| 1 |
|
<?php |
| 2 |
|
// $Id$ |
| 3 |
|
/** |
| 4 |
|
* @file |
| 5 |
|
* Helper functions standard word2web modules. |
| 6 |
|
* |
| 7 |
|
* TODO: It'd be nice to allow users to toggle between simple quotes and the |
| 8 |
|
* html entities as show on http://shiflett.org/blog/2005/oct/convert-smart-quotes-with-php |
| 9 |
|
*/ |
| 10 |
|
|
| 11 |
|
/** |
| 12 |
|
* Utility function to replace microsoft "smart" characters into usable |
| 13 |
|
* UTF-8. From http://shiflett.org/blog/2005/oct/convert-smart-quotes-with-php |
| 14 |
|
*/ |
| 15 |
|
function _word2web_convert_chr($string) { |
| 16 |
|
$search = array( |
| 17 |
|
chr(145), |
| 18 |
|
chr(146), |
| 19 |
|
chr(147), |
| 20 |
|
chr(148), |
| 21 |
|
chr(151), |
| 22 |
|
); |
| 23 |
|
|
| 24 |
|
$replace = array( |
| 25 |
|
"'", |
| 26 |
|
"'", |
| 27 |
|
'"', |
| 28 |
|
'"', |
| 29 |
|
'-', |
| 30 |
|
); |
| 31 |
|
|
| 32 |
|
return str_replace($search, $replace, $string); |
| 33 |
|
} |
| 34 |
|
|
| 35 |
|
/** |
| 36 |
|
* Helper function that strips out MS Word tags. |
| 37 |
|
* |
| 38 |
|
* @param $html |
| 39 |
|
* A raw HTML string containing MS Word tags. |
| 40 |
|
* @param $strip_images |
| 41 |
|
* Optional. Boolean value that triggers the stripping of image tags. |
| 42 |
|
* @return |
| 43 |
|
* Cleaned up HTML. |
| 44 |
|
*/ |
| 45 |
|
function _word2web_filter($html, $strip_images = FALSE) { |
| 46 |
|
|
| 47 |
|
// This is useful but breaks other utf8 characters. |
| 48 |
|
// $html = _word2web_convert_chr($html); |
| 49 |
|
|
| 50 |
|
preg_match('/charset=([\w-]+)/', $html, $matches); |
| 51 |
|
if ($matches[1] == 'windows-1256') { |
| 52 |
|
$html = iconv('windows-1256', 'utf-8', $html); |
| 53 |
|
} |
| 54 |
|
if ($matches[1] == 'windows-1252') { |
| 55 |
|
$html = iconv('windows-1252', 'utf-8', $html); |
| 56 |
|
} |
| 57 |
|
|
| 58 |
|
// If we want to strip images we just skip converting them. |
| 59 |
|
if (!$strip_images) { |
| 60 |
|
// Convert MS Word image tags into more HTML standard tags so they aren't |
| 61 |
|
// filtered out below. They still exists in the html for now. |
| 62 |
|
$html = _word2web_covert_image_tags($html); |
| 63 |
|
} |
| 64 |
|
|
| 65 |
|
$html = iconv('UTF-8', 'UTF-8//IGNORE', $html); |
| 66 |
|
$html = mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'); |
| 67 |
|
|
| 68 |
|
// Apply our XSL transformations. |
| 69 |
|
$path = drupal_get_path('module', 'word2web') . '/'; |
| 70 |
|
$html = _word2web_xslt_transform($html, $path .'empty.xsl'); |
| 71 |
|
$html = _word2web_xslt_transform($html, $path .'w2html.xslt'); |
| 72 |
|
|
| 73 |
|
return $html; |
| 74 |
|
} |
| 75 |
|
|
| 76 |
|
/** |
| 77 |
|
* Finds word image tags and converts them into html style image tags. |
| 78 |
|
* |
| 79 |
|
* Note: Old tag is not removed and expected to be cleaned up by the caller. |
| 80 |
|
* |
| 81 |
|
* @param $html |
| 82 |
|
* HTML string containing MS Word tags. |
| 83 |
|
* @return |
| 84 |
|
* HTML string. |
| 85 |
|
*/ |
| 86 |
|
function _word2web_covert_image_tags($html) { |
| 87 |
|
// |
| 88 |
|
$html_dom = new DOMDocument(); |
| 89 |
|
// This is guaranteed not to be entirely valid HTML so suppress the errors |
| 90 |
|
// reminding us of this fact. |
| 91 |
|
@$html_dom->loadHTML($html); |
| 92 |
|
|
| 93 |
|
// TODO do something with v:imagedata information? |
| 94 |
|
// $vimages = $html->getElementsByTagName("v:imagedata"); |
| 95 |
|
|
| 96 |
|
/* |
| 97 |
|
* This little bit changes img tags into image tags so that they aren't wiped |
| 98 |
|
* out by the (validating) XSL transformations. |
| 99 |
|
*/ |
| 100 |
|
$images = $html_dom->getElementsByTagName('img'); |
| 101 |
|
foreach ($images as $im) { |
| 102 |
|
$image_node = $html_dom->createElement('image'); |
| 103 |
|
$image_node->setAttribute('src', $im->getAttribute('src')); |
| 104 |
|
$im->parentNode->insertBefore($image_node, $im); |
| 105 |
|
} |
| 106 |
|
return $html_dom->saveXML(); |
| 107 |
|
} |
| 108 |
|
|
| 109 |
|
/** |
| 110 |
|
* Utility function for crunching XML through XSLT |
| 111 |
|
*/ |
| 112 |
|
function _word2web_xslt_transform($xml, $xsl_file, $params = array()) { |
| 113 |
|
// load specified stylesheet and set any parameters |
| 114 |
|
// Without Domdocument charsets: french breaks, arabic remains broken |
| 115 |
|
// |
| 116 |
|
$xsl = new DOMDocument(); |
| 117 |
|
$xsl->load($xsl_file); |
| 118 |
|
$xslt = new XSLTProcessor(); |
| 119 |
|
$xslt->importStylesheet($xsl); |
| 120 |
|
// check whether input is string or object |
| 121 |
|
if (!is_object($xml)) { |
| 122 |
|
$x = new DOMDocument(); |
| 123 |
|
// This is guaranteed not to be entirely valid HTML so suppress the errors |
| 124 |
|
// reminding us of this fact. |
| 125 |
|
@$x->loadHTML($xml); |
| 126 |
|
} |
| 127 |
|
else { |
| 128 |
|
$x = $xml; |
| 129 |
|
} |
| 130 |
|
// return transformed xml |
| 131 |
|
return $xslt->transformToXML($x); |
| 132 |
|
} |
| 133 |
|
|