| 2 |
|
|
| 3 |
/* |
/* |
| 4 |
* Author : Tom MacWright, Young Hahn |
* Author : Tom MacWright, Young Hahn |
| 5 |
* Revision : $Id: word2web.module,v 1.6 2008/07/24 15:33:12 tmcw Exp $ |
* Revision : $Id: word2web.module,v 1.6.2.2 2009/01/22 00:27:32 sumit Exp $ |
| 6 |
*/ |
*/ |
| 7 |
|
|
| 8 |
/** |
/** |
| 243 |
$validators = array( |
$validators = array( |
| 244 |
); |
); |
| 245 |
if ($html_file = file_save_upload('word_document', $validators)) { |
if ($html_file = file_save_upload('word_document', $validators)) { |
| 246 |
$path = drupal_get_path('module', 'word2web'); |
// Include our helper file. |
| 247 |
$html_raw = file_get_contents($html_file->filepath); |
module_load_include('inc', 'word2web'); |
| 248 |
$html_raw = _word2web_convert_chr($html_raw); |
$html = _word2web_filter(file_get_contents($html_file->filepath)); |
|
set_error_handler('_word2web_suppress_errors'); |
|
|
|
|
|
preg_match('/charset=([\w-]+)/', $html_raw, $matches); |
|
|
|
|
|
if ($matches[1] == 'windows-1256') { |
|
|
$html_raw = iconv('windows-1256', 'utf-8', $html_raw); |
|
|
} |
|
|
if ($matches[1] == 'windows-1252') { |
|
|
$html_raw = iconv('windows-1252', 'utf-8', $html_raw); |
|
|
} |
|
|
$html = new DOMDocument(); |
|
|
$html->loadHTML($html_raw); |
|
|
$images = $html->getElementsByTagName("img"); |
|
|
$vimages = $html->getElementsByTagName("v:imagedata"); |
|
|
/* |
|
|
* This little bit changes img tags into |
|
|
* image tags so that they aren't wiped out by the (validating) |
|
|
* XSL transformations. It also collects a list of URLs of images |
|
|
* so that we'll know what to fetch from the user |
|
|
*/ |
|
|
foreach ($images as $im) { |
|
|
$image_node = $html->createElement("image"); |
|
|
$image_node->appendChild($html->createTextNode(" ")); |
|
|
$image_node->setAttribute("src", $im->getAttribute("src")); |
|
|
$im->parentNode->insertBefore($image_node, $im); |
|
|
} |
|
|
$html = $html->saveXML(); |
|
|
$html = iconv("UTF-8", "UTF-8//IGNORE", $html); |
|
|
$html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); |
|
|
$html = _word2web_xslt_transform($html, $path .'/empty.xsl'); |
|
|
$html = _word2web_xslt_transform($html, $path .'/w2html.xslt'); |
|
|
// Somehow /> is getting produced at certain places in the document -- let's take them out |
|
|
$html = str_replace('/>', '', $html); |
|
|
restore_error_handler(); |
|
| 249 |
|
|
| 250 |
if (variable_get('word2web_filter', 0)) { |
if (variable_get('word2web_filter', 0)) { |
| 251 |
$node->format = variable_get('word2web_filter', 0); |
$node->format = variable_get('word2web_filter', 0); |
| 303 |
*/ |
*/ |
| 304 |
|
|
| 305 |
function word2web_manual($html_raw) { |
function word2web_manual($html_raw) { |
|
$html_raw = _word2web_convert_chr($html_raw); |
|
|
preg_match('/charset=([\w-]+)/', $html_raw, $matches); |
|
| 306 |
|
|
|
if ($matches[1] == 'windows-1256') { |
|
|
$html_raw = iconv('windows-1256', 'utf-8', $html_raw); |
|
|
} |
|
|
if ($matches[1] == 'windows-1252') { |
|
|
$html_raw = iconv('windows-1252', 'utf-8', $html_raw); |
|
|
} |
|
|
$path = drupal_get_path('module', 'word2web'); |
|
|
set_error_handler('_word2web_suppress_errors'); |
|
|
$html = $html_raw; |
|
| 307 |
$html_raw = preg_replace("/<(img)([^>]*)>/mi", |
$html_raw = preg_replace("/<(img)([^>]*)>/mi", |
| 308 |
"<addr class='image' $2>-</addr>", |
"<addr class='image' $2>-</addr>", |
| 309 |
$html_raw); |
$html_raw); |
| 310 |
|
|
| 311 |
//preg_match('/charset=([\w-]+)/', $html_raw, $matches); |
// Include our helper file. |
| 312 |
// This step apparently cleans up the XML a little. |
module_load_include('inc', 'word2web'); |
| 313 |
$html = new DOMDocument(); |
return _word2web_filter($html_raw); |
|
$html->loadHTML($html_raw); |
|
|
//echo "charset? ".$charset; |
|
|
$html = $html->saveXML(); |
|
|
// Normalizes some odd utf-8 characters |
|
|
|
|
|
// Convert from the charset it says it is into UTF-8 |
|
|
//$html = iconv($matches[1], "UTF-8//IGNORE", $html); |
|
|
$html = iconv("UTF-8", "UTF-8//IGNORE", $html); |
|
|
$html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); |
|
|
// Actually run the xsl transformations |
|
|
$html = _word2web_xslt_transform($html, $path .'/empty.xsl'); |
|
|
|
|
|
$html = _word2web_xslt_transform($html, $path .'/w2html.xslt'); |
|
|
restore_error_handler(); |
|
|
$html = str_replace('/>', '', $html); |
|
|
return $html; |
|
| 314 |
} |
} |
| 315 |
|
|
| 316 |
/** |
/** |
| 373 |
} |
} |
| 374 |
|
|
| 375 |
/** |
/** |
|
* Utility function to replace microsoft "smart" characters into usable |
|
|
* UTF-8. From http://shiflett.org/blog/2005/oct/convert-smart-quotes-with-php |
|
|
*/ |
|
|
function _word2web_convert_chr($string) { |
|
|
$search = array( |
|
|
chr(145), |
|
|
chr(146), |
|
|
chr(147), |
|
|
chr(148), |
|
|
chr(151) |
|
|
); |
|
|
|
|
|
$replace = array( |
|
|
"'", |
|
|
"'", |
|
|
'"', |
|
|
'"', |
|
|
'-' |
|
|
); |
|
|
|
|
|
return str_replace($search, $replace, $string); |
|
|
} |
|
|
|
|
|
/** |
|
| 376 |
* Utility function for hiding errors, esp. on loading XML from bad HTML input |
* Utility function for hiding errors, esp. on loading XML from bad HTML input |
| 377 |
*/ |
*/ |
| 378 |
function _word2web_suppress_errors() { } |
function _word2web_suppress_errors() { } |
|
|
|
|
/** |
|
|
* Utility function for crunching XML through XSLT |
|
|
*/ |
|
|
function _word2web_xslt_transform($xml, $xsl_file, $params = array()) { |
|
|
// load specified stylesheet and set any parameters |
|
|
// Without Domdocument charsets: french breaks, arabic remains broken |
|
|
// |
|
|
$xsl = new DOMDocument(); |
|
|
$xsl->load($xsl_file); |
|
|
$xslt = new XSLTProcessor(); |
|
|
$xslt->importStylesheet($xsl); |
|
|
// check whether input is string or object |
|
|
if (!is_object($xml)) { |
|
|
$x = new DOMDocument(); |
|
|
$x->loadHTML($xml); |
|
|
} |
|
|
else { |
|
|
$x = $xml; |
|
|
} |
|
|
// return transformed xml |
|
|
return $xslt->transformToXML($x); |
|
|
} |
|
|
|
|