/[drupal]/contributions/modules/word2web/word2web.module
ViewVC logotype

Diff of /contributions/modules/word2web/word2web.module

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph | View Patch Patch

revision 1.6.2.2, Thu Jan 22 00:27:32 2009 UTC revision 1.6.2.3, Tue Nov 10 18:17:55 2009 UTC
# Line 2  Line 2 
2    
3  /*  /*
4   * Author   : Tom MacWright, Young Hahn   * Author   : Tom MacWright, Young Hahn
5   * Revision : $Id: word2web.module,v 1.6 2008/07/24 15:33:12 tmcw Exp $   * Revision : $Id: word2web.module,v 1.6.2.2 2009/01/22 00:27:32 sumit Exp $
6   */   */
7    
8  /**  /**
# Line 243  function word2web_nodeapi(&$node, $op) { Line 243  function word2web_nodeapi(&$node, $op) {
243    $validators = array(    $validators = array(
244    );    );
245        if ($html_file = file_save_upload('word_document', $validators)) {        if ($html_file = file_save_upload('word_document', $validators)) {
246          $path = drupal_get_path('module', 'word2web');          // Include our helper file.
247          $html_raw = file_get_contents($html_file->filepath);          module_load_include('inc', 'word2web');
248          $html_raw = _word2web_convert_chr($html_raw);          $html = _word2web_filter(file_get_contents($html_file->filepath));
         set_error_handler('_word2web_suppress_errors');  
   
         preg_match('/charset=([\w-]+)/', $html_raw, $matches);  
   
         if ($matches[1] == 'windows-1256') {  
           $html_raw = iconv('windows-1256', 'utf-8', $html_raw);  
         }  
         if ($matches[1] == 'windows-1252') {  
           $html_raw = iconv('windows-1252', 'utf-8', $html_raw);  
         }  
         $html = new DOMDocument();  
         $html->loadHTML($html_raw);  
         $images = $html->getElementsByTagName("img");  
         $vimages = $html->getElementsByTagName("v:imagedata");  
         /*  
          * This little bit changes img tags into  
          * image tags so that they aren't wiped out by the (validating)  
          * XSL transformations. It also collects a list of URLs of images  
          * so that we'll know what to fetch from the user  
          */  
         foreach ($images as $im) {  
           $image_node = $html->createElement("image");  
           $image_node->appendChild($html->createTextNode(" "));  
           $image_node->setAttribute("src", $im->getAttribute("src"));  
           $im->parentNode->insertBefore($image_node, $im);  
         }  
         $html = $html->saveXML();  
         $html = iconv("UTF-8", "UTF-8//IGNORE", $html);  
         $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8");  
         $html = _word2web_xslt_transform($html, $path .'/empty.xsl');  
         $html = _word2web_xslt_transform($html, $path .'/w2html.xslt');  
         // Somehow /> is getting produced at certain places in the document -- let's take them out  
         $html = str_replace('/>', '', $html);  
         restore_error_handler();  
249    
250          if (variable_get('word2web_filter', 0)) {          if (variable_get('word2web_filter', 0)) {
251            $node->format = variable_get('word2web_filter', 0);            $node->format = variable_get('word2web_filter', 0);
# Line 337  function _word2web_get_images($c) { Line 303  function _word2web_get_images($c) {
303   */   */
304    
305  function word2web_manual($html_raw) {  function word2web_manual($html_raw) {
   $html_raw = _word2web_convert_chr($html_raw);  
   preg_match('/charset=([\w-]+)/', $html_raw, $matches);  
306    
   if ($matches[1] == 'windows-1256') {  
       $html_raw = iconv('windows-1256', 'utf-8', $html_raw);  
     }  
   if ($matches[1] == 'windows-1252') {  
       $html_raw = iconv('windows-1252', 'utf-8', $html_raw);  
     }  
   $path = drupal_get_path('module', 'word2web');  
   set_error_handler('_word2web_suppress_errors');  
   $html = $html_raw;  
307    $html_raw = preg_replace("/<(img)([^>]*)>/mi",    $html_raw = preg_replace("/<(img)([^>]*)>/mi",
308    "<addr class='image' $2>-</addr>",    "<addr class='image' $2>-</addr>",
309    $html_raw);    $html_raw);
310    
311    //preg_match('/charset=([\w-]+)/', $html_raw, $matches);    // Include our helper file.
312    // This step apparently cleans up the XML a little.    module_load_include('inc', 'word2web');
313    $html = new DOMDocument();    return _word2web_filter($html_raw);
   $html->loadHTML($html_raw);  
   //echo "charset? ".$charset;  
   $html = $html->saveXML();  
   // Normalizes some odd utf-8 characters  
   
   // Convert from the charset it says it is into UTF-8  
   //$html = iconv($matches[1], "UTF-8//IGNORE", $html);  
   $html = iconv("UTF-8", "UTF-8//IGNORE", $html);  
         $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8");  
   // Actually run the xsl transformations  
   $html = _word2web_xslt_transform($html, $path .'/empty.xsl');  
   
   $html = _word2web_xslt_transform($html, $path .'/w2html.xslt');  
   restore_error_handler();  
   $html = str_replace('/&gt;', '', $html);  
   return $html;  
314  }  }
315    
316  /**  /**
# Line 434  function word2web_settings_form() { Line 373  function word2web_settings_form() {
373  }  }
374    
375  /**  /**
  * Utility function to replace microsoft "smart" characters into usable  
  * UTF-8. From http://shiflett.org/blog/2005/oct/convert-smart-quotes-with-php  
  */  
 function _word2web_convert_chr($string) {  
   $search = array(  
     chr(145),  
     chr(146),  
     chr(147),  
     chr(148),  
     chr(151)  
   );  
   
   $replace = array(  
     "'",  
     "'",  
     '"',  
     '"',  
     '-'  
   );  
   
   return str_replace($search, $replace, $string);  
 }  
   
 /**  
376   * Utility function for hiding errors, esp. on loading XML from bad HTML input   * Utility function for hiding errors, esp. on loading XML from bad HTML input
377   */   */
378  function _word2web_suppress_errors() { }  function _word2web_suppress_errors() { }
   
 /**  
  * Utility function for crunching XML through XSLT  
  */  
 function _word2web_xslt_transform($xml, $xsl_file, $params = array()) {  
   // load specified stylesheet and set any parameters  
   // Without Domdocument charsets: french breaks, arabic remains broken  
   //  
   $xsl = new DOMDocument();  
   $xsl->load($xsl_file);  
   $xslt = new XSLTProcessor();  
   $xslt->importStylesheet($xsl);  
   // check whether input is string or object  
   if (!is_object($xml)) {  
     $x = new DOMDocument();  
     $x->loadHTML($xml);  
   }  
   else {  
     $x = $xml;  
   }  
   // return transformed xml  
   return $xslt->transformToXML($x);  
 }  
   

Legend:
Removed from v.1.6.2.2  
changed lines
  Added in v.1.6.2.3

  ViewVC Help
Powered by ViewVC 1.1.2