/[drupal]/contributions/modules/word2web/word2web.inc
ViewVC logotype

Diff of /contributions/modules/word2web/word2web.inc

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph | View Patch Patch

revision 1.1, Tue Nov 10 18:17:55 2009 UTC revision 1.1.2.1, Tue Nov 10 18:17:55 2009 UTC
# Line 0  Line 1 
1    <?php
2    // $Id$
3    /**
4     * @file
5     * Helper functions standard word2web modules.
6     *
7     * TODO: It'd be nice to allow users to toggle between simple quotes and the
8     * html entities as show on http://shiflett.org/blog/2005/oct/convert-smart-quotes-with-php
9     */
10    
11    /**
12     * Utility function to replace microsoft "smart" characters into usable
13     * UTF-8. From http://shiflett.org/blog/2005/oct/convert-smart-quotes-with-php
14     */
15    function _word2web_convert_chr($string) {
16      $search = array(
17        chr(145),
18        chr(146),
19        chr(147),
20        chr(148),
21        chr(151),
22      );
23    
24      $replace = array(
25        "'",
26        "'",
27        '"',
28        '"',
29        '-',
30      );
31    
32      return str_replace($search, $replace, $string);
33    }
34    
35    /**
36     * Helper function that strips out MS Word tags.
37     *
38     * @param $html
39     * A raw HTML string containing MS Word tags.
40     * @param $strip_images
41     * Optional. Boolean value that triggers the stripping of image tags.
42     * @return
43     * Cleaned up HTML.
44     */
45    function _word2web_filter($html, $strip_images = FALSE) {
46    
47      // This is useful but breaks other utf8 characters.
48    //  $html = _word2web_convert_chr($html);
49    
50      preg_match('/charset=([\w-]+)/', $html, $matches);
51      if ($matches[1] == 'windows-1256') {
52        $html = iconv('windows-1256', 'utf-8', $html);
53      }
54      if ($matches[1] == 'windows-1252') {
55        $html = iconv('windows-1252', 'utf-8', $html);
56      }
57    
58      // If we want to strip images we just skip converting them.
59      if (!$strip_images) {
60        // Convert MS Word image tags into more HTML standard tags so they aren't
61        // filtered out below. They still exists in the html for now.
62        $html = _word2web_covert_image_tags($html);
63      }
64    
65      $html = iconv('UTF-8', 'UTF-8//IGNORE', $html);
66      $html = mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8');
67    
68      // Apply our XSL transformations.
69      $path = drupal_get_path('module', 'word2web') . '/';
70      $html = _word2web_xslt_transform($html, $path .'empty.xsl');
71      $html = _word2web_xslt_transform($html, $path .'w2html.xslt');
72    
73      return $html;
74    }
75    
76    /**
77     * Finds word image tags and converts them into html style image tags.
78     *
79     * Note: Old tag is not removed and expected to be cleaned up by the caller.
80     *
81     * @param $html
82     * HTML string containing MS Word tags.
83     * @return
84     * HTML string.
85     */
86    function _word2web_covert_image_tags($html) {
87      //
88      $html_dom = new DOMDocument();
89      // This is guaranteed not to be entirely valid HTML so suppress the errors
90      // reminding us of this fact.
91      @$html_dom->loadHTML($html);
92    
93      // TODO do something with v:imagedata information?
94      // $vimages = $html->getElementsByTagName("v:imagedata");
95    
96      /*
97       * This little bit changes img tags into image tags so that they aren't wiped
98       * out by the (validating) XSL transformations.
99       */
100      $images = $html_dom->getElementsByTagName('img');
101      foreach ($images as $im) {
102        $image_node = $html_dom->createElement('image');
103        $image_node->setAttribute('src', $im->getAttribute('src'));
104        $im->parentNode->insertBefore($image_node, $im);
105      }
106      return $html_dom->saveXML();
107    }
108    
109    /**
110     * Utility function for crunching XML through XSLT
111     */
112    function _word2web_xslt_transform($xml, $xsl_file, $params = array()) {
113      // load specified stylesheet and set any parameters
114      // Without Domdocument charsets: french breaks, arabic remains broken
115      //
116      $xsl = new DOMDocument();
117      $xsl->load($xsl_file);
118      $xslt = new XSLTProcessor();
119      $xslt->importStylesheet($xsl);
120      // check whether input is string or object
121      if (!is_object($xml)) {
122        $x = new DOMDocument();
123        // This is guaranteed not to be entirely valid HTML so suppress the errors
124        // reminding us of this fact.
125        @$x->loadHTML($xml);
126      }
127      else {
128        $x = $xml;
129      }
130      // return transformed xml
131      return $xslt->transformToXML($x);
132    }
133    

Legend:
Removed from v.1.1  
changed lines
  Added in v.1.1.2.1

  ViewVC Help
Powered by ViewVC 1.1.2