/[drupal]/contributions/modules/import_html/import_html_process.inc
ViewVC logotype

Diff of /contributions/modules/import_html/import_html_process.inc

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph | View Patch Patch

revision 1.5.4.16, Wed Sep 9 06:47:22 2009 UTC revision 1.5.4.17, Tue Oct 6 14:09:02 2009 UTC
# Line 1  Line 1 
1  <?php  <?php
2  // $Id: import_html_process.inc,v 1.5.4.15 2009/07/01 02:40:56 dman Exp $  // $Id: import_html_process.inc,v 1.5.4.16 2009/09/09 06:47:22 dman Exp $
3  /**  /**
4   * @file Actual routines for importing files.   * @file Actual routines for importing files.
5   *   *
6   *   *
7   * @ingroup import_html Import HTML   * @ingroup import_html Import HTML
8   * @author Dan Morrison http://coders.co.nz/   * @author Dan Morrison http://coders.co.nz/
9   *   *
# Line 15  module_load_include('inc', 'import_html' Line 15  module_load_include('inc', 'import_html'
15    
16  /**  /**
17   * Files have been selected, set them up for processing   * Files have been selected, set them up for processing
18   *   *
19   * @param $file_list   * @param $file_list
20   *   an  array of simple file paths, probably selected from the file_list form   *   an  array of simple file paths, probably selected from the file_list form
21   * @param $context   * @param $context
22   *   A set of parameters, similar to the import_html profile, possibly from   *   A set of parameters, similar to the import_html profile, possibly from
23   *   the list_filesystem form. Should contain the base path that the   *   the list_filesystem form. Should contain the base path that the
24   *   submitted files are relative to.   *   submitted files are relative to.
25   * Note that context is NOT a full profile.   * Note that context is NOT a full profile.
26   *   *
27   * @return A result set of nodes   * @return A result set of nodes
28   */   */
29  function import_html_import_files($file_list, $context) {  function import_html_import_files($file_list, $context) {
# Line 39  function import_html_import_files($file_ Line 39  function import_html_import_files($file_
39    }    }
40    
41    // TODO see what we can do about clearing out our memory    // TODO see what we can do about clearing out our memory
42    
43    $results = array();    $results = array();
44    foreach ($file_list as $list_index => $rel_path) {    foreach ($file_list as $list_index => $rel_path) {
45      if (! empty($rel_path)) {      if (! empty($rel_path)) {
46        $file_results = import_html_import_file($rel_path, $context);        $file_results = import_html_import_file($rel_path, $context);
47        if (! $file_results) {        if (! $file_results) {
48          drupal_set_message(t('          drupal_set_message(t('
49            Failed to get any results from the attempted analysis of %rel_path.            Failed to get any results from the attempted analysis of %rel_path.
50            The source file path was probably unavailable or incorrect.',            The source file path was probably unavailable or incorrect.',
51            array('%rel_path' => $rel_path)), 'error');            array('%rel_path' => $rel_path)), 'error');
52          continue;          continue;
53        }        }
54        // Result of importing a file MAY be more than one node,        // Result of importing a file MAY be more than one node,
55        // unlikely as it may be for XHTML, but is possible for XML extension        // unlikely as it may be for XHTML, but is possible for XML extension
56        import_html_debug_code("Result of processing file $rel_path", $file_results, WATCHDOG_DEBUG);        import_html_debug_code("Result of processing file $rel_path", $file_results, WATCHDOG_DEBUG);
57        foreach ($file_results as $node) {        foreach ($file_results as $node) {
# Line 75  function import_html_import_files($file_ Line 75  function import_html_import_files($file_
75    
76  /**  /**
77   * Given a html file, prepare all the node info we can get out of it.   * Given a html file, prepare all the node info we can get out of it.
78   *   *
79   * This func mainly prepares the paths and relative links. Data extraction happens in _import_html_process_html_page()   * This func mainly prepares the paths and relative links. Data extraction happens in _import_html_process_html_page()
80   *   *
81   * It does submit and save the node to the database.   * It does submit and save the node to the database.
82   *   *
83   * @param $context   * @param $context
84   *   describes the context this function was called in. It should contain   *   describes the context this function was called in. It should contain
85   * 'profile_id' and 'source_siteroot'. Also 'form_id'   * 'profile_id' and 'source_siteroot'. Also 'form_id'
86   *   *
87   * @return array that may contain more than one node (in extreme cases)   * @return array that may contain more than one node (in extreme cases)
88   */   */
89  function import_html_import_file($rel_path, $context) {  function import_html_import_file($rel_path, $context) {
90    
91    // Read the profile id and use that as a context for all settings    // Read the profile id and use that as a context for all settings
92    $profile = import_html_profile($context['profile_id']);    $profile = import_html_profile($context['profile_id']);
93    
94    $source_siteroot = $context['source_siteroot'];    $source_siteroot = $context['source_siteroot'];
95    $dest_root = ensure_trailing_slash($profile['file_storage_path']);    $dest_root = ensure_trailing_slash($profile['file_storage_path']);
96    $is_remote = valid_url($source_siteroot, TRUE);    $is_remote = valid_url($source_siteroot, TRUE);
97    
98    import_html_debug(    import_html_debug(
99      "<strong>Importing</strong> '%rel_path'",      "<strong>Importing</strong> '%rel_path'",
100      array('%rel_path' => $rel_path),      array('%rel_path' => $rel_path),
101      WATCHDOG_NOTICE      WATCHDOG_NOTICE
102    );    );
103    
104    $source_path = $source_siteroot . $rel_path;    $source_path = $source_siteroot . $rel_path;
# Line 108  function import_html_import_file($rel_pa Line 108  function import_html_import_file($rel_pa
108    // Handle URLS/Folders with training slash    // Handle URLS/Folders with training slash
109    if (preg_match("|/$|", $rel_path)) {    if (preg_match("|/$|", $rel_path)) {
110      // Handle trailing slashes differently at home and away      // Handle trailing slashes differently at home and away
111      if ($is_remote) {      if ($is_remote) {
112        // It's remote        // It's remote
113        $default_documents = split(",", $profile['default_document']);        $default_documents = split(",", $profile['default_document']);
114        // need a dummy filename if retrieving default docs.        // need a dummy filename if retrieving default docs.
115        $dest_path .= trim(array_shift($default_documents));;        $dest_path .= trim(array_shift($default_documents));;
116      }      }
117      else {      else {
118        return; //skip local directories altogether (their contents is selected individually)        return; //skip local directories altogether (their contents is selected individually)
119      }      }
120    }    }
121    
122    $file = array(    $file = array(
123      'source' => $source_path,      'source' => $source_path,
124      'dest' => $dest_path,      'dest' => $dest_path,
# Line 132  function import_html_import_file($rel_pa Line 132  function import_html_import_file($rel_pa
132      // non-page resource - what sort of processing can I do here?      // non-page resource - what sort of processing can I do here?
133    
134      import_html_debug(      import_html_debug(
135        "I think (due to file suffix '%doctype')        "I think (due to file suffix '%doctype')
136          that '%source' is not a html page I can process.<br/>          that '%source' is not a html page I can process.<br/>
137          It's just been copied into '!dest'.",          It's just been copied into '!dest'.",
138        array(        array(
139          '%source' => basename($source_path),          '%source' => basename($source_path),
140          '!dest' => l($dest_path, $dest_path),          '!dest' => l($dest_path, $dest_path),
141          '%doctype' => import_html_guess_file_class($checkfile),          '%doctype' => import_html_guess_file_class($checkfile),
142        ),        ),
143        WATCHDOG_NOTICE        WATCHDOG_NOTICE
144      );      );
145    
146      import_html_get_raw_file_local($source_path, $dest_path, $is_remote);      import_html_get_raw_file_local($source_path, $dest_path, $is_remote);
# Line 158  function import_html_import_file($rel_pa Line 158  function import_html_import_file($rel_pa
158      if ($profile['handle_duplicates'] == IMPORT_HTML_SKIP) {      if ($profile['handle_duplicates'] == IMPORT_HTML_SKIP) {
159    
160        import_html_debug(        import_html_debug(
161          "We already have '%new_path' in the system as '%normal_path'.          "We already have '%new_path' in the system as '%normal_path'.
162            According to import_html settings, this import is being skipped",            According to import_html settings, this import is being skipped",
163          array(          array(
164            '%new_path' => $new_path,            '%new_path' => $new_path,
165            '%normal_path' => $normal_path,            '%normal_path' => $normal_path,
166          ),          ),
167          WATCHDOG_INFO          WATCHDOG_INFO
168        );        );
169    
170        return;        return;
171      }      }
172      import_html_debug(      import_html_debug(
173        "We already have '%new_path' in the system as '%normal_path'.        "We already have '%new_path' in the system as '%normal_path'.
174          Overwriting/updating it with the new import",          Overwriting/updating it with the new import",
175        array(        array(
176          '%new_path' => $new_path,          '%new_path' => $new_path,
177          '%normal_path' => $normal_path,          '%normal_path' => $normal_path,
178        ),        ),
179        WATCHDOG_INFO        WATCHDOG_INFO
180      );      );
181    }    }
182    
# Line 198  function import_html_import_file($rel_pa Line 198  function import_html_import_file($rel_pa
198        import_html_debug(        import_html_debug(
199          "Failed to fetch a copy of %source_path into %dest_path",          "Failed to fetch a copy of %source_path into %dest_path",
200          array('%dest_path' => $dest_path, '%source_path' => $source_path),          array('%dest_path' => $dest_path, '%source_path' => $source_path),
201          WATCHDOG_ERROR          WATCHDOG_ERROR
202        );        );
203    
204        return FALSE;        return FALSE;
205      }      }
206    
207      import_html_debug(      import_html_debug(
208        "Fetched a %persistant local copy to %dest_path",        "Fetched a %persistant local copy to %dest_path",
209        array(        array(
210          '%dest_path' => $dest_path,          '%dest_path' => $dest_path,
211          '%persistant' => ($profile['keep_temp_files'] ? 'persistant' : 'temporary')          '%persistant' => ($profile['keep_temp_files'] ? 'persistant' : 'temporary')
212        )        )
213      );      );
# Line 215  function import_html_import_file($rel_pa Line 215  function import_html_import_file($rel_pa
215    }    }
216    else {    else {
217      import_html_debug(      import_html_debug(
218        "Local copy exists at %dest_path",        "Local copy exists at %dest_path",
219        array('%dest_path' => $dest_path)        array('%dest_path' => $dest_path)
220      );      );
221    }    }
222    
223    if ($is_remote) {    if ($is_remote) {
224      // Importing a remote file - as for demo      // Importing a remote file - as for demo
225      // relinking will happen to point back at where it came from, not here      // relinking will happen to point back at where it came from, not here
226      // TODO need yet another parameter to indicate this, the path to neighbours      // TODO need yet another parameter to indicate this, the path to neighbours
227      # $rel_path = $source_path;      # $rel_path = $source_path;
228      // that worked, but created some odd paths in places when using a prefix.      // that worked, but created some odd paths in places when using a prefix.
229      // Resources and cross-links were found, but local alias was wrong      // Resources and cross-links were found, but local alias was wrong
230    
231      import_html_debug("Relinking this source will point back to the original URL context!");      import_html_debug("Relinking this source will point back to the original URL context!");
232    }    }
233    // We have a local copy now.    // We have a local copy now.
# Line 244  function import_html_import_file($rel_pa Line 244  function import_html_import_file($rel_pa
244    
245    if (!$nodes) {    if (!$nodes) {
246      import_html_debug(      import_html_debug(
247        "Failed to process a node out of file '%rel_path'",        "Failed to process a node out of file '%rel_path'",
248        array('%rel_path' => $rel_path),        array('%rel_path' => $rel_path),
249        WATCHDOG_ERROR        WATCHDOG_ERROR
250      );      );
251      return FALSE;      return FALSE;
252    }    }
253    
254    // We can immediately discard the source file -    // We can immediately discard the source file -
255    // it should have been a temp copy made by import_html_get_raw_file_local() above    // it should have been a temp copy made by import_html_get_raw_file_local() above
256    if ( file_exists($dest_path) && ! $profile['keep_temp_files']) {    if ( file_exists($dest_path) && ! $profile['keep_temp_files']) {
257      unlink($dest_path);      unlink($dest_path);
258    }    }
259    
260    // Almost trivial loop (probably over 1 item)    // Almost trivial loop (probably over 1 item)
261    foreach ($nodes as $node) {    foreach ($nodes as $node) {
262      // The node data object has been initialized      // The node data object has been initialized
# Line 267  function import_html_import_file($rel_pa Line 267  function import_html_import_file($rel_pa
267        array('%title' => $node->title),        array('%title' => $node->title),
268        WATCHDOG_INFO        WATCHDOG_INFO
269      );      );
270    
271      // If it's overwriting an existing path, merge values      // If it's overwriting an existing path, merge values
272      $node = import_html_merge_over_existing_node($node, $profile);      $node = import_html_merge_over_existing_node($node, $profile);
273    
# Line 277  function import_html_import_file($rel_pa Line 277  function import_html_import_file($rel_pa
277    
278      // Finished prep, now save      // Finished prep, now save
279    
280      // 'prepare' occasionally ensures that some required fields are filled in      // 'prepare' occasionally ensures that some required fields are filled in
281      // depending on enabled modules. Maybe.      // depending on enabled modules. Maybe.
282      // node_invoke_nodeapi($node, 'prepare');      // node_invoke_nodeapi($node, 'prepare');
283    
284      // I really should VALIDATE now!      // I really should VALIDATE now!
285      // but what to do with errors?      // but what to do with errors?
286      // path_nodeapi complains if I try to validate before I know my nid. Is that correct?      // path_nodeapi complains if I try to validate before I know my nid. Is that correct?
287      // node_invoke_nodeapi($node, 'validate');      // node_invoke_nodeapi($node, 'validate');
288    
289      // Submit doesn't actually save, it just fills in extra fields      // Submit doesn't actually save, it just fills in extra fields
290      $node = node_submit($node);      $node = node_submit($node);
291    
292    
   
293      if ($context['form_id'] == 'import_html_demo_form') {      if ($context['form_id'] == 'import_html_demo_form') {
294        // DO NOT actually save stuff to the database        // DO NOT actually save stuff to the database
295        $file['node'] = $node;        $file['node'] = $node;
# Line 315  function import_html_import_file($rel_pa Line 315  function import_html_import_file($rel_pa
315          if (! empty($node->nid)) {          if (! empty($node->nid)) {
316    
317            import_html_debug(            import_html_debug(
318              "!node_link Exists, updating it with content from %source_path.",              "!node_link Exists, updating it with content from %source_path.",
319              array(              array(
320                '!node_link' => l('node '. $node->nid, 'node/'. $node->nid),                '!node_link' => l('node '. $node->nid, 'node/'. $node->nid),
321                '%source_path' => $source_path                '%source_path' => $source_path
322              ),              ),
323              WATCHDOG_INFO              WATCHDOG_INFO
# Line 329  function import_html_import_file($rel_pa Line 329  function import_html_import_file($rel_pa
329          else {          else {
330    
331            import_html_debug(            import_html_debug(
332              "Inserting New Node !node_link with content from %source_path",              "Inserting New Node !node_link with content from %source_path",
333              array('%source_path' => $source_path, '!node_link' => l($node->path, $node->path)),              array('%source_path' => $source_path, '!node_link' => l($node->path, $node->path)),
334              WATCHDOG_INFO              WATCHDOG_INFO
335            );            );
336            #dpm($node);            #dpm($node);
337            node_save($node);            node_save($node);
338    
339            // Had to wait until I had an ID to do this            // Had to wait until I had an ID to do this
340            // These callbacks add the aliases and menus            // These callbacks add the aliases and menus
341            module_invoke_all('import_html_after_save', $profile, $node);            module_invoke_all('import_html_after_save', $profile, $node);
342            // note, navigation items only gets set up on first import.            // note, navigation items only gets set up on first import.
343            // After that you are on your own            // After that you are on your own
344          }          }
345        } // Finished updating database        } // Finished updating database
346    
347    
348        #// Keep a copy for auditing (maybe not if memory gets heavy)        #// Keep a copy for auditing (maybe not if memory gets heavy)
349        $mini_node = (object) array();        $mini_node = (object) array();
# Line 351  function import_html_import_file($rel_pa Line 351  function import_html_import_file($rel_pa
351          $mini_node->$att = $node->$att;          $mini_node->$att = $node->$att;
352        }        }
353        $file['node'] = $mini_node;        $file['node'] = $mini_node;
354    
355        import_html_debug(        import_html_debug(
356          "<strong>Imported Node</strong> !node_link with content from %source_path . [mem: %memory]",          "<strong>Imported Node</strong> !node_link with content from %source_path . [mem: %memory]",
357          array(          array(
358            '%source_path' => $source_path,            '%source_path' => $source_path,
359            '!node_link' => l($node->title, 'node/' . $node->nid),            '!node_link' => l($node->title, 'node/' . $node->nid),
360            # Node path is usually right, but we will actually let the system figure that out - path may be off!            # Node path is usually right, but we will actually let the system figure that out - path may be off!
361            #'!node_link' => l($node->title, $node->path),            #'!node_link' => l($node->title, $node->path),
# Line 368  function import_html_import_file($rel_pa Line 368  function import_html_import_file($rel_pa
368      $files[] = $file;      $files[] = $file;
369    } // Looped over all files    } // Looped over all files
370    
371    return $files;    return $files;
372  }  }
373    
374  /**  /**
375   * Big brother to import_html_import_file   * Big brother to import_html_import_file
376   *   *
377   * Recursively imports ALL FILES in a given folder and returns a result array   * Recursively imports ALL FILES in a given folder and returns a result array
378   *   *
379   * This does this immediately in normal flow, and really should be done in   * This does this immediately in normal flow, and really should be done in
380   * batch. Try not to do this directly a lot.   * batch. Try not to do this directly a lot.
381   */   */
382  function import_html_import_directory($rel_path, $context) {  function import_html_import_directory($rel_path, $context) {
383    // Read the profile id and use that as a context for all settings    // Read the profile id and use that as a context for all settings
384    $profile = import_html_profile($context['profile_id']);    $profile = import_html_profile($context['profile_id']);
385    $source_siteroot = $context['source_siteroot'];    $source_siteroot = $context['source_siteroot'];
386    import_html_debug(    import_html_debug(
387      "<strong>Importing Directory</strong> '%rel_path'",      "<strong>Importing Directory</strong> '%rel_path'",
388      array('%rel_path' => $rel_path),      array('%rel_path' => $rel_path),
389      WATCHDOG_INFO      WATCHDOG_INFO
390    );    );
# Line 411  function import_html_get_raw_file_local( Line 411  function import_html_get_raw_file_local(
411    mkdirs(dirname($dest_path), FILE_CREATE_DIRECTORY);    mkdirs(dirname($dest_path), FILE_CREATE_DIRECTORY);
412    if (! mkdirs(dirname($dest_path)) ) {    if (! mkdirs(dirname($dest_path)) ) {
413      trigger_error("Failed to create directory for $dest_path Might be permissions.", E_USER_ERROR);      trigger_error("Failed to create directory for $dest_path Might be permissions.", E_USER_ERROR);
414    }    }
415    $debug_info = array(    $debug_info = array(
416      '%source' => $source_path,      '%source' => $source_path,
417      '%dest' => $dest_path,      '%dest' => $dest_path,
418    );    );
419    
420    import_html_debug(    import_html_debug(
421      "Fetching content from %location '<a href='!realpath'>%source_path</a>' now. Saving temp file locally as %dest_path",      "Fetching content from %location '<a href='!realpath'>%source_path</a>' now. Saving temp file locally as %dest_path",
422      array(      array(
423        '%source_path' => $source_path,        '%source_path' => $source_path,
424        '%dest_path' =>  $dest_path,        '%dest_path' =>  $dest_path,
# Line 430  function import_html_get_raw_file_local( Line 430  function import_html_get_raw_file_local(
430    
431    $orig_path = $source_path;    $orig_path = $source_path;
432    
433    if ($host) {    if ($host) {
434      // It's remote. Trust PHP5 and allor_url_fopen is available      // It's remote. Trust PHP5 and allor_url_fopen is available
435      if (!copy($source_path, $dest_path)) {      if (!copy($source_path, $dest_path)) {
436        import_html_debug(        import_html_debug(
437          "Remote file copy from %source to %dest failed",          "Remote file copy from %source to %dest failed",
438          $debug_info,          $debug_info,
439          WATCHDOG_ERROR          WATCHDOG_ERROR
440        );        );
441        return FALSE;        return FALSE;
442      }      }
443    }    }
444    else {    else {
445      // local copy      // local copy
446      if (realpath($source_path) == realpath($dest_path)) {      if (realpath($source_path) == realpath($dest_path)) {
447        import_html_debug(        import_html_debug(
# Line 456  function import_html_get_raw_file_local( Line 456  function import_html_get_raw_file_local(
456          'Local file copy failed (%source_path to %dest_path).          'Local file copy failed (%source_path to %dest_path).
457          Source %orig_path is <pre>!source_stat</pre>          Source %orig_path is <pre>!source_stat</pre>
458          Dest folder %dest_path is <pre>!dest_stat</pre>          Dest folder %dest_path is <pre>!dest_stat</pre>
459          ',          ',
460          array(          array(
461            '%source_path' => $source_path,            '%source_path' => $source_path,
462            '%dest_path' => $dest_path,            '%dest_path' => $dest_path,
463            '!source_stat' => print_r(stat($source_path), 1),            '!source_stat' => print_r(stat($source_path), 1),
464            '!dest_stat' => print_r(stat($dest_path), 1),            '!dest_stat' => print_r(stat($dest_path), 1),
465          ),          ),
466          WATCHDOG_ERROR          WATCHDOG_ERROR
467        );        );
468    
469        return FALSE;        return FALSE;
470      }      }
471    }    }
472    import_html_debug(    import_html_debug(
473      "Copied import file from %source_path to %dest_path",      "Copied import file from %source_path to %dest_path",
474      $debug_info      $debug_info
475    );    );
476    return TRUE;    return TRUE;
# Line 488  function import_html_get_raw_file_local( Line 488  function import_html_get_raw_file_local(
488   * This processing is still in the 'validate' phase, so should   * This processing is still in the 'validate' phase, so should
489   * not cause anything to happen, just configure the node object   * not cause anything to happen, just configure the node object
490   *   *
491   * @param $path/$node   * @param $path/$node
492   *   the file (or object) to read the data from. If it's a string, it's taken   *   the file (or object) to read the data from. If it's a string, it's taken
493   * to be the filename, if an object, it's the node. A node should contain a -   * to be the filename, if an object, it's the node. A node should contain a -
494   * >body (or ->raw_html) and a - >path at least.   * >body (or ->raw_html) and a - >path at least.
495   * @param $rel_path   * @param $rel_path
496   *   Where this html page was found, relative to its own server root. This is   *   Where this html page was found, relative to its own server root. This is
497   * used to rewrite its urls. If the path is a directory, it should end with a   * used to rewrite its urls. If the path is a directory, it should end with a
498   * slash. ( /a/path/ == /a/path/index.html != /a/path )   * slash. ( /a/path/ == /a/path/index.html != /a/path )
499   * @param $profile   * @param $profile
500   *   The  settings for this import process.   *   The  settings for this import process.
501   *   *
502   * @return array containing the new node object as the first item. Some   * @return array containing the new node object as the first item. Some
503   * processes may return multiple nodes   * processes may return multiple nodes
504   */   */
# Line 509  function import_html_process_html_page($ Line 509  function import_html_process_html_page($
509    }    }
510    import_html_debug_code("The import profile settings being used to import_html_process_html_page($rel_path)", $profile);    import_html_debug_code("The import profile settings being used to import_html_process_html_page($rel_path)", $profile);
511    
512    if (is_string($path)) {    if (is_string($path)) {
513      // read from file      // read from file
514    
515      import_html_debug(      import_html_debug(
516        "Processing file as HTML page.        "Processing file as HTML page.
517          Full file path: %path , will be imported as a relative path          Full file path: %path , will be imported as a relative path
518          under the current section.          under the current section.
519          Relative-path is: %rel_path",          Relative-path is: %rel_path",
520          array('%path' => $path, '%rel_path' => $rel_path),          array('%path' => $path, '%rel_path' => $rel_path),
521          WATCHDOG_INFO          WATCHDOG_INFO
522      );      );
523    
524      if (! file_exists($path)) {      if (! file_exists($path)) {
525        trigger_error("Path '$path' was not found. This should have been a local copy of the file being imported, but the paths may be wrong somehow. Abject failure processing $rel_path");        trigger_error("Path '$path' was not found. This should have been a local copy of the file being imported, but the paths may be wrong somehow. Abject failure processing $rel_path");
526      }      }
527    
528      /*      /*
# Line 530  function import_html_process_html_page($ Line 530  function import_html_process_html_page($
530       * Either I want everything to be html, (always tidy)       * Either I want everything to be html, (always tidy)
531       * or I allow for exsl:document blocks (which can't be tidied)       * or I allow for exsl:document blocks (which can't be tidied)
532       * Option for now is try to parse, and only tidy if that fails.       * Option for now is try to parse, and only tidy if that fails.
533       *       *
534       */       */
535       // temporarily ignore parser errors (catch?)       // temporarily ignore parser errors (catch?)
536      set_error_handler('stfu');      set_error_handler('stfu');
537      $xmldoc = parse_in_xml_file($path, $profile['force_tidy']);      $xmldoc = parse_in_xml_file($path, $profile['force_tidy']);
538      restore_error_handler();      restore_error_handler();
539    
540      if (! $xmldoc && $profile['force_tidy'] ) {      if (! $xmldoc && $profile['force_tidy'] ) {
541        import_html_debug(        import_html_debug(
542          "%path was not tidy enough - running tidy over it now so I can parse it.",          "%path was not tidy enough - running tidy over it now so I can parse it.",
543          array('%path' => $path, '%rel_path' => $rel_path)          array('%path' => $path, '%rel_path' => $rel_path)
544        );        );
545        // If a raw XML parse failed,        // If a raw XML parse failed,
# Line 550  function import_html_process_html_page($ Line 550  function import_html_process_html_page($
550      #import_html_debug_code("Finished reading from file:", xml_tostring($xmldoc));      #import_html_debug_code("Finished reading from file:", xml_tostring($xmldoc));
551      $source_node = new stdClass();      $source_node = new stdClass();
552    }    }
553    else {    else {
554      // We may have passed in a source-node object where the path was expected instead.      // We may have passed in a source-node object where the path was expected instead.
555      // A bit of a sneak. The given node has the source HTML in $node->raw_html      // A bit of a sneak. The given node has the source HTML in $node->raw_html
556      if (is_object($path)) {      if (is_object($path)) {
# Line 566  function import_html_process_html_page($ Line 566  function import_html_process_html_page($
566      }      }
567    }    }
568    
569    if (!$xmldoc) {    if (!$xmldoc) {
570      // parsing failed      // parsing failed
571      import_html_debug("Import_HTML failed to initialize or parse XMLdoc input", array(), WATCHDOG_ERROR);      import_html_debug("Import_HTML failed to initialize or parse XMLdoc input", array(), WATCHDOG_ERROR);
572      return FALSE;      return FALSE;
# Line 591  function import_html_process_html_page($ Line 591  function import_html_process_html_page($
591    if (import_html_variable('debug_level')) {    if (import_html_variable('debug_level')) {
592      $source_node->file_data['after_rewriting'] = xml_tostring($xmldoc);      $source_node->file_data['after_rewriting'] = xml_tostring($xmldoc);
593    }    }
594    
595    // Import content as node.    // Import content as node.
596    // Translate the source text to the known tidy simple, tagged HTML structure now    // Translate the source text to the known tidy simple, tagged HTML structure now
597    $parameters = array(    $parameters = array(
# Line 607  function import_html_process_html_page($ Line 607  function import_html_process_html_page($
607      $xml_top = $xmldoc->firstChild;      $xml_top = $xmldoc->firstChild;
608      $xsl_top = $xsldoc->firstChild;      $xsl_top = $xsldoc->firstChild;
609      import_html_debug("      import_html_debug("
610        Using XSL translation template to extract semantic content.        Using XSL translation template to extract semantic content.
611        Will search for body content labelled '". $parameters['contentid']        Will search for body content labelled '". $parameters['contentid']
612        ."' in the source.        ."' in the source.
613        Active XML Namespaces are        Active XML Namespaces are
614        {$xml_top->nodeName} : {$xml_top->namespaceURI} -        {$xml_top->nodeName} : {$xml_top->namespaceURI} -
615        {$xsl_top->nodeName} : {$xsl_top->namespaceURI}  \n"        {$xsl_top->nodeName} : {$xsl_top->namespaceURI}  \n"
616        , array());        , array());
617      $importxml = xmldoc_plus_xsldoc($xmldoc, $xsldoc, $parameters);      $importxml = xmldoc_plus_xsldoc($xmldoc, $xsldoc, $parameters);
# Line 658  function import_html_process_html_page($ Line 658  function import_html_process_html_page($
658        $node->old_path = _import_html_calc_path($rel_path, TRUE);        $node->old_path = _import_html_calc_path($rel_path, TRUE);
659      }      }
660    
661      // May need extra care when creating multiples.      // May need extra care when creating multiples.
662      // Invent new paths for the new documents if the exsl:document didn't define them      // Invent new paths for the new documents if the exsl:document didn't define them
663      if (isset($nodes[$node->path])) {      if (isset($nodes[$node->path])) {
664        // already using this path, extend a new one        // already using this path, extend a new one
665        $node->path .= '/'. import_html_check_name(!empty($node->label)?$node->label:$node->title);        $node->path .= '/'. import_html_check_name(!empty($node->label)?$node->label:$node->title);
666      }      }
667    
# Line 671  function import_html_process_html_page($ Line 671  function import_html_process_html_page($
671    
672    
673      $nodes[$node->path] = $node;      $nodes[$node->path] = $node;
674    
675      import_html_debug("Path to save this page as is %path", array('%path' => $path));      import_html_debug("Path to save this page as is %path", array('%path' => $path));
676    }    }
677    
678    return $nodes;    return $nodes;
679  }  }
# Line 685  function import_html_process_html_page($ Line 685  function import_html_process_html_page($
685   * with all useful parameters set.   * with all useful parameters set.
686   * A shell node object may be passed in with some values already set. The data   * A shell node object may be passed in with some values already set. The data
687   * extracted from the XHTML structure will be layered onto that.   * extracted from the XHTML structure will be layered onto that.
688   *   *
689   * Here is where we map HTML info to node data, like H1 -> $node->title   * Here is where we map HTML info to node data, like H1 -> $node->title
690   * TODO tidy this up with a lookup table or something   * TODO tidy this up with a lookup table or something
691   *   *
692   * node may have defined its own $node->type even   * node may have defined its own $node->type even
693   *   *
694   * Called by   * Called by
695   * @see import_html_process_html_page()   * @see import_html_process_html_page()
696   *   *
697   * THIS IS THE ENGINE OF IMPORT_HTML   * THIS IS THE ENGINE OF IMPORT_HTML
698   *   *
699   * @param $datadoc   * @param $datadoc
700   *   An XML document containing the whole source data   *   An XML document containing the whole source data
701   * @param $node   * @param $node
# Line 704  function import_html_process_html_page($ Line 704  function import_html_process_html_page($
704   * @param $profile   * @param $profile
705   *   A   set of settings and preferences for the import_html process currently   *   A   set of settings and preferences for the import_html process currently
706   * underway. May include some context information like paths.   * underway. May include some context information like paths.
707   *   *
708   */   */
709  function import_html_xhtml_to_node($datadoc, $node, $profile) {  function import_html_xhtml_to_node($datadoc, $node, $profile) {
710    import_html_debug("Importing from XML object to node object");    import_html_debug("Importing from XML object to node object");
# Line 716  function import_html_xhtml_to_node($data Line 716  function import_html_xhtml_to_node($data
716    if (import_html_variable('debug_level')) {    if (import_html_variable('debug_level')) {
717      $node->file_data['raw_xhtml'] = xml_toString($datadoc);      $node->file_data['raw_xhtml'] = xml_toString($datadoc);
718    }    }
719    
720    
721    // Now read the input into node structure    // Now read the input into node structure
722    //    //
723    // Absorb the most generic bits first. Later processes may overwrite them more accurately.    // Absorb the most generic bits first. Later processes may overwrite them more accurately.
724    
725    // This initial import is a totally generic catch-all.    // This initial import is a totally generic catch-all.
726    import_html_absorb_all_tagged_elements($node, $datadoc);    import_html_absorb_all_tagged_elements($node, $datadoc);
727    
728    //    //
# Line 738  function import_html_xhtml_to_node($data Line 738  function import_html_xhtml_to_node($data
738    // Loop over a buch of hook-like per-module extensions    // Loop over a buch of hook-like per-module extensions
739    // MENU, PATH, TAXONOMY, CCK all add values in their own callbacks in import_html_modules.inc    // MENU, PATH, TAXONOMY, CCK all add values in their own callbacks in import_html_modules.inc
740    // Also the core node elements - body, title, teaser get set in a 'core' callback    // Also the core node elements - body, title, teaser get set in a 'core' callback
741    
742    import_html_include_add_on_module_handlers();    import_html_include_add_on_module_handlers();
743    module_invoke_all('import_html', $profile, $node, $datadoc);    module_invoke_all('import_html', $profile, $node, $datadoc);
744    
745    // 'content' is now a reserved word in Drupal5    // 'content' is now a reserved word in Drupal5
746    // If I have a string there, the body cannot be rendered right later    // If I have a string there, the body cannot be rendered right later
747    unset($node->content);    unset($node->content);
# Line 750  function import_html_xhtml_to_node($data Line 750  function import_html_xhtml_to_node($data
750    $node->format = import_html_get_preferred_filter();    $node->format = import_html_get_preferred_filter();
751    
752    import_html_debug_code(    import_html_debug_code(
753      "After absorbing absolutely everything I could find,      "After absorbing absolutely everything I could find,
754      the node object now contains the following blocks and bits:",      the node object now contains the following blocks and bits:",
755      $node      $node
756    );    );
# Line 760  function import_html_xhtml_to_node($data Line 760  function import_html_xhtml_to_node($data
760    
761  /**  /**
762   * Import ALL tagged classes and IDs as node attributes.   * Import ALL tagged classes and IDs as node attributes.
763   *   *
764   * If the input has ANY id or classes at all, grab that info and apply it to   * If the input has ANY id or classes at all, grab that info and apply it to
765   * this object. Assume anything important enough to have a label is important   * this object. Assume anything important enough to have a label is important
766   * enough to remember.   * enough to remember.
767   *   *
768   * This will probably produce a very cloggy node, filled with trash, Possibly   * This will probably produce a very cloggy node, filled with trash, Possibly
769   * even some arrays where there shouldn't be. But any unrecognised property   * even some arrays where there shouldn't be. But any unrecognised property
770   * names will be discarded on save, leaving only the serializable values. This   * names will be discarded on save, leaving only the serializable values. This
# Line 774  function import_html_xhtml_to_node($data Line 774  function import_html_xhtml_to_node($data
774  function import_html_absorb_all_tagged_elements(&$node, $datadoc) {  function import_html_absorb_all_tagged_elements(&$node, $datadoc) {
775    
776    foreach (array('id', 'class') as $attribute_label) {    foreach (array('id', 'class') as $attribute_label) {
777    
778      import_html_debug(      import_html_debug(
779        "Absorbing all elements with an %attribute_label        "Absorbing all elements with an %attribute_label
780        as incidental data blobs (possibly html) into node structure",        as incidental data blobs (possibly html) into node structure",
781        array('%attribute_label' => $attribute_label)        array('%attribute_label' => $attribute_label)
782      );      );
783      $found_elements = xml_query($datadoc, './/*[@'. $attribute_label .']');      $found_elements = xml_query($datadoc, './/*[@'. $attribute_label .']');
784    
785      // I now have a collection of tagged nodes.      // I now have a collection of tagged nodes.
786      foreach ($found_elements as $found_element) {      foreach ($found_elements as $found_element) {
787    
788        $attribute_value = xml_getattribute($found_element, $attribute_label);        $attribute_value = xml_getattribute($found_element, $attribute_label);
789        // if it was a class, it may be multiple!        // if it was a class, it may be multiple!
790        // Usually just one however...        // Usually just one however...
791        $keys = explode(' ', $attribute_value);        $keys = explode(' ', $attribute_value);
792        // debug("Found an node with $attribute_label of ".print_r($keys, 1) , 3);        // debug("Found an node with $attribute_label of ".print_r($keys, 1) , 3);
793    
794        foreach ($keys as $key) {        foreach ($keys as $key) {
795          // Found 'something' labelled 'something'          // Found 'something' labelled 'something'
796          if (! trim($key)) continue;          if (! trim($key)) continue;
797    
798          // Allow HTML though. Sometimes this will not be right...          // Allow HTML though. Sometimes this will not be right...
799          // TODO, figure it out?          // TODO, figure it out?
800          $value = xml_tostring($found_element, TRUE);          $value = xml_tostring($found_element, TRUE);
# Line 802  function import_html_absorb_all_tagged_e Line 802  function import_html_absorb_all_tagged_e
802    
803          // The value just gets absorbed          // The value just gets absorbed
804          import_html_debug(          import_html_debug(
805            "Found an unexpected tagged value - %key ,            "Found an unexpected tagged value - %key ,
806              Absorbing it into the node as a default text/html value",              Absorbing it into the node as a default text/html value",
807            array('%key' => $key)            array('%key' => $key)
808          );          );
809    
810          // Set it onto the node,          // Set it onto the node,
811          // If it's a class, carefully combine to preserve pre-existing arrays          // If it's a class, carefully combine to preserve pre-existing arrays
812          if ( $attribute_label == 'class') {          if ( $attribute_label == 'class') {
813            import_html_absorb_properties($node, $key, $value);            import_html_absorb_properties($node, $key, $value);
# Line 819  function import_html_absorb_all_tagged_e Line 819  function import_html_absorb_all_tagged_e
819        } // each multiple key        } // each multiple key
820      } // each found element      } // each found element
821    } // each attribute type    } // each attribute type
822  }  }
823    
824  /**  /**
825   * Scan a given dom object for metas of a certain persuasion, and add all found   * Scan a given dom object for metas of a certain persuasion, and add all found
826   * key-values to the $node.   * key-values to the $node.
827   *   *
828   * Supports different metas, like   * Supports different metas, like
829   * <meta name="key" content="value" />   * <meta name="key" content="value" />
830   * or   * or
831   * <rel type="top" href="url" />   * <rel type="top" href="url" />
832   *   *
833   * import_html_absorb_metas($node, $htmlnode, 'meta', 'name', 'content');   * import_html_absorb_metas($node, $htmlnode, 'meta', 'name', 'content');
834   * import_html_absorb_metas($node, $htmlnode, 'rel', 'type', 'href');   * import_html_absorb_metas($node, $htmlnode, 'rel', 'type', 'href');
835   *   *
836   * ...   * ...
837   * .. would result in :   * .. would result in :
838   *   *
839   * $node->key='value';   * $node->key='value';
840   * $node->top='url';   * $node->top='url';
841   *   *
842   *   *
843   */   */
844  function import_html_absorb_metas(&$node, $xml_element, $tagname, $keyname, $valname) {  function import_html_absorb_metas(&$node, $xml_element, $tagname, $keyname, $valname) {
845    
846    import_html_debug(    import_html_debug(
847      "Absorbing the '%valname' of '%tagname's with a '%keyname'      "Absorbing the '%valname' of '%tagname's with a '%keyname'
848        from source doc into node structure",        from source doc into node structure",
849      array(      array(
850        '%valname' => $valname,        '%valname' => $valname,
# Line 866  function import_html_absorb_metas(&$node Line 866  function import_html_absorb_metas(&$node
866      }      }
867      else{      else{
868        import_html_debug(        import_html_debug(
869          "When absorbing '%valname' from '%tagname's with a '%keyname' from source doc,          "When absorbing '%valname' from '%tagname's with a '%keyname' from source doc,
870          (%key='%value') had a null value. Not a great problem, just letting you know.",          (%key='%value') had a null value. Not a great problem, just letting you know.",
871          array(          array(
872            '%valname' => $valname,            '%valname' => $valname,
# Line 898  function import_html_absorb_properties(& Line 898  function import_html_absorb_properties(&
898      debug("Odd, when absorbing properties, '$key' had a null value. This is probably not an error.", 2);      debug("Odd, when absorbing properties, '$key' had a null value. This is probably not an error.", 2);
899      return;      return;
900    }    }
901    
902    // Auto-expand into arrays - most metas can legally have duplicates    // Auto-expand into arrays - most metas can legally have duplicates
903    if ( ! isset($node->$key) ) {    if ( ! isset($node->$key) ) {
904      $node->$key = $value;      $node->$key = $value;
905    }    }
906    else if ( is_array($node->$key) ) {    else if ( is_array($node->$key) ) {
907      $a = $node->$key; $a[] = $value; $node->$key = $a;      $a = $node->$key; $a[] = $value; $node->$key = $a;
908    }    }
909    else { $node->$key = array($node->$key, $value); }    else { $node->$key = array($node->$key, $value); }
910  }  }
# Line 931  function import_html_include_add_on_modu Line 931  function import_html_include_add_on_modu
931  function import_html_guess_document_title($node) {  function import_html_guess_document_title($node) {
932    if (empty($node->title) ) {    if (empty($node->title) ) {
933      import_html_debug(      import_html_debug(
934        "Failed to extract a useful title for this node, falling back to a default value.",        "Failed to extract a useful title for this node, falling back to a default value.",
935        array(),        array(),
936        WATCHDOG_NOTICE        WATCHDOG_NOTICE
937      );      );
938      switch (import_html_variable('handle_no_title')) {      switch (import_html_variable('handle_no_title')) {
# Line 960  function import_html_guess_label($title, Line 960  function import_html_guess_label($title,
960      if (!$label) {      if (!$label) {
961        // it had a trailing slash        // it had a trailing slash
962        $label = array_pop($path_bits);        $label = array_pop($path_bits);
963      }      }
964      $label = preg_replace('/\?.*$/', '?', $label); // messiness from mirrored URLs with args in      $label = preg_replace('/\?.*$/', '?', $label); // messiness from mirrored URLs with args in
965      // TODO maybe adjust this title-munging algoritm to make better guesses      // TODO maybe adjust this title-munging algoritm to make better guesses
966      $label = str_replace('_', ' ', $label);      $label = str_replace('_', ' ', $label);
967      $label = (strstr($label, '.')) ? substr($label, 0, strrpos($label, ".")) : $label;      $label = (strstr($label, '.')) ? substr($label, 0, strrpos($label, ".")) : $label;
968    }    }
969    return $label;    return $label;
970  }  }
971    
972    
973  /**  /**
974   * Return the nice path alias of an imported page.   * Return the nice path alias of an imported page.
975   *   *
976   * Simplify a legacy URL path into something better looking.   * Simplify a legacy URL path into something better looking.
977   */   */
978  function _import_html_calc_path($rel_path, $leave_suffix = FALSE) {  function _import_html_calc_path($rel_path, $leave_suffix = FALSE) {
# Line 986  function _import_html_calc_path($rel_pat Line 986  function _import_html_calc_path($rel_pat
986    if (import_html_variable('trim_suffixes')) {    if (import_html_variable('trim_suffixes')) {
987      // Simplify the URL if possible by trimming the suffix and 'index'      // Simplify the URL if possible by trimming the suffix and 'index'
988      // but remember the original path somewhere, we'ill need to link it forward      // but remember the original path somewhere, we'ill need to link it forward
989      // once the new node is established.      // once the new node is established.
990    
991      // To be clever, special-case the 'index.html' files to be      // To be clever, special-case the 'index.html' files to be
992      // linked to their parent directories.      // linked to their parent directories.
993      // Trailing slash is tricky.      // Trailing slash is tricky.
994      // /this/path is a whole navigation level above      // /this/path is a whole navigation level above
995      // /this/path/ and will resolve relative links differently!      // /this/path/ and will resolve relative links differently!
996      // We need to actually redirect, not just alias any links like that      // We need to actually redirect, not just alias any links like that
997      $default_documents = split(",", import_html_variable('default_document'));      $default_documents = split(",", import_html_variable('default_document'));
998      $trimmed_path = $path;      $trimmed_path = $path;
# Line 1001  function _import_html_calc_path($rel_pat Line 1001  function _import_html_calc_path($rel_pat
1001      }      }
1002      if ($trimmed_path != $path) {      if ($trimmed_path != $path) {
1003        import_html_debug(        import_html_debug(
1004          "It's an index page, so we will refer to $path as $trimmed_path",          "It's an index page, so we will refer to $path as $trimmed_path",
1005          array('%path' => $path, '%trimmed_path' => $trimmed_path),          array('%path' => $path, '%trimmed_path' => $trimmed_path),
1006          WATCHDOG_INFO          WATCHDOG_INFO
1007        );        );
1008        $path = $trimmed_path;        $path = $trimmed_path;
1009      }      }
1010      else {      else {
1011        // No change, Chop suffix instead.        // No change, Chop suffix instead.
1012        // Take care - don't break a path like        // Take care - don't break a path like
1013        // /path/site-mirror/drupal.org/about        // /path/site-mirror/drupal.org/about
1014        // incorrectly. So make sure that we split off the basename, chop its suffix, then glue it back onto the dirname        // incorrectly. So make sure that we split off the basename, chop its suffix, then glue it back onto the dirname
1015        // $path = (! empty($path) ? dirname($path) .'/' : '') . preg_replace('|\.[^\.]+$|', "", basename($path));        // $path = (! empty($path) ? dirname($path) .'/' : '') . preg_replace('|\.[^\.]+$|', "", basename($path));
# Line 1022  function _import_html_calc_path($rel_pat Line 1022  function _import_html_calc_path($rel_pat
1022  }  }
1023    
1024  /**  /**
1025   * Find and initialize the transformation template.   * Find and initialize the transformation template.
1026   *   *
1027   * Includes caching retrieval for a bit of speed-up over bulks.   * Includes caching retrieval for a bit of speed-up over bulks.
1028   *   *
1029   * @return XML Document   * @return XML Document
1030   */   */
1031  function _import_html_get_xsl_doc($xslfile) {  function _import_html_get_xsl_doc($xslfile) {
# Line 1060  function _import_html_get_xsl_doc($xslfi Line 1060  function _import_html_get_xsl_doc($xslfi
1060  /**  /**
1061   * Run the url-rewrite XSL over the source document   * Run the url-rewrite XSL over the source document
1062   * TODO allow for the non-base version of Drupal links   * TODO allow for the non-base version of Drupal links
1063   *   *
1064   * The relative links need to be converted into path-to- top and back down   * The relative links need to be converted into path-to- top and back down
1065   * again. Relative references just cannot be maintained.   * again. Relative references just cannot be maintained.
1066   *   *
1067   * @return an XML doc again   * @return an XML doc again
1068   */   */
1069  function import_html_rewrite_links($xmldoc, $rel_path, $profile) {  function import_html_rewrite_links($xmldoc, $rel_path, $profile) {
# Line 1099  function import_html_rewrite_links($xmld Line 1099  function import_html_rewrite_links($xmld
1099    $src_root = base_path() . ensure_trailing_slash($profile['file_storage_path']);    $src_root = base_path() . ensure_trailing_slash($profile['file_storage_path']);
1100    
1101    $src_base = ensure_trailing_slash($src_root) . (($rel_base == '/') ? '' : $rel_base);    $src_base = ensure_trailing_slash($src_root) . (($rel_base == '/') ? '' : $rel_base);
1102    
1103    // Or not, if we are still linking to full URLs (demo or partial import)    // Or not, if we are still linking to full URLs (demo or partial import)
1104    if (valid_url($rel_path, TRUE)) {    if (valid_url($rel_path, TRUE)) {
1105      // it's remote!      // it's remote!
# Line 1116  function import_html_rewrite_links($xmld Line 1116  function import_html_rewrite_links($xmld
1116    import_html_debug("    import_html_debug("
1117      <b>Rewrite patterns:</b>      <b>Rewrite patterns:</b>
1118      Path to the top of this (relative) server is $site_root .      Path to the top of this (relative) server is $site_root .
1119      Path to top of the prefixed section      Path to top of the prefixed section
1120      ({$profile['import_site_prefix']})      ({$profile['import_site_prefix']})
1121      from here ($rel_path)      from here ($rel_path)
1122      to our import base      to our import base
1123      ({$profile['import_site_prefix']})      ({$profile['import_site_prefix']})
1124      would be '$path_to_import_top'.      would be '$path_to_import_top'.
1125      Path to a relative <em>neighbour</em> of this page would be      Path to a relative <em>neighbour</em> of this page would be
# Line 1127  function import_html_rewrite_links($xmld Line 1127  function import_html_rewrite_links($xmld
1127      or to find the base for <em>relative</em> resource files over in      or to find the base for <em>relative</em> resource files over in
1128      the file storage area      the file storage area
1129      ({$profile['file_storage_path']})      ({$profile['file_storage_path']})
1130      would be '$src_base' ",      would be '$src_base' ",
1131      array(),      array(),
1132      WATCHDOG_DEBUG      WATCHDOG_DEBUG
1133    );    );
1134    
1135    
1136    $parameters = array(    $parameters = array(
1137      // These parameters tell the rewriter what to prepend to the links.      // These parameters tell the rewriter what to prepend to the links.
1138      // They are instructions how this page will find its missing bretheren      // They are instructions how this page will find its missing bretheren
1139      // when we put it where we put it.      // when we put it where we put it.
1140      // Images and Pages may end up in different places.      // Images and Pages may end up in different places.
1141      'site_root'      => $site_root,      'site_root'      => $site_root,
1142      'src_root'       => $src_root,      'src_root'       => $src_root,
1143      'src_base'       => $src_base,      'src_base'       => $src_base,
1144      'href_base'      => $href_base,      'href_base'      => $href_base,
1145      'replace_suffix' => $profile['relink_files'],      'replace_suffix' => $profile['relink_files'],
1146      'new_suffix'     => '',      'new_suffix'     => '',
1147      'xsl_path'       => $xslfilepath,      'xsl_path'       => $xslfilepath,
1148      'strip_scripts'  => $profile['strip_scripts'],      'strip_scripts'  => $profile['strip_scripts'],
1149    );    );
1150    import_html_debug("    import_html_debug("
1151      XSL for URL rewrites loaded OK.      XSL for URL rewrites loaded OK.
1152      HTML links for files that were under '$rel_base' will be made relative to '"      HTML links for files that were under '$rel_base' will be made relative to '"
1153      . $parameters['href_base'] ."' (for pages) and '". $parameters['src_base'] ."' (for resources) "      . $parameters['href_base'] ."' (for pages) and '". $parameters['src_base'] ."' (for resources) "
1154      . ( $parameters['strip_scripts'] ? 'All inline script blocks will be discarded from the source.'. $parameters['strip_scripts'] : '')      . ( $parameters['strip_scripts'] ? 'All inline script blocks will be discarded from the source.'. $parameters['strip_scripts'] : '')
1155      ,      ,
1156      array(),      array(),
# Line 1176  function import_html_rewrite_links($xmld Line 1176  function import_html_rewrite_links($xmld
1176    
1177  /**  /**
1178   * Run the strip_tables XSL over the source document   * Run the strip_tables XSL over the source document
1179   *   *
1180   * @return an XML doc again   * @return an XML doc again
1181   */   */
1182  function import_html_strip_tables($xmldoc) {  function import_html_strip_tables($xmldoc) {
# Line 1243  function import_html_check_name($name) { Line 1243  function import_html_check_name($name) {
1243   * Avoid double-ups, if the path already exists, UPDATE the existing node.   * Avoid double-ups, if the path already exists, UPDATE the existing node.
1244   * Can't have two content nodes claiming the same path or it won't validate.   * Can't have two content nodes claiming the same path or it won't validate.
1245   * Plus, we want to retain any info that's been added via drupal. Probably.   * Plus, we want to retain any info that's been added via drupal. Probably.
1246   *   *
1247   * @param $node   * @param $node
1248   *   partially  created node from import. Key lookup on $node->path   *   partially  created node from import. Key lookup on $node->path
1249   * @param $profile   * @param $profile
1250   *  May  contain some rules for conflict resolution - which values to keep,   *  May  contain some rules for conflict resolution - which values to keep,
1251   * which to over-write.   * which to over-write.
1252   *   *
1253   * @return $node   * @return $node
1254   *   possibly with pre-existing values blended in. Importantly - the nid   *   possibly with pre-existing values blended in. Importantly - the nid
1255   */   */
1256  function import_html_merge_over_existing_node($node, $profile) {  function import_html_merge_over_existing_node($node, $profile) {
# Line 1259  function import_html_merge_over_existing Line 1259  function import_html_merge_over_existing
1259    if ($internal_link != $node->path) {    if ($internal_link != $node->path) {
1260      // Found an internal match, the alias is already asigned to a node      // Found an internal match, the alias is already asigned to a node
1261      // Merge info to avoid losing any Drupal-only info      // Merge info to avoid losing any Drupal-only info
1262    
1263      $probable_nid = array_pop(explode("/", $internal_link));      $probable_nid = array_pop(explode("/", $internal_link));
1264      if (! is_numeric($probable_nid)) {      if (! is_numeric($probable_nid)) {
1265        // This may happen if the menu builder has created a placeholder alias        // This may happen if the menu builder has created a placeholder alias
1266        // pseudo-page, or the alias conflicts with an already-created system path.        // pseudo-page, or the alias conflicts with an already-created system path.
1267        import_html_debug("        import_html_debug("
1268          When looking for an alias to '%nodepath',          When looking for an alias to '%nodepath',
1269          Found some pre-existing (non-node) content there.          Found some pre-existing (non-node) content there.
1270          the internal link          the internal link
1271          '%internal_link' - which was expected to return a nid.",          '%internal_link' - which was expected to return a nid.",
1272          array(          array(
1273            '%nodepath' => $node->path,            '%nodepath' => $node->path,
# Line 1280  function import_html_merge_over_existing Line 1280  function import_html_merge_over_existing
1280      $node->nid = $probable_nid;      $node->nid = $probable_nid;
1281    
1282      import_html_debug("      import_html_debug("
1283          Page path alias '%nodepath' already exists,          Page path alias '%nodepath' already exists,
1284          It's already linked to node id '%nodenid'.          It's already linked to node id '%nodenid'.
1285          This data import will <em>replace</em> that content,          This data import will <em>replace</em> that content,
1286          but try to keep any other values.          but try to keep any other values.
1287        ",        ",
1288        array(        array(
# Line 1309  function import_html_merge_over_existing Line 1309  function import_html_merge_over_existing
1309      // Now do the rest by copying values as best we can      // Now do the rest by copying values as best we can
1310      foreach ($node as $key => $value) {      foreach ($node as $key => $value) {
1311        // Do a deep merge        // Do a deep merge
1312        if (is_array($value)) {        if (is_array($value)) {
1313          // merge deeper sets, like taxonomy          // merge deeper sets, like taxonomy
1314          if (!@is_array($old_node->$key)) {          if (!@is_array($old_node->$key)) {
1315            $old_node->$key=array();            $old_node->$key=array();
# Line 1317  function import_html_merge_over_existing Line 1317  function import_html_merge_over_existing
1317          foreach ($value as $k => $v) {          foreach ($value as $k => $v) {
1318            $old_node->{$key}[$k] = $v;            $old_node->{$key}[$k] = $v;
1319          }          }
1320        }        }
1321        else {        else {
1322          $old_node-> $key = $value;          $old_node-> $key = $value;
1323        }        }
# Line 1329  function import_html_merge_over_existing Line 1329  function import_html_merge_over_existing
1329    
1330    
1331  /**  /**
1332   * Utility function   * Utility function
1333   *   *
1334   * file_scan_directory() does not support max_depth.   * file_scan_directory() does not support max_depth.
1335   * I need it so my folder listings don't go insane when recursing   * I need it so my folder listings don't go insane when recursing
1336   *   *
1337   * This is a version of file_scan_directory that does respect max_depth   * This is a version of file_scan_directory that does respect max_depth
1338   * when recursing.   * when recursing.
1339   * It also adds a filecount value to the returned item to assist feedback   * It also adds a filecount value to the returned item to assist feedback
1340   *   *
1341   * @see file_scan_directory.   * @see file_scan_directory.
1342   *   *
1343   */   */
1344  function import_html_file_scan_directory($dir, $mask, $nomask = array('.', '..', 'CVS'), $callback = 0, $recurse = TRUE, $key = 'filename', $min_depth = 0, $depth = 0, $max_depth = NULL) {  function import_html_file_scan_directory($dir, $mask, $nomask = array('.', '..', 'CVS'), $callback = 0, $recurse = TRUE, $key = 'filename', $min_depth = 0, $depth = 0, $max_depth = NULL) {
1345    // If no max_depth is set, the normal recursed version is OK    // If no max_depth is set, the normal recursed version is OK
1346    if (! isset($max_depth)) {    if (! isset($max_depth)) {
1347      return file_scan_directory($dir, $mask, $nomask, $callback, TRUE, $key, $min_depth, $depth);      return file_scan_directory($dir, $mask, $nomask, $callback, TRUE, $key, $min_depth, $depth);
1348    }    }
1349    
1350    $files = array();    $files = array();
1351    
1352    // Use file_scan_directory - non-recursive    // Use file_scan_directory - non-recursive
# Line 1357  function import_html_file_scan_directory Line 1357  function import_html_file_scan_directory
1357        if ($depth < $max_depth) {        if ($depth < $max_depth) {
1358          $files = array_merge(import_html_file_scan_directory($filepath, $mask, $nomask, $callback, $recurse, $key, $min_depth, $depth + 1, $max_depth), $files);          $files = array_merge(import_html_file_scan_directory($filepath, $mask, $nomask, $callback, $recurse, $key, $min_depth, $depth + 1, $max_depth), $files);
1359        }        }
1360    
1361        // This may be intensive, but will help debugging        // This may be intensive, but will help debugging
1362        $count_files = file_scan_directory($filepath, $mask);        $count_files = file_scan_directory($filepath, $mask);
1363        $files[$filepath]->child_count = count($count_files);        $files[$filepath]->child_count = count($count_files);
# Line 1388  function import_html_scan_rel_dir($selec Line 1388  function import_html_scan_rel_dir($selec
1388    
1389  /**  /**
1390   * Tidy URLs before saving locally - for URL imports   * Tidy URLs before saving locally - for URL imports
1391   *   *
1392   * Squash/hash query strings, but don't discard them.   * Squash/hash query strings, but don't discard them.
1393   * Do discard fragment ids   * Do discard fragment ids
1394   *   *
1395   * Replace spaces and non-alphanumerics with underscore   * Replace spaces and non-alphanumerics with underscore
1396   */   */
1397  function safe_filepath_from_url($rel_path) {  function safe_filepath_from_url($rel_path) {
# Line 1419  function charset_decode_utf_8($string) { Line 1419  function charset_decode_utf_8($string) {
1419    
1420    // decode three byte unicode characters    // decode three byte unicode characters
1421    $string = preg_replace(    $string = preg_replace(
1422      "/([\340-\357])([\200-\277])([\200-\277])/e",      "/([\340-\357])([\200-\277])([\200-\277])/e",
1423      "'&#'.((ord('\\1')-224)*4096 + (ord('\\2')-128)*64 + (ord('\\3')-128)).';'",      "'&#'.((ord('\\1')-224)*4096 + (ord('\\2')-128)*64 + (ord('\\3')-128)).';'",
1424      $string      $string
1425    );    );
1426    

Legend:
Removed from v.1.5.4.16  
changed lines
  Added in v.1.5.4.17

  ViewVC Help
Powered by ViewVC 1.1.2