/[drupal]/contributions/modules/import_html/import_html.module
ViewVC logotype

Diff of /contributions/modules/import_html/import_html.module

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph | View Patch Patch

revision 1.71.2.15, Wed Jul 1 02:40:56 2009 UTC revision 1.71.2.16, Tue Oct 6 14:09:02 2009 UTC
# Line 1  Line 1 
1  <?php  <?php
2  // $Id: import_html.module,v 1.71.2.14 2009/05/17 15:12:46 dman Exp $  // $Id: import_html.module,v 1.71.2.15 2009/07/01 02:40:56 dman Exp $
3    
4  /**  /**
5   * @file Main drupal interface to import_html.   * @file Main drupal interface to import_html.
6   *   *
7   * This file contains Drupal hooks   * This file contains Drupal hooks
8   * and some config/preferences. Actual processing functions are in   * and some config/preferences. Actual processing functions are in
9   * import_html_process.inc   * import_html_process.inc
10   *   *
11   * Synopsis:   * Synopsis:
12   *   *
13   * Facility to import an existing, static HTML site structure into the Drupal   * Facility to import an existing, static HTML site structure into the Drupal
14   * CMS as structured nodes.   * CMS as structured nodes.
15   *   *
# Line 30  Line 30 
30   *   *
31   * import_html libraries can be used by external modules, specifically wrapper.   * import_html libraries can be used by external modules, specifically wrapper.
32   * module (private development) and static.module (experimental)   * module (private development) and static.module (experimental)
33   *   *
34   * CODE:   * CODE:
35   * Internally, there are a lot of filepath fragments being passed around.   * Internally, there are a lot of filepath fragments being passed around.
36   * Any filepath known to represent a folder (eg $source_siteroot) will end with   * Any filepath known to represent a folder (eg $source_siteroot) will end with
# Line 39  Line 39 
39   * dividers. However, many times, small functions like 'ensure_trailing_slash()'   * dividers. However, many times, small functions like 'ensure_trailing_slash()'
40   * may be called. Not all of them are needed, but most of them were put in due   * may be called. Not all of them are needed, but most of them were put in due
41   * to some wierd filesystem vagery or filename encountered.   * to some wierd filesystem vagery or filename encountered.
42   *   *
43   *   *
44   * @ingroup import_html Import HTML   * @ingroup import_html Import HTML
45   *   *
46   * @author Dan Morrison http://coders.co.nz/   * @author Dan Morrison http://coders.co.nz/
47   *   *
48   */   */
# Line 108  define("IMPORT_HTML_MAX_LABEL_LENGTH", 2 Line 108  define("IMPORT_HTML_MAX_LABEL_LENGTH", 2
108  define('IMPORT_HTML_ADMIN_PATH', 'admin/build/import_html');  define('IMPORT_HTML_ADMIN_PATH', 'admin/build/import_html');
109    
110    
111  define('IMPORT_HTML_GLOB_BEFORE', 'glob before');  define('IMPORT_HTML_GLOB_BEFORE', 'glob before');
112  define('IMPORT_HTML_RECURSE_AFTER', 'recurse after ');  define('IMPORT_HTML_RECURSE_AFTER', 'recurse after ');
113    
114  /**  /**
115   * To avoid overloading the treeview display, directory listing depth recursion   * To avoid overloading the treeview display, directory listing depth recursion
116   * will be truncated if the number of found files is greater than this.   * will be truncated if the number of found files is greater than this.
117   */   */
118  define('IMPORT_HTML_MAX_FILE_LISTING_COUNT', 500);  define('IMPORT_HTML_MAX_FILE_LISTING_COUNT', 500);
119    
120  /**  /**
121   * If the database max_allowed_packet size is limited, then huge batch jobs   * If the database max_allowed_packet size is limited, then huge batch jobs
# Line 134  define('IMPORT_HTML_REQUIRED_MAX_ALLOWED Line 134  define('IMPORT_HTML_REQUIRED_MAX_ALLOWED
134   */   */
135  function import_html_menu() {  function import_html_menu() {
136    $items[IMPORT_HTML_ADMIN_PATH] = array(    $items[IMPORT_HTML_ADMIN_PATH] = array(
137      'title' => 'Import HTML',      'title' => 'Import HTML',
138      'description' => "Import/Export an entire site or directory to/from static HTML",      'description' => "Import/Export an entire site or directory to/from static HTML",
139    
140      // Use the built-in Overview menu thing      // Use the built-in Overview menu thing
# Line 142  function import_html_menu() { Line 142  function import_html_menu() {
142      'file' => 'system.admin.inc',      'file' => 'system.admin.inc',
143      'file path' => drupal_get_path('module', 'system'),      'file path' => drupal_get_path('module', 'system'),
144    
145      'access arguments' => array('access import_html'),      'access arguments' => array('access import_html'),
146      'type' => MENU_NORMAL_ITEM,      'type' => MENU_NORMAL_ITEM,
147    );    );
148    $items[IMPORT_HTML_ADMIN_PATH .'/settings'] = array(    $items[IMPORT_HTML_ADMIN_PATH .'/settings'] = array(
# Line 165  function import_html_menu() { Line 165  function import_html_menu() {
165      'type' => MENU_NORMAL_ITEM,      'type' => MENU_NORMAL_ITEM,
166    );    );
167    $items[IMPORT_HTML_ADMIN_PATH .'/import_site'] = array(    $items[IMPORT_HTML_ADMIN_PATH .'/import_site'] = array(
168      'title' => 'Import HTML Site',      'title' => 'Import HTML Site',
169      'description' => "Import/Export an entire site to/from static HTML",      'description' => "Import/Export an entire site to/from static HTML",
170      'weight' => -1,      'weight' => -1,
171      'page callback' => 'drupal_get_form',      'page callback' => 'drupal_get_form',
172      'page arguments' => array('import_html_process_form'),      'page arguments' => array('import_html_process_form'),
173      'file' => 'import_html_ui.inc',      'file' => 'import_html_ui.inc',
174      'access arguments' => array('access import_html'),      'access arguments' => array('access import_html'),
175      'type' => MENU_NORMAL_ITEM,      'type' => MENU_NORMAL_ITEM,
176    );    );
177    $items[IMPORT_HTML_ADMIN_PATH .'/demo'] = array(    $items[IMPORT_HTML_ADMIN_PATH .'/demo'] = array(
178      'title' => 'Demo',      'title' => 'Demo',
179      'weight' => 3,      'weight' => 3,
180      'description' => 'Demonstrate or test HTML Import on one file.',      'description' => 'Demonstrate or test HTML Import on one file.',
181      'page callback' => 'drupal_get_form',      'page callback' => 'drupal_get_form',
# Line 185  function import_html_menu() { Line 185  function import_html_menu() {
185      'type' => MENU_NORMAL_ITEM,      'type' => MENU_NORMAL_ITEM,
186    );    );
187    $items[IMPORT_HTML_ADMIN_PATH .'/results'] = array(    $items[IMPORT_HTML_ADMIN_PATH .'/results'] = array(
188      'title' => 'Import Results Summary',      'title' => 'Import Results Summary',
189      'weight' => 5,      'weight' => 5,
190      'description' => "View log of recent import messages",      'description' => "View log of recent import messages",
191      'page callback' => 'import_html_results',      'page callback' => 'import_html_results',
192      'file' => 'import_html_ui.inc',      'file' => 'import_html_ui.inc',
193      'access arguments' => array('access import_html'),      'access arguments' => array('access import_html'),
194      'type' => MENU_NORMAL_ITEM,      'type' => MENU_NORMAL_ITEM,
195    );    );
196    
# Line 228  function import_html_help($path, $arg) { Line 228  function import_html_help($path, $arg) {
228        $output = t("<p>        $output = t("<p>
229            For background, remember to read <a href='!help_link'>the Import Html help page</a>.            For background, remember to read <a href='!help_link'>the Import Html help page</a>.
230            The <a href='!settings_link'>settings page</a> contains the config options.            The <a href='!settings_link'>settings page</a> contains the config options.
231          </p>",          </p>",
232          array(          array(
233            '!help_link' => url('admin/help/import_html'),            '!help_link' => url('admin/help/import_html'),
234            '!settings_link' => url(IMPORT_HTML_ADMIN_PATH .'/settings'),            '!settings_link' => url(IMPORT_HTML_ADMIN_PATH .'/settings'),
235          )          )
236        );        );
# Line 248  function import_html_help($path, $arg) { Line 248  function import_html_help($path, $arg) {
248        </p><p>        </p><p>
249          If you see <em>too much</em> of the page in the 'body' area (nested navbars and layout)          If you see <em>too much</em> of the page in the 'body' area (nested navbars and layout)
250          then the XSL import template or selector needs to be made more specific.          then the XSL import template or selector needs to be made more specific.
251          If you see none, or not enough content in the body area, the template or          If you see none, or not enough content in the body area, the template or
252          selector needs to be adjusted to encompass the text correctly.          selector needs to be adjusted to encompass the text correctly.
253        </p><p>        </p><p>
254          If you get an error or no result, the input HTML is probably too invalid to work with.          If you get an error or no result, the input HTML is probably too invalid to work with.
255        </p><p>        </p><p>
256          Single demo imports do not have the full context information to work with,          Single demo imports do not have the full context information to work with,
257          so the menu or URL alias (and internal relinking) shown may not be representative of the real result.          so the menu or URL alias (and internal relinking) shown may not be representative of the real result.
258        </p>");        </p>");
259    
# Line 282  function import_html_form_alter(&$form, Line 282  function import_html_form_alter(&$form,
282    
283  /**  /**
284   * A wrapper to variable_set, variable_get to encapsulate multiple import 'profiles'   * A wrapper to variable_set, variable_get to encapsulate multiple import 'profiles'
285   *   *
286   * This natively just returns the settings from the current 'default' profile,   * This natively just returns the settings from the current 'default' profile,
287   * but also allows the settings forms to be extended to other sets.   * but also allows the settings forms to be extended to other sets.
288   *   *
289   * Use INSTEAD OF variable_get() and it will return the 'default' or 'active' profile vars.   * Use INSTEAD OF variable_get() and it will return the 'default' or 'active' profile vars.
290   *   *
291   * @param $var Name of the variable within the currently active profile to retrieve.   * @param $var Name of the variable within the currently active profile to retrieve.
292   * @param $val If set, sets this variable within the profile and saves it.   * @param $val If set, sets this variable within the profile and saves it.
293   * @see import_html_current_profile();   * @see import_html_current_profile();
294   */   */
295  function import_html_variable($var, $val = NULL) {  function import_html_variable($var, $val = NULL) {
# Line 304  function import_html_variable($var, $val Line 304  function import_html_variable($var, $val
304    if (empty($import_html_profiles[$import_html_current_profile_id])) {    if (empty($import_html_profiles[$import_html_current_profile_id])) {
305      // Fill in defaults (should only be needed first time, if that      // Fill in defaults (should only be needed first time, if that
306      drupal_set_message('import_html_variable initing profile from nowhere - should this ever happen?');      drupal_set_message('import_html_variable initing profile from nowhere - should this ever happen?');
307      $import_html_profiles[$import_html_current_profile_id] = import_html_profile_defaults();      $import_html_profiles[$import_html_current_profile_id] = import_html_profile_defaults();
308    }    }
309    
310    $import_html_profile = &$import_html_profiles[$import_html_current_profile_id];    $import_html_profile = &$import_html_profiles[$import_html_current_profile_id];
# Line 314  function import_html_variable($var, $val Line 314  function import_html_variable($var, $val
314      #$import_html_profiles[$import_html_current_profile_id] = $import_html_profile;      #$import_html_profiles[$import_html_current_profile_id] = $import_html_profile;
315      variable_set('import_html_profiles', $import_html_profiles);      variable_set('import_html_profiles', $import_html_profiles);
316    }    }
317    
318    return $import_html_profile[$var];    return $import_html_profile[$var];
319  }  }
320    
321  /**  /**
322   * Accessor for the current profile data   * Accessor for the current profile data
323   *   *
324   * Returns the data in a named import_html_profile.   * Returns the data in a named import_html_profile.
325   * The currently active one if not explicitly defined.   * The currently active one if not explicitly defined.
326   * Profile will be padded with expected default fields if not explicitly set (to   * Profile will be padded with expected default fields if not explicitly set (to
327   * assist upgrades)   * assist upgrades)
328   *   *
329   * @param $profile If set, saves this data back to the saved settings. Pass FALSE to delete it.   * @param $profile If set, saves this data back to the saved settings. Pass FALSE to delete it.
330   */   */
331  function import_html_profile($profile_id = NULL, $profile = NULL) {  function import_html_profile($profile_id = NULL, $profile = NULL) {
# Line 337  function import_html_profile($profile_id Line 337  function import_html_profile($profile_id
337    if (isset($profile)) {    if (isset($profile)) {
338      $import_html_profiles[$import_html_profile_id] = $profile;      $import_html_profiles[$import_html_profile_id] = $profile;
339      // Allow a quiet delete      // Allow a quiet delete
340      if (!$profile) {      if (!$profile) {
341        unset($import_html_profiles[$import_html_profile_id]);        unset($import_html_profiles[$import_html_profile_id]);
342      }      }
343      variable_set('import_html_profiles', $import_html_profiles);      variable_set('import_html_profiles', $import_html_profiles);
344    }    }
# Line 348  function import_html_profile($profile_id Line 348  function import_html_profile($profile_id
348    
349  /**  /**
350   * Accessor for a persistant profile switcher.   * Accessor for a persistant profile switcher.
351   *   *
352   * @param $profile_id If set, this becomes the active profile for the duration of the request.   * @param $profile_id If set, this becomes the active profile for the duration of the request.
353   * @returns the current set profile id. 'default' by default.   * @returns the current set profile id. 'default' by default.
354   */   */
# Line 406  function import_html_profile_defaults() Line 406  function import_html_profile_defaults()
406      'recursion_behaviour' => IMPORT_HTML_GLOB_BEFORE,      'recursion_behaviour' => IMPORT_HTML_GLOB_BEFORE,
407      'debug_level' => 0,      'debug_level' => 0,
408      'keep_temp_files' => FALSE,      'keep_temp_files' => FALSE,
409    
410    );    );
411  }  }
412    
# Line 425  function import_html_profile_defaults() Line 425  function import_html_profile_defaults()
425  function import_html_get_preferred_filter($as_list = FALSE) {  function import_html_get_preferred_filter($as_list = FALSE) {
426    static $fid;    static $fid;
427    if ($fid) return $fid;    if ($fid) return $fid;
428    
429    if (($preferred_filter = variable_get('import_html_preferred_filter', 0)) && (!$as_list)) {    if (($preferred_filter = variable_get('import_html_preferred_filter', 0)) && (!$as_list)) {
430      return $preferred_filter;      return $preferred_filter;
431    }    }
# Line 435  function import_html_get_preferred_filte Line 435  function import_html_get_preferred_filte
435    // Detect the ID of 'Unfiltered HTML' format, if available    // Detect the ID of 'Unfiltered HTML' format, if available
436    // it's almost always 3, but I guess we must look it up.    // it's almost always 3, but I guess we must look it up.
437    $ff = filter_formats();    $ff = filter_formats();
438    
439    $formats = array();    $formats = array();
440    foreach ($ff as $f) {    foreach ($ff as $f) {
441      $formats[$f->format] = $f->name;      $formats[$f->format] = $f->name;
# Line 465  function import_html_get_preferred_filte Line 465  function import_html_get_preferred_filte
465  /**  /**
466   * Returns what general 'type' a file probably is, based on suffix or mime if   * Returns what general 'type' a file probably is, based on suffix or mime if
467   * available.   * available.
468   *   *
469   * @returns one of the defined 'file_classes' : page|image|resource|document ... or null if unknown   * @returns one of the defined 'file_classes' : page|image|resource|document ... or null if unknown
470   *   *
471   * This is mainly used for UI coloring, so is not totally cannonic. HTML-or-not   * This is mainly used for UI coloring, so is not totally cannonic. HTML-or-not
472   * is all that really counts.   * is all that really counts.
473   *   *
474   * @see $_import_html_file_classes   * @see $_import_html_file_classes
475   */   */
476  function import_html_guess_file_class($filename) {  function import_html_guess_file_class($filename) {
# Line 488  function import_html_guess_file_class($f Line 488  function import_html_guess_file_class($f
488      if ($mime_type == 'application') return 'document'; // gross generalization      if ($mime_type == 'application') return 'document'; // gross generalization
489      # return 'resource';      # return 'resource';
490    }    }
491    
492    // Some file mirrors (wget or myself) may have produced odd filenames    // Some file mirrors (wget or myself) may have produced odd filenames
493    // strip URL args like # and ? off it    // strip URL args like # and ? off it
494    $filename = preg_replace('|[\?\#].*$|', '', $filename);    $filename = preg_replace('|[\?\#].*$|', '', $filename);
# Line 497  function import_html_guess_file_class($f Line 497  function import_html_guess_file_class($f
497      // assume no suffix at all is a html page      // assume no suffix at all is a html page
498      return 'html';      return 'html';
499    }    }
500    $extension = pathinfo($filename, PATHINFO_EXTENSION);    $extension = pathinfo($filename, PATHINFO_EXTENSION);
501    
502    return @$_import_html_file_classes[strtolower($extension)];    return @$_import_html_file_classes[strtolower($extension)];
503  }  }
# Line 521  function import_html_theme() { Line 521  function import_html_theme() {
521    
522  /**  /**
523   * Implementation of hook_elements().   * Implementation of hook_elements().
524   *   *
525   * Declare our custom pseudo for item   * Declare our custom pseudo for item
526   */   */
527  function import_html_elements() {  function import_html_elements() {
# Line 534  function import_html_elements() { Line 534  function import_html_elements() {
534    
535    
536  ///////////////////////////////////////////////////////  ///////////////////////////////////////////////////////
537  // Batch operations are really part of import_html_process, but  // Batch operations are really part of import_html_process, but
538  // need to be in this module file so the batch runner can FIND the jobs  // need to be in this module file so the batch runner can FIND the jobs
539  // Batch ops do not allow inc file inclusions AFAIK.  // Batch ops do not allow inc file inclusions AFAIK.
540    
541  /**  /**
542   * Return a batch set containing the instructions to run over many files.   * Return a batch set containing the instructions to run over many files.
543   *   *
544   * A batch set is a queue consisting of multiple batch jobs.   * A batch set is a queue consisting of multiple batch jobs.
545   *   *
546   * @see import_html_import_file_batch_job()   * @see import_html_import_file_batch_job()
547   *   *
548   * @param $file_list   * @param $file_list
549   * @param $profile context settings.   * @param $profile context settings.
550   */   */
# Line 575  function import_html_import_file_batch_j Line 575  function import_html_import_file_batch_j
575    $results = import_html_import_file($rel_path, $profile);    $results = import_html_import_file($rel_path, $profile);
576    if (! $results) {    if (! $results) {
577      import_html_debug(      import_html_debug(
578        'Failed to get any results from the attempted analysis of %rel_path.        'Failed to get any results from the attempted analysis of %rel_path.
579          The source file path was probably unavailable, invalid or incorrect.',          The source file path was probably unavailable, invalid or incorrect.',
580        array('%rel_path' => $rel_path),        array('%rel_path' => $rel_path),
581        WATCHDOG_ERROR        WATCHDOG_ERROR
582      );      );
583      return FALSE;      return FALSE;
584    }    }
585    
586    foreach ($results as $summary) {    foreach ($results as $summary) {
587      // Almost useless loop, usually just one node per file      // Almost useless loop, usually just one node per file
588      if (! empty($summary['node'])) {      if (! empty($summary['node'])) {
589        $batch_context['message'] = "Processed <span class='rel-path'>$rel_path</span> : <span class='node-title'>" . $summary['node']->title ."</span>";        $batch_context['message'] = "Processed <span class='rel-path'>$rel_path</span> : <span class='node-title'>" . $summary['node']->title ."</span>";
# Line 610  function import_html_import_file_batch_j Line 610  function import_html_import_file_batch_j
610  /**  /**
611   * Called by the batch queue, invoke import_html_import_directory() and update   * Called by the batch queue, invoke import_html_import_directory() and update
612   * the batch context with the status   * the batch context with the status
613   *   *
614   * @param $rel_path   * @param $rel_path
615   * @param $profile   * @param $profile
616   */   */
# Line 618  function import_html_import_directory_ba Line 618  function import_html_import_directory_ba
618    // Processing a directory when in batch context means adding batch jobs to the end of the current process    // Processing a directory when in batch context means adding batch jobs to the end of the current process
619    // - that in turn may add more jobs.    // - that in turn may add more jobs.
620    // List the files we can, and add them as jobs.    // List the files we can, and add them as jobs.
621    
622    import_html_debug("Starting batch import dir job '$rel_path' ", array('%rel_path' => $rel_path), WATCHDOG_DEBUG );    import_html_debug("Starting batch import dir job '$rel_path' ", array('%rel_path' => $rel_path), WATCHDOG_DEBUG );
623    
624    $working_path = $profile['source_siteroot'] . $rel_path;    $working_path = $profile['source_siteroot'] . $rel_path;
625    
626    // Scan the given directory (NOT deep), add the files, and the subdirectories.    // Scan the given directory (NOT deep), add the files, and the subdirectories.
627    // Processing the subdir will do the recursion itself    // Processing the subdir will do the recursion itself
628    $dir_structure = import_html_file_scan_directory($working_path, ".*", array('.', '..', 'CVS'), 0, FALSE, 'filename', 0, NULL, 1);    $dir_structure = import_html_file_scan_directory($working_path, ".*", array('.', '..', 'CVS'), 0, FALSE, 'filename', 0, NULL, 1);
# Line 662  function import_html_batch_import_finish Line 662  function import_html_batch_import_finish
662    $duration = time() - variable_get('import_html_last_import_timestamp', time());    $duration = time() - variable_get('import_html_last_import_timestamp', time());
663    
664    // results returns a list of node path -titles . Convert to something we can theme    // results returns a list of node path -titles . Convert to something we can theme
665    array_walk(    array_walk(
666      $results,      $results,
667      create_function('&$title, $link', '$title = array("href" => $link, "title" => url($link) ." : ". $title);')      create_function('&$title, $link', '$title = array("href" => $link, "title" => url($link) ." : ". $title);')
668    );    );
669    
670    
671    import_html_debug(    import_html_debug(
672      '<strong>Batch import completed</strong> in %duration. %count items processed (%average) : !links',      '<strong>Batch import completed</strong> in %duration. %count items processed (%average) : !links',
673      array(      array(
# Line 675  function import_html_batch_import_finish Line 675  function import_html_batch_import_finish
675        '%count' => count($results),        '%count' => count($results),
676        '%average' => count($results) ? format_interval($duration / count($results)) .' per item' : "no time details",        '%average' => count($results) ? format_interval($duration / count($results)) .' per item' : "no time details",
677        '!links' => theme('links', array_values($results), 'tree'),        '!links' => theme('links', array_values($results), 'tree'),
678      ),      ),
679      WATCHDOG_NOTICE      WATCHDOG_NOTICE
680    );    );
681    #dpm($results);    #dpm($results);
682  }  }

Legend:
Removed from v.1.71.2.15  
changed lines
  Added in v.1.71.2.16

  ViewVC Help
Powered by ViewVC 1.1.2