| 1 |
<?php |
<?php |
| 2 |
// $Id: import_html.module,v 1.71.2.14 2009/05/17 15:12:46 dman Exp $ |
// $Id: import_html.module,v 1.71.2.15 2009/07/01 02:40:56 dman Exp $ |
| 3 |
|
|
| 4 |
/** |
/** |
| 5 |
* @file Main drupal interface to import_html. |
* @file Main drupal interface to import_html. |
| 6 |
* |
* |
| 7 |
* This file contains Drupal hooks |
* This file contains Drupal hooks |
| 8 |
* and some config/preferences. Actual processing functions are in |
* and some config/preferences. Actual processing functions are in |
| 9 |
* import_html_process.inc |
* import_html_process.inc |
| 10 |
* |
* |
| 11 |
* Synopsis: |
* Synopsis: |
| 12 |
* |
* |
| 13 |
* Facility to import an existing, static HTML site structure into the Drupal |
* Facility to import an existing, static HTML site structure into the Drupal |
| 14 |
* CMS as structured nodes. |
* CMS as structured nodes. |
| 15 |
* |
* |
| 30 |
* |
* |
| 31 |
* import_html libraries can be used by external modules, specifically wrapper. |
* import_html libraries can be used by external modules, specifically wrapper. |
| 32 |
* module (private development) and static.module (experimental) |
* module (private development) and static.module (experimental) |
| 33 |
* |
* |
| 34 |
* CODE: |
* CODE: |
| 35 |
* Internally, there are a lot of filepath fragments being passed around. |
* Internally, there are a lot of filepath fragments being passed around. |
| 36 |
* Any filepath known to represent a folder (eg $source_siteroot) will end with |
* Any filepath known to represent a folder (eg $source_siteroot) will end with |
| 39 |
* dividers. However, many times, small functions like 'ensure_trailing_slash()' |
* dividers. However, many times, small functions like 'ensure_trailing_slash()' |
| 40 |
* may be called. Not all of them are needed, but most of them were put in due |
* may be called. Not all of them are needed, but most of them were put in due |
| 41 |
* to some wierd filesystem vagery or filename encountered. |
* to some wierd filesystem vagery or filename encountered. |
| 42 |
* |
* |
| 43 |
* |
* |
| 44 |
* @ingroup import_html Import HTML |
* @ingroup import_html Import HTML |
| 45 |
* |
* |
| 46 |
* @author Dan Morrison http://coders.co.nz/ |
* @author Dan Morrison http://coders.co.nz/ |
| 47 |
* |
* |
| 48 |
*/ |
*/ |
| 108 |
define('IMPORT_HTML_ADMIN_PATH', 'admin/build/import_html'); |
define('IMPORT_HTML_ADMIN_PATH', 'admin/build/import_html'); |
| 109 |
|
|
| 110 |
|
|
| 111 |
define('IMPORT_HTML_GLOB_BEFORE', 'glob before'); |
define('IMPORT_HTML_GLOB_BEFORE', 'glob before'); |
| 112 |
define('IMPORT_HTML_RECURSE_AFTER', 'recurse after '); |
define('IMPORT_HTML_RECURSE_AFTER', 'recurse after '); |
| 113 |
|
|
| 114 |
/** |
/** |
| 115 |
* To avoid overloading the treeview display, directory listing depth recursion |
* To avoid overloading the treeview display, directory listing depth recursion |
| 116 |
* will be truncated if the number of found files is greater than this. |
* will be truncated if the number of found files is greater than this. |
| 117 |
*/ |
*/ |
| 118 |
define('IMPORT_HTML_MAX_FILE_LISTING_COUNT', 500); |
define('IMPORT_HTML_MAX_FILE_LISTING_COUNT', 500); |
| 119 |
|
|
| 120 |
/** |
/** |
| 121 |
* If the database max_allowed_packet size is limited, then huge batch jobs |
* If the database max_allowed_packet size is limited, then huge batch jobs |
| 134 |
*/ |
*/ |
| 135 |
function import_html_menu() { |
function import_html_menu() { |
| 136 |
$items[IMPORT_HTML_ADMIN_PATH] = array( |
$items[IMPORT_HTML_ADMIN_PATH] = array( |
| 137 |
'title' => 'Import HTML', |
'title' => 'Import HTML', |
| 138 |
'description' => "Import/Export an entire site or directory to/from static HTML", |
'description' => "Import/Export an entire site or directory to/from static HTML", |
| 139 |
|
|
| 140 |
// Use the built-in Overview menu thing |
// Use the built-in Overview menu thing |
| 142 |
'file' => 'system.admin.inc', |
'file' => 'system.admin.inc', |
| 143 |
'file path' => drupal_get_path('module', 'system'), |
'file path' => drupal_get_path('module', 'system'), |
| 144 |
|
|
| 145 |
'access arguments' => array('access import_html'), |
'access arguments' => array('access import_html'), |
| 146 |
'type' => MENU_NORMAL_ITEM, |
'type' => MENU_NORMAL_ITEM, |
| 147 |
); |
); |
| 148 |
$items[IMPORT_HTML_ADMIN_PATH .'/settings'] = array( |
$items[IMPORT_HTML_ADMIN_PATH .'/settings'] = array( |
| 165 |
'type' => MENU_NORMAL_ITEM, |
'type' => MENU_NORMAL_ITEM, |
| 166 |
); |
); |
| 167 |
$items[IMPORT_HTML_ADMIN_PATH .'/import_site'] = array( |
$items[IMPORT_HTML_ADMIN_PATH .'/import_site'] = array( |
| 168 |
'title' => 'Import HTML Site', |
'title' => 'Import HTML Site', |
| 169 |
'description' => "Import/Export an entire site to/from static HTML", |
'description' => "Import/Export an entire site to/from static HTML", |
| 170 |
'weight' => -1, |
'weight' => -1, |
| 171 |
'page callback' => 'drupal_get_form', |
'page callback' => 'drupal_get_form', |
| 172 |
'page arguments' => array('import_html_process_form'), |
'page arguments' => array('import_html_process_form'), |
| 173 |
'file' => 'import_html_ui.inc', |
'file' => 'import_html_ui.inc', |
| 174 |
'access arguments' => array('access import_html'), |
'access arguments' => array('access import_html'), |
| 175 |
'type' => MENU_NORMAL_ITEM, |
'type' => MENU_NORMAL_ITEM, |
| 176 |
); |
); |
| 177 |
$items[IMPORT_HTML_ADMIN_PATH .'/demo'] = array( |
$items[IMPORT_HTML_ADMIN_PATH .'/demo'] = array( |
| 178 |
'title' => 'Demo', |
'title' => 'Demo', |
| 179 |
'weight' => 3, |
'weight' => 3, |
| 180 |
'description' => 'Demonstrate or test HTML Import on one file.', |
'description' => 'Demonstrate or test HTML Import on one file.', |
| 181 |
'page callback' => 'drupal_get_form', |
'page callback' => 'drupal_get_form', |
| 185 |
'type' => MENU_NORMAL_ITEM, |
'type' => MENU_NORMAL_ITEM, |
| 186 |
); |
); |
| 187 |
$items[IMPORT_HTML_ADMIN_PATH .'/results'] = array( |
$items[IMPORT_HTML_ADMIN_PATH .'/results'] = array( |
| 188 |
'title' => 'Import Results Summary', |
'title' => 'Import Results Summary', |
| 189 |
'weight' => 5, |
'weight' => 5, |
| 190 |
'description' => "View log of recent import messages", |
'description' => "View log of recent import messages", |
| 191 |
'page callback' => 'import_html_results', |
'page callback' => 'import_html_results', |
| 192 |
'file' => 'import_html_ui.inc', |
'file' => 'import_html_ui.inc', |
| 193 |
'access arguments' => array('access import_html'), |
'access arguments' => array('access import_html'), |
| 194 |
'type' => MENU_NORMAL_ITEM, |
'type' => MENU_NORMAL_ITEM, |
| 195 |
); |
); |
| 196 |
|
|
| 228 |
$output = t("<p> |
$output = t("<p> |
| 229 |
For background, remember to read <a href='!help_link'>the Import Html help page</a>. |
For background, remember to read <a href='!help_link'>the Import Html help page</a>. |
| 230 |
The <a href='!settings_link'>settings page</a> contains the config options. |
The <a href='!settings_link'>settings page</a> contains the config options. |
| 231 |
</p>", |
</p>", |
| 232 |
array( |
array( |
| 233 |
'!help_link' => url('admin/help/import_html'), |
'!help_link' => url('admin/help/import_html'), |
| 234 |
'!settings_link' => url(IMPORT_HTML_ADMIN_PATH .'/settings'), |
'!settings_link' => url(IMPORT_HTML_ADMIN_PATH .'/settings'), |
| 235 |
) |
) |
| 236 |
); |
); |
| 248 |
</p><p> |
</p><p> |
| 249 |
If you see <em>too much</em> of the page in the 'body' area (nested navbars and layout) |
If you see <em>too much</em> of the page in the 'body' area (nested navbars and layout) |
| 250 |
then the XSL import template or selector needs to be made more specific. |
then the XSL import template or selector needs to be made more specific. |
| 251 |
If you see none, or not enough content in the body area, the template or |
If you see none, or not enough content in the body area, the template or |
| 252 |
selector needs to be adjusted to encompass the text correctly. |
selector needs to be adjusted to encompass the text correctly. |
| 253 |
</p><p> |
</p><p> |
| 254 |
If you get an error or no result, the input HTML is probably too invalid to work with. |
If you get an error or no result, the input HTML is probably too invalid to work with. |
| 255 |
</p><p> |
</p><p> |
| 256 |
Single demo imports do not have the full context information to work with, |
Single demo imports do not have the full context information to work with, |
| 257 |
so the menu or URL alias (and internal relinking) shown may not be representative of the real result. |
so the menu or URL alias (and internal relinking) shown may not be representative of the real result. |
| 258 |
</p>"); |
</p>"); |
| 259 |
|
|
| 282 |
|
|
| 283 |
/** |
/** |
| 284 |
* A wrapper to variable_set, variable_get to encapsulate multiple import 'profiles' |
* A wrapper to variable_set, variable_get to encapsulate multiple import 'profiles' |
| 285 |
* |
* |
| 286 |
* This natively just returns the settings from the current 'default' profile, |
* This natively just returns the settings from the current 'default' profile, |
| 287 |
* but also allows the settings forms to be extended to other sets. |
* but also allows the settings forms to be extended to other sets. |
| 288 |
* |
* |
| 289 |
* Use INSTEAD OF variable_get() and it will return the 'default' or 'active' profile vars. |
* Use INSTEAD OF variable_get() and it will return the 'default' or 'active' profile vars. |
| 290 |
* |
* |
| 291 |
* @param $var Name of the variable within the currently active profile to retrieve. |
* @param $var Name of the variable within the currently active profile to retrieve. |
| 292 |
* @param $val If set, sets this variable within the profile and saves it. |
* @param $val If set, sets this variable within the profile and saves it. |
| 293 |
* @see import_html_current_profile(); |
* @see import_html_current_profile(); |
| 294 |
*/ |
*/ |
| 295 |
function import_html_variable($var, $val = NULL) { |
function import_html_variable($var, $val = NULL) { |
| 304 |
if (empty($import_html_profiles[$import_html_current_profile_id])) { |
if (empty($import_html_profiles[$import_html_current_profile_id])) { |
| 305 |
// Fill in defaults (should only be needed first time, if that |
// Fill in defaults (should only be needed first time, if that |
| 306 |
drupal_set_message('import_html_variable initing profile from nowhere - should this ever happen?'); |
drupal_set_message('import_html_variable initing profile from nowhere - should this ever happen?'); |
| 307 |
$import_html_profiles[$import_html_current_profile_id] = import_html_profile_defaults(); |
$import_html_profiles[$import_html_current_profile_id] = import_html_profile_defaults(); |
| 308 |
} |
} |
| 309 |
|
|
| 310 |
$import_html_profile = &$import_html_profiles[$import_html_current_profile_id]; |
$import_html_profile = &$import_html_profiles[$import_html_current_profile_id]; |
| 314 |
#$import_html_profiles[$import_html_current_profile_id] = $import_html_profile; |
#$import_html_profiles[$import_html_current_profile_id] = $import_html_profile; |
| 315 |
variable_set('import_html_profiles', $import_html_profiles); |
variable_set('import_html_profiles', $import_html_profiles); |
| 316 |
} |
} |
| 317 |
|
|
| 318 |
return $import_html_profile[$var]; |
return $import_html_profile[$var]; |
| 319 |
} |
} |
| 320 |
|
|
| 321 |
/** |
/** |
| 322 |
* Accessor for the current profile data |
* Accessor for the current profile data |
| 323 |
* |
* |
| 324 |
* Returns the data in a named import_html_profile. |
* Returns the data in a named import_html_profile. |
| 325 |
* The currently active one if not explicitly defined. |
* The currently active one if not explicitly defined. |
| 326 |
* Profile will be padded with expected default fields if not explicitly set (to |
* Profile will be padded with expected default fields if not explicitly set (to |
| 327 |
* assist upgrades) |
* assist upgrades) |
| 328 |
* |
* |
| 329 |
* @param $profile If set, saves this data back to the saved settings. Pass FALSE to delete it. |
* @param $profile If set, saves this data back to the saved settings. Pass FALSE to delete it. |
| 330 |
*/ |
*/ |
| 331 |
function import_html_profile($profile_id = NULL, $profile = NULL) { |
function import_html_profile($profile_id = NULL, $profile = NULL) { |
| 337 |
if (isset($profile)) { |
if (isset($profile)) { |
| 338 |
$import_html_profiles[$import_html_profile_id] = $profile; |
$import_html_profiles[$import_html_profile_id] = $profile; |
| 339 |
// Allow a quiet delete |
// Allow a quiet delete |
| 340 |
if (!$profile) { |
if (!$profile) { |
| 341 |
unset($import_html_profiles[$import_html_profile_id]); |
unset($import_html_profiles[$import_html_profile_id]); |
| 342 |
} |
} |
| 343 |
variable_set('import_html_profiles', $import_html_profiles); |
variable_set('import_html_profiles', $import_html_profiles); |
| 344 |
} |
} |
| 348 |
|
|
| 349 |
/** |
/** |
| 350 |
* Accessor for a persistant profile switcher. |
* Accessor for a persistant profile switcher. |
| 351 |
* |
* |
| 352 |
* @param $profile_id If set, this becomes the active profile for the duration of the request. |
* @param $profile_id If set, this becomes the active profile for the duration of the request. |
| 353 |
* @returns the current set profile id. 'default' by default. |
* @returns the current set profile id. 'default' by default. |
| 354 |
*/ |
*/ |
| 406 |
'recursion_behaviour' => IMPORT_HTML_GLOB_BEFORE, |
'recursion_behaviour' => IMPORT_HTML_GLOB_BEFORE, |
| 407 |
'debug_level' => 0, |
'debug_level' => 0, |
| 408 |
'keep_temp_files' => FALSE, |
'keep_temp_files' => FALSE, |
| 409 |
|
|
| 410 |
); |
); |
| 411 |
} |
} |
| 412 |
|
|
| 425 |
function import_html_get_preferred_filter($as_list = FALSE) { |
function import_html_get_preferred_filter($as_list = FALSE) { |
| 426 |
static $fid; |
static $fid; |
| 427 |
if ($fid) return $fid; |
if ($fid) return $fid; |
| 428 |
|
|
| 429 |
if (($preferred_filter = variable_get('import_html_preferred_filter', 0)) && (!$as_list)) { |
if (($preferred_filter = variable_get('import_html_preferred_filter', 0)) && (!$as_list)) { |
| 430 |
return $preferred_filter; |
return $preferred_filter; |
| 431 |
} |
} |
| 435 |
// Detect the ID of 'Unfiltered HTML' format, if available |
// Detect the ID of 'Unfiltered HTML' format, if available |
| 436 |
// it's almost always 3, but I guess we must look it up. |
// it's almost always 3, but I guess we must look it up. |
| 437 |
$ff = filter_formats(); |
$ff = filter_formats(); |
| 438 |
|
|
| 439 |
$formats = array(); |
$formats = array(); |
| 440 |
foreach ($ff as $f) { |
foreach ($ff as $f) { |
| 441 |
$formats[$f->format] = $f->name; |
$formats[$f->format] = $f->name; |
| 465 |
/** |
/** |
| 466 |
* Returns what general 'type' a file probably is, based on suffix or mime if |
* Returns what general 'type' a file probably is, based on suffix or mime if |
| 467 |
* available. |
* available. |
| 468 |
* |
* |
| 469 |
* @returns one of the defined 'file_classes' : page|image|resource|document ... or null if unknown |
* @returns one of the defined 'file_classes' : page|image|resource|document ... or null if unknown |
| 470 |
* |
* |
| 471 |
* This is mainly used for UI coloring, so is not totally cannonic. HTML-or-not |
* This is mainly used for UI coloring, so is not totally cannonic. HTML-or-not |
| 472 |
* is all that really counts. |
* is all that really counts. |
| 473 |
* |
* |
| 474 |
* @see $_import_html_file_classes |
* @see $_import_html_file_classes |
| 475 |
*/ |
*/ |
| 476 |
function import_html_guess_file_class($filename) { |
function import_html_guess_file_class($filename) { |
| 488 |
if ($mime_type == 'application') return 'document'; // gross generalization |
if ($mime_type == 'application') return 'document'; // gross generalization |
| 489 |
# return 'resource'; |
# return 'resource'; |
| 490 |
} |
} |
| 491 |
|
|
| 492 |
// Some file mirrors (wget or myself) may have produced odd filenames |
// Some file mirrors (wget or myself) may have produced odd filenames |
| 493 |
// strip URL args like # and ? off it |
// strip URL args like # and ? off it |
| 494 |
$filename = preg_replace('|[\?\#].*$|', '', $filename); |
$filename = preg_replace('|[\?\#].*$|', '', $filename); |
| 497 |
// assume no suffix at all is a html page |
// assume no suffix at all is a html page |
| 498 |
return 'html'; |
return 'html'; |
| 499 |
} |
} |
| 500 |
$extension = pathinfo($filename, PATHINFO_EXTENSION); |
$extension = pathinfo($filename, PATHINFO_EXTENSION); |
| 501 |
|
|
| 502 |
return @$_import_html_file_classes[strtolower($extension)]; |
return @$_import_html_file_classes[strtolower($extension)]; |
| 503 |
} |
} |
| 521 |
|
|
| 522 |
/** |
/** |
| 523 |
* Implementation of hook_elements(). |
* Implementation of hook_elements(). |
| 524 |
* |
* |
| 525 |
* Declare our custom pseudo for item |
* Declare our custom pseudo for item |
| 526 |
*/ |
*/ |
| 527 |
function import_html_elements() { |
function import_html_elements() { |
| 534 |
|
|
| 535 |
|
|
| 536 |
/////////////////////////////////////////////////////// |
/////////////////////////////////////////////////////// |
| 537 |
// Batch operations are really part of import_html_process, but |
// Batch operations are really part of import_html_process, but |
| 538 |
// need to be in this module file so the batch runner can FIND the jobs |
// need to be in this module file so the batch runner can FIND the jobs |
| 539 |
// Batch ops do not allow inc file inclusions AFAIK. |
// Batch ops do not allow inc file inclusions AFAIK. |
| 540 |
|
|
| 541 |
/** |
/** |
| 542 |
* Return a batch set containing the instructions to run over many files. |
* Return a batch set containing the instructions to run over many files. |
| 543 |
* |
* |
| 544 |
* A batch set is a queue consisting of multiple batch jobs. |
* A batch set is a queue consisting of multiple batch jobs. |
| 545 |
* |
* |
| 546 |
* @see import_html_import_file_batch_job() |
* @see import_html_import_file_batch_job() |
| 547 |
* |
* |
| 548 |
* @param $file_list |
* @param $file_list |
| 549 |
* @param $profile context settings. |
* @param $profile context settings. |
| 550 |
*/ |
*/ |
| 575 |
$results = import_html_import_file($rel_path, $profile); |
$results = import_html_import_file($rel_path, $profile); |
| 576 |
if (! $results) { |
if (! $results) { |
| 577 |
import_html_debug( |
import_html_debug( |
| 578 |
'Failed to get any results from the attempted analysis of %rel_path. |
'Failed to get any results from the attempted analysis of %rel_path. |
| 579 |
The source file path was probably unavailable, invalid or incorrect.', |
The source file path was probably unavailable, invalid or incorrect.', |
| 580 |
array('%rel_path' => $rel_path), |
array('%rel_path' => $rel_path), |
| 581 |
WATCHDOG_ERROR |
WATCHDOG_ERROR |
| 582 |
); |
); |
| 583 |
return FALSE; |
return FALSE; |
| 584 |
} |
} |
| 585 |
|
|
| 586 |
foreach ($results as $summary) { |
foreach ($results as $summary) { |
| 587 |
// Almost useless loop, usually just one node per file |
// Almost useless loop, usually just one node per file |
| 588 |
if (! empty($summary['node'])) { |
if (! empty($summary['node'])) { |
| 589 |
$batch_context['message'] = "Processed <span class='rel-path'>$rel_path</span> : <span class='node-title'>" . $summary['node']->title ."</span>"; |
$batch_context['message'] = "Processed <span class='rel-path'>$rel_path</span> : <span class='node-title'>" . $summary['node']->title ."</span>"; |
| 610 |
/** |
/** |
| 611 |
* Called by the batch queue, invoke import_html_import_directory() and update |
* Called by the batch queue, invoke import_html_import_directory() and update |
| 612 |
* the batch context with the status |
* the batch context with the status |
| 613 |
* |
* |
| 614 |
* @param $rel_path |
* @param $rel_path |
| 615 |
* @param $profile |
* @param $profile |
| 616 |
*/ |
*/ |
| 618 |
// Processing a directory when in batch context means adding batch jobs to the end of the current process |
// Processing a directory when in batch context means adding batch jobs to the end of the current process |
| 619 |
// - that in turn may add more jobs. |
// - that in turn may add more jobs. |
| 620 |
// List the files we can, and add them as jobs. |
// List the files we can, and add them as jobs. |
| 621 |
|
|
| 622 |
import_html_debug("Starting batch import dir job '$rel_path' ", array('%rel_path' => $rel_path), WATCHDOG_DEBUG ); |
import_html_debug("Starting batch import dir job '$rel_path' ", array('%rel_path' => $rel_path), WATCHDOG_DEBUG ); |
| 623 |
|
|
| 624 |
$working_path = $profile['source_siteroot'] . $rel_path; |
$working_path = $profile['source_siteroot'] . $rel_path; |
| 625 |
|
|
| 626 |
// Scan the given directory (NOT deep), add the files, and the subdirectories. |
// Scan the given directory (NOT deep), add the files, and the subdirectories. |
| 627 |
// Processing the subdir will do the recursion itself |
// Processing the subdir will do the recursion itself |
| 628 |
$dir_structure = import_html_file_scan_directory($working_path, ".*", array('.', '..', 'CVS'), 0, FALSE, 'filename', 0, NULL, 1); |
$dir_structure = import_html_file_scan_directory($working_path, ".*", array('.', '..', 'CVS'), 0, FALSE, 'filename', 0, NULL, 1); |
| 662 |
$duration = time() - variable_get('import_html_last_import_timestamp', time()); |
$duration = time() - variable_get('import_html_last_import_timestamp', time()); |
| 663 |
|
|
| 664 |
// results returns a list of node path -titles . Convert to something we can theme |
// results returns a list of node path -titles . Convert to something we can theme |
| 665 |
array_walk( |
array_walk( |
| 666 |
$results, |
$results, |
| 667 |
create_function('&$title, $link', '$title = array("href" => $link, "title" => url($link) ." : ". $title);') |
create_function('&$title, $link', '$title = array("href" => $link, "title" => url($link) ." : ". $title);') |
| 668 |
); |
); |
| 669 |
|
|
| 670 |
|
|
| 671 |
import_html_debug( |
import_html_debug( |
| 672 |
'<strong>Batch import completed</strong> in %duration. %count items processed (%average) : !links', |
'<strong>Batch import completed</strong> in %duration. %count items processed (%average) : !links', |
| 673 |
array( |
array( |
| 675 |
'%count' => count($results), |
'%count' => count($results), |
| 676 |
'%average' => count($results) ? format_interval($duration / count($results)) .' per item' : "no time details", |
'%average' => count($results) ? format_interval($duration / count($results)) .' per item' : "no time details", |
| 677 |
'!links' => theme('links', array_values($results), 'tree'), |
'!links' => theme('links', array_values($results), 'tree'), |
| 678 |
), |
), |
| 679 |
WATCHDOG_NOTICE |
WATCHDOG_NOTICE |
| 680 |
); |
); |
| 681 |
#dpm($results); |
#dpm($results); |
| 682 |
} |
} |