| 1 |
<?php |
<?php |
| 2 |
// $Id: import_html_process.inc,v 1.5.4.15 2009/07/01 02:40:56 dman Exp $ |
// $Id: import_html_process.inc,v 1.5.4.16 2009/09/09 06:47:22 dman Exp $ |
| 3 |
/** |
/** |
| 4 |
* @file Actual routines for importing files. |
* @file Actual routines for importing files. |
| 5 |
* |
* |
| 6 |
* |
* |
| 7 |
* @ingroup import_html Import HTML |
* @ingroup import_html Import HTML |
| 8 |
* @author Dan Morrison http://coders.co.nz/ |
* @author Dan Morrison http://coders.co.nz/ |
| 9 |
* |
* |
| 15 |
|
|
| 16 |
/** |
/** |
| 17 |
* Files have been selected, set them up for processing |
* Files have been selected, set them up for processing |
| 18 |
* |
* |
| 19 |
* @param $file_list |
* @param $file_list |
| 20 |
* an array of simple file paths, probably selected from the file_list form |
* an array of simple file paths, probably selected from the file_list form |
| 21 |
* @param $context |
* @param $context |
| 22 |
* A set of parameters, similar to the import_html profile, possibly from |
* A set of parameters, similar to the import_html profile, possibly from |
| 23 |
* the list_filesystem form. Should contain the base path that the |
* the list_filesystem form. Should contain the base path that the |
| 24 |
* submitted files are relative to. |
* submitted files are relative to. |
| 25 |
* Note that context is NOT a full profile. |
* Note that context is NOT a full profile. |
| 26 |
* |
* |
| 27 |
* @return A result set of nodes |
* @return A result set of nodes |
| 28 |
*/ |
*/ |
| 29 |
function import_html_import_files($file_list, $context) { |
function import_html_import_files($file_list, $context) { |
| 39 |
} |
} |
| 40 |
|
|
| 41 |
// TODO see what we can do about clearing out our memory |
// TODO see what we can do about clearing out our memory |
| 42 |
|
|
| 43 |
$results = array(); |
$results = array(); |
| 44 |
foreach ($file_list as $list_index => $rel_path) { |
foreach ($file_list as $list_index => $rel_path) { |
| 45 |
if (! empty($rel_path)) { |
if (! empty($rel_path)) { |
| 46 |
$file_results = import_html_import_file($rel_path, $context); |
$file_results = import_html_import_file($rel_path, $context); |
| 47 |
if (! $file_results) { |
if (! $file_results) { |
| 48 |
drupal_set_message(t(' |
drupal_set_message(t(' |
| 49 |
Failed to get any results from the attempted analysis of %rel_path. |
Failed to get any results from the attempted analysis of %rel_path. |
| 50 |
The source file path was probably unavailable or incorrect.', |
The source file path was probably unavailable or incorrect.', |
| 51 |
array('%rel_path' => $rel_path)), 'error'); |
array('%rel_path' => $rel_path)), 'error'); |
| 52 |
continue; |
continue; |
| 53 |
} |
} |
| 54 |
// Result of importing a file MAY be more than one node, |
// Result of importing a file MAY be more than one node, |
| 55 |
// unlikely as it may be for XHTML, but is possible for XML extension |
// unlikely as it may be for XHTML, but is possible for XML extension |
| 56 |
import_html_debug_code("Result of processing file $rel_path", $file_results, WATCHDOG_DEBUG); |
import_html_debug_code("Result of processing file $rel_path", $file_results, WATCHDOG_DEBUG); |
| 57 |
foreach ($file_results as $node) { |
foreach ($file_results as $node) { |
| 75 |
|
|
| 76 |
/** |
/** |
| 77 |
* Given a html file, prepare all the node info we can get out of it. |
* Given a html file, prepare all the node info we can get out of it. |
| 78 |
* |
* |
| 79 |
* This func mainly prepares the paths and relative links. Data extraction happens in _import_html_process_html_page() |
* This func mainly prepares the paths and relative links. Data extraction happens in _import_html_process_html_page() |
| 80 |
* |
* |
| 81 |
* It does submit and save the node to the database. |
* It does submit and save the node to the database. |
| 82 |
* |
* |
| 83 |
* @param $context |
* @param $context |
| 84 |
* describes the context this function was called in. It should contain |
* describes the context this function was called in. It should contain |
| 85 |
* 'profile_id' and 'source_siteroot'. Also 'form_id' |
* 'profile_id' and 'source_siteroot'. Also 'form_id' |
| 86 |
* |
* |
| 87 |
* @return array that may contain more than one node (in extreme cases) |
* @return array that may contain more than one node (in extreme cases) |
| 88 |
*/ |
*/ |
| 89 |
function import_html_import_file($rel_path, $context) { |
function import_html_import_file($rel_path, $context) { |
| 90 |
|
|
| 91 |
// Read the profile id and use that as a context for all settings |
// Read the profile id and use that as a context for all settings |
| 92 |
$profile = import_html_profile($context['profile_id']); |
$profile = import_html_profile($context['profile_id']); |
| 93 |
|
|
| 94 |
$source_siteroot = $context['source_siteroot']; |
$source_siteroot = $context['source_siteroot']; |
| 95 |
$dest_root = ensure_trailing_slash($profile['file_storage_path']); |
$dest_root = ensure_trailing_slash($profile['file_storage_path']); |
| 96 |
$is_remote = valid_url($source_siteroot, TRUE); |
$is_remote = valid_url($source_siteroot, TRUE); |
| 97 |
|
|
| 98 |
import_html_debug( |
import_html_debug( |
| 99 |
"<strong>Importing</strong> '%rel_path'", |
"<strong>Importing</strong> '%rel_path'", |
| 100 |
array('%rel_path' => $rel_path), |
array('%rel_path' => $rel_path), |
| 101 |
WATCHDOG_NOTICE |
WATCHDOG_NOTICE |
| 102 |
); |
); |
| 103 |
|
|
| 104 |
$source_path = $source_siteroot . $rel_path; |
$source_path = $source_siteroot . $rel_path; |
| 108 |
// Handle URLS/Folders with training slash |
// Handle URLS/Folders with training slash |
| 109 |
if (preg_match("|/$|", $rel_path)) { |
if (preg_match("|/$|", $rel_path)) { |
| 110 |
// Handle trailing slashes differently at home and away |
// Handle trailing slashes differently at home and away |
| 111 |
if ($is_remote) { |
if ($is_remote) { |
| 112 |
// It's remote |
// It's remote |
| 113 |
$default_documents = split(",", $profile['default_document']); |
$default_documents = split(",", $profile['default_document']); |
| 114 |
// need a dummy filename if retrieving default docs. |
// need a dummy filename if retrieving default docs. |
| 115 |
$dest_path .= trim(array_shift($default_documents));; |
$dest_path .= trim(array_shift($default_documents));; |
| 116 |
} |
} |
| 117 |
else { |
else { |
| 118 |
return; //skip local directories altogether (their contents is selected individually) |
return; //skip local directories altogether (their contents is selected individually) |
| 119 |
} |
} |
| 120 |
} |
} |
| 121 |
|
|
| 122 |
$file = array( |
$file = array( |
| 123 |
'source' => $source_path, |
'source' => $source_path, |
| 124 |
'dest' => $dest_path, |
'dest' => $dest_path, |
| 132 |
// non-page resource - what sort of processing can I do here? |
// non-page resource - what sort of processing can I do here? |
| 133 |
|
|
| 134 |
import_html_debug( |
import_html_debug( |
| 135 |
"I think (due to file suffix '%doctype') |
"I think (due to file suffix '%doctype') |
| 136 |
that '%source' is not a html page I can process.<br/> |
that '%source' is not a html page I can process.<br/> |
| 137 |
It's just been copied into '!dest'.", |
It's just been copied into '!dest'.", |
| 138 |
array( |
array( |
| 139 |
'%source' => basename($source_path), |
'%source' => basename($source_path), |
| 140 |
'!dest' => l($dest_path, $dest_path), |
'!dest' => l($dest_path, $dest_path), |
| 141 |
'%doctype' => import_html_guess_file_class($checkfile), |
'%doctype' => import_html_guess_file_class($checkfile), |
| 142 |
), |
), |
| 143 |
WATCHDOG_NOTICE |
WATCHDOG_NOTICE |
| 144 |
); |
); |
| 145 |
|
|
| 146 |
import_html_get_raw_file_local($source_path, $dest_path, $is_remote); |
import_html_get_raw_file_local($source_path, $dest_path, $is_remote); |
| 158 |
if ($profile['handle_duplicates'] == IMPORT_HTML_SKIP) { |
if ($profile['handle_duplicates'] == IMPORT_HTML_SKIP) { |
| 159 |
|
|
| 160 |
import_html_debug( |
import_html_debug( |
| 161 |
"We already have '%new_path' in the system as '%normal_path'. |
"We already have '%new_path' in the system as '%normal_path'. |
| 162 |
According to import_html settings, this import is being skipped", |
According to import_html settings, this import is being skipped", |
| 163 |
array( |
array( |
| 164 |
'%new_path' => $new_path, |
'%new_path' => $new_path, |
| 165 |
'%normal_path' => $normal_path, |
'%normal_path' => $normal_path, |
| 166 |
), |
), |
| 167 |
WATCHDOG_INFO |
WATCHDOG_INFO |
| 168 |
); |
); |
| 169 |
|
|
| 170 |
return; |
return; |
| 171 |
} |
} |
| 172 |
import_html_debug( |
import_html_debug( |
| 173 |
"We already have '%new_path' in the system as '%normal_path'. |
"We already have '%new_path' in the system as '%normal_path'. |
| 174 |
Overwriting/updating it with the new import", |
Overwriting/updating it with the new import", |
| 175 |
array( |
array( |
| 176 |
'%new_path' => $new_path, |
'%new_path' => $new_path, |
| 177 |
'%normal_path' => $normal_path, |
'%normal_path' => $normal_path, |
| 178 |
), |
), |
| 179 |
WATCHDOG_INFO |
WATCHDOG_INFO |
| 180 |
); |
); |
| 181 |
} |
} |
| 182 |
|
|
| 198 |
import_html_debug( |
import_html_debug( |
| 199 |
"Failed to fetch a copy of %source_path into %dest_path", |
"Failed to fetch a copy of %source_path into %dest_path", |
| 200 |
array('%dest_path' => $dest_path, '%source_path' => $source_path), |
array('%dest_path' => $dest_path, '%source_path' => $source_path), |
| 201 |
WATCHDOG_ERROR |
WATCHDOG_ERROR |
| 202 |
); |
); |
| 203 |
|
|
| 204 |
return FALSE; |
return FALSE; |
| 205 |
} |
} |
| 206 |
|
|
| 207 |
import_html_debug( |
import_html_debug( |
| 208 |
"Fetched a %persistant local copy to %dest_path", |
"Fetched a %persistant local copy to %dest_path", |
| 209 |
array( |
array( |
| 210 |
'%dest_path' => $dest_path, |
'%dest_path' => $dest_path, |
| 211 |
'%persistant' => ($profile['keep_temp_files'] ? 'persistant' : 'temporary') |
'%persistant' => ($profile['keep_temp_files'] ? 'persistant' : 'temporary') |
| 212 |
) |
) |
| 213 |
); |
); |
| 215 |
} |
} |
| 216 |
else { |
else { |
| 217 |
import_html_debug( |
import_html_debug( |
| 218 |
"Local copy exists at %dest_path", |
"Local copy exists at %dest_path", |
| 219 |
array('%dest_path' => $dest_path) |
array('%dest_path' => $dest_path) |
| 220 |
); |
); |
| 221 |
} |
} |
| 222 |
|
|
| 223 |
if ($is_remote) { |
if ($is_remote) { |
| 224 |
// Importing a remote file - as for demo |
// Importing a remote file - as for demo |
| 225 |
// relinking will happen to point back at where it came from, not here |
// relinking will happen to point back at where it came from, not here |
| 226 |
// TODO need yet another parameter to indicate this, the path to neighbours |
// TODO need yet another parameter to indicate this, the path to neighbours |
| 227 |
# $rel_path = $source_path; |
# $rel_path = $source_path; |
| 228 |
// that worked, but created some odd paths in places when using a prefix. |
// that worked, but created some odd paths in places when using a prefix. |
| 229 |
// Resources and cross-links were found, but local alias was wrong |
// Resources and cross-links were found, but local alias was wrong |
| 230 |
|
|
| 231 |
import_html_debug("Relinking this source will point back to the original URL context!"); |
import_html_debug("Relinking this source will point back to the original URL context!"); |
| 232 |
} |
} |
| 233 |
// We have a local copy now. |
// We have a local copy now. |
| 244 |
|
|
| 245 |
if (!$nodes) { |
if (!$nodes) { |
| 246 |
import_html_debug( |
import_html_debug( |
| 247 |
"Failed to process a node out of file '%rel_path'", |
"Failed to process a node out of file '%rel_path'", |
| 248 |
array('%rel_path' => $rel_path), |
array('%rel_path' => $rel_path), |
| 249 |
WATCHDOG_ERROR |
WATCHDOG_ERROR |
| 250 |
); |
); |
| 251 |
return FALSE; |
return FALSE; |
| 252 |
} |
} |
| 253 |
|
|
| 254 |
// We can immediately discard the source file - |
// We can immediately discard the source file - |
| 255 |
// it should have been a temp copy made by import_html_get_raw_file_local() above |
// it should have been a temp copy made by import_html_get_raw_file_local() above |
| 256 |
if ( file_exists($dest_path) && ! $profile['keep_temp_files']) { |
if ( file_exists($dest_path) && ! $profile['keep_temp_files']) { |
| 257 |
unlink($dest_path); |
unlink($dest_path); |
| 258 |
} |
} |
| 259 |
|
|
| 260 |
// Almost trivial loop (probably over 1 item) |
// Almost trivial loop (probably over 1 item) |
| 261 |
foreach ($nodes as $node) { |
foreach ($nodes as $node) { |
| 262 |
// The node data object has been initialized |
// The node data object has been initialized |
| 267 |
array('%title' => $node->title), |
array('%title' => $node->title), |
| 268 |
WATCHDOG_INFO |
WATCHDOG_INFO |
| 269 |
); |
); |
| 270 |
|
|
| 271 |
// If it's overwriting an existing path, merge values |
// If it's overwriting an existing path, merge values |
| 272 |
$node = import_html_merge_over_existing_node($node, $profile); |
$node = import_html_merge_over_existing_node($node, $profile); |
| 273 |
|
|
| 277 |
|
|
| 278 |
// Finished prep, now save |
// Finished prep, now save |
| 279 |
|
|
| 280 |
// 'prepare' occasionally ensures that some required fields are filled in |
// 'prepare' occasionally ensures that some required fields are filled in |
| 281 |
// depending on enabled modules. Maybe. |
// depending on enabled modules. Maybe. |
| 282 |
// node_invoke_nodeapi($node, 'prepare'); |
// node_invoke_nodeapi($node, 'prepare'); |
| 283 |
|
|
| 284 |
// I really should VALIDATE now! |
// I really should VALIDATE now! |
| 285 |
// but what to do with errors? |
// but what to do with errors? |
| 286 |
// path_nodeapi complains if I try to validate before I know my nid. Is that correct? |
// path_nodeapi complains if I try to validate before I know my nid. Is that correct? |
| 287 |
// node_invoke_nodeapi($node, 'validate'); |
// node_invoke_nodeapi($node, 'validate'); |
| 288 |
|
|
| 289 |
// Submit doesn't actually save, it just fills in extra fields |
// Submit doesn't actually save, it just fills in extra fields |
| 290 |
$node = node_submit($node); |
$node = node_submit($node); |
| 291 |
|
|
| 292 |
|
|
|
|
|
| 293 |
if ($context['form_id'] == 'import_html_demo_form') { |
if ($context['form_id'] == 'import_html_demo_form') { |
| 294 |
// DO NOT actually save stuff to the database |
// DO NOT actually save stuff to the database |
| 295 |
$file['node'] = $node; |
$file['node'] = $node; |
| 315 |
if (! empty($node->nid)) { |
if (! empty($node->nid)) { |
| 316 |
|
|
| 317 |
import_html_debug( |
import_html_debug( |
| 318 |
"!node_link Exists, updating it with content from %source_path.", |
"!node_link Exists, updating it with content from %source_path.", |
| 319 |
array( |
array( |
| 320 |
'!node_link' => l('node '. $node->nid, 'node/'. $node->nid), |
'!node_link' => l('node '. $node->nid, 'node/'. $node->nid), |
| 321 |
'%source_path' => $source_path |
'%source_path' => $source_path |
| 322 |
), |
), |
| 323 |
WATCHDOG_INFO |
WATCHDOG_INFO |
| 329 |
else { |
else { |
| 330 |
|
|
| 331 |
import_html_debug( |
import_html_debug( |
| 332 |
"Inserting New Node !node_link with content from %source_path", |
"Inserting New Node !node_link with content from %source_path", |
| 333 |
array('%source_path' => $source_path, '!node_link' => l($node->path, $node->path)), |
array('%source_path' => $source_path, '!node_link' => l($node->path, $node->path)), |
| 334 |
WATCHDOG_INFO |
WATCHDOG_INFO |
| 335 |
); |
); |
| 336 |
#dpm($node); |
#dpm($node); |
| 337 |
node_save($node); |
node_save($node); |
| 338 |
|
|
| 339 |
// Had to wait until I had an ID to do this |
// Had to wait until I had an ID to do this |
| 340 |
// These callbacks add the aliases and menus |
// These callbacks add the aliases and menus |
| 341 |
module_invoke_all('import_html_after_save', $profile, $node); |
module_invoke_all('import_html_after_save', $profile, $node); |
| 342 |
// note, navigation items only gets set up on first import. |
// note, navigation items only gets set up on first import. |
| 343 |
// After that you are on your own |
// After that you are on your own |
| 344 |
} |
} |
| 345 |
} // Finished updating database |
} // Finished updating database |
| 346 |
|
|
| 347 |
|
|
| 348 |
#// Keep a copy for auditing (maybe not if memory gets heavy) |
#// Keep a copy for auditing (maybe not if memory gets heavy) |
| 349 |
$mini_node = (object) array(); |
$mini_node = (object) array(); |
| 351 |
$mini_node->$att = $node->$att; |
$mini_node->$att = $node->$att; |
| 352 |
} |
} |
| 353 |
$file['node'] = $mini_node; |
$file['node'] = $mini_node; |
| 354 |
|
|
| 355 |
import_html_debug( |
import_html_debug( |
| 356 |
"<strong>Imported Node</strong> !node_link with content from %source_path . [mem: %memory]", |
"<strong>Imported Node</strong> !node_link with content from %source_path . [mem: %memory]", |
| 357 |
array( |
array( |
| 358 |
'%source_path' => $source_path, |
'%source_path' => $source_path, |
| 359 |
'!node_link' => l($node->title, 'node/' . $node->nid), |
'!node_link' => l($node->title, 'node/' . $node->nid), |
| 360 |
# Node path is usually right, but we will actually let the system figure that out - path may be off! |
# Node path is usually right, but we will actually let the system figure that out - path may be off! |
| 361 |
#'!node_link' => l($node->title, $node->path), |
#'!node_link' => l($node->title, $node->path), |
| 368 |
$files[] = $file; |
$files[] = $file; |
| 369 |
} // Looped over all files |
} // Looped over all files |
| 370 |
|
|
| 371 |
return $files; |
return $files; |
| 372 |
} |
} |
| 373 |
|
|
| 374 |
/** |
/** |
| 375 |
* Big brother to import_html_import_file |
* Big brother to import_html_import_file |
| 376 |
* |
* |
| 377 |
* Recursively imports ALL FILES in a given folder and returns a result array |
* Recursively imports ALL FILES in a given folder and returns a result array |
| 378 |
* |
* |
| 379 |
* This does this immediately in normal flow, and really should be done in |
* This does this immediately in normal flow, and really should be done in |
| 380 |
* batch. Try not to do this directly a lot. |
* batch. Try not to do this directly a lot. |
| 381 |
*/ |
*/ |
| 382 |
function import_html_import_directory($rel_path, $context) { |
function import_html_import_directory($rel_path, $context) { |
| 383 |
// Read the profile id and use that as a context for all settings |
// Read the profile id and use that as a context for all settings |
| 384 |
$profile = import_html_profile($context['profile_id']); |
$profile = import_html_profile($context['profile_id']); |
| 385 |
$source_siteroot = $context['source_siteroot']; |
$source_siteroot = $context['source_siteroot']; |
| 386 |
import_html_debug( |
import_html_debug( |
| 387 |
"<strong>Importing Directory</strong> '%rel_path'", |
"<strong>Importing Directory</strong> '%rel_path'", |
| 388 |
array('%rel_path' => $rel_path), |
array('%rel_path' => $rel_path), |
| 389 |
WATCHDOG_INFO |
WATCHDOG_INFO |
| 390 |
); |
); |
| 411 |
mkdirs(dirname($dest_path), FILE_CREATE_DIRECTORY); |
mkdirs(dirname($dest_path), FILE_CREATE_DIRECTORY); |
| 412 |
if (! mkdirs(dirname($dest_path)) ) { |
if (! mkdirs(dirname($dest_path)) ) { |
| 413 |
trigger_error("Failed to create directory for $dest_path Might be permissions.", E_USER_ERROR); |
trigger_error("Failed to create directory for $dest_path Might be permissions.", E_USER_ERROR); |
| 414 |
} |
} |
| 415 |
$debug_info = array( |
$debug_info = array( |
| 416 |
'%source' => $source_path, |
'%source' => $source_path, |
| 417 |
'%dest' => $dest_path, |
'%dest' => $dest_path, |
| 418 |
); |
); |
| 419 |
|
|
| 420 |
import_html_debug( |
import_html_debug( |
| 421 |
"Fetching content from %location '<a href='!realpath'>%source_path</a>' now. Saving temp file locally as %dest_path", |
"Fetching content from %location '<a href='!realpath'>%source_path</a>' now. Saving temp file locally as %dest_path", |
| 422 |
array( |
array( |
| 423 |
'%source_path' => $source_path, |
'%source_path' => $source_path, |
| 424 |
'%dest_path' => $dest_path, |
'%dest_path' => $dest_path, |
| 430 |
|
|
| 431 |
$orig_path = $source_path; |
$orig_path = $source_path; |
| 432 |
|
|
| 433 |
if ($host) { |
if ($host) { |
| 434 |
// It's remote. Trust PHP5 and allor_url_fopen is available |
// It's remote. Trust PHP5 and allor_url_fopen is available |
| 435 |
if (!copy($source_path, $dest_path)) { |
if (!copy($source_path, $dest_path)) { |
| 436 |
import_html_debug( |
import_html_debug( |
| 437 |
"Remote file copy from %source to %dest failed", |
"Remote file copy from %source to %dest failed", |
| 438 |
$debug_info, |
$debug_info, |
| 439 |
WATCHDOG_ERROR |
WATCHDOG_ERROR |
| 440 |
); |
); |
| 441 |
return FALSE; |
return FALSE; |
| 442 |
} |
} |
| 443 |
} |
} |
| 444 |
else { |
else { |
| 445 |
// local copy |
// local copy |
| 446 |
if (realpath($source_path) == realpath($dest_path)) { |
if (realpath($source_path) == realpath($dest_path)) { |
| 447 |
import_html_debug( |
import_html_debug( |
| 456 |
'Local file copy failed (%source_path to %dest_path). |
'Local file copy failed (%source_path to %dest_path). |
| 457 |
Source %orig_path is <pre>!source_stat</pre> |
Source %orig_path is <pre>!source_stat</pre> |
| 458 |
Dest folder %dest_path is <pre>!dest_stat</pre> |
Dest folder %dest_path is <pre>!dest_stat</pre> |
| 459 |
', |
', |
| 460 |
array( |
array( |
| 461 |
'%source_path' => $source_path, |
'%source_path' => $source_path, |
| 462 |
'%dest_path' => $dest_path, |
'%dest_path' => $dest_path, |
| 463 |
'!source_stat' => print_r(stat($source_path), 1), |
'!source_stat' => print_r(stat($source_path), 1), |
| 464 |
'!dest_stat' => print_r(stat($dest_path), 1), |
'!dest_stat' => print_r(stat($dest_path), 1), |
| 465 |
), |
), |
| 466 |
WATCHDOG_ERROR |
WATCHDOG_ERROR |
| 467 |
); |
); |
| 468 |
|
|
| 469 |
return FALSE; |
return FALSE; |
| 470 |
} |
} |
| 471 |
} |
} |
| 472 |
import_html_debug( |
import_html_debug( |
| 473 |
"Copied import file from %source_path to %dest_path", |
"Copied import file from %source_path to %dest_path", |
| 474 |
$debug_info |
$debug_info |
| 475 |
); |
); |
| 476 |
return TRUE; |
return TRUE; |
| 488 |
* This processing is still in the 'validate' phase, so should |
* This processing is still in the 'validate' phase, so should |
| 489 |
* not cause anything to happen, just configure the node object |
* not cause anything to happen, just configure the node object |
| 490 |
* |
* |
| 491 |
* @param $path/$node |
* @param $path/$node |
| 492 |
* the file (or object) to read the data from. If it's a string, it's taken |
* the file (or object) to read the data from. If it's a string, it's taken |
| 493 |
* to be the filename, if an object, it's the node. A node should contain a - |
* to be the filename, if an object, it's the node. A node should contain a - |
| 494 |
* >body (or ->raw_html) and a - >path at least. |
* >body (or ->raw_html) and a - >path at least. |
| 495 |
* @param $rel_path |
* @param $rel_path |
| 496 |
* Where this html page was found, relative to its own server root. This is |
* Where this html page was found, relative to its own server root. This is |
| 497 |
* used to rewrite its urls. If the path is a directory, it should end with a |
* used to rewrite its urls. If the path is a directory, it should end with a |
| 498 |
* slash. ( /a/path/ == /a/path/index.html != /a/path ) |
* slash. ( /a/path/ == /a/path/index.html != /a/path ) |
| 499 |
* @param $profile |
* @param $profile |
| 500 |
* The settings for this import process. |
* The settings for this import process. |
| 501 |
* |
* |
| 502 |
* @return array containing the new node object as the first item. Some |
* @return array containing the new node object as the first item. Some |
| 503 |
* processes may return multiple nodes |
* processes may return multiple nodes |
| 504 |
*/ |
*/ |
| 509 |
} |
} |
| 510 |
import_html_debug_code("The import profile settings being used to import_html_process_html_page($rel_path)", $profile); |
import_html_debug_code("The import profile settings being used to import_html_process_html_page($rel_path)", $profile); |
| 511 |
|
|
| 512 |
if (is_string($path)) { |
if (is_string($path)) { |
| 513 |
// read from file |
// read from file |
| 514 |
|
|
| 515 |
import_html_debug( |
import_html_debug( |
| 516 |
"Processing file as HTML page. |
"Processing file as HTML page. |
| 517 |
Full file path: %path , will be imported as a relative path |
Full file path: %path , will be imported as a relative path |
| 518 |
under the current section. |
under the current section. |
| 519 |
Relative-path is: %rel_path", |
Relative-path is: %rel_path", |
| 520 |
array('%path' => $path, '%rel_path' => $rel_path), |
array('%path' => $path, '%rel_path' => $rel_path), |
| 521 |
WATCHDOG_INFO |
WATCHDOG_INFO |
| 522 |
); |
); |
| 523 |
|
|
| 524 |
if (! file_exists($path)) { |
if (! file_exists($path)) { |
| 525 |
trigger_error("Path '$path' was not found. This should have been a local copy of the file being imported, but the paths may be wrong somehow. Abject failure processing $rel_path"); |
trigger_error("Path '$path' was not found. This should have been a local copy of the file being imported, but the paths may be wrong somehow. Abject failure processing $rel_path"); |
| 526 |
} |
} |
| 527 |
|
|
| 528 |
/* |
/* |
| 530 |
* Either I want everything to be html, (always tidy) |
* Either I want everything to be html, (always tidy) |
| 531 |
* or I allow for exsl:document blocks (which can't be tidied) |
* or I allow for exsl:document blocks (which can't be tidied) |
| 532 |
* Option for now is try to parse, and only tidy if that fails. |
* Option for now is try to parse, and only tidy if that fails. |
| 533 |
* |
* |
| 534 |
*/ |
*/ |
| 535 |
// temporarily ignore parser errors (catch?) |
// temporarily ignore parser errors (catch?) |
| 536 |
set_error_handler('stfu'); |
set_error_handler('stfu'); |
| 537 |
$xmldoc = parse_in_xml_file($path, $profile['force_tidy']); |
$xmldoc = parse_in_xml_file($path, $profile['force_tidy']); |
| 538 |
restore_error_handler(); |
restore_error_handler(); |
| 539 |
|
|
| 540 |
if (! $xmldoc && $profile['force_tidy'] ) { |
if (! $xmldoc && $profile['force_tidy'] ) { |
| 541 |
import_html_debug( |
import_html_debug( |
| 542 |
"%path was not tidy enough - running tidy over it now so I can parse it.", |
"%path was not tidy enough - running tidy over it now so I can parse it.", |
| 543 |
array('%path' => $path, '%rel_path' => $rel_path) |
array('%path' => $path, '%rel_path' => $rel_path) |
| 544 |
); |
); |
| 545 |
// If a raw XML parse failed, |
// If a raw XML parse failed, |
| 550 |
#import_html_debug_code("Finished reading from file:", xml_tostring($xmldoc)); |
#import_html_debug_code("Finished reading from file:", xml_tostring($xmldoc)); |
| 551 |
$source_node = new stdClass(); |
$source_node = new stdClass(); |
| 552 |
} |
} |
| 553 |
else { |
else { |
| 554 |
// We may have passed in a source-node object where the path was expected instead. |
// We may have passed in a source-node object where the path was expected instead. |
| 555 |
// A bit of a sneak. The given node has the source HTML in $node->raw_html |
// A bit of a sneak. The given node has the source HTML in $node->raw_html |
| 556 |
if (is_object($path)) { |
if (is_object($path)) { |
| 566 |
} |
} |
| 567 |
} |
} |
| 568 |
|
|
| 569 |
if (!$xmldoc) { |
if (!$xmldoc) { |
| 570 |
// parsing failed |
// parsing failed |
| 571 |
import_html_debug("Import_HTML failed to initialize or parse XMLdoc input", array(), WATCHDOG_ERROR); |
import_html_debug("Import_HTML failed to initialize or parse XMLdoc input", array(), WATCHDOG_ERROR); |
| 572 |
return FALSE; |
return FALSE; |
| 591 |
if (import_html_variable('debug_level')) { |
if (import_html_variable('debug_level')) { |
| 592 |
$source_node->file_data['after_rewriting'] = xml_tostring($xmldoc); |
$source_node->file_data['after_rewriting'] = xml_tostring($xmldoc); |
| 593 |
} |
} |
| 594 |
|
|
| 595 |
// Import content as node. |
// Import content as node. |
| 596 |
// Translate the source text to the known tidy simple, tagged HTML structure now |
// Translate the source text to the known tidy simple, tagged HTML structure now |
| 597 |
$parameters = array( |
$parameters = array( |
| 607 |
$xml_top = $xmldoc->firstChild; |
$xml_top = $xmldoc->firstChild; |
| 608 |
$xsl_top = $xsldoc->firstChild; |
$xsl_top = $xsldoc->firstChild; |
| 609 |
import_html_debug(" |
import_html_debug(" |
| 610 |
Using XSL translation template to extract semantic content. |
Using XSL translation template to extract semantic content. |
| 611 |
Will search for body content labelled '". $parameters['contentid'] |
Will search for body content labelled '". $parameters['contentid'] |
| 612 |
."' in the source. |
."' in the source. |
| 613 |
Active XML Namespaces are |
Active XML Namespaces are |
| 614 |
{$xml_top->nodeName} : {$xml_top->namespaceURI} - |
{$xml_top->nodeName} : {$xml_top->namespaceURI} - |
| 615 |
{$xsl_top->nodeName} : {$xsl_top->namespaceURI} \n" |
{$xsl_top->nodeName} : {$xsl_top->namespaceURI} \n" |
| 616 |
, array()); |
, array()); |
| 617 |
$importxml = xmldoc_plus_xsldoc($xmldoc, $xsldoc, $parameters); |
$importxml = xmldoc_plus_xsldoc($xmldoc, $xsldoc, $parameters); |
| 658 |
$node->old_path = _import_html_calc_path($rel_path, TRUE); |
$node->old_path = _import_html_calc_path($rel_path, TRUE); |
| 659 |
} |
} |
| 660 |
|
|
| 661 |
// May need extra care when creating multiples. |
// May need extra care when creating multiples. |
| 662 |
// Invent new paths for the new documents if the exsl:document didn't define them |
// Invent new paths for the new documents if the exsl:document didn't define them |
| 663 |
if (isset($nodes[$node->path])) { |
if (isset($nodes[$node->path])) { |
| 664 |
// already using this path, extend a new one |
// already using this path, extend a new one |
| 665 |
$node->path .= '/'. import_html_check_name(!empty($node->label)?$node->label:$node->title); |
$node->path .= '/'. import_html_check_name(!empty($node->label)?$node->label:$node->title); |
| 666 |
} |
} |
| 667 |
|
|
| 671 |
|
|
| 672 |
|
|
| 673 |
$nodes[$node->path] = $node; |
$nodes[$node->path] = $node; |
| 674 |
|
|
| 675 |
import_html_debug("Path to save this page as is %path", array('%path' => $path)); |
import_html_debug("Path to save this page as is %path", array('%path' => $path)); |
| 676 |
} |
} |
| 677 |
|
|
| 678 |
return $nodes; |
return $nodes; |
| 679 |
} |
} |
| 685 |
* with all useful parameters set. |
* with all useful parameters set. |
| 686 |
* A shell node object may be passed in with some values already set. The data |
* A shell node object may be passed in with some values already set. The data |
| 687 |
* extracted from the XHTML structure will be layered onto that. |
* extracted from the XHTML structure will be layered onto that. |
| 688 |
* |
* |
| 689 |
* Here is where we map HTML info to node data, like H1 -> $node->title |
* Here is where we map HTML info to node data, like H1 -> $node->title |
| 690 |
* TODO tidy this up with a lookup table or something |
* TODO tidy this up with a lookup table or something |
| 691 |
* |
* |
| 692 |
* node may have defined its own $node->type even |
* node may have defined its own $node->type even |
| 693 |
* |
* |
| 694 |
* Called by |
* Called by |
| 695 |
* @see import_html_process_html_page() |
* @see import_html_process_html_page() |
| 696 |
* |
* |
| 697 |
* THIS IS THE ENGINE OF IMPORT_HTML |
* THIS IS THE ENGINE OF IMPORT_HTML |
| 698 |
* |
* |
| 699 |
* @param $datadoc |
* @param $datadoc |
| 700 |
* An XML document containing the whole source data |
* An XML document containing the whole source data |
| 701 |
* @param $node |
* @param $node |
| 704 |
* @param $profile |
* @param $profile |
| 705 |
* A set of settings and preferences for the import_html process currently |
* A set of settings and preferences for the import_html process currently |
| 706 |
* underway. May include some context information like paths. |
* underway. May include some context information like paths. |
| 707 |
* |
* |
| 708 |
*/ |
*/ |
| 709 |
function import_html_xhtml_to_node($datadoc, $node, $profile) { |
function import_html_xhtml_to_node($datadoc, $node, $profile) { |
| 710 |
import_html_debug("Importing from XML object to node object"); |
import_html_debug("Importing from XML object to node object"); |
| 716 |
if (import_html_variable('debug_level')) { |
if (import_html_variable('debug_level')) { |
| 717 |
$node->file_data['raw_xhtml'] = xml_toString($datadoc); |
$node->file_data['raw_xhtml'] = xml_toString($datadoc); |
| 718 |
} |
} |
| 719 |
|
|
| 720 |
|
|
| 721 |
// Now read the input into node structure |
// Now read the input into node structure |
| 722 |
// |
// |
| 723 |
// Absorb the most generic bits first. Later processes may overwrite them more accurately. |
// Absorb the most generic bits first. Later processes may overwrite them more accurately. |
| 724 |
|
|
| 725 |
// This initial import is a totally generic catch-all. |
// This initial import is a totally generic catch-all. |
| 726 |
import_html_absorb_all_tagged_elements($node, $datadoc); |
import_html_absorb_all_tagged_elements($node, $datadoc); |
| 727 |
|
|
| 728 |
// |
// |
| 738 |
// Loop over a buch of hook-like per-module extensions |
// Loop over a buch of hook-like per-module extensions |
| 739 |
// MENU, PATH, TAXONOMY, CCK all add values in their own callbacks in import_html_modules.inc |
// MENU, PATH, TAXONOMY, CCK all add values in their own callbacks in import_html_modules.inc |
| 740 |
// Also the core node elements - body, title, teaser get set in a 'core' callback |
// Also the core node elements - body, title, teaser get set in a 'core' callback |
| 741 |
|
|
| 742 |
import_html_include_add_on_module_handlers(); |
import_html_include_add_on_module_handlers(); |
| 743 |
module_invoke_all('import_html', $profile, $node, $datadoc); |
module_invoke_all('import_html', $profile, $node, $datadoc); |
| 744 |
|
|
| 745 |
// 'content' is now a reserved word in Drupal5 |
// 'content' is now a reserved word in Drupal5 |
| 746 |
// If I have a string there, the body cannot be rendered right later |
// If I have a string there, the body cannot be rendered right later |
| 747 |
unset($node->content); |
unset($node->content); |
| 750 |
$node->format = import_html_get_preferred_filter(); |
$node->format = import_html_get_preferred_filter(); |
| 751 |
|
|
| 752 |
import_html_debug_code( |
import_html_debug_code( |
| 753 |
"After absorbing absolutely everything I could find, |
"After absorbing absolutely everything I could find, |
| 754 |
the node object now contains the following blocks and bits:", |
the node object now contains the following blocks and bits:", |
| 755 |
$node |
$node |
| 756 |
); |
); |
| 760 |
|
|
| 761 |
/** |
/** |
| 762 |
* Import ALL tagged classes and IDs as node attributes. |
* Import ALL tagged classes and IDs as node attributes. |
| 763 |
* |
* |
| 764 |
* If the input has ANY id or classes at all, grab that info and apply it to |
* If the input has ANY id or classes at all, grab that info and apply it to |
| 765 |
* this object. Assume anything important enough to have a label is important |
* this object. Assume anything important enough to have a label is important |
| 766 |
* enough to remember. |
* enough to remember. |
| 767 |
* |
* |
| 768 |
* This will probably produce a very cloggy node, filled with trash, Possibly |
* This will probably produce a very cloggy node, filled with trash, Possibly |
| 769 |
* even some arrays where there shouldn't be. But any unrecognised property |
* even some arrays where there shouldn't be. But any unrecognised property |
| 770 |
* names will be discarded on save, leaving only the serializable values. This |
* names will be discarded on save, leaving only the serializable values. This |
| 774 |
function import_html_absorb_all_tagged_elements(&$node, $datadoc) { |
function import_html_absorb_all_tagged_elements(&$node, $datadoc) { |
| 775 |
|
|
| 776 |
foreach (array('id', 'class') as $attribute_label) { |
foreach (array('id', 'class') as $attribute_label) { |
| 777 |
|
|
| 778 |
import_html_debug( |
import_html_debug( |
| 779 |
"Absorbing all elements with an %attribute_label |
"Absorbing all elements with an %attribute_label |
| 780 |
as incidental data blobs (possibly html) into node structure", |
as incidental data blobs (possibly html) into node structure", |
| 781 |
array('%attribute_label' => $attribute_label) |
array('%attribute_label' => $attribute_label) |
| 782 |
); |
); |
| 783 |
$found_elements = xml_query($datadoc, './/*[@'. $attribute_label .']'); |
$found_elements = xml_query($datadoc, './/*[@'. $attribute_label .']'); |
| 784 |
|
|
| 785 |
// I now have a collection of tagged nodes. |
// I now have a collection of tagged nodes. |
| 786 |
foreach ($found_elements as $found_element) { |
foreach ($found_elements as $found_element) { |
| 787 |
|
|
| 788 |
$attribute_value = xml_getattribute($found_element, $attribute_label); |
$attribute_value = xml_getattribute($found_element, $attribute_label); |
| 789 |
// if it was a class, it may be multiple! |
// if it was a class, it may be multiple! |
| 790 |
// Usually just one however... |
// Usually just one however... |
| 791 |
$keys = explode(' ', $attribute_value); |
$keys = explode(' ', $attribute_value); |
| 792 |
// debug("Found an node with $attribute_label of ".print_r($keys, 1) , 3); |
// debug("Found an node with $attribute_label of ".print_r($keys, 1) , 3); |
| 793 |
|
|
| 794 |
foreach ($keys as $key) { |
foreach ($keys as $key) { |
| 795 |
// Found 'something' labelled 'something' |
// Found 'something' labelled 'something' |
| 796 |
if (! trim($key)) continue; |
if (! trim($key)) continue; |
| 797 |
|
|
| 798 |
// Allow HTML though. Sometimes this will not be right... |
// Allow HTML though. Sometimes this will not be right... |
| 799 |
// TODO, figure it out? |
// TODO, figure it out? |
| 800 |
$value = xml_tostring($found_element, TRUE); |
$value = xml_tostring($found_element, TRUE); |
| 802 |
|
|
| 803 |
// The value just gets absorbed |
// The value just gets absorbed |
| 804 |
import_html_debug( |
import_html_debug( |
| 805 |
"Found an unexpected tagged value - %key , |
"Found an unexpected tagged value - %key , |
| 806 |
Absorbing it into the node as a default text/html value", |
Absorbing it into the node as a default text/html value", |
| 807 |
array('%key' => $key) |
array('%key' => $key) |
| 808 |
); |
); |
| 809 |
|
|
| 810 |
// Set it onto the node, |
// Set it onto the node, |
| 811 |
// If it's a class, carefully combine to preserve pre-existing arrays |
// If it's a class, carefully combine to preserve pre-existing arrays |
| 812 |
if ( $attribute_label == 'class') { |
if ( $attribute_label == 'class') { |
| 813 |
import_html_absorb_properties($node, $key, $value); |
import_html_absorb_properties($node, $key, $value); |
| 819 |
} // each multiple key |
} // each multiple key |
| 820 |
} // each found element |
} // each found element |
| 821 |
} // each attribute type |
} // each attribute type |
| 822 |
} |
} |
| 823 |
|
|
| 824 |
/** |
/** |
| 825 |
* Scan a given dom object for metas of a certain persuasion, and add all found |
* Scan a given dom object for metas of a certain persuasion, and add all found |
| 826 |
* key-values to the $node. |
* key-values to the $node. |
| 827 |
* |
* |
| 828 |
* Supports different metas, like |
* Supports different metas, like |
| 829 |
* <meta name="key" content="value" /> |
* <meta name="key" content="value" /> |
| 830 |
* or |
* or |
| 831 |
* <rel type="top" href="url" /> |
* <rel type="top" href="url" /> |
| 832 |
* |
* |
| 833 |
* import_html_absorb_metas($node, $htmlnode, 'meta', 'name', 'content'); |
* import_html_absorb_metas($node, $htmlnode, 'meta', 'name', 'content'); |
| 834 |
* import_html_absorb_metas($node, $htmlnode, 'rel', 'type', 'href'); |
* import_html_absorb_metas($node, $htmlnode, 'rel', 'type', 'href'); |
| 835 |
* |
* |
| 836 |
* ... |
* ... |
| 837 |
* .. would result in : |
* .. would result in : |
| 838 |
* |
* |
| 839 |
* $node->key='value'; |
* $node->key='value'; |
| 840 |
* $node->top='url'; |
* $node->top='url'; |
| 841 |
* |
* |
| 842 |
* |
* |
| 843 |
*/ |
*/ |
| 844 |
function import_html_absorb_metas(&$node, $xml_element, $tagname, $keyname, $valname) { |
function import_html_absorb_metas(&$node, $xml_element, $tagname, $keyname, $valname) { |
| 845 |
|
|
| 846 |
import_html_debug( |
import_html_debug( |
| 847 |
"Absorbing the '%valname' of '%tagname's with a '%keyname' |
"Absorbing the '%valname' of '%tagname's with a '%keyname' |
| 848 |
from source doc into node structure", |
from source doc into node structure", |
| 849 |
array( |
array( |
| 850 |
'%valname' => $valname, |
'%valname' => $valname, |
| 866 |
} |
} |
| 867 |
else{ |
else{ |
| 868 |
import_html_debug( |
import_html_debug( |
| 869 |
"When absorbing '%valname' from '%tagname's with a '%keyname' from source doc, |
"When absorbing '%valname' from '%tagname's with a '%keyname' from source doc, |
| 870 |
(%key='%value') had a null value. Not a great problem, just letting you know.", |
(%key='%value') had a null value. Not a great problem, just letting you know.", |
| 871 |
array( |
array( |
| 872 |
'%valname' => $valname, |
'%valname' => $valname, |
| 898 |
debug("Odd, when absorbing properties, '$key' had a null value. This is probably not an error.", 2); |
debug("Odd, when absorbing properties, '$key' had a null value. This is probably not an error.", 2); |
| 899 |
return; |
return; |
| 900 |
} |
} |
| 901 |
|
|
| 902 |
// Auto-expand into arrays - most metas can legally have duplicates |
// Auto-expand into arrays - most metas can legally have duplicates |
| 903 |
if ( ! isset($node->$key) ) { |
if ( ! isset($node->$key) ) { |
| 904 |
$node->$key = $value; |
$node->$key = $value; |
| 905 |
} |
} |
| 906 |
else if ( is_array($node->$key) ) { |
else if ( is_array($node->$key) ) { |
| 907 |
$a = $node->$key; $a[] = $value; $node->$key = $a; |
$a = $node->$key; $a[] = $value; $node->$key = $a; |
| 908 |
} |
} |
| 909 |
else { $node->$key = array($node->$key, $value); } |
else { $node->$key = array($node->$key, $value); } |
| 910 |
} |
} |
| 931 |
function import_html_guess_document_title($node) { |
function import_html_guess_document_title($node) { |
| 932 |
if (empty($node->title) ) { |
if (empty($node->title) ) { |
| 933 |
import_html_debug( |
import_html_debug( |
| 934 |
"Failed to extract a useful title for this node, falling back to a default value.", |
"Failed to extract a useful title for this node, falling back to a default value.", |
| 935 |
array(), |
array(), |
| 936 |
WATCHDOG_NOTICE |
WATCHDOG_NOTICE |
| 937 |
); |
); |
| 938 |
switch (import_html_variable('handle_no_title')) { |
switch (import_html_variable('handle_no_title')) { |
| 960 |
if (!$label) { |
if (!$label) { |
| 961 |
// it had a trailing slash |
// it had a trailing slash |
| 962 |
$label = array_pop($path_bits); |
$label = array_pop($path_bits); |
| 963 |
} |
} |
| 964 |
$label = preg_replace('/\?.*$/', '?', $label); // messiness from mirrored URLs with args in |
$label = preg_replace('/\?.*$/', '?', $label); // messiness from mirrored URLs with args in |
| 965 |
// TODO maybe adjust this title-munging algoritm to make better guesses |
// TODO maybe adjust this title-munging algoritm to make better guesses |
| 966 |
$label = str_replace('_', ' ', $label); |
$label = str_replace('_', ' ', $label); |
| 967 |
$label = (strstr($label, '.')) ? substr($label, 0, strrpos($label, ".")) : $label; |
$label = (strstr($label, '.')) ? substr($label, 0, strrpos($label, ".")) : $label; |
| 968 |
} |
} |
| 969 |
return $label; |
return $label; |
| 970 |
} |
} |
| 971 |
|
|
| 972 |
|
|
| 973 |
/** |
/** |
| 974 |
* Return the nice path alias of an imported page. |
* Return the nice path alias of an imported page. |
| 975 |
* |
* |
| 976 |
* Simplify a legacy URL path into something better looking. |
* Simplify a legacy URL path into something better looking. |
| 977 |
*/ |
*/ |
| 978 |
function _import_html_calc_path($rel_path, $leave_suffix = FALSE) { |
function _import_html_calc_path($rel_path, $leave_suffix = FALSE) { |
| 986 |
if (import_html_variable('trim_suffixes')) { |
if (import_html_variable('trim_suffixes')) { |
| 987 |
// Simplify the URL if possible by trimming the suffix and 'index' |
// Simplify the URL if possible by trimming the suffix and 'index' |
| 988 |
// but remember the original path somewhere, we'ill need to link it forward |
// but remember the original path somewhere, we'ill need to link it forward |
| 989 |
// once the new node is established. |
// once the new node is established. |
| 990 |
|
|
| 991 |
// To be clever, special-case the 'index.html' files to be |
// To be clever, special-case the 'index.html' files to be |
| 992 |
// linked to their parent directories. |
// linked to their parent directories. |
| 993 |
// Trailing slash is tricky. |
// Trailing slash is tricky. |
| 994 |
// /this/path is a whole navigation level above |
// /this/path is a whole navigation level above |
| 995 |
// /this/path/ and will resolve relative links differently! |
// /this/path/ and will resolve relative links differently! |
| 996 |
// We need to actually redirect, not just alias any links like that |
// We need to actually redirect, not just alias any links like that |
| 997 |
$default_documents = split(",", import_html_variable('default_document')); |
$default_documents = split(",", import_html_variable('default_document')); |
| 998 |
$trimmed_path = $path; |
$trimmed_path = $path; |
| 1001 |
} |
} |
| 1002 |
if ($trimmed_path != $path) { |
if ($trimmed_path != $path) { |
| 1003 |
import_html_debug( |
import_html_debug( |
| 1004 |
"It's an index page, so we will refer to $path as $trimmed_path", |
"It's an index page, so we will refer to $path as $trimmed_path", |
| 1005 |
array('%path' => $path, '%trimmed_path' => $trimmed_path), |
array('%path' => $path, '%trimmed_path' => $trimmed_path), |
| 1006 |
WATCHDOG_INFO |
WATCHDOG_INFO |
| 1007 |
); |
); |
| 1008 |
$path = $trimmed_path; |
$path = $trimmed_path; |
| 1009 |
} |
} |
| 1010 |
else { |
else { |
| 1011 |
// No change, Chop suffix instead. |
// No change, Chop suffix instead. |
| 1012 |
// Take care - don't break a path like |
// Take care - don't break a path like |
| 1013 |
// /path/site-mirror/drupal.org/about |
// /path/site-mirror/drupal.org/about |
| 1014 |
// incorrectly. So make sure that we split off the basename, chop its suffix, then glue it back onto the dirname |
// incorrectly. So make sure that we split off the basename, chop its suffix, then glue it back onto the dirname |
| 1015 |
// $path = (! empty($path) ? dirname($path) .'/' : '') . preg_replace('|\.[^\.]+$|', "", basename($path)); |
// $path = (! empty($path) ? dirname($path) .'/' : '') . preg_replace('|\.[^\.]+$|', "", basename($path)); |
| 1022 |
} |
} |
| 1023 |
|
|
| 1024 |
/** |
/** |
| 1025 |
* Find and initialize the transformation template. |
* Find and initialize the transformation template. |
| 1026 |
* |
* |
| 1027 |
* Includes caching retrieval for a bit of speed-up over bulks. |
* Includes caching retrieval for a bit of speed-up over bulks. |
| 1028 |
* |
* |
| 1029 |
* @return XML Document |
* @return XML Document |
| 1030 |
*/ |
*/ |
| 1031 |
function _import_html_get_xsl_doc($xslfile) { |
function _import_html_get_xsl_doc($xslfile) { |
| 1060 |
/** |
/** |
| 1061 |
* Run the url-rewrite XSL over the source document |
* Run the url-rewrite XSL over the source document |
| 1062 |
* TODO allow for the non-base version of Drupal links |
* TODO allow for the non-base version of Drupal links |
| 1063 |
* |
* |
| 1064 |
* The relative links need to be converted into path-to- top and back down |
* The relative links need to be converted into path-to- top and back down |
| 1065 |
* again. Relative references just cannot be maintained. |
* again. Relative references just cannot be maintained. |
| 1066 |
* |
* |
| 1067 |
* @return an XML doc again |
* @return an XML doc again |
| 1068 |
*/ |
*/ |
| 1069 |
function import_html_rewrite_links($xmldoc, $rel_path, $profile) { |
function import_html_rewrite_links($xmldoc, $rel_path, $profile) { |
| 1099 |
$src_root = base_path() . ensure_trailing_slash($profile['file_storage_path']); |
$src_root = base_path() . ensure_trailing_slash($profile['file_storage_path']); |
| 1100 |
|
|
| 1101 |
$src_base = ensure_trailing_slash($src_root) . (($rel_base == '/') ? '' : $rel_base); |
$src_base = ensure_trailing_slash($src_root) . (($rel_base == '/') ? '' : $rel_base); |
| 1102 |
|
|
| 1103 |
// Or not, if we are still linking to full URLs (demo or partial import) |
// Or not, if we are still linking to full URLs (demo or partial import) |
| 1104 |
if (valid_url($rel_path, TRUE)) { |
if (valid_url($rel_path, TRUE)) { |
| 1105 |
// it's remote! |
// it's remote! |
| 1116 |
import_html_debug(" |
import_html_debug(" |
| 1117 |
<b>Rewrite patterns:</b> |
<b>Rewrite patterns:</b> |
| 1118 |
Path to the top of this (relative) server is $site_root . |
Path to the top of this (relative) server is $site_root . |
| 1119 |
Path to top of the prefixed section |
Path to top of the prefixed section |
| 1120 |
({$profile['import_site_prefix']}) |
({$profile['import_site_prefix']}) |
| 1121 |
from here ($rel_path) |
from here ($rel_path) |
| 1122 |
to our import base |
to our import base |
| 1123 |
({$profile['import_site_prefix']}) |
({$profile['import_site_prefix']}) |
| 1124 |
would be '$path_to_import_top'. |
would be '$path_to_import_top'. |
| 1125 |
Path to a relative <em>neighbour</em> of this page would be |
Path to a relative <em>neighbour</em> of this page would be |
| 1127 |
or to find the base for <em>relative</em> resource files over in |
or to find the base for <em>relative</em> resource files over in |
| 1128 |
the file storage area |
the file storage area |
| 1129 |
({$profile['file_storage_path']}) |
({$profile['file_storage_path']}) |
| 1130 |
would be '$src_base' ", |
would be '$src_base' ", |
| 1131 |
array(), |
array(), |
| 1132 |
WATCHDOG_DEBUG |
WATCHDOG_DEBUG |
| 1133 |
); |
); |
| 1134 |
|
|
| 1135 |
|
|
| 1136 |
$parameters = array( |
$parameters = array( |
| 1137 |
// These parameters tell the rewriter what to prepend to the links. |
// These parameters tell the rewriter what to prepend to the links. |
| 1138 |
// They are instructions how this page will find its missing bretheren |
// They are instructions how this page will find its missing bretheren |
| 1139 |
// when we put it where we put it. |
// when we put it where we put it. |
| 1140 |
// Images and Pages may end up in different places. |
// Images and Pages may end up in different places. |
| 1141 |
'site_root' => $site_root, |
'site_root' => $site_root, |
| 1142 |
'src_root' => $src_root, |
'src_root' => $src_root, |
| 1143 |
'src_base' => $src_base, |
'src_base' => $src_base, |
| 1144 |
'href_base' => $href_base, |
'href_base' => $href_base, |
| 1145 |
'replace_suffix' => $profile['relink_files'], |
'replace_suffix' => $profile['relink_files'], |
| 1146 |
'new_suffix' => '', |
'new_suffix' => '', |
| 1147 |
'xsl_path' => $xslfilepath, |
'xsl_path' => $xslfilepath, |
| 1148 |
'strip_scripts' => $profile['strip_scripts'], |
'strip_scripts' => $profile['strip_scripts'], |
| 1149 |
); |
); |
| 1150 |
import_html_debug(" |
import_html_debug(" |
| 1151 |
XSL for URL rewrites loaded OK. |
XSL for URL rewrites loaded OK. |
| 1152 |
HTML links for files that were under '$rel_base' will be made relative to '" |
HTML links for files that were under '$rel_base' will be made relative to '" |
| 1153 |
. $parameters['href_base'] ."' (for pages) and '". $parameters['src_base'] ."' (for resources) " |
. $parameters['href_base'] ."' (for pages) and '". $parameters['src_base'] ."' (for resources) " |
| 1154 |
. ( $parameters['strip_scripts'] ? 'All inline script blocks will be discarded from the source.'. $parameters['strip_scripts'] : '') |
. ( $parameters['strip_scripts'] ? 'All inline script blocks will be discarded from the source.'. $parameters['strip_scripts'] : '') |
| 1155 |
, |
, |
| 1156 |
array(), |
array(), |
| 1176 |
|
|
| 1177 |
/** |
/** |
| 1178 |
* Run the strip_tables XSL over the source document |
* Run the strip_tables XSL over the source document |
| 1179 |
* |
* |
| 1180 |
* @return an XML doc again |
* @return an XML doc again |
| 1181 |
*/ |
*/ |
| 1182 |
function import_html_strip_tables($xmldoc) { |
function import_html_strip_tables($xmldoc) { |
| 1243 |
* Avoid double-ups, if the path already exists, UPDATE the existing node. |
* Avoid double-ups, if the path already exists, UPDATE the existing node. |
| 1244 |
* Can't have two content nodes claiming the same path or it won't validate. |
* Can't have two content nodes claiming the same path or it won't validate. |
| 1245 |
* Plus, we want to retain any info that's been added via drupal. Probably. |
* Plus, we want to retain any info that's been added via drupal. Probably. |
| 1246 |
* |
* |
| 1247 |
* @param $node |
* @param $node |
| 1248 |
* partially created node from import. Key lookup on $node->path |
* partially created node from import. Key lookup on $node->path |
| 1249 |
* @param $profile |
* @param $profile |
| 1250 |
* May contain some rules for conflict resolution - which values to keep, |
* May contain some rules for conflict resolution - which values to keep, |
| 1251 |
* which to over-write. |
* which to over-write. |
| 1252 |
* |
* |
| 1253 |
* @return $node |
* @return $node |
| 1254 |
* possibly with pre-existing values blended in. Importantly - the nid |
* possibly with pre-existing values blended in. Importantly - the nid |
| 1255 |
*/ |
*/ |
| 1256 |
function import_html_merge_over_existing_node($node, $profile) { |
function import_html_merge_over_existing_node($node, $profile) { |
| 1259 |
if ($internal_link != $node->path) { |
if ($internal_link != $node->path) { |
| 1260 |
// Found an internal match, the alias is already asigned to a node |
// Found an internal match, the alias is already asigned to a node |
| 1261 |
// Merge info to avoid losing any Drupal-only info |
// Merge info to avoid losing any Drupal-only info |
| 1262 |
|
|
| 1263 |
$probable_nid = array_pop(explode("/", $internal_link)); |
$probable_nid = array_pop(explode("/", $internal_link)); |
| 1264 |
if (! is_numeric($probable_nid)) { |
if (! is_numeric($probable_nid)) { |
| 1265 |
// This may happen if the menu builder has created a placeholder alias |
// This may happen if the menu builder has created a placeholder alias |
| 1266 |
// pseudo-page, or the alias conflicts with an already-created system path. |
// pseudo-page, or the alias conflicts with an already-created system path. |
| 1267 |
import_html_debug(" |
import_html_debug(" |
| 1268 |
When looking for an alias to '%nodepath', |
When looking for an alias to '%nodepath', |
| 1269 |
Found some pre-existing (non-node) content there. |
Found some pre-existing (non-node) content there. |
| 1270 |
the internal link |
the internal link |
| 1271 |
'%internal_link' - which was expected to return a nid.", |
'%internal_link' - which was expected to return a nid.", |
| 1272 |
array( |
array( |
| 1273 |
'%nodepath' => $node->path, |
'%nodepath' => $node->path, |
| 1280 |
$node->nid = $probable_nid; |
$node->nid = $probable_nid; |
| 1281 |
|
|
| 1282 |
import_html_debug(" |
import_html_debug(" |
| 1283 |
Page path alias '%nodepath' already exists, |
Page path alias '%nodepath' already exists, |
| 1284 |
It's already linked to node id '%nodenid'. |
It's already linked to node id '%nodenid'. |
| 1285 |
This data import will <em>replace</em> that content, |
This data import will <em>replace</em> that content, |
| 1286 |
but try to keep any other values. |
but try to keep any other values. |
| 1287 |
", |
", |
| 1288 |
array( |
array( |
| 1309 |
// Now do the rest by copying values as best we can |
// Now do the rest by copying values as best we can |
| 1310 |
foreach ($node as $key => $value) { |
foreach ($node as $key => $value) { |
| 1311 |
// Do a deep merge |
// Do a deep merge |
| 1312 |
if (is_array($value)) { |
if (is_array($value)) { |
| 1313 |
// merge deeper sets, like taxonomy |
// merge deeper sets, like taxonomy |
| 1314 |
if (!@is_array($old_node->$key)) { |
if (!@is_array($old_node->$key)) { |
| 1315 |
$old_node->$key=array(); |
$old_node->$key=array(); |
| 1317 |
foreach ($value as $k => $v) { |
foreach ($value as $k => $v) { |
| 1318 |
$old_node->{$key}[$k] = $v; |
$old_node->{$key}[$k] = $v; |
| 1319 |
} |
} |
| 1320 |
} |
} |
| 1321 |
else { |
else { |
| 1322 |
$old_node-> $key = $value; |
$old_node-> $key = $value; |
| 1323 |
} |
} |
| 1329 |
|
|
| 1330 |
|
|
| 1331 |
/** |
/** |
| 1332 |
* Utility function |
* Utility function |
| 1333 |
* |
* |
| 1334 |
* file_scan_directory() does not support max_depth. |
* file_scan_directory() does not support max_depth. |
| 1335 |
* I need it so my folder listings don't go insane when recursing |
* I need it so my folder listings don't go insane when recursing |
| 1336 |
* |
* |
| 1337 |
* This is a version of file_scan_directory that does respect max_depth |
* This is a version of file_scan_directory that does respect max_depth |
| 1338 |
* when recursing. |
* when recursing. |
| 1339 |
* It also adds a filecount value to the returned item to assist feedback |
* It also adds a filecount value to the returned item to assist feedback |
| 1340 |
* |
* |
| 1341 |
* @see file_scan_directory. |
* @see file_scan_directory. |
| 1342 |
* |
* |
| 1343 |
*/ |
*/ |
| 1344 |
function import_html_file_scan_directory($dir, $mask, $nomask = array('.', '..', 'CVS'), $callback = 0, $recurse = TRUE, $key = 'filename', $min_depth = 0, $depth = 0, $max_depth = NULL) { |
function import_html_file_scan_directory($dir, $mask, $nomask = array('.', '..', 'CVS'), $callback = 0, $recurse = TRUE, $key = 'filename', $min_depth = 0, $depth = 0, $max_depth = NULL) { |
| 1345 |
// If no max_depth is set, the normal recursed version is OK |
// If no max_depth is set, the normal recursed version is OK |
| 1346 |
if (! isset($max_depth)) { |
if (! isset($max_depth)) { |
| 1347 |
return file_scan_directory($dir, $mask, $nomask, $callback, TRUE, $key, $min_depth, $depth); |
return file_scan_directory($dir, $mask, $nomask, $callback, TRUE, $key, $min_depth, $depth); |
| 1348 |
} |
} |
| 1349 |
|
|
| 1350 |
$files = array(); |
$files = array(); |
| 1351 |
|
|
| 1352 |
// Use file_scan_directory - non-recursive |
// Use file_scan_directory - non-recursive |
| 1357 |
if ($depth < $max_depth) { |
if ($depth < $max_depth) { |
| 1358 |
$files = array_merge(import_html_file_scan_directory($filepath, $mask, $nomask, $callback, $recurse, $key, $min_depth, $depth + 1, $max_depth), $files); |
$files = array_merge(import_html_file_scan_directory($filepath, $mask, $nomask, $callback, $recurse, $key, $min_depth, $depth + 1, $max_depth), $files); |
| 1359 |
} |
} |
| 1360 |
|
|
| 1361 |
// This may be intensive, but will help debugging |
// This may be intensive, but will help debugging |
| 1362 |
$count_files = file_scan_directory($filepath, $mask); |
$count_files = file_scan_directory($filepath, $mask); |
| 1363 |
$files[$filepath]->child_count = count($count_files); |
$files[$filepath]->child_count = count($count_files); |
| 1388 |
|
|
| 1389 |
/** |
/** |
| 1390 |
* Tidy URLs before saving locally - for URL imports |
* Tidy URLs before saving locally - for URL imports |
| 1391 |
* |
* |
| 1392 |
* Squash/hash query strings, but don't discard them. |
* Squash/hash query strings, but don't discard them. |
| 1393 |
* Do discard fragment ids |
* Do discard fragment ids |
| 1394 |
* |
* |
| 1395 |
* Replace spaces and non-alphanumerics with underscore |
* Replace spaces and non-alphanumerics with underscore |
| 1396 |
*/ |
*/ |
| 1397 |
function safe_filepath_from_url($rel_path) { |
function safe_filepath_from_url($rel_path) { |
| 1419 |
|
|
| 1420 |
// decode three byte unicode characters |
// decode three byte unicode characters |
| 1421 |
$string = preg_replace( |
$string = preg_replace( |
| 1422 |
"/([\340-\357])([\200-\277])([\200-\277])/e", |
"/([\340-\357])([\200-\277])([\200-\277])/e", |
| 1423 |
"'&#'.((ord('\\1')-224)*4096 + (ord('\\2')-128)*64 + (ord('\\3')-128)).';'", |
"'&#'.((ord('\\1')-224)*4096 + (ord('\\2')-128)*64 + (ord('\\3')-128)).';'", |
| 1424 |
$string |
$string |
| 1425 |
); |
); |
| 1426 |
|
|