| 1 |
<?php
|
| 2 |
/**
|
| 3 |
* @file Actual routines for importing files.
|
| 4 |
*
|
| 5 |
*
|
| 6 |
* @ingroup import_html Import HTML
|
| 7 |
* @author Dan Morrison http://coders.co.nz/
|
| 8 |
* @version $Id: import_html_process.inc,v 1.5.2.2 2009/03/14 03:18:51 dman Exp $
|
| 9 |
*
|
| 10 |
*/
|
| 11 |
|
| 12 |
|
| 13 |
/**
|
| 14 |
* Files have been selected, set them up for processing
|
| 15 |
*
|
| 16 |
* @param $file_list an array of simple file paths, probably selected from the file_list form
|
| 17 |
* @param $context A set of parameters, similar to the import_html profile, possibly from the list_filesystem form. Should contain the base path that the submitted files are relative to.
|
| 18 |
*
|
| 19 |
* @return A result set of nodes
|
| 20 |
*/
|
| 21 |
function import_html_import_files($file_list, $context) {
|
| 22 |
drupal_set_message(t('Processing files!'));
|
| 23 |
# dsm(count($file_list) ." files to go on");
|
| 24 |
# dsm($file_list);
|
| 25 |
# dsm(array('Context' => $context));
|
| 26 |
|
| 27 |
if ( empty($file_list)) {
|
| 28 |
drupal_set_message(t("No Files Selected. Nothing to import"), 'error');
|
| 29 |
return;
|
| 30 |
}
|
| 31 |
|
| 32 |
// TODO see if we really need to keep a handle on all these result nodes at once.
|
| 33 |
// It will be eating into our memory.
|
| 34 |
$results = array();
|
| 35 |
foreach ($file_list as $rel_path) {
|
| 36 |
if ($rel_path) {
|
| 37 |
$results = import_html_import_file($rel_path, $context);
|
| 38 |
if (! $results) {
|
| 39 |
drupal_set_message(t('Failed to get any results from the attempted analysis of %rel_path. The source file path was probably unavailable or incorrect.', array('%rel_path' => $rel_path)), 'error');
|
| 40 |
return FALSE;
|
| 41 |
}
|
| 42 |
// result of importing a file MAY be more than one node, unlikely as it may be
|
| 43 |
debug_pre(array( "Result of processing file $rel_path" => $results), 4);
|
| 44 |
foreach($results as $node) {
|
| 45 |
unset($node->file_data); // discard debug logs, try to save space.
|
| 46 |
$results[] = $node;
|
| 47 |
}
|
| 48 |
}
|
| 49 |
}
|
| 50 |
// This isn't happening until I visit admin?
|
| 51 |
menu_rebuild();
|
| 52 |
return $results;
|
| 53 |
}
|
| 54 |
|
| 55 |
/**
|
| 56 |
* Given a html file, prepare all the node info we can get out of it.
|
| 57 |
*
|
| 58 |
* This func mainly prepares the paths and relative links. Data extraction happens in _import_html_process_html_page()
|
| 59 |
*
|
| 60 |
* It does submit and save the node to the database.
|
| 61 |
*
|
| 62 |
* @return an array that may contain more than one node (in extreme cases)
|
| 63 |
*/
|
| 64 |
function import_html_import_file($rel_path, $context) {
|
| 65 |
|
| 66 |
// Read the profile id and use that as a context for all settings
|
| 67 |
|
| 68 |
$profile = import_html_profile($context['profile_id']);
|
| 69 |
$base_path = $context['base_path'];
|
| 70 |
$dest_root = $profile['file_storage_path'];
|
| 71 |
$url_parts = parse_url($base_path);
|
| 72 |
|
| 73 |
drupal_set_message(t("<strong>Importing</strong> '%rel_path'", array('%rel_path' => $rel_path)));
|
| 74 |
|
| 75 |
$source_path = $base_path . $rel_path;
|
| 76 |
$save_as = safe_filepath_from_url($rel_path);
|
| 77 |
$dest_path = preg_replace("|/+|", "/", $dest_root . $save_as);
|
| 78 |
|
| 79 |
if (preg_match("|/$|", $rel_path)) {
|
| 80 |
// Handle trailing slashes differently at home and away
|
| 81 |
if ($url_parts['host']) {
|
| 82 |
// It's remote
|
| 83 |
$default_documents = split(",", $profile['default_document']);
|
| 84 |
// need a dummy filename if retrieving default docs.
|
| 85 |
$dest_path .= trim(array_shift($default_documents));;
|
| 86 |
}
|
| 87 |
else {
|
| 88 |
return; //skip local directories altogether (their contents is selected individually)
|
| 89 |
}
|
| 90 |
}
|
| 91 |
|
| 92 |
$file = array(
|
| 93 |
'source' => $source_path,
|
| 94 |
'dest' => $dest_path,
|
| 95 |
'rel_path' => $rel_path,
|
| 96 |
);
|
| 97 |
|
| 98 |
// Handle files that are resources.
|
| 99 |
// Copy them into the files folder and return
|
| 100 |
$checkfile = is_local($source_path) ? $source_path : $dest_path;
|
| 101 |
// can't use mime detection on remote lookups yet
|
| 102 |
if (import_html_guess_file_class($checkfile) != 'html') {
|
| 103 |
// non-page resource - what sort of processing can I do here?
|
| 104 |
drupal_set_message(t("
|
| 105 |
I think (due to file suffix '%doctype') that '%source' is not a html page
|
| 106 |
I can process.<br/> It's just been copied into '!dest'.",
|
| 107 |
array(
|
| 108 |
'%source' => basename($source_path),
|
| 109 |
'!dest' => l($dest_path, $dest_path),
|
| 110 |
'%doctype' => import_html_guess_file_class($checkfile),
|
| 111 |
)
|
| 112 |
));
|
| 113 |
import_html_get_raw_file_local($source_path, $dest_path, $url_parts['host']);
|
| 114 |
$file['type'] = 'resource';
|
| 115 |
$files[] = $file;
|
| 116 |
return $files;
|
| 117 |
}
|
| 118 |
|
| 119 |
|
| 120 |
// Compare the alias path of this new page with what we've already got
|
| 121 |
$new_path = _import_html_calc_path($rel_path);
|
| 122 |
if (($normal_path = drupal_get_normal_path($new_path)) != $new_path) {
|
| 123 |
// We recognise that alias, thus an item already exists in that path.
|
| 124 |
if ($profile['handle_duplicates'] == IMPORT_HTML_SKIP) {
|
| 125 |
drupal_set_message("We already have the URL alias '$new_path' in the system as '$normal_path'. According to import_html settings, this import is being skipped");
|
| 126 |
return;
|
| 127 |
}
|
| 128 |
drupal_set_message("We already have '$new_path' in the system as '$normal_path'. Overwriting/updating it with the new import");
|
| 129 |
}
|
| 130 |
|
| 131 |
// Minor clean-up. Helps recover from crashes and prevents files getting renamed into file-01.etc
|
| 132 |
if ( file_exists($dest_path) && ! $profile['keep_temp_files']) {
|
| 133 |
unlink($dest_path);
|
| 134 |
}
|
| 135 |
|
| 136 |
if (! file_exists($dest_path) || ! $profile['keep_temp_files']) {
|
| 137 |
if (!import_html_get_raw_file_local($source_path, $dest_path, $url_parts['host'])) {
|
| 138 |
drupal_set_message(t("Failed to fetch a copy of %source_path into %dest_path", array('%dest_path' => $dest_path, '%source_path' => $source_path) ));
|
| 139 |
return NULL;
|
| 140 |
}
|
| 141 |
debug(t("Fetched a %persistant local copy to %dest_path", array('%dest_path' => $dest_path, '%persistant' => ($profile['keep_temp_files'] ? 'persistant' : 'temporary') )), 1);
|
| 142 |
}
|
| 143 |
else {
|
| 144 |
debug(t("Local copy exists at %dest_path", array('%dest_path' => $dest_path)), 1);
|
| 145 |
}
|
| 146 |
|
| 147 |
|
| 148 |
// We have a local copy now.
|
| 149 |
// $node initialized and processed HERE. Produces a node OBJECT
|
| 150 |
//
|
| 151 |
if ($url_parts['host']) {
|
| 152 |
// Importing a remote file - as for demo
|
| 153 |
// relinking will happen to point back at where it came from, not here
|
| 154 |
$rel_path = $source_path;
|
| 155 |
debug("Relinking this source will point back to the original URL context!", 2);
|
| 156 |
}
|
| 157 |
|
| 158 |
$nodes = import_html_process_html_page($dest_path, $rel_path, $profile);
|
| 159 |
|
| 160 |
// At this point, the node(s) are full of data, but not yet saved.
|
| 161 |
|
| 162 |
// On rare occasions, (using xt:document) the processing can produce an ARRAY of nodes that need saving
|
| 163 |
// Otherwise it's a node object
|
| 164 |
|
| 165 |
if (!$nodes) {
|
| 166 |
drupal_set_message(t("Failed to process file '%rel_path'", array('%rel_path' => $rel_path)), "error");
|
| 167 |
return FALSE;
|
| 168 |
}
|
| 169 |
|
| 170 |
|
| 171 |
// We can immediately discard the source file -
|
| 172 |
// it should have been a temp copy made by import_html_get_raw_file_local() above
|
| 173 |
if ( file_exists($dest_path) && ! $profile['keep_temp_files']) {
|
| 174 |
unlink($dest_path);
|
| 175 |
}
|
| 176 |
|
| 177 |
if (! is_array($nodes)) { // cast into array for iteration anyway.
|
| 178 |
$nodes = array($nodes);
|
| 179 |
}
|
| 180 |
|
| 181 |
// Almost trivial loop (probably over 1 item)
|
| 182 |
foreach ($nodes as $node) {
|
| 183 |
// The node data object has been initialized
|
| 184 |
// It may contain heaps of extra junk set in via a random absorbtion of elements in the XML import.
|
| 185 |
// They will get ignored if not recognised.
|
| 186 |
drupal_set_message(t("Processed page to extract content. Title: '{$node->title}' "));
|
| 187 |
|
| 188 |
// figure if it's overwriting an existing path
|
| 189 |
$node = import_html_merge_over_existing_node($node);
|
| 190 |
|
| 191 |
// If processing in bulk, any error will stop any further processing.
|
| 192 |
// (form_errors is non-null) need to clear it between each pass.
|
| 193 |
unset($GLOBALS['form']);
|
| 194 |
node_validate($node);
|
| 195 |
|
| 196 |
if (!trim($node->body)) {
|
| 197 |
form_set_error('body', t("No body content found in this node"));
|
| 198 |
}
|
| 199 |
|
| 200 |
$file['node'] = $node;
|
| 201 |
|
| 202 |
// Finished prep, now save
|
| 203 |
|
| 204 |
if ($context['form_id'] == 'import_html_demo_form') {
|
| 205 |
// DO NOT actually save stuff to the database
|
| 206 |
node_submit($node);
|
| 207 |
}
|
| 208 |
else {
|
| 209 |
|
| 210 |
if ($errors = form_get_errors()) {
|
| 211 |
drupal_set_message(t("Import of '%rel_path' did not quite validate. I'm not sure how to recover from that problem. <br/>!errors", array(
|
| 212 |
'%rel_path' => $file['rel_path'],
|
| 213 |
'!errors' => join(',<br/> ', $errors),
|
| 214 |
)), 'error');
|
| 215 |
// TODO This is not very helpful in bulk mode.
|
| 216 |
// what can I do now?
|
| 217 |
}
|
| 218 |
else {
|
| 219 |
if ($node->nid) {
|
| 220 |
drupal_set_message(t("!node_link Exists, updating it.", array('!node_link' => l('node '. $node->nid, 'node/'. $node->nid)) ));
|
| 221 |
watchdog('Import HTML', t("Updating node !node_link with content from $source_path", array('%source_path' => $source_path, '!node_link' => l($node->path, $node->path))), WATCHDOG_NOTICE);
|
| 222 |
node_save($node);
|
| 223 |
module_invoke_all('import_html_after_save', $profile, $node);
|
| 224 |
}
|
| 225 |
else {
|
| 226 |
drupal_set_message(t("Inserting New Node. ". l($node->path, $node->path)));
|
| 227 |
watchdog('Import HTML', t("Inserting New Node !node_link with content from $source_path", array('%source_path' => $source_path, '!node_link' => l($node->path, $node->path))), WATCHDOG_NOTICE);
|
| 228 |
$node = node_submit($node);
|
| 229 |
// Submit doesn't actually save, it just fills in extra fields
|
| 230 |
node_save($node);
|
| 231 |
|
| 232 |
// Had to wait until I had an ID to do this
|
| 233 |
// These callbacks add the aliases and menus
|
| 234 |
module_invoke_all('import_html_after_save', $profile, $node);
|
| 235 |
// note, navigation only gets set up on first import. After that you are on your own
|
| 236 |
}
|
| 237 |
} // Finished updating database
|
| 238 |
|
| 239 |
// and show the rendering of the new page teaser
|
| 240 |
$file['teaser'] = node_view($node, TRUE);
|
| 241 |
}
|
| 242 |
$files[] = $file;
|
| 243 |
}
|
| 244 |
|
| 245 |
#dpm(array('file will be' => $file));
|
| 246 |
#dpm(array('Ready to save processed node(s) is' => $files));
|
| 247 |
|
| 248 |
return $files;
|
| 249 |
}
|
| 250 |
|
| 251 |
|
| 252 |
|
| 253 |
/**
|
| 254 |
* Files have been selected, set them up for processing
|
| 255 |
*
|
| 256 |
* @param $file_list an array of simple file paths, probably selected from the file_list form
|
| 257 |
* @param $context A set of parameters, similar to the import_html profile, possibly from the list_filesystem form. Should contain the base path that the submitted files are relative to.
|
| 258 |
*/
|
| 259 |
function import_html_queue_files($file_list, $context) {
|
| 260 |
if ( empty($file_list)) {
|
| 261 |
drupal_set_message(t("No Files Selected. Nothing to import"), 'error');
|
| 262 |
return;
|
| 263 |
}
|
| 264 |
foreach ($file_list as $rel_path) {
|
| 265 |
if ($rel_path) {
|
| 266 |
import_html_add_to_queue($rel_path, $context);
|
| 267 |
}
|
| 268 |
}
|
| 269 |
return;
|
| 270 |
}
|
| 271 |
|
| 272 |
/**
|
| 273 |
* Inserts the requested action into the daemon queue to be called later
|
| 274 |
*/
|
| 275 |
function import_html_add_to_queue($rel_path, $context) {
|
| 276 |
$sql = "INSERT INTO {import_html_queue} (command, subject, context, date) VALUES ('%s', '%s', '%s', '%d')";
|
| 277 |
db_query($sql, 'import_html_import_file', $rel_path, serialize($context), time());
|
| 278 |
drupal_set_message(t("%rel_path added to processing queue for importing soon", array('%rel_path' => $rel_path)));
|
| 279 |
}
|
| 280 |
|
| 281 |
|
| 282 |
|
| 283 |
|
| 284 |
|
| 285 |
/**
|
| 286 |
* Carefully fetch a (potentially remote?) file and save it nearby
|
| 287 |
*/
|
| 288 |
function import_html_get_raw_file_local($source_path, $dest_path, $host) {
|
| 289 |
mkdirs(dirname($dest_path), FILE_CREATE_DIRECTORY);
|
| 290 |
drupal_set_message(t("Fetching content from %location '<a href='!realpath'>%source_path</a>' now.", array(
|
| 291 |
'%source_path' => $source_path,
|
| 292 |
'%location' => $host,
|
| 293 |
'!realpath' => realpath($source_path),
|
| 294 |
)));
|
| 295 |
debug("Saving temp file locally as '$dest_path' ", 3);
|
| 296 |
|
| 297 |
$orig_path = $source_path;
|
| 298 |
|
| 299 |
if ($host) {
|
| 300 |
// It's remote. Trust PHP5 and allor_url_fopen is available
|
| 301 |
if (!copy($source_path, $dest_path)) {
|
| 302 |
drupal_set_message(t("Remote file copy failed"));
|
| 303 |
return FALSE;
|
| 304 |
}
|
| 305 |
}
|
| 306 |
else {
|
| 307 |
// local copy
|
| 308 |
if (realpath($source_path) == realpath($dest_path)) {
|
| 309 |
drupal_set_message("Copying between identical source and destination, $source_path $dest_path , importing file in-place.");
|
| 310 |
return TRUE;
|
| 311 |
};
|
| 312 |
// drupal file copy assumes everything is under 'files' directory
|
| 313 |
#$dest_path = realpath(dirname($dest_path)) .'/'. basename($dest_path);
|
| 314 |
|
| 315 |
if (!copy($source_path, $dest_path)) {
|
| 316 |
drupal_set_message(t('Local file copy failed (%from to %to)', array('%from' => $orig_path, '%to' => $dest_path)), 'error');
|
| 317 |
drupal_set_message("Source <code>$source_path</code> is <pre>". print_r(stat($source_path), 1) ."</pre>", 'error');
|
| 318 |
drupal_set_message("Dest folder properties are <code>$dest_path</code> <pre>". print_r(stat(dirname($dest_path)), 1) ."</pre>", 'error');
|
| 319 |
|
| 320 |
return FALSE;
|
| 321 |
}
|
| 322 |
}
|
| 323 |
|
| 324 |
debug("Copied import file from '$orig_path' to '$dest_path'", 3);
|
| 325 |
return TRUE;
|
| 326 |
}
|
| 327 |
|
| 328 |
|
| 329 |
/**
|
| 330 |
* Analyse a source page and create a node definition from it.
|
| 331 |
*
|
| 332 |
* Most of the processing magic is in here.
|
| 333 |
* The $node handle may be provided initialized with some pre-set values.
|
| 334 |
* The $node may come in as an array or an object.
|
| 335 |
* Internally we should continue using the object methods.
|
| 336 |
*
|
| 337 |
* This processing is still in the 'validate' phase, so should
|
| 338 |
* not cause anything to happen, just configure the node object
|
| 339 |
*
|
| 340 |
* @param $path/$node the file (or object) to read the data from. If it's a
|
| 341 |
* string, it's taken to be the filename, if an object, it's the node. A node
|
| 342 |
* should contain a - >body (or ->raw_html) and a - >path at least.
|
| 343 |
* @param $rel_path Where this html page was found, relative to its own server
|
| 344 |
* root. This is used to rewrite its urls. If the path is a directory, it should
|
| 345 |
* end with a slash. ( /a/path/ == /a/path/index.html != /a/path )
|
| 346 |
* @return an ARRAY containing the new node object as the first item. Some
|
| 347 |
* processes may return multiple nodes
|
| 348 |
* @param $profile The settings for this import process.
|
| 349 |
*/
|
| 350 |
function import_html_process_html_page($path, $rel_path, $profile) {
|
| 351 |
if (!init_xsl()) {
|
| 352 |
trigger_error("Sorry, with no XML support there will be no content scanning AT ALL. Aborting process. See the import_html_help.htm for info on enabling XML under PHP.", E_USER_ERROR);
|
| 353 |
return;
|
| 354 |
}
|
| 355 |
debug(t("import_html.module debugging is enabled to level %debug_level. Visit the <a href='!profile_config'>profile configuration</a> (advanced) to turn down the volume.", array('%debug_level' => $profile['debug_level'], '!profile_config' => url( IMPORT_HTML_ADMIN_PATH .'/profile/'. $profile['profile_id']))), 1);
|
| 356 |
debug_pre(array($profile, "The import profile settings being used to import_html_process_html_page($rel_path)"), 2);
|
| 357 |
|
| 358 |
if (is_string($path)) {
|
| 359 |
// read from file
|
| 360 |
debug("Processing file as HTML page. Full file path: '$path' , will be imported as a relative path under the current section. relative-path is:'$rel_path'", 1);
|
| 361 |
|
| 362 |
if (! file_exists($path)) {
|
| 363 |
trigger_error("Path '$path' was not found. This should have been a local copy of the file being imported, but the paths may be wrong somehow. Abject failure processing $rel_path");
|
| 364 |
}
|
| 365 |
|
| 366 |
/*
|
| 367 |
* Trying to parse pure XML first is causing problems
|
| 368 |
* Either I want everything to be html, (always tidy)
|
| 369 |
* or I allow for exsl:document blocks (which can't be tidied)
|
| 370 |
* Option for now is try to parse, and only tidy if that fails.
|
| 371 |
*
|
| 372 |
*/
|
| 373 |
// temporarily ignore parser errors (catch?)
|
| 374 |
set_error_handler('stfu');
|
| 375 |
$xmldoc = parse_in_xml_file($path, $profile['force_tidy']);
|
| 376 |
restore_error_handler();
|
| 377 |
|
| 378 |
if (! $xmldoc && $profile['force_tidy'] ) {
|
| 379 |
debug("$path was not tidy enough - running tidy over it now so I can parse it.", 1);
|
| 380 |
// If a raw XML parse failed,
|
| 381 |
// tell parse_in_xml_file() to use htmlTidy before it begins
|
| 382 |
// TODO - add a flag to skip this double-processing, (parsing twice) it may be a bit slow if it's not often used
|
| 383 |
$xmldoc = parse_in_xml_file($path, TRUE);
|
| 384 |
}
|
| 385 |
debug_pre( array("Finished reading from file:" => xml_tostring($xmldoc)), 3);
|
| 386 |
$source_node = new stdClass();
|
| 387 |
}
|
| 388 |
else {
|
| 389 |
// We may have passed in a source-node object where the path was expected instead.
|
| 390 |
// A bit of a sneak. The given node has the source HTML in $node->raw_html
|
| 391 |
if (is_object($path)) {
|
| 392 |
$source_node = $path;
|
| 393 |
$path = $source_node->path;
|
| 394 |
if(! $source_node->raw_html) {
|
| 395 |
trigger_error(t("import_html_process_html_page called with no HTML source to analyse"), E_USER_ERROR);
|
| 396 |
}
|
| 397 |
debug("Processing page source, ". strlen($source_node->raw_html) ." chars long", 2);
|
| 398 |
debug_pre(array("Raw source" => $source_node->raw_html), 3);
|
| 399 |
|
| 400 |
$xmldoc = parse_in_xml_string($source_node->raw_html, $profile['force_tidy']);
|
| 401 |
}
|
| 402 |
}
|
| 403 |
if (!$xmldoc) {
|
| 404 |
// parsing failed
|
| 405 |
drupal_set_message(t("Import_HTML failed to initialize or parse XMLdoc input"), "error");
|
| 406 |
// Insert extra debug to see why
|
| 407 |
$source_node->file_data['after_tidying'] = xml_tidy_string($source_node->raw_html);
|
| 408 |
return false;
|
| 409 |
}
|
| 410 |
debug_pre(array("PARSED XML $path . XHTML" => xml_tostring($xmldoc)), 2);
|
| 411 |
|
| 412 |
if ($profile['rewrite_links']) {
|
| 413 |
// use XSL to rewrite links to fit into Drupal
|
| 414 |
$xmldoc = import_html_rewrite_links($xmldoc, $rel_path, $profile);
|
| 415 |
}
|
| 416 |
if ($profile['strip_tables']) {
|
| 417 |
$xmldoc = import_html_strip_tables($xmldoc);
|
| 418 |
}
|
| 419 |
if ($profile['strip_scripts']) {
|
| 420 |
$xmldoc = import_html_strip_scripts($xmldoc);
|
| 421 |
}
|
| 422 |
if (TRUE || $profile['tag_editable_areas']) {
|
| 423 |
$xmldoc = import_html_tag_editable_areas($xmldoc);
|
| 424 |
}
|
| 425 |
|
| 426 |
// Debug trace data
|
| 427 |
if (import_html_variable('debug_level')) {
|
| 428 |
$source_node->file_data['after_rewriting'] = xml_tostring($xmldoc);
|
| 429 |
}
|
| 430 |
|
| 431 |
// Import content as node.
|
| 432 |
// Translate the source text to the known tidy simple, tagged HTML structure now
|
| 433 |
$parameters = array(
|
| 434 |
'xmlid' => TRUE,
|
| 435 |
);
|
| 436 |
if ( !empty($profile['content_tag_id'])) {
|
| 437 |
$parameters['contentid'] = $profile['content_tag_id'];
|
| 438 |
}
|
| 439 |
|
| 440 |
if ($xsldoc = _import_html_get_xsl_doc($profile['translation_template'])) {
|
| 441 |
debug("Using XSL translation template to extract semantic content. Will search for body content labelled '". $parameters['contentid'] ."' in the source. Active XML Namespaces are {$xmldoc->firstchild->nodename} : {$xmldoc->firstchild->namespaceuri} - {$xsldoc->firstchild->nodename} : {$xsldoc->firstchild->namespaceuri} \n", 1);
|
| 442 |
$importxml = xmldoc_plus_xsldoc($xmldoc, $xsldoc, $parameters);
|
| 443 |
debug_pre(array("Transform Successful. TRANSLATED from messy source into a pure xhtml page to import" => $importxml), 2);
|
| 444 |
}
|
| 445 |
else {
|
| 446 |
trigger_error("Failed to initialize XSLdoc", E_USER_WARNING);
|
| 447 |
}
|
| 448 |
|
| 449 |
if ($importxml) {
|
| 450 |
$xmldoc = parse_in_xml_string($importxml, false);
|
| 451 |
//
|
| 452 |
// Allow one source document to produce multiple nodes
|
| 453 |
// If the process has resulted in xt:document blocks, each block
|
| 454 |
// is a new item.
|
| 455 |
// Either there is a html element in the input ... or many of them.
|
| 456 |
|
| 457 |
$html_elements = xml_getelementsbytagname($xmldoc, 'html');
|
| 458 |
|
| 459 |
debug("Found ". count($html_elements) ." html elements in source doc", 3);
|
| 460 |
|
| 461 |
$nodes = array();
|
| 462 |
// probably only one, but we'll iterate over an array of one then
|
| 463 |
foreach ($html_elements as $html_element) {
|
| 464 |
$node = import_html_xhtml_to_node($html_element, $source_node, $profile);
|
| 465 |
|
| 466 |
// Set what we want the alias to be.
|
| 467 |
if (! $node->path) {
|
| 468 |
$node->path = _import_html_calc_path($rel_path);
|
| 469 |
$node->old_path = _import_html_calc_path($rel_path, TRUE);
|
| 470 |
}
|
| 471 |
|
| 472 |
// May need extra care when creating multiples.
|
| 473 |
// Invent new paths for the new documents if the exsl:document didn't define them
|
| 474 |
if ($nodes[$node->path]) {
|
| 475 |
// already using this path, extend a new one
|
| 476 |
$node->path .= '/'. import_html_check_name($node->label?$node->label:$node->title);
|
| 477 |
}
|
| 478 |
|
| 479 |
$node->title = import_html_guess_document_title($node);
|
| 480 |
|
| 481 |
$node->status = $profile['import_status'];
|
| 482 |
$node->promote = $profile['import_promote'];
|
| 483 |
|
| 484 |
// Tag this new content if the profile has a global tag set
|
| 485 |
$cats = $profile['import_category'];
|
| 486 |
if (is_array($cats)) {
|
| 487 |
foreach ($cats as $cat) {
|
| 488 |
$node->taxonomy[$cat] = taxonomy_get_term($cat);
|
| 489 |
}
|
| 490 |
}
|
| 491 |
|
| 492 |
// debug notes/trace logs. Can be removed
|
| 493 |
if (import_html_variable('debug_level')) {
|
| 494 |
$node->file_data['raw_xhtml'] = xml_toString($html_element);
|
| 495 |
}
|
| 496 |
# $node->xml = $html_element;
|
| 497 |
|
| 498 |
$nodes[$node->path] = $node;
|
| 499 |
|
| 500 |
debug("Path to save this page as is '". $node->path ."'", 1);
|
| 501 |
}
|
| 502 |
}
|
| 503 |
else {
|
| 504 |
trigger_error("Nothing useful extracted via XML from that content", E_USER_WARNING);
|
| 505 |
return false;
|
| 506 |
}
|
| 507 |
debug_pre(array("PROCESSED node body" => $node->body), 3);
|
| 508 |
return $nodes;
|
| 509 |
}
|
| 510 |
|
| 511 |
|
| 512 |
|
| 513 |
/**
|
| 514 |
* From a given XML document, create a node structure
|
| 515 |
* with all useful parameters set.
|
| 516 |
* A shell node object may be passed in with some values already set. The data
|
| 517 |
* extracted from the XHTML structure will be layered onto that.
|
| 518 |
*
|
| 519 |
* Here is where we map HTML info to node data, like H1 -> $node->title
|
| 520 |
* TODO tidy this up with a lookup table or something
|
| 521 |
*
|
| 522 |
* node may have defined its own $node->type even
|
| 523 |
*
|
| 524 |
* THIS IS THE ENGINE OF IMPORT_HTML
|
| 525 |
*/
|
| 526 |
function import_html_xhtml_to_node($datadoc, $node, $profile) {
|
| 527 |
debug("Importing from XML object to node object", 3);
|
| 528 |
$node = $node ? $node : new stdClass();
|
| 529 |
$node->type = is_string($node->type) ? $node->type : $profile['content_type'];
|
| 530 |
|
| 531 |
$node->taxonomy = is_array($node->taxonomy) ? $node->taxonomy : array();
|
| 532 |
|
| 533 |
// Now read the input into node structure
|
| 534 |
//
|
| 535 |
// Absorb the most generic bits first. Later processes may overwrite them more accurately.
|
| 536 |
|
| 537 |
// This initial import is a totally generic catch-all.
|
| 538 |
import_html_absorb_all_tagged_elements(&$node, $datadoc);
|
| 539 |
|
| 540 |
//
|
| 541 |
// Get all metas as properties
|
| 542 |
//
|
| 543 |
$head_element = xml_getelementsbytagname($datadoc, 'head', TRUE);
|
| 544 |
// Allow ALL values I find (some may get lost later)
|
| 545 |
import_html_absorb_metas($node, $head_element, 'meta', 'name', 'content');
|
| 546 |
import_html_absorb_metas($node, $head_element, 'link', 'rel', 'href');
|
| 547 |
|
| 548 |
// If there are any other things to come from HTML into $node, let me know now!
|
| 549 |
// Loop over a buch of hook-like per-module extensions
|
| 550 |
// MENU, PATH, TAXONOMY, CCK all add values in their own callbacks in import_html_modules.inc
|
| 551 |
// Also the core node elements - body, title, teaser get set in a callback
|
| 552 |
//
|
| 553 |
require_once('import_html_modules.inc');
|
| 554 |
import_html_include_add_on_module_handlers();
|
| 555 |
|
| 556 |
module_invoke_all('import_html', $profile, $node, $datadoc);
|
| 557 |
|
| 558 |
// 'content' is now a reserved word in Drupal5
|
| 559 |
// If I have a string there, the body cannot be rendered right later
|
| 560 |
unset($node->content);
|
| 561 |
|
| 562 |
// The preferred filter 'format' of this body is none - not even line breaks
|
| 563 |
$node->format = import_html_get_preferred_filter();
|
| 564 |
|
| 565 |
debug("After absorbing absolutely everything I could find, the node object now contains the following blocks and bits:", 3);
|
| 566 |
debug_pre(array( 'Absorbed all node structure from the XHTML. Node is:' => $node), 2);
|
| 567 |
return $node;
|
| 568 |
}
|
| 569 |
|
| 570 |
/**
|
| 571 |
* Import ALL tagged classes and IDs as node attributes.
|
| 572 |
*
|
| 573 |
* If the input has ANY id or classes at all, grab that info and apply it to
|
| 574 |
* this object. Assume anything important enough to have a label is important
|
| 575 |
* enough to remember.
|
| 576 |
*
|
| 577 |
* This will probably produce a very cloggy node, filled with trash, Possibly
|
| 578 |
* even some arrays where there shouldn't be. But any unrecognised property
|
| 579 |
* names will be discarded on save, leaving only the serializable values. This
|
| 580 |
* approach will allow arbitrary data to come and go in the future.
|
| 581 |
*
|
| 582 |
*/
|
| 583 |
function import_html_absorb_all_tagged_elements(&$node, $datadoc) {
|
| 584 |
|
| 585 |
foreach (array('id', 'class') as $attribute_label) {
|
| 586 |
|
| 587 |
debug("Absorbing all blocks with an $attribute_label as incidental data blobs (possibly html) into node structure", 3);
|
| 588 |
$found_elements = xml_query($datadoc, './/*[@'. $attribute_label .']');
|
| 589 |
|
| 590 |
// I now have a collection of tagged nodes.
|
| 591 |
foreach ($found_elements as $found_element) {
|
| 592 |
|
| 593 |
$attribute_value = xml_getattribute($found_element, $attribute_label);
|
| 594 |
// if it was a class, it may be multiple!
|
| 595 |
// Usually just one however...
|
| 596 |
$keys = explode(' ', $attribute_value);
|
| 597 |
// debug("Found an node with $attribute_label of ".print_r($keys, 1) , 3);
|
| 598 |
|
| 599 |
foreach ($keys as $key) {
|
| 600 |
// Found 'something' labelled 'something'
|
| 601 |
if(! trim($key)) {continue;}
|
| 602 |
// Allow HTML though. Sometimes this will not be right...
|
| 603 |
// TODO, figure it out?
|
| 604 |
$value = xml_tostring($found_element, TRUE);
|
| 605 |
if(! trim($value)) {continue;}
|
| 606 |
|
| 607 |
// The value just gets absorbed
|
| 608 |
debug("Found an unexpected tagged value - '$key' , Absorbing it into the node as a default text/html value", 2);
|
| 609 |
|
| 610 |
// Set it onto the node,
|
| 611 |
// If it's a class, carefully combine to preserve pre-existing arrays
|
| 612 |
if ( $attribute_label == 'class') {
|
| 613 |
import_html_absorb_properties($node, $key, $value);
|
| 614 |
}
|
| 615 |
else {
|
| 616 |
// but if it's an ID, there can be only one, just set it
|
| 617 |
$node->$key = $value;
|
| 618 |
}
|
| 619 |
} // each multiple key
|
| 620 |
} // each found element
|
| 621 |
} // each attribute type
|
| 622 |
}
|
| 623 |
|
| 624 |
function import_html_absorb_metas(&$node, $xml_element, $tagname, $keyname, $valname) {
|
| 625 |
$metas = xml_getelementsbytagname($xml_element, $tagname);
|
| 626 |
debug("Absorbing the '$valname' of '{$tagname}'s with a '$keyname' from source doc into node structure", 3);
|
| 627 |
foreach ($metas as $meta) {
|
| 628 |
if (empty($meta)) {continue;}
|
| 629 |
$key = xml_getattribute($meta, $keyname);
|
| 630 |
$value = xml_getattribute($meta, $valname);
|
| 631 |
if ($key && $value) {
|
| 632 |
import_html_absorb_properties($node, $key, $value);
|
| 633 |
if (module_exists('nodewords')) {
|
| 634 |
$node->nodewords[strtolower($key)] = $value;
|
| 635 |
}
|
| 636 |
}
|
| 637 |
else{
|
| 638 |
debug("When absorbing '$valname' from '{$tagname}'s with a '$keyname' from source doc ($key=$value) had a null value. Not a great problem, just letting you know.", 2);
|
| 639 |
}
|
| 640 |
}
|
| 641 |
}
|
| 642 |
|
| 643 |
/**
|
| 644 |
* Include what we can find in the /modules directory.
|
| 645 |
* Only once.
|
| 646 |
*/
|
| 647 |
function import_html_include_add_on_module_handlers() {
|
| 648 |
static $done;
|
| 649 |
if ($done) return;
|
| 650 |
// Scan add-on dir and include all bits found there
|
| 651 |
$inc_files = file_scan_directory(drupal_get_path('module', 'import_html') .'/modules', ".*.inc", array('.', '..', 'CVS')) ;
|
| 652 |
foreach($inc_files as $inc_path) {
|
| 653 |
include_once($inc_path->filename);
|
| 654 |
}
|
| 655 |
$done = TRUE;
|
| 656 |
}
|
| 657 |
|
| 658 |
/**
|
| 659 |
* Set the given property on the given object,
|
| 660 |
* allowing multiple values to expand into arrays.
|
| 661 |
*
|
| 662 |
* Happens automatically IFF more than one kay match is found. Deal with that
|
| 663 |
* yourself.
|
| 664 |
*/
|
| 665 |
function import_html_absorb_properties(&$node, $key, $value) {
|
| 666 |
if (!$key) {debug("Odd, when absorbing properties, value:'$value' is a value for what key? The calling function passed a null key to be absorbed.");return;}
|
| 667 |
if (!$value) {debug("Odd, when absorbing properties, '$key' had a null value. This is probably not an error.", 2);return;}
|
| 668 |
|
| 669 |
// auto-expand into arrays - most metas can legally have duplicates
|
| 670 |
if ( ! isset($node->$key) ) {
|
| 671 |
$node->$key = $value;
|
| 672 |
}
|
| 673 |
else if ( is_array($node->$key) ) {
|
| 674 |
$a = $node->$key; $a[] = $value; $node->$key = $a;
|
| 675 |
}
|
| 676 |
else { $node->$key = array($node->$key, $value); }
|
| 677 |
}
|
| 678 |
|
| 679 |
|
| 680 |
function import_html_guess_document_title($node) {
|
| 681 |
if (! $node->title ) {
|
| 682 |
switch (import_html_variable('handle_no_title')) {
|
| 683 |
case IMPORT_HTML_GUESS :
|
| 684 |
return import_html_guess_label($node->path);
|
| 685 |
break;
|
| 686 |
case IMPORT_HTML_DEFAULT :
|
| 687 |
return 'Untitled Document';
|
| 688 |
break;
|
| 689 |
}
|
| 690 |
}
|
| 691 |
return $node->title;
|
| 692 |
}
|
| 693 |
|
| 694 |
function import_html_guess_label($title) {
|
| 695 |
$path_bits = split('/', $title);
|
| 696 |
$title=array_pop($path_bits);
|
| 697 |
if (!$title) {$title=array_pop($path_bits);} // it had a trailing slash
|
| 698 |
$title = str_replace('_', ' ', $title);
|
| 699 |
$title = (strstr($title, '.')) ? substr($title, 0, strrpos($title, ".")) : $title;
|
| 700 |
return $title;
|
| 701 |
}
|
| 702 |
|
| 703 |
|
| 704 |
/**
|
| 705 |
* Return the nice path alias of an imported page
|
| 706 |
*/
|
| 707 |
function _import_html_calc_path($rel_path, $leave_suffix = FALSE) {
|
| 708 |
$path = import_html_variable('import_site_prefix') . preg_replace('|^/|', '', $rel_path);
|
| 709 |
$path = preg_replace('| |', '%20', $path); // URLs should NOT have spaces, but old sites may have done this
|
| 710 |
|
| 711 |
if ($leave_suffix) {
|
| 712 |
return $path;
|
| 713 |
}
|
| 714 |
|
| 715 |
if (import_html_variable('trim_suffixes')) {
|
| 716 |
// Simplify the URL if possible by trimming the suffix and 'index'
|
| 717 |
// but remember the original path somewhere, we'ill need to link it forward
|
| 718 |
// once the new node is established.
|
| 719 |
|
| 720 |
// To be clever, special-case the 'index.html' files to be
|
| 721 |
// linked to their parent directories.
|
| 722 |
// Trailing slash is tricky.
|
| 723 |
// /this/path is a whole navigation level above
|
| 724 |
// /this/path/ and will resolve relative links differently!
|
| 725 |
// We need to actually redirect, not just alias any links like that
|
| 726 |
$default_documents = split(",", import_html_variable('default_document'));
|
| 727 |
$trimmed_path = $path;
|
| 728 |
foreach($default_documents as $default_document) {
|
| 729 |
$trimmed_path = preg_replace('|/('. trim($default_document) .')$|', "", $trimmed_path);
|
| 730 |
}
|
| 731 |
if ($trimmed_path != $path) {
|
| 732 |
debug("It's an index page, so we will refer to $path as $trimmed_path", 2);
|
| 733 |
$path = $trimmed_path;
|
| 734 |
}
|
| 735 |
else {
|
| 736 |
// No change, Chop suffix instead
|
| 737 |
$path = preg_replace('|\.[^\.]+$|', "", $path);
|
| 738 |
}
|
| 739 |
}
|
| 740 |
return $path;
|
| 741 |
}
|
| 742 |
|
| 743 |
/**
|
| 744 |
* Find and initialize the transformation template. Caching retrieval
|
| 745 |
*/
|
| 746 |
function _import_html_get_xsl_doc($xslfile) {
|
| 747 |
static $xsldoc;
|
| 748 |
if ($xsldoc) {
|
| 749 |
return $xsldoc;
|
| 750 |
}
|
| 751 |
|
| 752 |
// Check if and where filepath can be found
|
| 753 |
// Search first under full path, then module dir, then under files dir
|
| 754 |
$xslfilepath = $xslfile;
|
| 755 |
if (!file_exists($xslfilepath)) {
|
| 756 |
#dpm("Did not find $xslfilepath, trying module dir");
|
| 757 |
$xslfilepath = drupal_get_path('module', 'import_html') ."/$xslfile";
|
| 758 |
}
|
| 759 |
if (!file_exists($xslfilepath)) {
|
| 760 |
$xslfilepath = file_directory_path() ."/$xslfile";
|
| 761 |
}
|
| 762 |
|
| 763 |
if (file_exists($xslfilepath)) {
|
| 764 |
debug("Loading Transformation Stylesheet from $xslfilepath", 2);
|
| 765 |
$xsldoc = parse_in_xml_file($xslfilepath, false);
|
| 766 |
}
|
| 767 |
else {
|
| 768 |
drupal_set_message("Unable to locate the Transformation Stylesheet '$xslfilepath' ", "error");
|
| 769 |
return false;
|
| 770 |
}
|
| 771 |
return $xsldoc;
|
| 772 |
}
|
| 773 |
|
| 774 |
/**
|
| 775 |
* Run the url-rewrite XSL over the source document
|
| 776 |
* TODO allow for the non-base version of Drupal links
|
| 777 |
*
|
| 778 |
* The relative links need to be converted into path-to- top and back down
|
| 779 |
* again. Relative references just cannot be maintained.
|
| 780 |
*
|
| 781 |
* @return an XML doc again
|
| 782 |
*/
|
| 783 |
function import_html_rewrite_links($xmldoc, $rel_path, $profile) {
|
| 784 |
static $rewrite_xsldoc; // memo this to speed up bulk imports
|
| 785 |
static $xslfilepath;
|
| 786 |
if (!$rewrite_xsldoc) {
|
| 787 |
$xslfilepath = drupal_get_path('module', 'import_html') ."/rewrite_href_and_src.xsl";
|
| 788 |
$rewrite_xsldoc = parse_in_xml_file($xslfilepath, false);
|
| 789 |
}
|
| 790 |
|
| 791 |
debug("Rewriting links for a file called '$rel_path'. dirname($rel_path) is ". dirname($rel_path), 2);
|
| 792 |
debug_pre(array("import_html profile settings used for rewriting" => $profile), 3);
|
| 793 |
|
| 794 |
// dirname('/ok.htm') returns '\'; No idea why, may only happen at root level on Win
|
| 795 |
// !! B-X
|
| 796 |
|
| 797 |
// $rel_base is the path from the import root to the current page dir
|
| 798 |
// I want a trailing slash, but not a leading one for the next concatenation
|
| 799 |
// dirname('/a/dir/') returns '/a' - which is not what I want
|
| 800 |
$rel_dir = preg_match('|/$|', $rel_path) ? $rel_path : dirname($rel_path);
|
| 801 |
$rel_base = ensure_trailing_slash($rel_dir);
|
| 802 |
|
| 803 |
$site_root = url('');
|
| 804 |
$path_to_import_top = url( ensure_trailing_slash($profile['import_site_prefix']) );
|
| 805 |
$site_root = $path_to_import_top;
|
| 806 |
|
| 807 |
// if we are re-writing thing/index.htm to thing - our links will resolve differently!
|
| 808 |
// either too high for thing, or too low for the thing/index.htm alias.
|
| 809 |
$href_base = url( ensure_trailing_slash($profile['import_site_prefix']) . $rel_base);
|
| 810 |
|
| 811 |
// Create the prefix for resource sources
|
| 812 |
// Is url() OK for files with unclean urls? - NO. Neither is file_create_url
|
| 813 |
$src_root = base_path() . $profile['file_storage_path'];
|
| 814 |
|
| 815 |
$src_base = ensure_trailing_slash($src_root) . ($rel_base == '/') ? '' : $rel_base;
|
| 816 |
|
| 817 |
// Or not, if we are still linking to full URLs (demo or partial import)
|
| 818 |
$url_parts = parse_url($rel_path);
|
| 819 |
if ($url_parts['host']) {
|
| 820 |
// it's remote!
|
| 821 |
$path_to_import_top = $rel_path;
|
| 822 |
$site_root = 'http://'. $url_parts['host'] .'/';
|
| 823 |
$src_root = $site_root;
|
| 824 |
$src_base = $rel_path;
|
| 825 |
}
|
| 826 |
$src_base = str_replace('/./', '/', $src_base);
|
| 827 |
$href_base = str_replace('/./', '/', $href_base);
|
| 828 |
|
| 829 |
|
| 830 |
debug("
|
| 831 |
<b>Rewrite patterns:</b>
|
| 832 |
Path to the top of this (relative) server is $site_root .
|
| 833 |
Path to top of the prefixed section
|
| 834 |
({$profile['import_site_prefix']})
|
| 835 |
from here ($rel_path)
|
| 836 |
to our import base
|
| 837 |
({$profile['import_site_prefix']})
|
| 838 |
would be '$path_to_import_top'.
|
| 839 |
Path to a relative <em>neighbour</em> of this page would be
|
| 840 |
($href_base)
|
| 841 |
or to find the base for <em>relative</em> resource files over in
|
| 842 |
the file storage area
|
| 843 |
({$profile['file_storage_path']})
|
| 844 |
would be '$src_base' ", 2
|
| 845 |
);
|
| 846 |
|
| 847 |
|
| 848 |
$parameters = array(
|
| 849 |
// These parameters tell the rewriter what to prepend to the links.
|
| 850 |
// They are instructions how this page will find its missing bretheren
|
| 851 |
// when we put it where we put it.
|
| 852 |
// Images and Pages may end up in different places.
|
| 853 |
'site_root' => $site_root,
|
| 854 |
'src_root' => $src_root,
|
| 855 |
'src_base' => $src_base,
|
| 856 |
'href_base' => $href_base,
|
| 857 |
'replace_suffix' => $profile['relink_files'],
|
| 858 |
'new_suffix' => '',
|
| 859 |
'xsl_path' => $xslfilepath,
|
| 860 |
'strip_script_tags' => $profile['strip_script_tags'],
|
| 861 |
);
|
| 862 |
debug("
|
| 863 |
XSL for URL rewrites loaded OK.
|
| 864 |
HTML links for files that were under '$rel_base' will be made relative to '"
|
| 865 |
. $parameters['href_base'] ."' and '". $parameters['src_base'] ."'"
|
| 866 |
. ( $parameters['strip_script_tags'] ? 'All inline script blocks will be discarded from the source.'. $parameters['strip_script_tags'] : '')
|
| 867 |
, 2);
|
| 868 |
debug_pre(array("PARSED XSL $xslfilepath . XSL" => xml_tostring($rewrite_xsldoc)), 4);
|
| 869 |
|
| 870 |
$rewritten = xmldoc_plus_xsldoc($xmldoc, $rewrite_xsldoc, $parameters);
|
| 871 |
|
| 872 |
// collapse dir-up "../" paths. To tricky for XSL. Hope it doesn't break anything
|
| 873 |
$rewritten = preg_replace('|/[^\.][^/\s"\'>]*/\.\./|', '/', $rewritten);
|
| 874 |
|
| 875 |
debug_pre(array("The source after URL rewriting . XHTML (string)", $rewritten), 2);
|
| 876 |
|
| 877 |
$xmldoc = parse_in_xml_string($rewritten, FALSE);
|
| 878 |
if (empty($xmldoc)) {
|
| 879 |
trigger_error("Failed to rewrite links into a valid XML file", E_USER_WARNING);
|
| 880 |
return FALSE;
|
| 881 |
}
|
| 882 |
|
| 883 |
debug_pre(array(xml_tostring($xmldoc) => "Parsed in again. XHTML (XML)"), 3);
|
| 884 |
return $xmldoc;
|
| 885 |
}
|
| 886 |
|
| 887 |
/**
|
| 888 |
* Run the strip_tables XSL over the source document
|
| 889 |
*
|
| 890 |
* @return an XML doc again
|
| 891 |
*/
|
| 892 |
function import_html_strip_tables($xmldoc) {
|
| 893 |
static $strip_tables_xsldoc; // memo this to speed up bulk imports
|
| 894 |
if (!$strip_tables_xsldoc) {
|
| 895 |
$xslfilepath = drupal_get_path('module', 'import_html') ."/strip_tables.xsl";
|
| 896 |
$strip_tables_xsldoc = parse_in_xml_file($xslfilepath, FALSE);
|
| 897 |
}
|
| 898 |
|
| 899 |
debug_pre(array("PARSED strip_tables XSL $xslfilepath . XSL" => xml_tostring($strip_tables_xsldoc)) , 3);
|
| 900 |
$parameters = array();
|
| 901 |
$rewritten = xmldoc_plus_xsldoc($xmldoc, $strip_tables_xsldoc, $parameters);
|
| 902 |
|
| 903 |
// normalize space to clean up the gaps
|
| 904 |
$rewritten = preg_replace("/\\s*\\n\\s*/", "\n", $rewritten);
|
| 905 |
|
| 906 |
debug_pre(array("The source after stripping tables . XHTML (string)" => $rewritten), 3);
|
| 907 |
$xmldoc = parse_in_xml_string($rewritten, false);
|
| 908 |
if (!$xmldoc) {
|
| 909 |
trigger_error("Failed to strip tables and end up with a valid XML file", E_USER_WARNING);
|
| 910 |
return false;
|
| 911 |
}
|
| 912 |
|
| 913 |
return $xmldoc;
|
| 914 |
}
|
| 915 |
|
| 916 |
/**
|
| 917 |
* @see import_html_strip_tables
|
| 918 |
*/
|
| 919 |
function import_html_strip_scripts($xmldoc) {
|
| 920 |
static $strip_scripts_xsldoc; // memo this to speed up bulk imports
|
| 921 |
if (!$strip_scripts_xsldoc) {
|
| 922 |
$xslfilepath = drupal_get_path('module', 'import_html') ."/strip_scripts.xsl";
|
| 923 |
$strip_scripts_xsldoc = parse_in_xml_file($xslfilepath, false);
|
| 924 |
}
|
| 925 |
$parameters = array();
|
| 926 |
$rewritten = xmldoc_plus_xsldoc($xmldoc, $strip_scripts_xsldoc, $parameters);
|
| 927 |
|
| 928 |
$xmldoc = parse_in_xml_string($rewritten, false);
|
| 929 |
if (!$xmldoc) {
|
| 930 |
trigger_error("Failed to strip tables and end up with a valid XML file", E_USER_WARNING);
|
| 931 |
return false;
|
| 932 |
}
|
| 933 |
return $xmldoc;
|
| 934 |
}
|
| 935 |
|
| 936 |
/**
|
| 937 |
* Use XSL to convert Dreamweaver 'instanceEditable' comments into semantic
|
| 938 |
* tagged divs
|
| 939 |
*
|
| 940 |
* @see import_html_strip_tables
|
| 941 |
*/
|
| 942 |
function import_html_tag_editable_areas($xmldoc) {
|
| 943 |
static $editable_xsldoc; // memo this to speed up bulk imports
|
| 944 |
if (!$editable_xsldoc) {
|
| 945 |
$xslfilepath = drupal_get_path('module', 'import_html') ."/tag_editable_areas.xsl";
|
| 946 |
$editable_xsldoc = parse_in_xml_file($xslfilepath, false);
|
| 947 |
}
|
| 948 |
if (!$editable_xsldoc) {
|
| 949 |
trigger_error('Cannot tag_editable_areas. tag_editable_areas.xsl unavailable', E_USER_WARNING);
|
| 950 |
return NULL;
|
| 951 |
}
|
| 952 |
$parameters = array();
|
| 953 |
$rewritten = xmldoc_plus_xsldoc($xmldoc, $editable_xsldoc, $parameters);
|
| 954 |
|
| 955 |
$xmldoc = parse_in_xml_string($rewritten, false);
|
| 956 |
if (!$xmldoc) {
|
| 957 |
trigger_error("Failed to tag commented editable areas (eg from Dreamweaver) and end up with a valid XML file", E_USER_WARNING);
|
| 958 |
return false;
|
| 959 |
}
|
| 960 |
return $xmldoc;
|
| 961 |
}
|
| 962 |
|
| 963 |
|
| 964 |
|
| 965 |
|
| 966 |
/**
|
| 967 |
* Ensure a sting is able to be used as an XML, CSS or Javascript ID.
|
| 968 |
* Basically strip out all non-alpha-numerics
|
| 969 |
* http://www.w3.org/TR/REC-xml/#NT-Name
|
| 970 |
* @see form_clean_id() - which should have done this
|
| 971 |
*/
|
| 972 |
function import_html_check_name($name) {
|
| 973 |
return preg_replace('|[^a-zA-Z0-9_]+|', '_', $name);
|
| 974 |
}
|
| 975 |
|
| 976 |
|
| 977 |
|
| 978 |
/**
|
| 979 |
* Avoid double-ups, if the path already exists, UPDATE the existing node.
|
| 980 |
* Can't have two content nodes claiming the same path or it won't validate.
|
| 981 |
* Plus, we want to retain any info that's been added via drupal. Probably.
|
| 982 |
*
|
| 983 |
* @param $node - partially created node from import. Key lookup on $node->path
|
| 984 |
* @return $node - possibly with pre-existing values blended in. Importantly - the nid
|
| 985 |
*/
|
| 986 |
function import_html_merge_over_existing_node($node) {
|
| 987 |
$internal_link = drupal_get_normal_path($node->path);
|
| 988 |
|
| 989 |
if ($internal_link != $node->path) {
|
| 990 |
// Found an internal match, the alias is already asigned to a node
|
| 991 |
// Merge info to avoid losing any Drupal-only info
|
| 992 |
|
| 993 |
$node->nid = array_pop(explode("/", $internal_link));
|
| 994 |
if (!$node->nid) {
|
| 995 |
// Should never happen - just paranoia
|
| 996 |
drupal_set_message("
|
| 997 |
When looking for an alias to '{$node->path}',
|
| 998 |
got error finding node ID from the internal link
|
| 999 |
'$internal_link' - which was supposed to return a nid", 'error'
|
| 1000 |
);
|
| 1001 |
}
|
| 1002 |
else {
|
| 1003 |
debug("
|
| 1004 |
Page path alias '{$node->path}' already exists,
|
| 1005 |
It's already linked to node id '{$node->nid}'.
|
| 1006 |
This data import will <em>replace</em> that content,
|
| 1007 |
but try to keep any other values.
|
| 1008 |
", 2);
|
| 1009 |
|
| 1010 |
// Load existing item, layer changes on top of it
|
| 1011 |
$old_node = node_load($node->nid );
|
| 1012 |
foreach ($node as $key => $value) {
|
| 1013 |
if (is_array($value)) { // merge deeper sets, like taxonomy
|
| 1014 |
if (!is_array($old_node->$key)) {$old_node->$key=array();}
|
| 1015 |
foreach ($value as $k => $v) {
|
| 1016 |
$old_node->{$key}[$k] = $v;
|
| 1017 |
}
|
| 1018 |
}
|
| 1019 |
else {
|
| 1020 |
$old_node-> $key = $value;
|
| 1021 |
}
|
| 1022 |
}
|
| 1023 |
$node = $old_node;
|
| 1024 |
}
|
| 1025 |
}
|
| 1026 |
return $node;
|
| 1027 |
}
|
| 1028 |
|
| 1029 |
|
| 1030 |
/**
|
| 1031 |
* Tidy URLs before saving locally - for URL imports
|
| 1032 |
*
|
| 1033 |
* Squash/hash query strings, but don't discard them.
|
| 1034 |
* Do discard fragment ids
|
| 1035 |
*
|
| 1036 |
* Replace spaces and non-alphanumerics with underscore
|
| 1037 |
*/
|
| 1038 |
function safe_filepath_from_url($rel_path) {
|
| 1039 |
$save_as = preg_replace("|\?|", "%3f", $rel_path);
|
| 1040 |
$save_as = preg_replace("|\&|", "%26", $save_as);
|
| 1041 |
$save_as = preg_replace("|#.*|", "", $save_as);
|
| 1042 |
|
| 1043 |
if (import_html_variable('allow_bad_urls')) {
|
| 1044 |
return $save_as;
|
| 1045 |
}
|
| 1046 |
|
| 1047 |
$save_as = preg_replace("|[^A-Za-z0-9_\-~\./%]+|", "_", $save_as);
|
| 1048 |
return $save_as;
|
| 1049 |
}
|
| 1050 |
|
| 1051 |
/**
|
| 1052 |
* http://nz2.php.net/manual/en/function.utf8-decode.php#85034
|
| 1053 |
*/
|
| 1054 |
function charset_decode_utf_8 ($string) {
|
| 1055 |
/* Only do the slow convert if there are 8-bit characters */
|
| 1056 |
/* avoid using 0xA0 (\240) in ereg ranges. RH73 does not like that */
|
| 1057 |
if (! ereg("[\200-\237]", $string) and ! ereg("[\241-\377]", $string)) {
|
| 1058 |
return $string;
|
| 1059 |
}
|
| 1060 |
|
| 1061 |
// decode three byte unicode characters
|
| 1062 |
$string = preg_replace(
|
| 1063 |
"/([\340-\357])([\200-\277])([\200-\277])/e",
|
| 1064 |
"'&#'.((ord('\\1')-224)*4096 + (ord('\\2')-128)*64 + (ord('\\3')-128)).';'",
|
| 1065 |
$string
|
| 1066 |
);
|
| 1067 |
|
| 1068 |
// decode two byte unicode characters
|
| 1069 |
$string = preg_replace(
|
| 1070 |
"/([\300-\337])([\200-\277])/e",
|
| 1071 |
"'&#'.((ord('\\1')-192)*64+(ord('\\2')-128)).';'",
|
| 1072 |
$string
|
| 1073 |
);
|
| 1074 |
|
| 1075 |
// dman
|
| 1076 |
// Encode medium-high entities (>#128;). htmltidy is failing to catch (copyright) 0xA9 (#159) symbol. Others?
|
| 1077 |
$string = preg_replace(
|
| 1078 |
"/([\200-\377])/e",
|
| 1079 |
"'&#'.ord('\\1').';'",
|
| 1080 |
$string
|
| 1081 |
);
|
| 1082 |
|
| 1083 |
return $string;
|
| 1084 |
}
|
| 1085 |
|
| 1086 |
|
| 1087 |
/**
|
| 1088 |
* dummy error handler
|
| 1089 |
* Used to shush DOM errors when we know the doc is probably invalid
|
| 1090 |
*/
|
| 1091 |
function stfu($err, $str) {
|
| 1092 |
# debug($str, 4);
|
| 1093 |
};
|