/[drupal]/contributions/modules/import_html/import_html_process.inc
ViewVC logotype

Contents of /contributions/modules/import_html/import_html_process.inc

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.6 - (show annotations) (download) (as text)
Sat Mar 14 04:30:48 2009 UTC (8 months, 2 weeks ago) by dman
Branch: MAIN
CVS Tags: HEAD
Changes since 1.5: +64 -24 lines
File MIME type: text/x-php
Trying to move everything into a fresh branch, tagged 6--1
1 <?php
2 /**
3 * @file Actual routines for importing files.
4 *
5 *
6 * @ingroup import_html Import HTML
7 * @author Dan Morrison http://coders.co.nz/
8 * @version $Id: import_html_process.inc,v 1.5.2.2 2009/03/14 03:18:51 dman Exp $
9 *
10 */
11
12
13 /**
14 * Files have been selected, set them up for processing
15 *
16 * @param $file_list an array of simple file paths, probably selected from the file_list form
17 * @param $context A set of parameters, similar to the import_html profile, possibly from the list_filesystem form. Should contain the base path that the submitted files are relative to.
18 *
19 * @return A result set of nodes
20 */
21 function import_html_import_files($file_list, $context) {
22 drupal_set_message(t('Processing files!'));
23 # dsm(count($file_list) ." files to go on");
24 # dsm($file_list);
25 # dsm(array('Context' => $context));
26
27 if ( empty($file_list)) {
28 drupal_set_message(t("No Files Selected. Nothing to import"), 'error');
29 return;
30 }
31
32 // TODO see if we really need to keep a handle on all these result nodes at once.
33 // It will be eating into our memory.
34 $results = array();
35 foreach ($file_list as $rel_path) {
36 if ($rel_path) {
37 $results = import_html_import_file($rel_path, $context);
38 if (! $results) {
39 drupal_set_message(t('Failed to get any results from the attempted analysis of %rel_path. The source file path was probably unavailable or incorrect.', array('%rel_path' => $rel_path)), 'error');
40 return FALSE;
41 }
42 // result of importing a file MAY be more than one node, unlikely as it may be
43 debug_pre(array( "Result of processing file $rel_path" => $results), 4);
44 foreach($results as $node) {
45 unset($node->file_data); // discard debug logs, try to save space.
46 $results[] = $node;
47 }
48 }
49 }
50 // This isn't happening until I visit admin?
51 menu_rebuild();
52 return $results;
53 }
54
55 /**
56 * Given a html file, prepare all the node info we can get out of it.
57 *
58 * This func mainly prepares the paths and relative links. Data extraction happens in _import_html_process_html_page()
59 *
60 * It does submit and save the node to the database.
61 *
62 * @return an array that may contain more than one node (in extreme cases)
63 */
64 function import_html_import_file($rel_path, $context) {
65
66 // Read the profile id and use that as a context for all settings
67
68 $profile = import_html_profile($context['profile_id']);
69 $base_path = $context['base_path'];
70 $dest_root = $profile['file_storage_path'];
71 $url_parts = parse_url($base_path);
72
73 drupal_set_message(t("<strong>Importing</strong> '%rel_path'", array('%rel_path' => $rel_path)));
74
75 $source_path = $base_path . $rel_path;
76 $save_as = safe_filepath_from_url($rel_path);
77 $dest_path = preg_replace("|/+|", "/", $dest_root . $save_as);
78
79 if (preg_match("|/$|", $rel_path)) {
80 // Handle trailing slashes differently at home and away
81 if ($url_parts['host']) {
82 // It's remote
83 $default_documents = split(",", $profile['default_document']);
84 // need a dummy filename if retrieving default docs.
85 $dest_path .= trim(array_shift($default_documents));;
86 }
87 else {
88 return; //skip local directories altogether (their contents is selected individually)
89 }
90 }
91
92 $file = array(
93 'source' => $source_path,
94 'dest' => $dest_path,
95 'rel_path' => $rel_path,
96 );
97
98 // Handle files that are resources.
99 // Copy them into the files folder and return
100 $checkfile = is_local($source_path) ? $source_path : $dest_path;
101 // can't use mime detection on remote lookups yet
102 if (import_html_guess_file_class($checkfile) != 'html') {
103 // non-page resource - what sort of processing can I do here?
104 drupal_set_message(t("
105 I think (due to file suffix '%doctype') that '%source' is not a html page
106 I can process.<br/> It's just been copied into '!dest'.",
107 array(
108 '%source' => basename($source_path),
109 '!dest' => l($dest_path, $dest_path),
110 '%doctype' => import_html_guess_file_class($checkfile),
111 )
112 ));
113 import_html_get_raw_file_local($source_path, $dest_path, $url_parts['host']);
114 $file['type'] = 'resource';
115 $files[] = $file;
116 return $files;
117 }
118
119
120 // Compare the alias path of this new page with what we've already got
121 $new_path = _import_html_calc_path($rel_path);
122 if (($normal_path = drupal_get_normal_path($new_path)) != $new_path) {
123 // We recognise that alias, thus an item already exists in that path.
124 if ($profile['handle_duplicates'] == IMPORT_HTML_SKIP) {
125 drupal_set_message("We already have the URL alias '$new_path' in the system as '$normal_path'. According to import_html settings, this import is being skipped");
126 return;
127 }
128 drupal_set_message("We already have '$new_path' in the system as '$normal_path'. Overwriting/updating it with the new import");
129 }
130
131 // Minor clean-up. Helps recover from crashes and prevents files getting renamed into file-01.etc
132 if ( file_exists($dest_path) && ! $profile['keep_temp_files']) {
133 unlink($dest_path);
134 }
135
136 if (! file_exists($dest_path) || ! $profile['keep_temp_files']) {
137 if (!import_html_get_raw_file_local($source_path, $dest_path, $url_parts['host'])) {
138 drupal_set_message(t("Failed to fetch a copy of %source_path into %dest_path", array('%dest_path' => $dest_path, '%source_path' => $source_path) ));
139 return NULL;
140 }
141 debug(t("Fetched a %persistant local copy to %dest_path", array('%dest_path' => $dest_path, '%persistant' => ($profile['keep_temp_files'] ? 'persistant' : 'temporary') )), 1);
142 }
143 else {
144 debug(t("Local copy exists at %dest_path", array('%dest_path' => $dest_path)), 1);
145 }
146
147
148 // We have a local copy now.
149 // $node initialized and processed HERE. Produces a node OBJECT
150 //
151 if ($url_parts['host']) {
152 // Importing a remote file - as for demo
153 // relinking will happen to point back at where it came from, not here
154 $rel_path = $source_path;
155 debug("Relinking this source will point back to the original URL context!", 2);
156 }
157
158 $nodes = import_html_process_html_page($dest_path, $rel_path, $profile);
159
160 // At this point, the node(s) are full of data, but not yet saved.
161
162 // On rare occasions, (using xt:document) the processing can produce an ARRAY of nodes that need saving
163 // Otherwise it's a node object
164
165 if (!$nodes) {
166 drupal_set_message(t("Failed to process file '%rel_path'", array('%rel_path' => $rel_path)), "error");
167 return FALSE;
168 }
169
170
171 // We can immediately discard the source file -
172 // it should have been a temp copy made by import_html_get_raw_file_local() above
173 if ( file_exists($dest_path) && ! $profile['keep_temp_files']) {
174 unlink($dest_path);
175 }
176
177 if (! is_array($nodes)) { // cast into array for iteration anyway.
178 $nodes = array($nodes);
179 }
180
181 // Almost trivial loop (probably over 1 item)
182 foreach ($nodes as $node) {
183 // The node data object has been initialized
184 // It may contain heaps of extra junk set in via a random absorbtion of elements in the XML import.
185 // They will get ignored if not recognised.
186 drupal_set_message(t("Processed page to extract content. Title: '{$node->title}' "));
187
188 // figure if it's overwriting an existing path
189 $node = import_html_merge_over_existing_node($node);
190
191 // If processing in bulk, any error will stop any further processing.
192 // (form_errors is non-null) need to clear it between each pass.
193 unset($GLOBALS['form']);
194 node_validate($node);
195
196 if (!trim($node->body)) {
197 form_set_error('body', t("No body content found in this node"));
198 }
199
200 $file['node'] = $node;
201
202 // Finished prep, now save
203
204 if ($context['form_id'] == 'import_html_demo_form') {
205 // DO NOT actually save stuff to the database
206 node_submit($node);
207 }
208 else {
209
210 if ($errors = form_get_errors()) {
211 drupal_set_message(t("Import of '%rel_path' did not quite validate. I'm not sure how to recover from that problem. <br/>!errors", array(
212 '%rel_path' => $file['rel_path'],
213 '!errors' => join(',<br/> ', $errors),
214 )), 'error');
215 // TODO This is not very helpful in bulk mode.
216 // what can I do now?
217 }
218 else {
219 if ($node->nid) {
220 drupal_set_message(t("!node_link Exists, updating it.", array('!node_link' => l('node '. $node->nid, 'node/'. $node->nid)) ));
221 watchdog('Import HTML', t("Updating node !node_link with content from $source_path", array('%source_path' => $source_path, '!node_link' => l($node->path, $node->path))), WATCHDOG_NOTICE);
222 node_save($node);
223 module_invoke_all('import_html_after_save', $profile, $node);
224 }
225 else {
226 drupal_set_message(t("Inserting New Node. ". l($node->path, $node->path)));
227 watchdog('Import HTML', t("Inserting New Node !node_link with content from $source_path", array('%source_path' => $source_path, '!node_link' => l($node->path, $node->path))), WATCHDOG_NOTICE);
228 $node = node_submit($node);
229 // Submit doesn't actually save, it just fills in extra fields
230 node_save($node);
231
232 // Had to wait until I had an ID to do this
233 // These callbacks add the aliases and menus
234 module_invoke_all('import_html_after_save', $profile, $node);
235 // note, navigation only gets set up on first import. After that you are on your own
236 }
237 } // Finished updating database
238
239 // and show the rendering of the new page teaser
240 $file['teaser'] = node_view($node, TRUE);
241 }
242 $files[] = $file;
243 }
244
245 #dpm(array('file will be' => $file));
246 #dpm(array('Ready to save processed node(s) is' => $files));
247
248 return $files;
249 }
250
251
252
253 /**
254 * Files have been selected, set them up for processing
255 *
256 * @param $file_list an array of simple file paths, probably selected from the file_list form
257 * @param $context A set of parameters, similar to the import_html profile, possibly from the list_filesystem form. Should contain the base path that the submitted files are relative to.
258 */
259 function import_html_queue_files($file_list, $context) {
260 if ( empty($file_list)) {
261 drupal_set_message(t("No Files Selected. Nothing to import"), 'error');
262 return;
263 }
264 foreach ($file_list as $rel_path) {
265 if ($rel_path) {
266 import_html_add_to_queue($rel_path, $context);
267 }
268 }
269 return;
270 }
271
272 /**
273 * Inserts the requested action into the daemon queue to be called later
274 */
275 function import_html_add_to_queue($rel_path, $context) {
276 $sql = "INSERT INTO {import_html_queue} (command, subject, context, date) VALUES ('%s', '%s', '%s', '%d')";
277 db_query($sql, 'import_html_import_file', $rel_path, serialize($context), time());
278 drupal_set_message(t("%rel_path added to processing queue for importing soon", array('%rel_path' => $rel_path)));
279 }
280
281
282
283
284
285 /**
286 * Carefully fetch a (potentially remote?) file and save it nearby
287 */
288 function import_html_get_raw_file_local($source_path, $dest_path, $host) {
289 mkdirs(dirname($dest_path), FILE_CREATE_DIRECTORY);
290 drupal_set_message(t("Fetching content from %location '<a href='!realpath'>%source_path</a>' now.", array(
291 '%source_path' => $source_path,
292 '%location' => $host,
293 '!realpath' => realpath($source_path),
294 )));
295 debug("Saving temp file locally as '$dest_path' ", 3);
296
297 $orig_path = $source_path;
298
299 if ($host) {
300 // It's remote. Trust PHP5 and allor_url_fopen is available
301 if (!copy($source_path, $dest_path)) {
302 drupal_set_message(t("Remote file copy failed"));
303 return FALSE;
304 }
305 }
306 else {
307 // local copy
308 if (realpath($source_path) == realpath($dest_path)) {
309 drupal_set_message("Copying between identical source and destination, $source_path $dest_path , importing file in-place.");
310 return TRUE;
311 };
312 // drupal file copy assumes everything is under 'files' directory
313 #$dest_path = realpath(dirname($dest_path)) .'/'. basename($dest_path);
314
315 if (!copy($source_path, $dest_path)) {
316 drupal_set_message(t('Local file copy failed (%from to %to)', array('%from' => $orig_path, '%to' => $dest_path)), 'error');
317 drupal_set_message("Source <code>$source_path</code> is <pre>". print_r(stat($source_path), 1) ."</pre>", 'error');
318 drupal_set_message("Dest folder properties are <code>$dest_path</code> <pre>". print_r(stat(dirname($dest_path)), 1) ."</pre>", 'error');
319
320 return FALSE;
321 }
322 }
323
324 debug("Copied import file from '$orig_path' to '$dest_path'", 3);
325 return TRUE;
326 }
327
328
329 /**
330 * Analyse a source page and create a node definition from it.
331 *
332 * Most of the processing magic is in here.
333 * The $node handle may be provided initialized with some pre-set values.
334 * The $node may come in as an array or an object.
335 * Internally we should continue using the object methods.
336 *
337 * This processing is still in the 'validate' phase, so should
338 * not cause anything to happen, just configure the node object
339 *
340 * @param $path/$node the file (or object) to read the data from. If it's a
341 * string, it's taken to be the filename, if an object, it's the node. A node
342 * should contain a - >body (or ->raw_html) and a - >path at least.
343 * @param $rel_path Where this html page was found, relative to its own server
344 * root. This is used to rewrite its urls. If the path is a directory, it should
345 * end with a slash. ( /a/path/ == /a/path/index.html != /a/path )
346 * @return an ARRAY containing the new node object as the first item. Some
347 * processes may return multiple nodes
348 * @param $profile The settings for this import process.
349 */
350 function import_html_process_html_page($path, $rel_path, $profile) {
351 if (!init_xsl()) {
352 trigger_error("Sorry, with no XML support there will be no content scanning AT ALL. Aborting process. See the import_html_help.htm for info on enabling XML under PHP.", E_USER_ERROR);
353 return;
354 }
355 debug(t("import_html.module debugging is enabled to level %debug_level. Visit the <a href='!profile_config'>profile configuration</a> (advanced) to turn down the volume.", array('%debug_level' => $profile['debug_level'], '!profile_config' => url( IMPORT_HTML_ADMIN_PATH .'/profile/'. $profile['profile_id']))), 1);
356 debug_pre(array($profile, "The import profile settings being used to import_html_process_html_page($rel_path)"), 2);
357
358 if (is_string($path)) {
359 // read from file
360 debug("Processing file as HTML page. Full file path: '$path' , will be imported as a relative path under the current section. relative-path is:'$rel_path'", 1);
361
362 if (! file_exists($path)) {
363 trigger_error("Path '$path' was not found. This should have been a local copy of the file being imported, but the paths may be wrong somehow. Abject failure processing $rel_path");
364 }
365
366 /*
367 * Trying to parse pure XML first is causing problems
368 * Either I want everything to be html, (always tidy)
369 * or I allow for exsl:document blocks (which can't be tidied)
370 * Option for now is try to parse, and only tidy if that fails.
371 *
372 */
373 // temporarily ignore parser errors (catch?)
374 set_error_handler('stfu');
375 $xmldoc = parse_in_xml_file($path, $profile['force_tidy']);
376 restore_error_handler();
377
378 if (! $xmldoc && $profile['force_tidy'] ) {
379 debug("$path was not tidy enough - running tidy over it now so I can parse it.", 1);
380 // If a raw XML parse failed,
381 // tell parse_in_xml_file() to use htmlTidy before it begins
382 // TODO - add a flag to skip this double-processing, (parsing twice) it may be a bit slow if it's not often used
383 $xmldoc = parse_in_xml_file($path, TRUE);
384 }
385 debug_pre( array("Finished reading from file:" => xml_tostring($xmldoc)), 3);
386 $source_node = new stdClass();
387 }
388 else {
389 // We may have passed in a source-node object where the path was expected instead.
390 // A bit of a sneak. The given node has the source HTML in $node->raw_html
391 if (is_object($path)) {
392 $source_node = $path;
393 $path = $source_node->path;
394 if(! $source_node->raw_html) {
395 trigger_error(t("import_html_process_html_page called with no HTML source to analyse"), E_USER_ERROR);
396 }
397 debug("Processing page source, ". strlen($source_node->raw_html) ." chars long", 2);
398 debug_pre(array("Raw source" => $source_node->raw_html), 3);
399
400 $xmldoc = parse_in_xml_string($source_node->raw_html, $profile['force_tidy']);
401 }
402 }
403 if (!$xmldoc) {
404 // parsing failed
405 drupal_set_message(t("Import_HTML failed to initialize or parse XMLdoc input"), "error");
406 // Insert extra debug to see why
407 $source_node->file_data['after_tidying'] = xml_tidy_string($source_node->raw_html);
408 return false;
409 }
410 debug_pre(array("PARSED XML $path . XHTML" => xml_tostring($xmldoc)), 2);
411
412 if ($profile['rewrite_links']) {
413 // use XSL to rewrite links to fit into Drupal
414 $xmldoc = import_html_rewrite_links($xmldoc, $rel_path, $profile);
415 }
416 if ($profile['strip_tables']) {
417 $xmldoc = import_html_strip_tables($xmldoc);
418 }
419 if ($profile['strip_scripts']) {
420 $xmldoc = import_html_strip_scripts($xmldoc);
421 }
422 if (TRUE || $profile['tag_editable_areas']) {
423 $xmldoc = import_html_tag_editable_areas($xmldoc);
424 }
425
426 // Debug trace data
427 if (import_html_variable('debug_level')) {
428 $source_node->file_data['after_rewriting'] = xml_tostring($xmldoc);
429 }
430
431 // Import content as node.
432 // Translate the source text to the known tidy simple, tagged HTML structure now
433 $parameters = array(
434 'xmlid' => TRUE,
435 );
436 if ( !empty($profile['content_tag_id'])) {
437 $parameters['contentid'] = $profile['content_tag_id'];
438 }
439
440 if ($xsldoc = _import_html_get_xsl_doc($profile['translation_template'])) {
441 debug("Using XSL translation template to extract semantic content. Will search for body content labelled '". $parameters['contentid'] ."' in the source. Active XML Namespaces are {$xmldoc->firstchild->nodename} : {$xmldoc->firstchild->namespaceuri} - {$xsldoc->firstchild->nodename} : {$xsldoc->firstchild->namespaceuri} \n", 1);
442 $importxml = xmldoc_plus_xsldoc($xmldoc, $xsldoc, $parameters);
443 debug_pre(array("Transform Successful. TRANSLATED from messy source into a pure xhtml page to import" => $importxml), 2);
444 }
445 else {
446 trigger_error("Failed to initialize XSLdoc", E_USER_WARNING);
447 }
448
449 if ($importxml) {
450 $xmldoc = parse_in_xml_string($importxml, false);
451 //
452 // Allow one source document to produce multiple nodes
453 // If the process has resulted in xt:document blocks, each block
454 // is a new item.
455 // Either there is a html element in the input ... or many of them.
456
457 $html_elements = xml_getelementsbytagname($xmldoc, 'html');
458
459 debug("Found ". count($html_elements) ." html elements in source doc", 3);
460
461 $nodes = array();
462 // probably only one, but we'll iterate over an array of one then
463 foreach ($html_elements as $html_element) {
464 $node = import_html_xhtml_to_node($html_element, $source_node, $profile);
465
466 // Set what we want the alias to be.
467 if (! $node->path) {
468 $node->path = _import_html_calc_path($rel_path);
469 $node->old_path = _import_html_calc_path($rel_path, TRUE);
470 }
471
472 // May need extra care when creating multiples.
473 // Invent new paths for the new documents if the exsl:document didn't define them
474 if ($nodes[$node->path]) {
475 // already using this path, extend a new one
476 $node->path .= '/'. import_html_check_name($node->label?$node->label:$node->title);
477 }
478
479 $node->title = import_html_guess_document_title($node);
480
481 $node->status = $profile['import_status'];
482 $node->promote = $profile['import_promote'];
483
484 // Tag this new content if the profile has a global tag set
485 $cats = $profile['import_category'];
486 if (is_array($cats)) {
487 foreach ($cats as $cat) {
488 $node->taxonomy[$cat] = taxonomy_get_term($cat);
489 }
490 }
491
492 // debug notes/trace logs. Can be removed
493 if (import_html_variable('debug_level')) {
494 $node->file_data['raw_xhtml'] = xml_toString($html_element);
495 }
496 # $node->xml = $html_element;
497
498 $nodes[$node->path] = $node;
499
500 debug("Path to save this page as is '". $node->path ."'", 1);
501 }
502 }
503 else {
504 trigger_error("Nothing useful extracted via XML from that content", E_USER_WARNING);
505 return false;
506 }
507 debug_pre(array("PROCESSED node body" => $node->body), 3);
508 return $nodes;
509 }
510
511
512
513 /**
514 * From a given XML document, create a node structure
515 * with all useful parameters set.
516 * A shell node object may be passed in with some values already set. The data
517 * extracted from the XHTML structure will be layered onto that.
518 *
519 * Here is where we map HTML info to node data, like H1 -> $node->title
520 * TODO tidy this up with a lookup table or something
521 *
522 * node may have defined its own $node->type even
523 *
524 * THIS IS THE ENGINE OF IMPORT_HTML
525 */
526 function import_html_xhtml_to_node($datadoc, $node, $profile) {
527 debug("Importing from XML object to node object", 3);
528 $node = $node ? $node : new stdClass();
529 $node->type = is_string($node->type) ? $node->type : $profile['content_type'];
530
531 $node->taxonomy = is_array($node->taxonomy) ? $node->taxonomy : array();
532
533 // Now read the input into node structure
534 //
535 // Absorb the most generic bits first. Later processes may overwrite them more accurately.
536
537 // This initial import is a totally generic catch-all.
538 import_html_absorb_all_tagged_elements(&$node, $datadoc);
539
540 //
541 // Get all metas as properties
542 //
543 $head_element = xml_getelementsbytagname($datadoc, 'head', TRUE);
544 // Allow ALL values I find (some may get lost later)
545 import_html_absorb_metas($node, $head_element, 'meta', 'name', 'content');
546 import_html_absorb_metas($node, $head_element, 'link', 'rel', 'href');
547
548 // If there are any other things to come from HTML into $node, let me know now!
549 // Loop over a buch of hook-like per-module extensions
550 // MENU, PATH, TAXONOMY, CCK all add values in their own callbacks in import_html_modules.inc
551 // Also the core node elements - body, title, teaser get set in a callback
552 //
553 require_once('import_html_modules.inc');
554 import_html_include_add_on_module_handlers();
555
556 module_invoke_all('import_html', $profile, $node, $datadoc);
557
558 // 'content' is now a reserved word in Drupal5
559 // If I have a string there, the body cannot be rendered right later
560 unset($node->content);
561
562 // The preferred filter 'format' of this body is none - not even line breaks
563 $node->format = import_html_get_preferred_filter();
564
565 debug("After absorbing absolutely everything I could find, the node object now contains the following blocks and bits:", 3);
566 debug_pre(array( 'Absorbed all node structure from the XHTML. Node is:' => $node), 2);
567 return $node;
568 }
569
570 /**
571 * Import ALL tagged classes and IDs as node attributes.
572 *
573 * If the input has ANY id or classes at all, grab that info and apply it to
574 * this object. Assume anything important enough to have a label is important
575 * enough to remember.
576 *
577 * This will probably produce a very cloggy node, filled with trash, Possibly
578 * even some arrays where there shouldn't be. But any unrecognised property
579 * names will be discarded on save, leaving only the serializable values. This
580 * approach will allow arbitrary data to come and go in the future.
581 *
582 */
583 function import_html_absorb_all_tagged_elements(&$node, $datadoc) {
584
585 foreach (array('id', 'class') as $attribute_label) {
586
587 debug("Absorbing all blocks with an $attribute_label as incidental data blobs (possibly html) into node structure", 3);
588 $found_elements = xml_query($datadoc, './/*[@'. $attribute_label .']');
589
590 // I now have a collection of tagged nodes.
591 foreach ($found_elements as $found_element) {
592
593 $attribute_value = xml_getattribute($found_element, $attribute_label);
594 // if it was a class, it may be multiple!
595 // Usually just one however...
596 $keys = explode(' ', $attribute_value);
597 // debug("Found an node with $attribute_label of ".print_r($keys, 1) , 3);
598
599 foreach ($keys as $key) {
600 // Found 'something' labelled 'something'
601 if(! trim($key)) {continue;}
602 // Allow HTML though. Sometimes this will not be right...
603 // TODO, figure it out?
604 $value = xml_tostring($found_element, TRUE);
605 if(! trim($value)) {continue;}
606
607 // The value just gets absorbed
608 debug("Found an unexpected tagged value - '$key' , Absorbing it into the node as a default text/html value", 2);
609
610 // Set it onto the node,
611 // If it's a class, carefully combine to preserve pre-existing arrays
612 if ( $attribute_label == 'class') {
613 import_html_absorb_properties($node, $key, $value);
614 }
615 else {
616 // but if it's an ID, there can be only one, just set it
617 $node->$key = $value;
618 }
619 } // each multiple key
620 } // each found element
621 } // each attribute type
622 }
623
624 function import_html_absorb_metas(&$node, $xml_element, $tagname, $keyname, $valname) {
625 $metas = xml_getelementsbytagname($xml_element, $tagname);
626 debug("Absorbing the '$valname' of '{$tagname}'s with a '$keyname' from source doc into node structure", 3);
627 foreach ($metas as $meta) {
628 if (empty($meta)) {continue;}
629 $key = xml_getattribute($meta, $keyname);
630 $value = xml_getattribute($meta, $valname);
631 if ($key && $value) {
632 import_html_absorb_properties($node, $key, $value);
633 if (module_exists('nodewords')) {
634 $node->nodewords[strtolower($key)] = $value;
635 }
636 }
637 else{
638 debug("When absorbing '$valname' from '{$tagname}'s with a '$keyname' from source doc ($key=$value) had a null value. Not a great problem, just letting you know.", 2);
639 }
640 }
641 }
642
643 /**
644 * Include what we can find in the /modules directory.
645 * Only once.
646 */
647 function import_html_include_add_on_module_handlers() {
648 static $done;
649 if ($done) return;
650 // Scan add-on dir and include all bits found there
651 $inc_files = file_scan_directory(drupal_get_path('module', 'import_html') .'/modules', ".*.inc", array('.', '..', 'CVS')) ;
652 foreach($inc_files as $inc_path) {
653 include_once($inc_path->filename);
654 }
655 $done = TRUE;
656 }
657
658 /**
659 * Set the given property on the given object,
660 * allowing multiple values to expand into arrays.
661 *
662 * Happens automatically IFF more than one kay match is found. Deal with that
663 * yourself.
664 */
665 function import_html_absorb_properties(&$node, $key, $value) {
666 if (!$key) {debug("Odd, when absorbing properties, value:'$value' is a value for what key? The calling function passed a null key to be absorbed.");return;}
667 if (!$value) {debug("Odd, when absorbing properties, '$key' had a null value. This is probably not an error.", 2);return;}
668
669 // auto-expand into arrays - most metas can legally have duplicates
670 if ( ! isset($node->$key) ) {
671 $node->$key = $value;
672 }
673 else if ( is_array($node->$key) ) {
674 $a = $node->$key; $a[] = $value; $node->$key = $a;
675 }
676 else { $node->$key = array($node->$key, $value); }
677 }
678
679
680 function import_html_guess_document_title($node) {
681 if (! $node->title ) {
682 switch (import_html_variable('handle_no_title')) {
683 case IMPORT_HTML_GUESS :
684 return import_html_guess_label($node->path);
685 break;
686 case IMPORT_HTML_DEFAULT :
687 return 'Untitled Document';
688 break;
689 }
690 }
691 return $node->title;
692 }
693
694 function import_html_guess_label($title) {
695 $path_bits = split('/', $title);
696 $title=array_pop($path_bits);
697 if (!$title) {$title=array_pop($path_bits);} // it had a trailing slash
698 $title = str_replace('_', ' ', $title);
699 $title = (strstr($title, '.')) ? substr($title, 0, strrpos($title, ".")) : $title;
700 return $title;
701 }
702
703
704 /**
705 * Return the nice path alias of an imported page
706 */
707 function _import_html_calc_path($rel_path, $leave_suffix = FALSE) {
708 $path = import_html_variable('import_site_prefix') . preg_replace('|^/|', '', $rel_path);
709 $path = preg_replace('| |', '%20', $path); // URLs should NOT have spaces, but old sites may have done this
710
711 if ($leave_suffix) {
712 return $path;
713 }
714
715 if (import_html_variable('trim_suffixes')) {
716 // Simplify the URL if possible by trimming the suffix and 'index'
717 // but remember the original path somewhere, we'ill need to link it forward
718 // once the new node is established.
719
720 // To be clever, special-case the 'index.html' files to be
721 // linked to their parent directories.
722 // Trailing slash is tricky.
723 // /this/path is a whole navigation level above
724 // /this/path/ and will resolve relative links differently!
725 // We need to actually redirect, not just alias any links like that
726 $default_documents = split(",", import_html_variable('default_document'));
727 $trimmed_path = $path;
728 foreach($default_documents as $default_document) {
729 $trimmed_path = preg_replace('|/('. trim($default_document) .')$|', "", $trimmed_path);
730 }
731 if ($trimmed_path != $path) {
732 debug("It's an index page, so we will refer to $path as $trimmed_path", 2);
733 $path = $trimmed_path;
734 }
735 else {
736 // No change, Chop suffix instead
737 $path = preg_replace('|\.[^\.]+$|', "", $path);
738 }
739 }
740 return $path;
741 }
742
743 /**
744 * Find and initialize the transformation template. Caching retrieval
745 */
746 function _import_html_get_xsl_doc($xslfile) {
747 static $xsldoc;
748 if ($xsldoc) {
749 return $xsldoc;
750 }
751
752 // Check if and where filepath can be found
753 // Search first under full path, then module dir, then under files dir
754 $xslfilepath = $xslfile;
755 if (!file_exists($xslfilepath)) {
756 #dpm("Did not find $xslfilepath, trying module dir");
757 $xslfilepath = drupal_get_path('module', 'import_html') ."/$xslfile";
758 }
759 if (!file_exists($xslfilepath)) {
760 $xslfilepath = file_directory_path() ."/$xslfile";
761 }
762
763 if (file_exists($xslfilepath)) {
764 debug("Loading Transformation Stylesheet from $xslfilepath", 2);
765 $xsldoc = parse_in_xml_file($xslfilepath, false);
766 }
767 else {
768 drupal_set_message("Unable to locate the Transformation Stylesheet '$xslfilepath' ", "error");
769 return false;
770 }
771 return $xsldoc;
772 }
773
774 /**
775 * Run the url-rewrite XSL over the source document
776 * TODO allow for the non-base version of Drupal links
777 *
778 * The relative links need to be converted into path-to- top and back down
779 * again. Relative references just cannot be maintained.
780 *
781 * @return an XML doc again
782 */
783 function import_html_rewrite_links($xmldoc, $rel_path, $profile) {
784 static $rewrite_xsldoc; // memo this to speed up bulk imports
785 static $xslfilepath;
786 if (!$rewrite_xsldoc) {
787 $xslfilepath = drupal_get_path('module', 'import_html') ."/rewrite_href_and_src.xsl";
788 $rewrite_xsldoc = parse_in_xml_file($xslfilepath, false);
789 }
790
791 debug("Rewriting links for a file called '$rel_path'. dirname($rel_path) is ". dirname($rel_path), 2);
792 debug_pre(array("import_html profile settings used for rewriting" => $profile), 3);
793
794 // dirname('/ok.htm') returns '\'; No idea why, may only happen at root level on Win
795 // !! B-X
796
797 // $rel_base is the path from the import root to the current page dir
798 // I want a trailing slash, but not a leading one for the next concatenation
799 // dirname('/a/dir/') returns '/a' - which is not what I want
800 $rel_dir = preg_match('|/$|', $rel_path) ? $rel_path : dirname($rel_path);
801 $rel_base = ensure_trailing_slash($rel_dir);
802
803 $site_root = url('');
804 $path_to_import_top = url( ensure_trailing_slash($profile['import_site_prefix']) );
805 $site_root = $path_to_import_top;
806
807 // if we are re-writing thing/index.htm to thing - our links will resolve differently!
808 // either too high for thing, or too low for the thing/index.htm alias.
809 $href_base = url( ensure_trailing_slash($profile['import_site_prefix']) . $rel_base);
810
811 // Create the prefix for resource sources
812 // Is url() OK for files with unclean urls? - NO. Neither is file_create_url
813 $src_root = base_path() . $profile['file_storage_path'];
814
815 $src_base = ensure_trailing_slash($src_root) . ($rel_base == '/') ? '' : $rel_base;
816
817 // Or not, if we are still linking to full URLs (demo or partial import)
818 $url_parts = parse_url($rel_path);
819 if ($url_parts['host']) {
820 // it's remote!
821 $path_to_import_top = $rel_path;
822 $site_root = 'http://'. $url_parts['host'] .'/';
823 $src_root = $site_root;
824 $src_base = $rel_path;
825 }
826 $src_base = str_replace('/./', '/', $src_base);
827 $href_base = str_replace('/./', '/', $href_base);
828
829
830 debug("
831 <b>Rewrite patterns:</b>
832 Path to the top of this (relative) server is $site_root .
833 Path to top of the prefixed section
834 ({$profile['import_site_prefix']})
835 from here ($rel_path)
836 to our import base
837 ({$profile['import_site_prefix']})
838 would be '$path_to_import_top'.
839 Path to a relative <em>neighbour</em> of this page would be
840 ($href_base)
841 or to find the base for <em>relative</em> resource files over in
842 the file storage area
843 ({$profile['file_storage_path']})
844 would be '$src_base' ", 2
845 );
846
847
848 $parameters = array(
849 // These parameters tell the rewriter what to prepend to the links.
850 // They are instructions how this page will find its missing bretheren
851 // when we put it where we put it.
852 // Images and Pages may end up in different places.
853 'site_root' => $site_root,
854 'src_root' => $src_root,
855 'src_base' => $src_base,
856 'href_base' => $href_base,
857 'replace_suffix' => $profile['relink_files'],
858 'new_suffix' => '',
859 'xsl_path' => $xslfilepath,
860 'strip_script_tags' => $profile['strip_script_tags'],
861 );
862 debug("
863 XSL for URL rewrites loaded OK.
864 HTML links for files that were under '$rel_base' will be made relative to '"
865 . $parameters['href_base'] ."' and '". $parameters['src_base'] ."'"
866 . ( $parameters['strip_script_tags'] ? 'All inline script blocks will be discarded from the source.'. $parameters['strip_script_tags'] : '')
867 , 2);
868 debug_pre(array("PARSED XSL $xslfilepath . XSL" => xml_tostring($rewrite_xsldoc)), 4);
869
870 $rewritten = xmldoc_plus_xsldoc($xmldoc, $rewrite_xsldoc, $parameters);
871
872 // collapse dir-up "../" paths. To tricky for XSL. Hope it doesn't break anything
873 $rewritten = preg_replace('|/[^\.][^/\s"\'>]*/\.\./|', '/', $rewritten);
874
875 debug_pre(array("The source after URL rewriting . XHTML (string)", $rewritten), 2);
876
877 $xmldoc = parse_in_xml_string($rewritten, FALSE);
878 if (empty($xmldoc)) {
879 trigger_error("Failed to rewrite links into a valid XML file", E_USER_WARNING);
880 return FALSE;
881 }
882
883 debug_pre(array(xml_tostring($xmldoc) => "Parsed in again. XHTML (XML)"), 3);
884 return $xmldoc;
885 }
886
887 /**
888 * Run the strip_tables XSL over the source document
889 *
890 * @return an XML doc again
891 */
892 function import_html_strip_tables($xmldoc) {
893 static $strip_tables_xsldoc; // memo this to speed up bulk imports
894 if (!$strip_tables_xsldoc) {
895 $xslfilepath = drupal_get_path('module', 'import_html') ."/strip_tables.xsl";
896 $strip_tables_xsldoc = parse_in_xml_file($xslfilepath, FALSE);
897 }
898
899 debug_pre(array("PARSED strip_tables XSL $xslfilepath . XSL" => xml_tostring($strip_tables_xsldoc)) , 3);
900 $parameters = array();
901 $rewritten = xmldoc_plus_xsldoc($xmldoc, $strip_tables_xsldoc, $parameters);
902
903 // normalize space to clean up the gaps
904 $rewritten = preg_replace("/\\s*\\n\\s*/", "\n", $rewritten);
905
906 debug_pre(array("The source after stripping tables . XHTML (string)" => $rewritten), 3);
907 $xmldoc = parse_in_xml_string($rewritten, false);
908 if (!$xmldoc) {
909 trigger_error("Failed to strip tables and end up with a valid XML file", E_USER_WARNING);
910 return false;
911 }
912
913 return $xmldoc;
914 }
915
916 /**
917 * @see import_html_strip_tables
918 */
919 function import_html_strip_scripts($xmldoc) {
920 static $strip_scripts_xsldoc; // memo this to speed up bulk imports
921 if (!$strip_scripts_xsldoc) {
922 $xslfilepath = drupal_get_path('module', 'import_html') ."/strip_scripts.xsl";
923 $strip_scripts_xsldoc = parse_in_xml_file($xslfilepath, false);
924 }
925 $parameters = array();
926 $rewritten = xmldoc_plus_xsldoc($xmldoc, $strip_scripts_xsldoc, $parameters);
927
928 $xmldoc = parse_in_xml_string($rewritten, false);
929 if (!$xmldoc) {
930 trigger_error("Failed to strip tables and end up with a valid XML file", E_USER_WARNING);
931 return false;
932 }
933 return $xmldoc;
934 }
935
936 /**
937 * Use XSL to convert Dreamweaver 'instanceEditable' comments into semantic
938 * tagged divs
939 *
940 * @see import_html_strip_tables
941 */
942 function import_html_tag_editable_areas($xmldoc) {
943 static $editable_xsldoc; // memo this to speed up bulk imports
944 if (!$editable_xsldoc) {
945 $xslfilepath = drupal_get_path('module', 'import_html') ."/tag_editable_areas.xsl";
946 $editable_xsldoc = parse_in_xml_file($xslfilepath, false);
947 }
948 if (!$editable_xsldoc) {
949 trigger_error('Cannot tag_editable_areas. tag_editable_areas.xsl unavailable', E_USER_WARNING);
950 return NULL;
951 }
952 $parameters = array();
953 $rewritten = xmldoc_plus_xsldoc($xmldoc, $editable_xsldoc, $parameters);
954
955 $xmldoc = parse_in_xml_string($rewritten, false);
956 if (!$xmldoc) {
957 trigger_error("Failed to tag commented editable areas (eg from Dreamweaver) and end up with a valid XML file", E_USER_WARNING);
958 return false;
959 }
960 return $xmldoc;
961 }
962
963
964
965
966 /**
967 * Ensure a sting is able to be used as an XML, CSS or Javascript ID.
968 * Basically strip out all non-alpha-numerics
969 * http://www.w3.org/TR/REC-xml/#NT-Name
970 * @see form_clean_id() - which should have done this
971 */
972 function import_html_check_name($name) {
973 return preg_replace('|[^a-zA-Z0-9_]+|', '_', $name);
974 }
975
976
977
978 /**
979 * Avoid double-ups, if the path already exists, UPDATE the existing node.
980 * Can't have two content nodes claiming the same path or it won't validate.
981 * Plus, we want to retain any info that's been added via drupal. Probably.
982 *
983 * @param $node - partially created node from import. Key lookup on $node->path
984 * @return $node - possibly with pre-existing values blended in. Importantly - the nid
985 */
986 function import_html_merge_over_existing_node($node) {
987 $internal_link = drupal_get_normal_path($node->path);
988
989 if ($internal_link != $node->path) {
990 // Found an internal match, the alias is already asigned to a node
991 // Merge info to avoid losing any Drupal-only info
992
993 $node->nid = array_pop(explode("/", $internal_link));
994 if (!$node->nid) {
995 // Should never happen - just paranoia
996 drupal_set_message("
997 When looking for an alias to '{$node->path}',
998 got error finding node ID from the internal link
999 '$internal_link' - which was supposed to return a nid", 'error'
1000 );
1001 }
1002 else {
1003 debug("
1004 Page path alias '{$node->path}' already exists,
1005 It's already linked to node id '{$node->nid}'.
1006 This data import will <em>replace</em> that content,
1007 but try to keep any other values.
1008 ", 2);
1009
1010 // Load existing item, layer changes on top of it
1011 $old_node = node_load($node->nid );
1012 foreach ($node as $key => $value) {
1013 if (is_array($value)) { // merge deeper sets, like taxonomy
1014 if (!is_array($old_node->$key)) {$old_node->$key=array();}
1015 foreach ($value as $k => $v) {
1016 $old_node->{$key}[$k] = $v;
1017 }
1018 }
1019 else {
1020 $old_node-> $key = $value;
1021 }
1022 }
1023 $node = $old_node;
1024 }
1025 }
1026 return $node;
1027 }
1028
1029
1030 /**
1031 * Tidy URLs before saving locally - for URL imports
1032 *
1033 * Squash/hash query strings, but don't discard them.
1034 * Do discard fragment ids
1035 *
1036 * Replace spaces and non-alphanumerics with underscore
1037 */
1038 function safe_filepath_from_url($rel_path) {
1039 $save_as = preg_replace("|\?|", "%3f", $rel_path);
1040 $save_as = preg_replace("|\&|", "%26", $save_as);
1041 $save_as = preg_replace("|#.*|", "", $save_as);
1042
1043 if (import_html_variable('allow_bad_urls')) {
1044 return $save_as;
1045 }
1046
1047 $save_as = preg_replace("|[^A-Za-z0-9_\-~\./%]+|", "_", $save_as);
1048 return $save_as;
1049 }
1050
1051 /**
1052 * http://nz2.php.net/manual/en/function.utf8-decode.php#85034
1053 */
1054 function charset_decode_utf_8 ($string) {
1055 /* Only do the slow convert if there are 8-bit characters */
1056 /* avoid using 0xA0 (\240) in ereg ranges. RH73 does not like that */
1057 if (! ereg("[\200-\237]", $string) and ! ereg("[\241-\377]", $string)) {
1058 return $string;
1059 }
1060
1061 // decode three byte unicode characters
1062 $string = preg_replace(
1063 "/([\340-\357])([\200-\277])([\200-\277])/e",
1064 "'&#'.((ord('\\1')-224)*4096 + (ord('\\2')-128)*64 + (ord('\\3')-128)).';'",
1065 $string
1066 );
1067
1068 // decode two byte unicode characters
1069 $string = preg_replace(
1070 "/([\300-\337])([\200-\277])/e",
1071 "'&#'.((ord('\\1')-192)*64+(ord('\\2')-128)).';'",
1072 $string
1073 );
1074
1075 // dman
1076 // Encode medium-high entities (>#128;). htmltidy is failing to catch (copyright) 0xA9 (#159) symbol. Others?
1077 $string = preg_replace(
1078 "/([\200-\377])/e",
1079 "'&#'.ord('\\1').';'",
1080 $string
1081 );
1082
1083 return $string;
1084 }
1085
1086
1087 /**
1088 * dummy error handler
1089 * Used to shush DOM errors when we know the doc is probably invalid
1090 */
1091 function stfu($err, $str) {
1092 # debug($str, 4);
1093 };

  ViewVC Help
Powered by ViewVC 1.1.2