/[drupal]/contributions/modules/import_html/import_html_modules.inc
ViewVC logotype

Contents of /contributions/modules/import_html/import_html_modules.inc

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.12 - (show annotations) (download) (as text)
Sat Mar 14 04:30:48 2009 UTC (8 months, 2 weeks ago) by dman
Branch: MAIN
CVS Tags: HEAD
Changes since 1.11: +2 -2 lines
File MIME type: text/x-php
Trying to move everything into a fresh branch, tagged 6--1
1 <?php
2 /**
3 * @file
4 * Additional per-module functionality used by import_html.
5 *
6 * Each available module MAY impliment a hook_import_html() function to add or manipulate
7 * its own data as the node gets saved.
8 *
9 * @ingroup import_html Import HTML
10 * @author Dan Morrison http://coders.co.nz/
11 * @version $Id: import_html_modules.inc,v 1.11.2.2 2009/03/14 03:18:51 dman Exp $
12 */
13
14 /**
15 * hook_import_html - prototype function hook
16 *
17 * invoke this when creating a node out of nothing
18 *
19 * @param $profile array of settings for the import_html process
20 * @param $node node object with lots of extra data added to it, only some of which we want to save.
21 * @param $datadoc XML doc of the 'simple' node source that we may investigate for extra info if the cooked node object is not good enough
22 */
23 function hook_import_html($profile, &$node, $datadoc = NULL) {
24
25 }
26 /**
27 * hook_import_html_after_save - prototype function hook
28 *
29 * Some functions (alias) can only operate when the nid is already known.
30 * Invoke this when the node has been created
31 *
32 * @param $profile array of settings for the import_html process
33 * @param $node node object with lots of extra data added to it, only some of which we want to save.
34 * @param $datadoc XML doc of the 'simple' node source that we may investigate for extra info if the cooked node object is not good enough
35 */
36 function hook_import_html_after_save($profile, &$node, $datadoc = NULL) {
37
38 }
39
40
41
42 /**
43 * Implementation of hook_import_html
44 *
45 * Add appropriate normal node attributes - title, body and teaser
46 * - to the node
47 */
48 function node_import_html($profile, &$node, $datadoc = NULL) {
49 // Title
50 // Note, internally the title should always be plaintext, encoded characters and no newlines
51 // A meta 'title' may have been set,
52 // but I'll use the real visible one by preference.
53 $h1 = xml_get_element_content($datadoc, 'h1');
54 if ($h1) {
55 $node->title = $h1;
56 }
57 else {
58 $title_elem = xml_get_element_content($datadoc, 'title');
59 if ($title_elem) {
60 $node->title = $title_elem;
61 }
62 }
63
64 //
65 // BODY is the thing with id=content or whatever is set in the prefs
66 //
67 debug_pre(array('XML DOM being scanned by node_import_html() for XPATH data extraction to find the body content' => xml_tostring($datadoc)), 3);
68 $content_element = xml_getelementbyid($datadoc, 'content');
69
70 if (!$content_element) {
71 drupal_set_message("Failed to find a body, anything with id='content' in this page");
72 // TODO some cck may be valid without a 'content' body?
73 }
74
75 if ($content_element) {
76 $node->body = trim(xml_tostring($content_element, TRUE));
77
78 // The XSL may give us further hints about how it found this body data
79 $precision = xml_getattribute($content_element, 'precision');
80 if ($precision != NULL) {
81 $node->import_html_precision = $precision;
82 $explanation = xml_getattribute($content_element, 'title');
83 debug(t("I'm %precision % confident that we found the right body element. !explanation", array('%precision' => 100*$precision, '!explanation' => $explanation)));
84 }
85 }
86
87 // It's possible that our input was content-encoded (if it came from RSS or the old import-xml node)
88 // If so, the entities should be unwrapped.
89 // Other (HTML) imports should not require this
90 // @TODO when was $node->content_encoded being flagged?
91 $node->body = ($node->content_encoded) ? html_entity_decode($node->body):$node->body;
92
93 //
94 // Teaser
95 //
96 $teaser = xml_textcontent( xml_getelementbyid($datadoc, 'teaser') );
97 $description = xml_textcontent( xml_getelementbyid($datadoc, 'description') );
98 if (!$teaser) $teaser = $description ;
99 // No custom teaser found, but
100 // Imported HTML is highly likely to contain formatting that
101 // will ruin teasers. Do our own cut-down version that assumes
102 // FILTERED HTML
103 $teaser = filter_filter('process', 0, 1, node_teaser($node->body));
104 if ($teaser) $node->teaser = $teaser;
105 }
106
107
108 /**
109 * Implementation of hook_import_html
110 *
111 * Add appropriate user identification to the new node.
112 */
113 function user_import_html($profile, &$node, $datadoc = NULL) {
114 global $user;
115 $node->uid = $user->uid;
116 $node->name = $profile['import_user'];
117 if ($import_user = user_load(array('name' => $node->name))) {
118 $node->uid = $import_user->uid;
119 }
120 }
121
122
123 /**
124 * Set alias and navigation for the given node
125 * Helper for import_html_import_files()
126 */
127 function menu_import_html_after_save($profile, &$node, $datadoc = NULL) {
128 $normal_path = "node/{$node->nid}";
129
130 // Create Menu item for this new item
131 if ($profile['create_menus']) {
132 $node->label = $node->title;
133
134 // Need to beware of stupid long titles - they can't fit in breadcrumbs and menus
135 if (strlen($node->title) > IMPORT_HTML_MAX_LABEL_LENGTH) {
136 $path_bits = split('/', $node->path);
137 $node->label=array_pop($path_bits);
138 $node->label=preg_replace('/\?.*$/', '?', $node->label); // messiness from mirrored URLs with args in
139 $node->label=str_replace('_', ' ', $node->label);
140 // @TODO maybe adjust this title-munging algoritm to make better guesses
141 drupal_set_message(t("When creating menu item, the title '%node_title is too long to fit into a menu label (MAX=%MAX chars), truncating it to '%node_label' . This is not terrible, just unaesthetic. You may want to fix this later.", array('%node_title' => $node->title, '%MAX' => IMPORT_HTML_MAX_LABEL_LENGTH, '%node_label' => $node->label)));
142 }
143
144 // Check if a placeholder for the alias exists.
145 // This can happen if the child items were created earlier and required a placeholder
146 // Try both canonic and aliased paths
147 if( ! $menu = menu_get_item_by_path($normal_path) ) {
148 $menu = menu_get_item_by_path($node->path);
149 }
150 if ($menu) {
151 drupal_set_message(t("Discovered that a placeholder menu item for %path already exists. Filling that one out with the full details now", array('%path' => $node->path)));
152 }
153 else {
154 // New menu item
155 debug("Looking for parent menu of $node->path", 2);
156 $pid = import_html_create_menu_path(dirname($node->path), '');
157 if (!$pid) { $pid = $profile['menu_parent_id']; }
158 $menu = array(
159 'pid' => $pid,
160 'type' => MENU_CREATED_BY_ADMIN | MENU_MODIFIABLE_BY_ADMIN | MENU_VISIBLE_IN_TREE | MENU_VISIBLE_IN_BREADCRUMB,
161 );
162 }
163 $menu['title'] = $node->label;
164 $menu['path'] = $normal_path;
165 $node->menu = $menu;
166 menu_edit_item_save($node->menu);
167 }
168 }
169
170 /**
171 * Given a traditional URL path, return the parent menu item
172 *
173 * A helper function to menu_import_html_after_save()
174 *
175 * This is a recursive function that slices the given
176 * (alias) path up until it finds a menu item it recognises.
177 * Once found, it comes back down again building a path
178 * until we end up with the new parent menu id.
179 *
180 * @param $path the path (aliased URL) we intend to construct
181 * @param $title optional Display title of the menu item at that path point, otherwist it'll use the last fragment of the path as a display title
182 * @return a menu id. FALSE if it hit the top unexpectedly
183 * */
184 function import_html_create_menu_path($path, $title = '') {
185 debug("Looking for a menu item matching path '$path' ", 3);
186 if ((!module_exists('menu')) || (!module_exists('path'))) {
187 return;
188 }
189
190 // check parent exists
191 if ($pid = menu_get_mid_by_path($path)) {
192 return $pid;
193 }
194
195 debug("Failed to find a menu item '$path', making it.", 2);
196
197 $p_path = dirname($path); // parent path
198
199 if ($p_path == dirname($p_path)) {
200 return import_html_variable('menu_parent_id');
201 /*
202 debug("
203 Looping menu path, this is either a recursion error (illegal paths),
204 or caused by creating a new menu tree from the bottom up and we just
205 reached the top.
206 Although a menu structure for this page is now built, it may be hard
207 to navigate to child pages if the parent doesn't exist.
208 This can be fixed by importing the parents first or creating section header content.
209 ", 2);
210 return FALSE;
211 */
212 }
213
214 if (!$pid = menu_get_mid_by_path($p_path)) {
215 // Parent not found,
216 // Need to make it up...
217 $pid = import_html_create_menu_path($p_path);
218 // That will have climbed up and come back down. with its nearest new ancestor
219 }
220 // and attach self to it
221
222 if ($pid) {
223 if (!$title) {
224 $title = basename($path);
225 if (strstr(".", $title)) {
226 $title = substr($title, 0, strrpos($title, "."));
227 }
228 }
229 $new = array(
230 'title' => $title,
231 'path' => $path,
232 'pid' => $pid,
233 'type' => MENU_CUSTOM_ITEM
234 );
235 debug("import_html_create_menu_path : Defining new menu item '$path' as a child of $pid", 3);
236
237 if (!form_get_errors() && (!menu_get_mid_by_path($path))) {
238 menu_edit_item_save($new);
239 }
240 else {
241 debug_pre(form_get_errors());
242 debug_pre(menu_get_menu(), 4);
243 return;
244 }
245 $mid = menu_get_mid_by_path($path);
246
247 # // need to clear the menu cache for the new item to show up
248 # menu_rebuild();
249
250 debug("New Menu Item '". $new['title'] ."' id is :". $mid, 2);
251 return $mid;
252 }
253 else {
254 debug("Failed to regressively build menu for '$path'. This should not happen. Dunno why we are here.");
255 }
256 }
257
258
259
260 /**
261 * Add path alias support to the import_html process
262 *
263 * Adds an old-style (legacy) alias to the node path if required.
264 */
265 function path_import_html_after_save($profile, &$node, $datadoc = NULL) {
266 #dpm(array('INVOKED path_import_html_after_save for '. $node->path));
267 if ($profile['legacy_aliases'] && ($node->old_path != $node->path)) {
268 debug("Setting up navigation links for this item now. Directing {$node->old_path} to go to the system path ". drupal_get_normal_path($node->path), 2);
269
270 $normal_path = "node/{$node->nid}";
271
272 if ($normal_path == $node->path) {
273 drupal_set_message(t("Failed to resolve %node_path into a system path. Cannot create alias at this time.", array('%node_path' => $node->path)));
274 }
275 else {
276 path_set_alias($normal_path, $node->old_path);
277 drupal_set_message(t("This document (known internally as '%normal_path' ) should now be accessible via aliases as both '!main_alias' and '!legacy_alias' ",
278 array(
279 '%normal_path' => $normal_path,
280 '!main_alias' => l($node->path, $node->path),
281 '!legacy_alias' => l($node->old_path, $node->old_path)
282 )
283 ));
284 }
285 }
286
287 }
288
289
290 /**
291 * Absorbs elements in the import document we recognise as being (probably)
292 * taxonomy terms.
293 *
294 *
295 * Any link rel="tag" is a start. The plaintext content of such a tag is
296 * searched for as a term.
297 *
298 * both <a href rel='tag' > and <link rel="tag"> are good for me.
299 *
300 * @TODO this still needs to be expanded with real-world cases
301 */
302 function taxonomy_import_html($profile, &$node, $datadoc = NULL) {
303
304 $active_vocabs = taxonomy_get_vocabularies($node->type);
305 if(!$active_vocabs) { return; }
306 // Scan to see if any allow freetagging. After that it's guesswork which one to target. The one with the lowset number;
307 $prime_vocab = 0;
308 foreach($active_vocabs as $vocabulary) {
309 if($vocabulary->tags) {
310 $prime_vocabulary = $vocabulary;
311 break;
312 }
313 }
314
315 // Translate links with rel=tag into taxonomy terms
316 $relationships = xml_query($datadoc, './/*[@rel]');
317 foreach ($relationships as $link) {
318 if (empty($link)) {continue;}
319 $reltype = xml_getattribute($link, 'rel');
320 // rels can be multiple - like classes. Split on space
321 $reltypes = explode(' ', xml_getAttribute($link, 'rel'));
322 $rellink = xml_getattribute($link, 'href');
323 $label = trim(xml_textcontent($link));
324 // Microformats may put their value into a 'title' attribute instead
325 if (empty($label)) {
326 $label = xml_getAttribute($link, 'title');
327 }
328
329 if (!empty($label)) {
330 foreach($reltypes as $reltype) {
331 if ($reltype == 'tag') {
332 // Some of this magic should probably shift into absorb_metas()
333 // Still thinking of the best way to encode tags into the raw HTML
334
335 if (strstr($label, ":")) {
336 // allow an extended format for this value
337 // to support freetext vocab additions
338 // <a rel='tag'>Author:Joe Brown</a> will create a term in the 'Author' vocab called "Joe Brown".
339 debug("Splitting rel link '$label' up to make it a vocab term entry", 2);
340 $vocab_term = split(":", $label);
341 $vocab = $vocab_term[0];
342 $label = $vocab_term[1];
343 $vocabulary = taxonomy_get_vocabulary_by_name($vocab);
344 if ($vocabulary) {
345 $node->taxonomy['tags'][$vocabulary->vid] .= ",". $label;
346 }
347 }
348
349 // Match against existing terms
350 $terms = taxonomy_get_term_by_name($label);
351 // Allow an optional 'S' on the end of terms when looking for a match. Just to be flexible.
352 $terms = array_merge($terms, taxonomy_get_term_by_name($label ."s"));
353
354 if (!is_array($node->taxonomy)) {$node->taxonomy = array();}
355 if ($terms) {
356 foreach($terms as $term) {
357 // If we did successfully identify the vocab earlier, filter on that. Otherwise let anything go
358 if ($vocabulary && ($term->vid != $vocabulary->vid)) { continue; }
359 $node->taxonomy[$term->tid] = $term;
360 }
361 }
362 else {
363 // Couldn't retrieve it, can we add freetags?
364 $node->taxonomy['tags'][$prime_vocabulary->vid] .= ",". $label;
365 }
366
367 } // is a 'tag'
368 import_html_absorb_properties($node, $reltype, $label);
369 } // inner rel tag type
370 } // has value
371 } // each rel
372 }
373
374
375
376 /**
377 * Utility Functions which should exist but don't yet.
378 * out-of-module names sorry.
379 */
380
381
382 /**
383 * Return a menu item matching a given path or alias.
384 *
385 * Menu_get_item re-writes the path from alias to 'normal' path when it
386 * initializes. This makes it impossible to retrieve menu items by name using the available API.
387 *
388 * @return a menu ID
389 */
390 function menu_get_mid_by_path($path) {
391 // Caching works against me when adding lots of aliases in one go. Wipe it :(
392 drupal_lookup_path('wipe'); // This is not working?
393 $alias = db_result(db_query("SELECT src FROM {url_alias} WHERE dst = '%s'", $path));
394
395 $menu = menu_get_menu();
396 if ($mid = $menu['path index'][$path]) {
397 return $mid;
398 }
399 if ($alias && $mid = $menu['path index'][$alias]) {
400 return $mid;
401 }
402
403 // it's bad mojo to mess with the DB directly, but menu doesn't provide a lookup API.
404 // Or a way to avoid caching. Do it by hand if I need a newly added menu
405 $row = db_fetch_object(db_query('SELECT * FROM {menu} WHERE path != "" AND (path = "%s" OR path = "%s") ', $path, $alias ));
406 if ($row) {
407 return $row->mid;
408 }
409 }
410 /**
411 * Return a menu item matching a given path or alias.
412 */
413 function menu_get_item_by_path($path) {
414 $result = db_query('SELECT * FROM {menu} WHERE path="%s"', $path);
415 return db_fetch_array($result);
416 }
417
418
419 function taxonomy_get_vocabulary_by_name($name) {
420 $vs = taxonomy_get_vocabularies();
421 foreach ($vs as $voc) {
422 if ($voc->name == $name) return $voc;
423 }
424 }

  ViewVC Help
Powered by ViewVC 1.1.2