/[drupal]/contributions/modules/import_html/import_html.module
ViewVC logotype

Contents of /contributions/modules/import_html/import_html.module

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.72 - (show annotations) (download) (as text)
Sat Mar 14 04:30:48 2009 UTC (8 months, 2 weeks ago) by dman
Branch: MAIN
CVS Tags: HEAD
Changes since 1.71: +13 -5 lines
File MIME type: text/x-php
Trying to move everything into a fresh branch, tagged 6--1
1 <?php
2 /**
3 * @file Main drupal interface to import_html. This file contains Drupal hooks
4 * and some config/preferences. Actual processing functions are in
5 * import_html_process.inc
6 *
7 * Synopsis:
8 *
9 * Facility to import an existing, static HTML site structure into the Drupal
10 * CMS as structured nodes.
11 *
12 * Intent:
13 *
14 * Allow an admin to define a source directory of an existing
15 * traditional static HTML website, and import (as much as possible)
16 * the content and structure into a drupal site.
17 * Source files will be stripped of exisiting chrome and navigation
18 * elements before being inserted as nodes.
19 *
20 * See import_html_help.htm for more
21 *
22 * Main data extraction routines are in import_html_process.inc. Module-specific
23 * data extraction methods have been shifted into import_html_modules.inc UI
24 * forms and themes have been shifted to import_html_ui.inc.
25 * Drupal hooks and some configs remain in this module file.
26 *
27 * import_html libraries can be used by external modules, specifically wrapper.
28 * module (private development) and static.module (experimental)
29 *
30 * @ingroup import_html Import HTML
31 * @author Dan Morrison http://coders.co.nz/
32 * @version $Id: import_html.module,v 1.55.2.2 2009/03/14 03:18:50 dman Exp $
33 *
34 */
35
36 $import_html_path = drupal_get_path('module', 'import_html');
37 $import_html_path = dirname(__FILE__);
38 set_include_path($import_html_path . PATH_SEPARATOR . get_include_path());
39 $import_html_library_path = $import_html_path .'/coders_php_library';
40 set_include_path($import_html_library_path . PATH_SEPARATOR . get_include_path());
41
42 /**
43 * @name Debug Flag
44 * Used for testing only
45 * @{
46 */
47 if (! function_exists('debug')) {
48 require_once 'debug.inc';
49 define('DEBUGLEVEL', 0);
50 define('DEBUGBACKTRACE', 8);
51 }
52 #debug_set_level(3);
53
54 /**
55 * @}
56 */
57
58 // Split into libraries in prep for D6
59 require_once "import_html_process.inc";
60 require_once "import_html_ui.inc";
61
62 require_once "xml-transform.inc";
63 require_once "file-routines.inc";
64
65
66 /**
67 * @name $_import_html_file_classes
68 * poor-mans mime-list
69 */
70
71 global $_import_html_file_classes;
72 $_import_html_file_classes = array(
73 "htm" => 'html',
74 "html" => 'html',
75 "shtml" => 'html',
76 "php" => 'html',
77 "asp" => 'html',
78 "aspx" => 'html',
79 "jsp" => 'html',
80
81 "gif" => "image",
82 "jpg" => "image",
83 "png" => "image",
84
85 "css" => "resource",
86 "js" => "resource",
87 "swf" => "resource",
88
89 "pdf" => "document",
90 "txt" => "document",
91 "rtf" => "document"
92 );
93
94 // How to handle duplicates and other problems when importing
95 define("IMPORT_HTML_SKIP", 0);
96 define("IMPORT_HTML_MERGE", 1);
97 define("IMPORT_HTML_GUESS", 2);
98 define("IMPORT_HTML_DEFAULT", 4);
99
100 define("IMPORT_HTML_MAX_LABEL_LENGTH", 24);
101
102 define('IMPORT_HTML_ADMIN_PATH', 'admin/build/import_html');
103
104 /**
105 * Implementation of hook_menu() .
106 *
107 * Declares all the menu items relating to this module
108 * And all the public functions it supplies
109 */
110 function import_html_menu($may_cache) {
111 if ($may_cache) {
112 $items[] = array(
113 'path' => IMPORT_HTML_ADMIN_PATH,
114 'title' => t('Import HTML'),
115 'description' => t("Import/Export an entire site or directory to/from static HTML"),
116 'callback' => 'system_admin_menu_block_page', // Overview menu thing
117 'access' => user_access('access import_html'),
118 'type' => MENU_NORMAL_ITEM,
119 );
120 $items[] = array(
121 'path' => IMPORT_HTML_ADMIN_PATH .'/settings',
122 'title' => t('Import HTML Settings'),
123 'description' => t('Adjust the import_html options and settings.'),
124 'callback' => 'drupal_get_form',
125 'weight' => 1,
126 'callback arguments' => array('import_html_admin_settings'),
127 'access' => user_access('administer site configuration'),
128 'type' => MENU_NORMAL_ITEM,
129 );
130 $items[] = array(
131 'path' => IMPORT_HTML_ADMIN_PATH .'/profile',
132 'title' => t('Import HTML Profiles'),
133 'description' => t('Edit multiple import profiles.'),
134 'callback' => 'import_html_profiles_page',
135 'weight' => 2,
136 'access' => user_access('administer site configuration'),
137 'type' => MENU_NORMAL_ITEM,
138 );
139 $items[] = array(
140 'path' => IMPORT_HTML_ADMIN_PATH .'/import_site',
141 'title' => t('Import HTML Site'),
142 'description' => t("Import/Export an entire site to/from static HTML"),
143 'weight' => -1,
144 'callback' => 'drupal_get_form',
145 'callback arguments' => array('import_html_process_form'),
146 'access' => user_access('access import_html'),
147 'type' => MENU_NORMAL_ITEM,
148 );
149 $items[] = array(
150 'path' => IMPORT_HTML_ADMIN_PATH .'/demo',
151 'title' => t('Demo'),
152 'description' => t('Demonstrate or test HTML Import on one file.'),
153 'callback' => 'drupal_get_form',
154 'callback arguments' => array('import_html_demo_form'),
155 'access' => user_access('access import_html'),
156 'type' => MENU_NORMAL_ITEM,
157 );
158 }
159 return $items ? $items : array();
160 }
161
162 /**
163 * Hook Implimentation
164 */
165 function import_html_perm() {
166 return array(
167 'access import_html'
168 );
169 }
170
171 /**
172 * Return help text describing this module
173 *
174 * @param $section string Context this help is being called from
175 * @return string
176 */
177 function import_html_help($section) {
178 switch ($section) {
179 case 'admin/build/modules#description' :
180 return t("Import/Export an entire site to/from static HTML");
181 case 'admin/build/modules/import_html' :
182 return t("Import/Export an entire site to/from static HTML");
183 case 'admin/help#import_html' :
184 return file_get_contents(drupal_get_path("module", "import_html") ."/docs/import_html_help.htm");
185 case IMPORT_HTML_ADMIN_PATH :
186 return l("DO check the help page, this is a complex process", 'admin/help/import_html');
187 case 'admin/settings/import_html' :
188 return l("DO check the help page, this is a complex process", 'admin/help/import_html');
189 case 'admin/build/import_html/import_site' :
190 $output = t("<p>
191 For background, remember to read <a href='!help_link'>the Import Html help page</a>.
192 The <a href='!settings_link'>settings page</a> contains the config options.
193 </p><p>
194 For a quick intro, try <a href='!demo_link'>a quick demo</a>.
195 </p><p>
196 The source website files must be directly available to this server.
197 This process will NOT yet import the structure of a client website to a remote server,
198 or spider all the resources of a remote site.
199 </p><p>
200 <b>Note</b> Big sites <i>will</i> take a long time to process,
201 and processing may timeout.
202 You can either increase the php timeout value and wait around,
203 or just do sections at a time by using the subsection parameter above.
204 </p>",
205 array(
206 '!help_link' => url('admin/help/import_html'),
207 '!settings_link' => url(IMPORT_HTML_ADMIN_PATH .'/settings'),
208 '!demo_link' => url(IMPORT_HTML_ADMIN_PATH .'/demo'),
209 )
210 );
211 return $output;
212
213 case 'admin/build/import_html/demo' :
214 return t("<p>
215 Enter one HTML filepath or URL to process as an import.
216 The retrieved data <em>will not</em> automatically become part of the
217 site unless you confirm it, it's just a demo of what data would be
218 extracted on a simple import.
219 </p><p>
220 The given file will be passed through the currently configured import process
221 and shown as a node-edit form, displaying all the deduced content in the appropriate fields.
222 </p><p>
223 If you see <em>too much</em> of the page in the 'body' area (nested navbars and layout)
224 then the XSL import template or selector needs to be made more specific.
225 If you see none, or not enough content in the body area, the template or
226 selector needs to be adjusted to encompass the text correctly.
227 </p><p>
228 If you get an error or no result, the input HTML is probably too invalid to work with.
229 </p><p>
230 Single demo imports do not have the full context information to work with,
231 so the menu or URL alias (and internal relinking) shown may not be representative of the real result.
232 </p>");
233
234 }
235 return false;
236 }
237
238
239
240 /**
241 * Implementation of hook_form_alter() .
242 */
243 function import_html_form_alter($form_id, &$form) {
244 // I need to pretend the demo page is a submit page
245 if (($form_id == 'page_node_form') && ($form['#action'] == url(IMPORT_HTML_ADMIN_PATH .'/demo')) ) {
246 $form['#action'] = url('node/add/page');
247 debug("Redirecting the demo form to pretend to be a node_edit form", 2);
248 }
249 }
250
251 ////////////////////////////////////////////
252 // End of Drupal core hooks.
253 // Module utilities below.
254
255
256 /**
257 * A wrapper to variable_set, variable_get to encapsulate multiple import 'profiles'
258 *
259 * This natively just returns the settings from the current 'default' profile,
260 * but also allows the settings forms to be extended to other sets.
261 *
262 * Use INSTEAD OF variable_get() and it will return the 'default' or 'active' profile vars.
263 *
264 * @param $var Name of the variable within the currently active profile to retrieve.
265 * @param $val If set, sets this variable within the profile and saves it.
266 * @see import_html_current_profile();
267 */
268 function import_html_variable($var, $val = NULL) {
269
270 static $import_html_profiles;
271 if (! $import_html_profiles ) {
272 $import_html_profiles = variable_get('import_html_profiles', array());
273 }
274
275 $import_html_current_profile_id = import_html_current_profile_id();
276
277 if (! $import_html_profiles[$import_html_current_profile_id]) {
278 // Fill in defaults (should only be needed first time, if that
279 drupal_set_message('import_html_variable initing profile from nowhere - should this ever happen?');
280 $import_html_profiles[$import_html_current_profile_id] = import_html_profile_defaults();
281 }
282
283 $import_html_profile = &$import_html_profiles[$import_html_current_profile_id];
284
285 if (isset($val)) {
286 $import_html_profile[$var] = $val;
287 #$import_html_profiles[$import_html_current_profile_id] = $import_html_profile;
288 variable_set('import_html_profiles', $import_html_profiles);
289 }
290
291 return $import_html_profile[$var];
292 }
293
294 /**
295 * Accessor for the current profile data
296 *
297 * Returns the data in a named import_html_profile. The currently active one if not explicitly defined.
298 *
299 * @param $profile If set, saves this data back to the saved settings. Pass FALSE to delete it.
300 */
301 function import_html_profile($profile_id = null, $profile = null) {
302 $import_html_profile_id = import_html_current_profile_id($profile_id);
303 static $import_html_profiles;
304 if (! $import_html_profiles ) {
305 $import_html_profiles = variable_get('import_html_profiles', array());
306 }
307 if (isset($profile)) {
308 $import_html_profiles[$import_html_profile_id] = $profile;
309 // Allow a quiet delete
310 if (!$profile) { unset($import_html_profiles[$import_html_profile_id]); }
311 variable_set('import_html_profiles', $import_html_profiles);
312 }
313
314 return $import_html_profiles[$import_html_profile_id];
315 }
316
317
318 /**
319 * Accessor for a persistant profile switcher.
320 *
321 * @param $profile_id If set, this becomes the active profile for the duration of the request.
322 * @returns the current set profile id. 'default' by default.
323 */
324 function import_html_current_profile_id($profile_id = null) {
325 static $import_html_current_profile_id;
326 if (! $import_html_current_profile_id) {
327 $import_html_current_profile_id = variable_get('import_html_current_profile', 'default');
328 }
329 if ($profile_id) {
330 $import_html_current_profile_id = $profile_id;
331 }
332 return $import_html_current_profile_id;
333 }
334
335 /**
336 * Return an array containing the set of import_html settings to use as a template
337 */
338 function import_html_profile_defaults() {
339 return array(
340 'profile_id' => 'default', // required to be filled in by the system
341 # extraction prefs
342 'translation_template' => drupal_get_path('module', 'import_html') .'/templates/html2simplehtml.xsl',
343 'content_tag_id' => 'main',
344 'content_type' => 'page',
345 'preferred_filter' => import_html_get_preferred_filter(),
346 'default_document' => 'index.htm',
347 'file_exclusions' => "^_\n/_\nCVS\n^\.\n/\.",
348 'strip_tables' => FALSE,
349 'strip_scripts' => FALSE,
350 'force_tidy' => TRUE,
351 #replication options
352 'trim_suffixes' => TRUE,
353 'legacy_aliases' => TRUE,
354 'relink_files' => FALSE,
355 'allow_bad_urls' => FALSE,
356 'file_storage_path' => variable_get('file_directory_path', 'files') ."/imported/",
357 'rewrite_links' => TRUE,
358 'import_site_prefix' => 'imported/',
359 #subsettings
360 'import_category' => "",
361 'import_status' => TRUE,
362 'import_promote' => "",
363 'import_user' => $user->name,
364 'create_menus' => TRUE,
365 'menu_parent_id' => 1,
366 #advanced
367 'handle_duplicates' => IMPORT_HTML_MERGE,
368 'handle_no_title' => IMPORT_HTML_GUESS,
369 'debug_level' => 0,
370 'keep_temp_files' => FALSE,
371
372 );
373 }
374
375 /**
376 * Retrieve a default value for the 'input filter' which applies to nodes even after import.
377 *
378 * This is needed otherwise the edit page defaults to 'filtered' and existing
379 * formatting gets badly stripped.
380 * Normally this is available in the variable import_html_preferred_filter
381 * but the very first time this module is called, the setting may not be
382 * initialized yet. Scan the available filters here and return the appropriate
383 * named filter id.
384 *
385 * @param $as_list bool set this to just get an option list of available filters
386 */
387 function import_html_get_preferred_filter($as_list = FALSE) {
388 static $fid;
389 if ($fid && !$as_list) return $fid;
390
391 if (($preferred_filter = variable_get('import_html_preferred_filter', 0)) && (!$as_list)) {
392 return $preferred_filter;
393 }
394
395 // As we are importing existing html, we don't even need line breaks done for us,
396 // so the preferred filter is NONE AT ALL - Unfiltered HTML
397
398 // If this module is being enabled as part of an install profile - we cannot be sure that other core modules are even available!
399 if(! function_exists('user_access')) return;
400
401 // Detect the ID of 'Unfiltered HTML' format, if available
402 // it's almost always 3, but I guess we must look it up.
403 $ff = filter_formats();
404
405 $formats = array();
406 foreach ($ff as $f) {
407 $formats[$f->format] = $f->name;
408 if ($f->name == 'Unfiltered HTML') {
409 $fid = $f->format;
410 }
411 }
412
413 if ($as_list) {
414 return $formats;
415 }
416
417 if (!$fid) {
418 // No unfiltered HTML option available... but I need it!;
419 drupal_set_message("Creating new filter option - Unfiltered HTML");
420
421 // can't use filter_admin_add(); as it rewrites the whole page
422 // sorry, direct SQL
423 db_query("INSERT INTO {filter_formats} (name) VALUES ('%s')", 'Unfiltered HTML');
424 $fid = db_result(db_query("SELECT format from {filter_formats} where name='%s'", 'Unfiltered HTML'));
425 }
426 variable_set('import_html_preferred_filter', $fid);
427 return $fid;
428 }
429
430
431 /**
432 * Returns what general 'type' a file probably is, based on suffix or mime if
433 * available.
434 *
435 * @returns one of the defined 'file_classes' : page|image|resource|document ... or null if unknown
436 *
437 * This is mainly used for UI coloring, so is not totally cannonic. HTML-or-not
438 * is all that really counts.
439 *
440 * @see $_import_html_file_classes
441 */
442 function import_html_guess_file_class($filename) {
443 global $_import_html_file_classes;
444
445 // First try mime
446 if ( function_exists ('mime_content_type') ) {
447 $mime = mime_content_type($filename);
448 list($mime_type, $mime_subtype) = split('/', $mime);
449 if ($mime == 'text/html') return $mime_subtype;
450 if ($mime_type == 'image') return $mime_type;
451 if ($mime == 'text/css') return 'resource';
452 if ($mime == 'application/x-shockwave-flash') return 'resource';
453 if ($mime == 'application/x-javascript') return 'resource';
454 if ($mime_type == 'application') return 'document'; // gross generalization
455 # return 'resource';
456 }
457
458 // Some file mirrors (wget or myself) may have produced odd filenames
459 // strip URL args like # and ? off it
460 $filename = preg_replace('|[\?\#].*$|', '', $filename);
461
462 if (count(explode('.', $filename)) == 1) {
463 // assume no suffix at all is a html page
464 return 'html';
465 }
466 return $_import_html_file_classes[strtolower(array_pop(explode('.', $filename)))];
467 }

  ViewVC Help
Powered by ViewVC 1.1.2