/[drupal]/contributions/tricks/blojsom2drupal/blojsom2drupal.php
ViewVC logotype

Contents of /contributions/tricks/blojsom2drupal/blojsom2drupal.php

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.1 - (show annotations) (download) (as text)
Mon Dec 4 13:03:38 2006 UTC (2 years, 11 months ago) by bcampbell
Branch: MAIN
CVS Tags: HEAD
File MIME type: text/x-php
Initial commit of blojsom2drupal, a blog migration tool.
1 #!/usr/bin/php
2 <?php
3 /* vim:set expandtab tabstop=2 shiftwidth=2 autoindent smartindent: */
4
5 /*
6 blojsom2drupal.php - import blojsom blog data into drupal.
7 Copyright 2006 openDemocracy (http://www.opendemocracy.net)
8 by Ben Campbell (http://www.scumways.com)
9 */
10
11 /* assumes current dir is the main drupal dir */
12 require_once './includes/bootstrap.inc';
13 drupal_bootstrap(DRUPAL_BOOTSTRAP_FULL);
14
15
16
17 /*
18 NOTES TO SELF:
19
20 - can node_save() also store comments? ($node->comments member) check it out.
21 - need to remap paths to images on the oD server?
22
23 drupal stuff:
24 - a comment despamming module for drupal would be cool
25 - submit a patch to drupal with the custom comment_save fn and email all
26 the other module authors that have their own versions of the fn...
27 - Patch blog.module to handle anonymous blog posts more gracefully?
28
29 */
30
31 /* CONFIG STUFF */
32
33 $_blojsom2drupal_config = array(
34 'verbose'=>FALSE,
35 'dryrun'=>FALSE, /* if TRUE, don't modify drupal DB */
36 'smmtag' => '[ MORE TAG ]', /* showmemore tag */
37 'srcdir' => '',
38 'usermap' => array(), /* username remap table */
39 'categorymap' => array(), /* category remap table */
40 'terms' => array() /* terms to apply to all imported posts */
41 );
42
43
44 _blojsom2drupal_main();
45
46
47
48
49 function _blojsom2drupal_main()
50 {
51 global $_blojsom2drupal_config;
52 $cfg = &$_blojsom2drupal_config;
53
54 $opts = getopt("hdvalu:c:t:f:");
55 if( !$opts || array_key_exists( 'h', $opts ) ) {
56 print( "usage: blojsom2drupal [options] \n" .
57 " -h help\n" .
58 " -v verbose\n" .
59 " -a list authors and exit\n" .
60 " -l list categories and exit\n" .
61 " -d dry run (don't write anything into drupal)\n" .
62 " -u <remapfile> use file to remap usernames\n" .
63 " -c <remapfile> use file to remap categories\n" .
64 " -t <term(s)> taxonomy terms to apply to all imported posts\n" .
65 " -f <srcdir> blojsom forum to import (required unless -a or -l used)\n"
66 );
67
68 return;
69 }
70
71 if( array_key_exists( 'd', $opts ) )
72 $cfg['dryrun'] = TRUE;
73 if( array_key_exists( 'v', $opts ) )
74 $cfg['verbose'] = TRUE;
75 if( array_key_exists( 'u', $opts ) )
76 $cfg['usermap'] = _blojsom2drupal_load_mapfile( $opts['u'] );
77 if( array_key_exists( 'c', $opts ) )
78 $cfg['categorymap'] = _blojsom2drupal_load_mapfile( $opts['c'] );
79
80 if( array_key_exists( 'f', $opts ) )
81 $cfg['srcdir'] = $opts['f'];
82 else
83 die( "ERROR: no forum to import (missing -f option)\n" );
84
85 if( array_key_exists( 't', $opts ) )
86 $cfg['terms'] = explode( ',', $opts['t'] );
87
88 /* search for blojsom posts in the given dir */
89 $postlist = _blojsom2drupal_scan( $cfg['srcdir'] );
90
91 if( array_key_exists( 'a', $opts ) )
92 {
93 _blojsom2drupal_list_authors( $postlist );
94 return;
95 }
96
97 if( array_key_exists( 'l', $opts ) )
98 {
99 _blojsom2drupal_list_categories( $postlist );
100 return;
101 }
102
103 _blojsom2drupal_perform_import( $postlist );
104 }
105
106
107 /**
108 * Display a list of all the authors found for the posts in $postlist
109 *
110 * @param $postlist array of post entries (each entry is also an array)
111 *
112 */
113 function _blojsom2drupal_list_authors( $postlist )
114 {
115 $authors = array();
116 foreach( $postlist as $postinfo )
117 {
118 $postfile = $postinfo['file'];
119 $post = _blojsom2drupal_load_post( $postfile );
120 $a = _blojsom2drupal_resolve_post_author( $post );
121 if( $a )
122 $authors[$a] = TRUE;
123 }
124
125 foreach( array_keys( $authors ) as $a )
126 print( $a . "\n" );
127 }
128
129
130 /**
131 * Display a list of all the categories found for the posts in $postlist
132 *
133 * @param $postlist array of post entries (each entry is also an array)
134 *
135 */
136 function _blojsom2drupal_list_categories( $postlist )
137 {
138 $catlist = array();
139 foreach( $postlist as $postinfo )
140 {
141 $cat = $postinfo['category'];
142 if( $cat )
143 $catlist[$cat] = TRUE;
144 }
145
146 foreach( array_keys( $catlist ) as $cat )
147 print( $cat . "\n" );
148 }
149
150
151
152 /**
153 * Run through all the blojsom posts listed in $postlist and
154 * import them into drupal.
155 *
156 * @param $postlist array of post entries (each entry is also an array)
157 *
158 */
159 function _blojsom2drupal_perform_import( $postlist )
160 {
161 global $_blojsom2drupal_config;
162 $cfg = &$_blojsom2drupal_config;
163 $userremap = $cfg['usermap'];
164 $catremap = $cfg['categorymap'];
165
166 foreach( $postlist as $postinfo )
167 {
168 $category = $postinfo['category'];
169 $postfile = $postinfo['file'];
170
171 /* read in the blojsom post and all its metadata, comments etc.. */
172 $post = _blojsom2drupal_load_post( $postfile );
173
174 /* figure out user to file the post under in drupal */
175 $author = _blojsom2drupal_resolve_post_author( $post );
176 $k = strtolower( $author );
177 if( array_key_exists( $k, $userremap ) )
178 $author = $userremap[ $k ];
179 $post['author'] = $author;
180
181 /* figure out which taxonomy terms to apply to this post */
182 $termnames = $cfg['terms'];
183 if( $category )
184 {
185 $k = strtolower( $category );
186 if( array_key_exists( $k, $catremap ) )
187 $category = $catremap[ $k ];
188
189 if( $category )
190 $termnames[] = $category;
191 }
192
193 /* resolve term names into IDs */
194 $terms = array();
195 foreach( $termnames as $t ) {
196 $possibles = taxonomy_get_term_by_name( $t );
197 if( count($possibles) == 0 )
198 die( "Couldn't resolve taxonomy term '$t'\n" );
199 if( count($possibles) > 1 )
200 die( "Found multiple taxonomy terms for '$t'\n" );
201 $terms[] = $possibles[0]->tid;
202 }
203
204 _blojsom2drupal_import_post( $post, $terms );
205 }
206 }
207
208
209
210 /**
211 * Load in a file to map strings to other strings
212 *
213 * file format is one mapping per line:
214 *
215 * src:dest
216 *
217 * comments begin with '#'
218 * tolerant of whitespace.
219 * src or dest can be empty.
220 *
221 * @param $filename the file to load
222 *
223 * @return
224 * returns an array of src=>dest mappings.
225 */
226 function _blojsom2drupal_load_mapfile( $filename )
227 {
228 $map = array();
229 $lines = file( $filename );
230 if( !$lines )
231 die( "couldn't load $filename\n" );
232
233 foreach( $lines as $l )
234 {
235 /* filter out comments */
236 $l = preg_replace( '/\s*[#].*$/', '', $l );
237 $l = trim($l);
238 preg_match( '/\s*(.*?)\s*:\s*(.*?)\s*$/', $l, $m );
239 if( $m )
240 $map[ strtolower( $m[1] ) ] = $m[2];
241 }
242
243 return $map;
244 }
245
246
247 /**
248 * Look up uid a drupal user by name
249 *
250 * @param $username name of user to lookup (case insensitive)
251 *
252 * @return uid of user, or NULL if not found.
253 */
254 function _blojsom2drupal_lookup_user( $username )
255 {
256 $result = db_query("SELECT uid FROM {users} WHERE name='%s'", $username );
257 if( !$result )
258 die( "user lookup failed\n" );
259
260 $u = db_fetch_object( $result );
261 if( !$u )
262 return NULL;
263 return $u->uid;
264 }
265
266
267 /**
268 * Figure out the author of a post.
269 * Uses the blojsom username if present, otherwise tries to extract a user
270 * from the post text itself in a very openDemocracy-specific way (TODO -
271 * generalise or remove!)
272 *
273 * @param $post the loaded post array
274 *
275 * @return author name ('' if none found)
276 */
277 function _blojsom2drupal_resolve_post_author( $post )
278 {
279 $name = '';
280
281 /* use blojsom username if it's there */
282 if( array_key_exists( 'blog-entry-author', $post ) )
283 $name = $post[ 'blog-entry-author' ];
284 else
285 {
286 /* try and extract author from post text */
287
288 /* TODO: this stuff is specific to the openDemocracy dataset.
289 should figure out some nice way to break it out */
290 preg_match( '%<strong>\s*by\s+(.*?)\s*</strong>%i', $post['text'], $m );
291 if( $m )
292 {
293 $name = $m[1];
294 $name = preg_replace( '/&nbsp;/i', '', $name );
295 $name = preg_replace( '%<\s*br\s*/?\s*%i', '', $name ); // <br>, <br/>
296
297 /* couple of leftover bits of cruft */
298 $name = preg_replace( '%>%i', '', $name );
299 $name = preg_replace( '%of the%i', '', $name );
300
301 $name = preg_replace( '%&amp;%i', '&', $name );
302
303 $name = trim( $name );
304 }
305 }
306
307 return $name;
308 }
309
310
311
312
313 /**
314 * Scan a directory for Bojsom posts.
315 * Subdirectories (representing categories) will also be scanned - the
316 * name of the subdir will be used as the category for all the posts
317 * found within it.
318 *
319 * @param $dir Directory to scan
320 * @param $category Category to put posts in (used for subdirs)
321 *
322 * @return
323 * Returns an array of discovered posts. Each post is given as an array
324 * with the following members:
325 * 'file' - the full filename of the post
326 * 'category' - the category (subdir) the post belongs to. '' for root.
327 *
328 */
329 function _blojsom2drupal_scan( $dir, $category='' )
330 {
331 $found = array();
332
333 $dh = opendir($dir);
334 while(false !== ($f = readdir($dh))) {
335 $full = "$dir/$f";
336
337 if( preg_match( '/(\.html$)|(\.textile$)/i', $f ) ) {
338 /* it's a blog post - add it to our list */
339 $found[] = array( 'file'=>$full , 'category'=>$category );
340 }
341
342 /*
343 * if it's a subdir (and not '.', '..' or '.comment') then
344 * treat it as a category and recurse down into it.
345 */
346 if( is_dir($full) && $f[0] != '.' ) {
347 $found = array_merge( $found, _blojsom2drupal_scan( $full, trim($f) ) );
348 }
349 }
350 closedir( $dh );
351
352 return $found;
353 }
354
355
356 /**
357 * Read in a Blojsom post. Handles metafiles and comments.
358 * A post consists of a main file and an optional .meta file.
359 * Comments are stored under the .comment/ directory. Each post with
360 * comments has a subdir under ./comments. This subdir has the same name
361 * as the main file of the post.
362 *
363 * @param $filename Filename of the post to load
364 *
365 * @return
366 * Returns an array with the following members:
367 * 'timestamp' - time of post (php/unix timestamp)
368 * 'title' - title of post
369 * 'text' - the body text of the post
370 * 'comments' - array of comments for this post (empty array if none)
371 * If the post has an associated metafile, any values in the metafile
372 * will be added to the returned array. eg:
373 * 'blog-entry-author' - author of the post
374 */
375 function _blojsom2drupal_load_post( $filename )
376 {
377 $post = array();
378
379 $metafilename = preg_replace( '/(\.[^.]*$)/i', '.meta', $filename );
380 if( file_exists( $metafilename ) ) {
381 /* there's a metafile - so use it! */
382 $post = $post + _blojsom2drupal_load_metafile( $metafilename );
383 }
384
385 /* timestamp from main file unless specified by metadata */
386 if( array_key_exists( 'blog-entry-metadata-timestamp', $post ) ) {
387 /* /1000 to convert from java timestamp */
388 $post['timestamp'] = $post['blog-entry-metadata-timestamp']/1000;
389 } else {
390 $post['timestamp'] = filemtime( $filename );
391 }
392
393 /* first line is title, rest is body text */
394 $lines = file( $filename );
395 if( !$lines )
396 die( "couldn't load $filename\n" );
397 $post['title'] = trim(array_shift( $lines ));
398 $post['text'] = trim(implode( '', $lines ));
399
400 /* a comments dir exists for this post? */
401 $commentsdir = dirname($filename) . '/.comments/' . basename($filename);
402 if( file_exists($commentsdir) ) {
403 $post['comments'] = _blojsom2drupal_load_comments( $commentsdir );
404 } else {
405 $post['comments'] = array();
406 }
407 return $post;
408 }
409
410
411
412 /**
413 * parse a blojsom metafile
414 *
415 * @param $metafilename name of metafile to read
416 *
417 * @return array of values indexed by name.
418 */
419 function _blojsom2drupal_load_metafile( $metafilename )
420 {
421 $meta = array();
422 $lines = file( $metafilename );
423 if( !$lines )
424 die( "couldn't load $metafilename\n" );
425 /* grab lines of form: <name>=<value> */
426 foreach( $lines as $l ) {
427 $matches = array();
428 if( preg_match( '/^(.*?)=(.*?)$/', $l, $matches ) ) {
429 $name = trim($matches[1]);
430 $val = trim($matches[2]);
431 $meta[ $name ] = $val;
432 }
433 }
434 return $meta;
435 }
436
437
438
439 /**
440 * Collect all the comments for a Blojsom post.
441 * Each comment is a separate .cmt file (and optional .meta file).
442 * Each post has a comments dir containing all it's comments.
443 *
444 * @param $commentsdir directory of comments for the post
445 *
446 * @return
447 * array of comments (ordered by timestamp)
448 * Each comment is an array with the following members:
449 * 'name' - author name
450 * 'email' - author email address
451 * 'url' - author homepage url
452 * 'text' - comment text
453 * 'timestamp' - time of comment (php/unix timestamp)
454 * If the comment has a metafile, it's values will also be added.
455 */
456 function _blojsom2drupal_load_comments( $commentsdir )
457 {
458 $comments = array();
459
460 $dh = opendir($commentsdir);
461 while (false !== ($f = readdir($dh))) {
462 if( preg_match( '/\.cmt$/i', $f ) ) {
463 $filename = $commentsdir . '/' . $f;
464
465 $cmt = array();
466
467 /* load in metadata if any */
468 $metafilename = preg_replace( '/(\.[^.]*$)/i', '.meta', $filename );
469 if( file_exists( $metafilename ) ) {
470 $cmt = $cmt + _blojsom2drupal_load_metafile( $metafilename );
471 }
472
473 $lines = file( $filename );
474 if( !$lines )
475 die( "couldn't load $filename\n" );
476 $cmt[ 'name' ] = trim( array_shift( $lines ) );
477 $cmt[ 'email' ] = trim( array_shift( $lines ) );
478 $cmt[ 'url' ] = trim( array_shift( $lines ) );
479 $cmt[ 'text' ] = trim( implode( $lines ) );
480 $cmt[ 'timestamp' ] = filemtime( $filename );
481
482 $timestamp = date("Y\-m\-d\-His", filemtime( $filename ) ) . "-" .$f;
483 $comments[ $timestamp ] = $cmt;
484
485 }
486 }
487
488 /* order comments by timestamp */
489 ksort( $comments );
490 return $comments;
491 }
492
493
494 /**
495 * Import a single blojsom post into drupal.
496 * (if the dryrun flag is set, this fn will do everything except
497 * actually inserting stuff into the drupal DB)
498 *
499 * @param $post The post array
500 * @param $terms Array of drupal taxonomy term IDs to apply to the
501 * imported post.
502 */
503 function _blojsom2drupal_import_post( $post, $terms )
504 {
505 global $_blojsom2drupal_config;
506 $cfg = &$_blojsom2drupal_config;
507
508 if( $post['author'] ) {
509 $uid = _blojsom2drupal_lookup_user( $post['author'] );
510 if( !$uid )
511 die( "user '{$post['author']}' is not registered in drupal!\n" );
512 } else {
513 $uid = 0; // anonymous user
514 }
515
516 /* use drupal break-indicator instead of the showmemore one (used by node_teaser()) */
517 if( array_key_exists( 'smmtag', $cfg ) )
518 $bodytext = str_replace( $cfg['smmtag'], '<!--break-->', $post['text'] );
519 else
520 $bodytext = $post['text'];
521
522 $n = new StdClass();
523 $n->type = 'blog';
524 $n->title = $post['title'];
525 $n->uid = $uid;
526 $n->status = 1; /* published */
527 $n->promote = 0;
528 $n->created = $post['timestamp'];
529 $n->changed = $post['timestamp'];
530 $n->comment = 2; /* allowed to read/write comments */
531 $n->moderate = 0;
532 $n->body = $bodytext;
533 $n->sticky = 0;
534 $n->format = 3; /* Full HTML */
535 $n->teaser = node_teaser( $n->body, $n->format ); /* generate summary text for node */
536
537 if( $cfg['verbose'] ) {
538 printf( "%s: '%s' (%d cmts)\n",
539 $post['author'] ? $post['author'] : 'anonymous',
540 $n->title, count( $post[comments] ) );
541 }
542
543 if( !$cfg['dryrun'] ) {
544 node_save( $n );
545 $nid = $n->nid;
546
547 /* tag node with taxonomy terms */
548 if( $terms )
549 taxonomy_node_save( $n->nid, $terms );
550
551 }
552
553 /* now import any attached comments */
554
555 foreach( $post['comments'] as $srccomment ) {
556 $c = array(
557 'pid' => 0, /* parent comment (0=none) */
558 'nid' => $n->nid,
559 'uid' => 0, /* TODO - try and resolve name field? */
560 'subject' => NULL, /* blojsom comments have no title */
561 'comment' => $srccomment['text'],
562 'format' => 3, /* Full HTML */
563 'hostname' => '', /* TODO - some comments do have IP address we could put here... */
564 'timestamp' => $srccomment['timestamp'],
565 'name' => $srccomment['name'],
566 'mail' => $srccomment['email'],
567 'homepage' => $srccomment['url'],
568 );
569
570 if( !$cfg['dryrun'] ) {
571 $cid = _blojsom2drupal_comment_save($c);
572 if( !$cid )
573 die( "add comment failed\n" );
574 }
575 }
576 }
577
578
579
580
581
582
583 /* NOTE:
584 _blojsom2drupal_build_comment_thread()
585 and
586 _blojsom2drupal_comment_save()
587 are both cut&pasted from the jive2drupal module. Lots of other modules
588 seem to have similar functions, so maybe it's time to submit a patch
589 to the drupal core...
590 */
591
592
593
594 /**
595 * Custom version of comment_save(), to add a new comment to a topic.
596 * The real comment_save() in comment.module blats over various fields we
597 * want to bring over from the jive db. Like timestamp, user details etc...
598 *
599 * It'd be nice if comment.module provided some lower-level access, but it
600 * doesn't.
601 *
602 * @param $edit
603 * A comment array.
604 *
605 * @return
606 * If the comment is successfully saved the comment ID is returned. If the comment
607 * is not saved, FALSE is returned.
608 */
609 function _blojsom2drupal_comment_save($edit) {
610 $thread = _blojsom2drupal_build_comment_thread( $edit );
611
612 $edit['cid'] = db_next_id('{comments}_cid');
613
614 $status = COMMENT_PUBLISHED;
615 $score = 0; // 0 default value, comments get higher score depending on the author's roles
616 $users = serialize(array(0 => 1)); // default value for everybody!!
617
618 db_query("INSERT INTO {comments} (cid, nid, pid, uid, subject, comment, format, hostname, timestamp, status, score, users, thread, name, mail, homepage) VALUES (%d, %d, %d, %d, '%s', '%s', %d, '%s', %d, %d, %d, '%s', '%s', '%s', '%s', '%s')", $edit['cid'], $edit['nid'], $edit['pid'], $edit['uid'], $edit['subject'], $edit['comment'], $edit['format'], $edit['hostname'], $edit['timestamp'], $status, $score, $users, $thread, $edit['name'], $edit['mail'], $edit['homepage']);
619
620 _comment_update_node_statistics($edit['nid']);
621
622 // Tell the other modules a new comment has been submitted.
623 comment_invoke_comment($edit, 'insert');
624
625 return $edit['cid'];
626 }
627
628
629
630
631 /**
632 * Calculate a thread field for a comment.
633 * NOTE: this bit of code is snipped verbatim out of comment.module!
634 *
635 * @param $edit
636 * The comment data
637 *
638 * @return
639 * string to use as thread field
640 */
641 function _blojsom2drupal_build_comment_thread( $edit )
642 {
643
644 // Here we are building the thread field. See the comment
645 // in comment_render().
646 if ($edit['pid'] == 0) {
647 // This is a comment with no parent comment (depth 0): we start
648 // by retrieving the maximum thread level.
649 $max = db_result(db_query('SELECT MAX(thread) FROM {comments} WHERE nid = %d', $edit['nid']));
650
651 // Strip the "/" from the end of the thread.
652 $max = rtrim($max, '/');
653
654 // Finally, build the thread field for this new comment.
655 $thread = int2vancode(vancode2int($max) + 1) .'/';
656 }
657 else {
658 // This is comment with a parent comment: we increase
659 // the part of the thread value at the proper depth.
660
661 // Get the parent comment:
662 $parent = _comment_load($edit['pid']);
663
664 // Strip the "/" from the end of the parent thread.
665 $parent->thread = (string) rtrim((string) $parent->thread, '/');
666
667 // Get the max value in _this_ thread.
668 $max = db_result(db_query("SELECT MAX(thread) FROM {comments} WHERE thread LIKE '%s.%%' AND nid = %d", $parent->thread, $edit['nid']));
669
670 if ($max == '') {
671 // First child of this parent.
672 $thread = $parent->thread .'.'. int2vancode(0) .'/';
673 }
674 else {
675 // Strip the "/" at the end of the thread.
676 $max = rtrim($max, '/');
677
678 // We need to get the value at the correct depth.
679 $parts = explode('.', $max);
680 $parent_depth = count(explode('.', $parent->thread));
681 $last = $parts[$parent_depth];
682
683 // Finally, build the thread field for this new comment.
684 $thread = $parent->thread .'.'. int2vancode(vancode2int($last) + 1) .'/';
685 }
686 }
687
688 return $thread;
689 }
690
691

  ViewVC Help
Powered by ViewVC 1.1.2