/[drupal]/contributions/modules/oai_pmh/oai_pmh.module
ViewVC logotype

Contents of /contributions/modules/oai_pmh/oai_pmh.module

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.7 - (show annotations) (download) (as text)
Sat Apr 11 20:37:20 2009 UTC (7 months, 2 weeks ago) by pletcher
Branch: MAIN
CVS Tags: HEAD
Changes since 1.6: +65 -46 lines
File MIME type: text/x-php
Initial D6 port
1 <?php
2 // $Id: oai_pmh.module,v 1.6 2009/02/03 11:20:52 pletcher Exp $
3
4 function oai_pmh_help($path, $arg) {
5 $output = '';
6
7 switch ($path) {
8 case "admin/help#oai-pmh":
9 $output = '<p>'. t("Module for harvesting OAI-PMH data from a respoitory, and storing it locally for manipulation by Drupal") .'</p>';
10 break;
11 }
12
13 return $output;
14 }
15
16 function oai_pmh_perm() {
17 return array(
18 'administer oai pmh',
19 );
20 }
21
22 function oai_pmh_menu() {
23
24 $items['admin/settings/oai_pmh'] = array(
25 'title' => 'OAI-PMH Settings',
26 'description' => 'Settings page for OAI-PMH module',
27 'page callback' => 'drupal_get_form',
28 'page arguments' => array('oai_pmh_admin'),
29 'access arguments' => array('administer oai pmh'),
30 'type' => MENU_NORMAL_ITEM,
31 );
32
33 $items['oai_pmh/harvest'] = array(
34 'title' => 'Harvesting from OAI repo',
35 'page callback' => 'oai_pmh_cron',
36 'access arguments' => array('access content'),
37 'type' => MENU_CALLBACK
38 );
39
40 $items['oai_pmh/identify'] = array(
41 'title' => 'Getting Identify response from OAI repo',
42 'page callback' => 'oai_pmh_get_identify',
43 'access arguments' => array('administer oai pmh'),
44 'type' => MENU_CALLBACK
45 );
46 $items['oai_pmh/reset'] = array(
47 'title' => 'Resetting Repo',
48 'page callback' => 'oai_pmh_reset',
49 'access arguments' => array('administer oai pmh'),
50 'type' => MENU_CALLBACK
51 );
52 $items['oai_pmh/unpub'] = array(
53 'title' => 'Unpublish a set',
54 'page callback' => 'oai_pmh_unpub',
55 'access arguments' => array('administer oai pmh'),
56 'type' => MENU_CALLBACK
57 );
58
59 return $items;
60 }
61
62 function oai_pmh_unpub($hdl) {
63 $res = db_query("SELECT nid FROM {content_field_set} WHERE field_set_value = '%s'", $hdl);
64 print_r($res);
65 }
66
67
68 function oai_pmh_reset() {
69 $repos_str = variable_get('oai_pmh_repos', '');
70
71 if ($repos_str == '') {
72 $repos = array();
73 }
74 else {
75 $repos = explode(',', substr($repos_str, 1));
76 }
77
78 foreach ($repos as $repo) {
79 variable_set('oai_pmh_'. $repo .'_last_fetch',
80 variable_get('oai_pmh_'. $repo .'_earlist_datestamp', FALSE));
81 variable_set('oai_pmh_'. $repo .'_taxo', 5);
82 }
83
84 $result = db_query("SELECT n.nid FROM {node} n WHERE type = 'oai_pmh_record';");
85
86 while ($n = db_fetch_array($result)) {
87 node_delete($n['nid']);
88 }
89
90 }
91
92 function oai_pmh_admin() {
93
94 $repos_str = variable_get('oai_pmh_repos', '');
95
96 if ($repos_str == '') {
97 $repos = array();
98 }
99 else {
100 $repos = explode(',', substr($repos_str, 1));
101 }
102
103 foreach ($repos as $repo) {
104 $form['oai_pmh_'. $repo .'_name'] = array(
105 '#type' => 'item',
106 '#title' => t('Short Internal Identifier'),
107 '#value' => t($repo),
108 );
109
110 $form['oai_pmh_'. $repo .'_repo'] = array(
111 '#type' => 'textfield',
112 '#title' => t('URL of the OAI implementation'),
113 '#default_value' => variable_get('oai_pmh_'. $repo .'_repo', ''),
114 '#size' => 64,
115 '#maxlength' => 128,
116 '#description' => t("This is the repository."),
117 '#required' => TRUE
118 );
119
120 $sets = variable_get('oai_pmh_'. $repo .'_sets', array());
121 foreach ($sets as $k => $v) {
122 $sets[$k] = $v .' - '. $k;
123 }
124
125 $form['oai_pmh_'. $repo .'_enabled_sets'] = array(
126 '#type' => 'checkboxes',
127 '#title' => t('Sets'),
128 '#default_value' => variable_get('oai_pmh_'. $repo .'_enabled_sets', array()),
129 '#options' => $sets,
130 '#description' => t('Determine which sets are harvested via OAI-PMH'),
131 );
132
133 }
134
135 $new = variable_get('oai_pmh_new', '');
136
137 if ($new != '') {
138 // Escape spaces, other chars to underscores
139 $new = preg_replace('/[^a-z0-9_-]+/i', '_', trim($new));
140
141 $form['oai_pmh_'. $new .'_name'] = array(
142 '#type' => 'item',
143 '#title' => t('Short Identifier'),
144 '#value' => $new,
145 '#description' => t("This is a short name of the repository.")
146 );
147
148 $form['oai_pmh_'. $new .'_repo'] = array(
149 '#type' => 'textfield',
150 '#title' => t('URL of the OAI implementation'),
151 '#default_value' => variable_get('oai_pmh_'. $new .'_repo', ''),
152 '#size' => 64,
153 '#maxlength' => 128,
154 '#description' => t("This is the repository."),
155 '#required' => TRUE,
156 );
157
158 // Create new taxonomy, set the type, and return the vid
159 $vid = _oai_pmh_new_taxo($new);
160
161 if ($vid === NULL) {
162 die("Could not create new taxo");
163 }
164
165 variable_set('oai_pmh_'. $new .'_taxo', $vid);
166
167 variable_set('oai_pmh_new', '');
168 variable_set('oai_pmh_repos', $repos_str .','. $new);
169 }
170
171 #variable_set('oai_pmh_repos', '');
172
173 $form['oai_pmh_new'] = array(
174 '#type' => 'textfield',
175 '#title' => t('Add a new Repository'),
176 '#default_value' => '',
177 '#size' => 32,
178 '#maxlength' => 164,
179 '#description' => t("This is a short name of the new repository. Please only use alphanumeric, no spaces.")
180 );
181
182 return system_settings_form($form);
183 }
184
185 // This should be run during install, and have an option to run again
186 function oai_pmh_get_identify($repo='') {
187
188 $repos = array();
189
190 if ($repo == '') {
191 $repos_str = variable_get('oai_pmh_repos', '');
192 $repos = explode(',', substr($repos_str, 1));
193 }
194 else {
195 $repos = array($repo);
196 }
197
198 foreach ($repos as $repo) {
199
200 $url = variable_get('oai_pmh_'. $repo .'_repo', '');
201
202 $url .= "?verb=Identify";
203
204 $xml = file_get_contents($url);
205
206 $xml = utf8_encode($xml); // Sam says it's important
207
208 // Returns FALSE on error
209 $s = simplexml_load_string($xml);;
210
211 if (!$s) {
212 die('SimpleXML load string error: '. $xml);
213 }
214
215 $ident = $s->Identify;
216 // Things which must come back, or die
217 // Protocool Version
218 if (!isset($ident->protocolVersion)) {
219 die('Incorrect Identify Response from Repository -- No protocolVersion');
220 }
221 else {
222 if ($ident->protocolVersion != '2.0') {
223 die('Incorrect Identify Response from Repository -- Unsupport Protcool Version');
224 }
225 }
226
227 // DeleteRecord
228 if (!isset($ident->deletedRecord)) {
229 die('Incorrect Identify Response from Repository -- No deleteRecord');
230 }
231 else {
232 variable_set('oai_pmh_'. $repo .'_deleted_record', (string)$ident->deletedRecord);
233 }
234
235 // Earlist Datestamp
236 if (!isset($ident->earliestDatestamp)) {
237 die('Incorrect Identify Response from Repository -- No Earlist Datestamp');
238 }
239 else {
240 variable_set('oai_pmh_'. $repo .'_earlist_datestamp', (string)$ident->earliestDatestamp);
241 }
242
243 // Granularity
244 if (!isset($ident->granularity)) {
245 die('Incorrect Identify Response from Repository -- No Granularity');
246 }
247 else {
248 // Granularty is only in days
249 // Magic number from strlen(YYYY-MM-DD)
250 if (strlen($ident->granularity) == 10) {
251 variable_set('oai_pmh_'. $repo .'_granularity', 'days');
252 }
253 // Granularity is in seconds
254 // Magic number from strlen(YYYY-MM-DDThh:mm:ssZ)
255 else if (strlen($ident->granularity) == 20) {
256 variable_set('oai_pmh_'. $repo .'_granularity', 'seconds');
257 }
258 else {
259 die('Incorrect Identify Response from Repository -- Unknown Granularity');
260 }
261 }
262
263 // Optional things, which are nice to have
264 if (!isset($ident->compression)) {
265 variable_set('oai_pmh_'. $repo .'_compression', FALSE);
266 }
267 // According to HTTP 1.1 RFC 2616 there is also the Lempel-Ziv-Welch
268 // compression, which in theory could be supported. However, PHP doesn't
269 // seem to play nice with it, and I havnt seen a repo with it. Its also is
270 // 14 years old.
271 else {
272 variable_set('oai_pmh_'. $repo .'_compression', TRUE);
273 foreach ($ident->compression as $encoding) {
274 if ($encoding == 'gzip') {
275 variable_set('oai_pmh_'. $repo .'_compression_gzip', TRUE);
276 }
277 elseif ($encoding == 'deflate') {
278 variable_set('oai_pmh_'. $repo .'_compression_deflate', TRUE);
279 }
280 }
281 }
282 // Get possible sets from our repo
283 variable_set('oai_pmh_'. $repo .'_sets', oai_pmh_get_sets($repo));
284 }
285 }
286
287 function oai_pmh_get_repo($request, $repo) {
288 $return = '';
289
290 $compression = variable_get('oai_pmh_'. $repo .'_compression', FALSE);
291 $compression_gzip = variable_get('oai_pmh_'. $repo .'_compression_gzip', FALSE);
292 $compression_deflate = variable_get('oai_pmh_'. $repo .'_compression_deflate', FALSE);
293
294 $url = variable_get('oai_pmh_'. $repo .'_repo', '');
295 $url .= $request;
296
297 // If the repo supports gzip compression use curl to reduce overhead
298 if ($compression) {
299 // Init, return data, and set up compression headers.
300 $curl = curl_init($url);
301
302 if ($compression_gzip) {
303 $httpheader = array(
304 "Accept-Encoding: gzip;q=1.0, identity;q=0.5"
305 );
306 }
307 elseif ($compression_deflate) {
308 $httpheader = array(
309 "Accept-Encoding: deflate;q=1.0, identity;q=0.5"
310 );
311 }
312
313 curl_setopt($curl, CURLOPT_RETURNTRANSFER, TRUE);
314 curl_setopt($curl, CURLOPT_HTTPHEADER, $httpheader);
315
316 // Do work.
317 $data = curl_exec($curl);
318
319 // Check work.
320 if (curl_errno($curl)) {
321 die(curl_error($curl));
322 }
323
324 // Clean up
325 curl_close($curl);
326
327 // Magic number of bytes to strip, from php.net to play nice with
328 // mod_deflate, because apparently the php functions just work with gzipped
329 // stuff from inside php.
330 if ($compression_gzip) {
331 $return = gzinflate(substr($data, 10));
332 }
333 else if ($compression_deflate) {
334 $return = gzuncompress($data);
335 }
336 }
337 else {
338 // No compression supported, old school grab.
339 $return = file_get_contents($url);
340 }
341
342 return $return;
343 }
344
345 function oai_pmh_get_sets($repo) {
346 $request = '?verb=ListSets';
347 $xml = oai_pmh_get_repo($request, $repo);
348
349 $s = simplexml_load_string($xml);
350
351 if (!$s) {
352 die('SimpleXML load string error: '.$xml);
353 }
354
355 if (isset($s->error)) {
356 die('Error from server, code -- '. $s->error['code'] .
357 ' -- value -- '. (string)$s->error);
358 }
359
360 $return = array();
361
362 foreach ($s->ListSets->set as $set) {
363 $return[(string)$set->setSpec] = (string)$set->setName;
364 }
365 return $return;
366 }
367
368 function oai_pmh_main($repo='') {
369 if ($repo == '') {
370 die('No repo passed into oai_pmh_main()');
371 }
372
373 // Get current time before we start fetching. Fetching shouldnt take
374 // long, but in theory there could be a really small window in which
375 // a record could be created in and not fetched, and man that would
376 // suck to debug.
377 if ('seconds' == variable_get('oai_pmh_'. $repo .'_granularity', 'days')) {
378 $now = date('Y-m-d\TH:m:s\Z');
379 }
380 else if ('days' == variable_get('oai_pmh_'. $repo .'_granularity', 'days')) {
381 $now = date('Y-m-d');
382 }
383 else {
384 die('Unsuported granularity returned');
385 }
386
387 // Get the last fetch date, if we cant get the earlist date stamp
388 // and if we cant do that, die.
389 $from = variable_get('oai_pmh_'. $repo .'_last_fetch',
390 variable_get('oai_pmh_'. $repo .'_earlist_datestamp', FALSE));
391
392 // To reimport everything
393 //$from = variable_get('oai_pmh_'.$repo.'_earlist_datestamp', FALSE);
394
395 if (!$from) {
396 die('Unable to get a date to fetch from');
397 }
398 // If we dont declare this, array_merge throws a 500. Stay Classy PHP
399 $record_store['store'] = array();
400
401 /*
402 $sets = oai_pmh_get_sets($repo);
403
404 foreach ($sets as $k => $v) {
405 if (variable_get('oai_pmh_'. $repo .'_tag_'. $k, FALSE) != 0) {
406 $tmp[$k] = '"'. $v .'"';
407 }
408 }
409
410 $sets = $tmp;
411 */
412 $enabled_sets = variable_get('oai_pmh_'. $repo .'_enabled_sets', array());
413
414
415 if (count($enabled_sets) > 0) {
416 foreach ($enabled_sets as $k) {
417 // Build request for first pass. Clean date to make url safe
418 if (is_string($k)) {
419 $request = '?verb=ListRecords&from='. _oai_pmh_clean_url($from) .
420 '&set='. $k .'&metadataPrefix=oai_dc';
421 $merge = oai_pmh_do_work($repo, $request, $k);
422 $record_store['store'] = array_merge($record_store['store'], $merge);
423 }
424 }
425 }
426 else {
427 $request = '?verb=ListRecords&from='. _oai_pmh_clean_url($from) .
428 '&metadataPrefix=oai_dc';
429 $merge = oai_pmh_do_work($repo, $request, $record_store['store']);
430 $record_store['store'] = array_merge($record_store['store'], $merge);
431 }
432
433 // Store "currnet" time as our last fetched time for next iteration
434 variable_set('oai_pmh_'. $repo .'_last_fetch', $now);
435
436 $record_store['store'] = array_array_unique($record_store['store']);
437
438 return $record_store;
439 }
440
441
442 function oai_pmh_do_work($repo, $request, $tag = '') {
443 // Oneshot var to allow a single bad resumption token
444 $bad_resumpt_token = FALSE;
445
446 // Temp storage of results, will be merged with big array
447 $rs = array();
448
449 // Save this in case of error later.
450 $orig_request = $request;
451
452 do {
453 $continue = FALSE;
454
455 $xml = oai_pmh_get_repo($request, $repo);
456
457 $s = simplexml_load_string($xml);
458
459 if (!$s) {
460 die('SimpleXML load string error: '. $xml);
461 }
462
463 // If error element is set, we have a problem. Blow up before the
464 // foreach blows up for us. More info this way too.
465 if (isset($s->error)) {
466 // If the error is a bad resumption token, and we havnt already tried
467 // reset the query, unset our existing data to avoid duplicates, tell
468 // the loop to continue, and shoot out oneshot var
469 if ('badResumptionToken' == $s->error['code'] && !$bad_resumpt_token) {
470 $bad_resumpt_token = TRUE;
471 $continue = TRUE;
472 unset($rs);
473 $request = $orig_request;
474 }
475 // If its not a bad resump token, or if its a bad resump token for the
476 // second time die and say why. We want to ignore noRecordsmatch errors
477 // as theyre not 'real' errors, just means no new records.
478 else if ('noRecordsMatch' != $s->error['code']) {
479 die('Error from server, code -- '. $s->error['code'] .
480 ' -- value -- '. (string)$s->error);
481 }
482 }
483 else {
484 if ($tag != '') {
485 $sets = variable_get('oai_pmh_'. $repo .'_sets', '');
486 }
487 foreach ($s->ListRecords->record as $item) {
488 /*
489 $res = db_query("SELECT COUNT(*) FROM content_type_oai_pmh_record WHERE field_record_identifier_value = %s",
490 (string)$item->header->identifier);
491 $count = db_fetch_array($res);
492 */
493 // STORE RECORDS IN THIS VARIABLE. IF YOU CHANGE IT, CHANGE THE UNSET
494 // ABOVE OTHERWISE INCONSISTENCY IS POSSIBLE -- $rs
495 $dc_node = $item->metadata->children('http://www.openarchives.org/OAI/2.0/oai_dc/')->children('http://purl.org/dc/elements/1.1/');
496
497 $values = new stdClass();
498
499 $values->type = 'oai_pmh_record';
500 $values->title = (string)$dc_node->title;
501 $values->body = (string)$dc_node->description;
502 $values->status = 1;
503 $values->name = 'admin';
504
505 $values->field_record_identifier[0]['value'] =
506 (string)$item->header->identifier;
507 $values->field_record_date[0]['value'] = (string)$dc_node->date[0];
508 $values->field_record_creator[0]['value'] =
509 (string)$dc_node->creator;
510 $values->field_record_language[0]['value'] =
511 (string)$dc_node->language;
512 $values->field_record_type[0]['value'] = (string)$dc_node->type;
513
514
515 if (isset($dc_node->subject)) {
516
517 foreach ($dc_node->subject as $sub) {
518 $subjects[] = (string)$sub;
519 }
520
521 $tax = variable_get('oai_pmh_'. $repo .'_taxo', '');
522 if ($tax != '') {
523 $values->taxonomy['tags'][$tax] = implode(',', $subjects);
524 if ($tag != '') {
525 $values->taxonomy['tags'][$tax] .= "," . $sets[$tag];
526 $values->field_record_set[0]['value'] = $tag;
527 }
528 }
529
530 // Delete subjects for next run through
531 unset($subjects);
532 }
533
534 $rs[] = $values;
535
536 }
537 // if a resumption token is set, and it is non-null. Requests with
538 // resumptionTokens come back with an empty self closing tag
539 // indicating the end of the request.
540 if (isset($s->ListRecords->resumptionToken) &&
541 '' != (string)$s->ListRecords->resumptionToken) {
542 // Run the loop a second time, update the request url
543 $continue = TRUE;
544 $request = '?verb=ListRecords&resumptionToken='.
545 _oai_pmh_clean_url((string)$s->ListRecords->resumptionToken);
546 // Unneeded in theory, but makes me feel better
547 unset($s->ListRecords->resumptionToken);
548 }
549 }
550 }
551 while ($continue);
552
553 return $rs;
554 }
555
556 # Eventually deletion stuff will go here also
557 function oai_pmh_insert($arr) {
558 $n = 0;
559 echo "### ". count($arr['store']) ."\n";
560 foreach ($arr['store'] as $values) {
561 echo "#". $n++ ."\n";
562 $node = node_submit($values);
563 node_save($node);
564 ob_flush();
565 }
566 }
567
568 function _oai_pmh_clean_url($str) {
569 // Do %'s first, so we dont get double replacement
570 $str = str_replace('/%/', '%25', $str);
571 // Do the rest.
572 $pattern = array('/', '?', '#', '=', '&', ':', ';', ' ', '+');
573 $replace = array('%2F', '%3F', '%23', '%3D', '%26', '%3A', '%3B', '%20',
574 '%2B');
575
576 return str_replace($pattern, $replace, $str);
577 }
578
579 function _oai_pmh_new_taxo($name) {
580 // Create a new Taxonomy for the new repo
581 $taxo = array(
582 'name' => 'OAI PMH '. $name,
583 'description' => 'Taxonomy for the '. $name .' Repository',
584 'help' => 'Auto-generated Taxonomy from OAI PMH Module',
585 'relations' => 0,
586 'hierarchy' => 0,
587 'multiple' => 0,
588 'tags' => 1,
589 'weight' => 0,
590 'nodes' => array(
591 // This seems backwards, but according to the code...
592 'oai_pmh_record' => 'type'
593 ),
594 );
595
596 taxonomy_save_vocabulary($taxo);
597
598 $taxo_all = taxonomy_get_vocabularies('oai_pmh_record');
599
600 foreach ($taxo_all as $taxo) {
601 if ($taxo->name == 'OAI PMH '. $name) {
602 return $taxo->vid;
603 }
604 }
605
606 return NULL;
607 }
608
609 function oai_pmh_cron() {
610 print "<pre>";
611 set_time_limit(0);
612 oai_pmh_get_identify();
613 $repos_str = variable_get('oai_pmh_repos', '');
614 $repos = explode(',', substr($repos_str, 1));
615 foreach ($repos as $repo) {
616 $last = variable_get('oai_pmh_'. $repo .'_last_fetch', NULL);
617 $gran = variable_get('oai_pmh_'. $repo .'_granularity', NULL);
618 // Get time between fetching, default to 6hr
619 if ($gran == 'seconds') {
620 $interval = variable_get('oai_pmh_'. $repo .'_interval', 21600);
621 }
622 else if ($gran == 'days') {
623 $interval = variable_get('oai_pmh_'. $repo .'_interval', 86400);
624 }
625
626 // Based on granularity, if it hasnt been 6 or 24 hours respectively (magic num is time in sec)
627 // exit before running
628 if ($last != NULL && strtotime($last) >= (time() - $interval)) {
629 break;
630 }
631
632 $ret = oai_pmh_main($repo);
633 flush();
634 ob_flush();
635 oai_pmh_insert($ret);
636 }
637 print "</pre>";
638 }
639
640 // From php.net. Allows us to get a unique array when elements are arrays.
641 // Stay classy php.
642 function array_array_unique($my_array) {
643 if (!is_array($my_array)) {
644 return $my_array;
645 }
646
647 foreach ($my_array as &$my_value) {
648 $my_value = serialize($my_value);
649 }
650
651 $my_array = array_unique($my_array);
652
653 foreach ($my_array as &$my_value) {
654 $my_value = unserialize($my_value);
655 }
656
657 return $my_array;
658 }
659

  ViewVC Help
Powered by ViewVC 1.1.2