| 1 |
<?php
|
| 2 |
// $Id: oai_pmh.module,v 1.6 2009/02/03 11:20:52 pletcher Exp $
|
| 3 |
|
| 4 |
function oai_pmh_help($path, $arg) {
|
| 5 |
$output = '';
|
| 6 |
|
| 7 |
switch ($path) {
|
| 8 |
case "admin/help#oai-pmh":
|
| 9 |
$output = '<p>'. t("Module for harvesting OAI-PMH data from a respoitory, and storing it locally for manipulation by Drupal") .'</p>';
|
| 10 |
break;
|
| 11 |
}
|
| 12 |
|
| 13 |
return $output;
|
| 14 |
}
|
| 15 |
|
| 16 |
function oai_pmh_perm() {
|
| 17 |
return array(
|
| 18 |
'administer oai pmh',
|
| 19 |
);
|
| 20 |
}
|
| 21 |
|
| 22 |
function oai_pmh_menu() {
|
| 23 |
|
| 24 |
$items['admin/settings/oai_pmh'] = array(
|
| 25 |
'title' => 'OAI-PMH Settings',
|
| 26 |
'description' => 'Settings page for OAI-PMH module',
|
| 27 |
'page callback' => 'drupal_get_form',
|
| 28 |
'page arguments' => array('oai_pmh_admin'),
|
| 29 |
'access arguments' => array('administer oai pmh'),
|
| 30 |
'type' => MENU_NORMAL_ITEM,
|
| 31 |
);
|
| 32 |
|
| 33 |
$items['oai_pmh/harvest'] = array(
|
| 34 |
'title' => 'Harvesting from OAI repo',
|
| 35 |
'page callback' => 'oai_pmh_cron',
|
| 36 |
'access arguments' => array('access content'),
|
| 37 |
'type' => MENU_CALLBACK
|
| 38 |
);
|
| 39 |
|
| 40 |
$items['oai_pmh/identify'] = array(
|
| 41 |
'title' => 'Getting Identify response from OAI repo',
|
| 42 |
'page callback' => 'oai_pmh_get_identify',
|
| 43 |
'access arguments' => array('administer oai pmh'),
|
| 44 |
'type' => MENU_CALLBACK
|
| 45 |
);
|
| 46 |
$items['oai_pmh/reset'] = array(
|
| 47 |
'title' => 'Resetting Repo',
|
| 48 |
'page callback' => 'oai_pmh_reset',
|
| 49 |
'access arguments' => array('administer oai pmh'),
|
| 50 |
'type' => MENU_CALLBACK
|
| 51 |
);
|
| 52 |
$items['oai_pmh/unpub'] = array(
|
| 53 |
'title' => 'Unpublish a set',
|
| 54 |
'page callback' => 'oai_pmh_unpub',
|
| 55 |
'access arguments' => array('administer oai pmh'),
|
| 56 |
'type' => MENU_CALLBACK
|
| 57 |
);
|
| 58 |
|
| 59 |
return $items;
|
| 60 |
}
|
| 61 |
|
| 62 |
function oai_pmh_unpub($hdl) {
|
| 63 |
$res = db_query("SELECT nid FROM {content_field_set} WHERE field_set_value = '%s'", $hdl);
|
| 64 |
print_r($res);
|
| 65 |
}
|
| 66 |
|
| 67 |
|
| 68 |
function oai_pmh_reset() {
|
| 69 |
$repos_str = variable_get('oai_pmh_repos', '');
|
| 70 |
|
| 71 |
if ($repos_str == '') {
|
| 72 |
$repos = array();
|
| 73 |
}
|
| 74 |
else {
|
| 75 |
$repos = explode(',', substr($repos_str, 1));
|
| 76 |
}
|
| 77 |
|
| 78 |
foreach ($repos as $repo) {
|
| 79 |
variable_set('oai_pmh_'. $repo .'_last_fetch',
|
| 80 |
variable_get('oai_pmh_'. $repo .'_earlist_datestamp', FALSE));
|
| 81 |
variable_set('oai_pmh_'. $repo .'_taxo', 5);
|
| 82 |
}
|
| 83 |
|
| 84 |
$result = db_query("SELECT n.nid FROM {node} n WHERE type = 'oai_pmh_record';");
|
| 85 |
|
| 86 |
while ($n = db_fetch_array($result)) {
|
| 87 |
node_delete($n['nid']);
|
| 88 |
}
|
| 89 |
|
| 90 |
}
|
| 91 |
|
| 92 |
function oai_pmh_admin() {
|
| 93 |
|
| 94 |
$repos_str = variable_get('oai_pmh_repos', '');
|
| 95 |
|
| 96 |
if ($repos_str == '') {
|
| 97 |
$repos = array();
|
| 98 |
}
|
| 99 |
else {
|
| 100 |
$repos = explode(',', substr($repos_str, 1));
|
| 101 |
}
|
| 102 |
|
| 103 |
foreach ($repos as $repo) {
|
| 104 |
$form['oai_pmh_'. $repo .'_name'] = array(
|
| 105 |
'#type' => 'item',
|
| 106 |
'#title' => t('Short Internal Identifier'),
|
| 107 |
'#value' => t($repo),
|
| 108 |
);
|
| 109 |
|
| 110 |
$form['oai_pmh_'. $repo .'_repo'] = array(
|
| 111 |
'#type' => 'textfield',
|
| 112 |
'#title' => t('URL of the OAI implementation'),
|
| 113 |
'#default_value' => variable_get('oai_pmh_'. $repo .'_repo', ''),
|
| 114 |
'#size' => 64,
|
| 115 |
'#maxlength' => 128,
|
| 116 |
'#description' => t("This is the repository."),
|
| 117 |
'#required' => TRUE
|
| 118 |
);
|
| 119 |
|
| 120 |
$sets = variable_get('oai_pmh_'. $repo .'_sets', array());
|
| 121 |
foreach ($sets as $k => $v) {
|
| 122 |
$sets[$k] = $v .' - '. $k;
|
| 123 |
}
|
| 124 |
|
| 125 |
$form['oai_pmh_'. $repo .'_enabled_sets'] = array(
|
| 126 |
'#type' => 'checkboxes',
|
| 127 |
'#title' => t('Sets'),
|
| 128 |
'#default_value' => variable_get('oai_pmh_'. $repo .'_enabled_sets', array()),
|
| 129 |
'#options' => $sets,
|
| 130 |
'#description' => t('Determine which sets are harvested via OAI-PMH'),
|
| 131 |
);
|
| 132 |
|
| 133 |
}
|
| 134 |
|
| 135 |
$new = variable_get('oai_pmh_new', '');
|
| 136 |
|
| 137 |
if ($new != '') {
|
| 138 |
// Escape spaces, other chars to underscores
|
| 139 |
$new = preg_replace('/[^a-z0-9_-]+/i', '_', trim($new));
|
| 140 |
|
| 141 |
$form['oai_pmh_'. $new .'_name'] = array(
|
| 142 |
'#type' => 'item',
|
| 143 |
'#title' => t('Short Identifier'),
|
| 144 |
'#value' => $new,
|
| 145 |
'#description' => t("This is a short name of the repository.")
|
| 146 |
);
|
| 147 |
|
| 148 |
$form['oai_pmh_'. $new .'_repo'] = array(
|
| 149 |
'#type' => 'textfield',
|
| 150 |
'#title' => t('URL of the OAI implementation'),
|
| 151 |
'#default_value' => variable_get('oai_pmh_'. $new .'_repo', ''),
|
| 152 |
'#size' => 64,
|
| 153 |
'#maxlength' => 128,
|
| 154 |
'#description' => t("This is the repository."),
|
| 155 |
'#required' => TRUE,
|
| 156 |
);
|
| 157 |
|
| 158 |
// Create new taxonomy, set the type, and return the vid
|
| 159 |
$vid = _oai_pmh_new_taxo($new);
|
| 160 |
|
| 161 |
if ($vid === NULL) {
|
| 162 |
die("Could not create new taxo");
|
| 163 |
}
|
| 164 |
|
| 165 |
variable_set('oai_pmh_'. $new .'_taxo', $vid);
|
| 166 |
|
| 167 |
variable_set('oai_pmh_new', '');
|
| 168 |
variable_set('oai_pmh_repos', $repos_str .','. $new);
|
| 169 |
}
|
| 170 |
|
| 171 |
#variable_set('oai_pmh_repos', '');
|
| 172 |
|
| 173 |
$form['oai_pmh_new'] = array(
|
| 174 |
'#type' => 'textfield',
|
| 175 |
'#title' => t('Add a new Repository'),
|
| 176 |
'#default_value' => '',
|
| 177 |
'#size' => 32,
|
| 178 |
'#maxlength' => 164,
|
| 179 |
'#description' => t("This is a short name of the new repository. Please only use alphanumeric, no spaces.")
|
| 180 |
);
|
| 181 |
|
| 182 |
return system_settings_form($form);
|
| 183 |
}
|
| 184 |
|
| 185 |
// This should be run during install, and have an option to run again
|
| 186 |
function oai_pmh_get_identify($repo='') {
|
| 187 |
|
| 188 |
$repos = array();
|
| 189 |
|
| 190 |
if ($repo == '') {
|
| 191 |
$repos_str = variable_get('oai_pmh_repos', '');
|
| 192 |
$repos = explode(',', substr($repos_str, 1));
|
| 193 |
}
|
| 194 |
else {
|
| 195 |
$repos = array($repo);
|
| 196 |
}
|
| 197 |
|
| 198 |
foreach ($repos as $repo) {
|
| 199 |
|
| 200 |
$url = variable_get('oai_pmh_'. $repo .'_repo', '');
|
| 201 |
|
| 202 |
$url .= "?verb=Identify";
|
| 203 |
|
| 204 |
$xml = file_get_contents($url);
|
| 205 |
|
| 206 |
$xml = utf8_encode($xml); // Sam says it's important
|
| 207 |
|
| 208 |
// Returns FALSE on error
|
| 209 |
$s = simplexml_load_string($xml);;
|
| 210 |
|
| 211 |
if (!$s) {
|
| 212 |
die('SimpleXML load string error: '. $xml);
|
| 213 |
}
|
| 214 |
|
| 215 |
$ident = $s->Identify;
|
| 216 |
// Things which must come back, or die
|
| 217 |
// Protocool Version
|
| 218 |
if (!isset($ident->protocolVersion)) {
|
| 219 |
die('Incorrect Identify Response from Repository -- No protocolVersion');
|
| 220 |
}
|
| 221 |
else {
|
| 222 |
if ($ident->protocolVersion != '2.0') {
|
| 223 |
die('Incorrect Identify Response from Repository -- Unsupport Protcool Version');
|
| 224 |
}
|
| 225 |
}
|
| 226 |
|
| 227 |
// DeleteRecord
|
| 228 |
if (!isset($ident->deletedRecord)) {
|
| 229 |
die('Incorrect Identify Response from Repository -- No deleteRecord');
|
| 230 |
}
|
| 231 |
else {
|
| 232 |
variable_set('oai_pmh_'. $repo .'_deleted_record', (string)$ident->deletedRecord);
|
| 233 |
}
|
| 234 |
|
| 235 |
// Earlist Datestamp
|
| 236 |
if (!isset($ident->earliestDatestamp)) {
|
| 237 |
die('Incorrect Identify Response from Repository -- No Earlist Datestamp');
|
| 238 |
}
|
| 239 |
else {
|
| 240 |
variable_set('oai_pmh_'. $repo .'_earlist_datestamp', (string)$ident->earliestDatestamp);
|
| 241 |
}
|
| 242 |
|
| 243 |
// Granularity
|
| 244 |
if (!isset($ident->granularity)) {
|
| 245 |
die('Incorrect Identify Response from Repository -- No Granularity');
|
| 246 |
}
|
| 247 |
else {
|
| 248 |
// Granularty is only in days
|
| 249 |
// Magic number from strlen(YYYY-MM-DD)
|
| 250 |
if (strlen($ident->granularity) == 10) {
|
| 251 |
variable_set('oai_pmh_'. $repo .'_granularity', 'days');
|
| 252 |
}
|
| 253 |
// Granularity is in seconds
|
| 254 |
// Magic number from strlen(YYYY-MM-DDThh:mm:ssZ)
|
| 255 |
else if (strlen($ident->granularity) == 20) {
|
| 256 |
variable_set('oai_pmh_'. $repo .'_granularity', 'seconds');
|
| 257 |
}
|
| 258 |
else {
|
| 259 |
die('Incorrect Identify Response from Repository -- Unknown Granularity');
|
| 260 |
}
|
| 261 |
}
|
| 262 |
|
| 263 |
// Optional things, which are nice to have
|
| 264 |
if (!isset($ident->compression)) {
|
| 265 |
variable_set('oai_pmh_'. $repo .'_compression', FALSE);
|
| 266 |
}
|
| 267 |
// According to HTTP 1.1 RFC 2616 there is also the Lempel-Ziv-Welch
|
| 268 |
// compression, which in theory could be supported. However, PHP doesn't
|
| 269 |
// seem to play nice with it, and I havnt seen a repo with it. Its also is
|
| 270 |
// 14 years old.
|
| 271 |
else {
|
| 272 |
variable_set('oai_pmh_'. $repo .'_compression', TRUE);
|
| 273 |
foreach ($ident->compression as $encoding) {
|
| 274 |
if ($encoding == 'gzip') {
|
| 275 |
variable_set('oai_pmh_'. $repo .'_compression_gzip', TRUE);
|
| 276 |
}
|
| 277 |
elseif ($encoding == 'deflate') {
|
| 278 |
variable_set('oai_pmh_'. $repo .'_compression_deflate', TRUE);
|
| 279 |
}
|
| 280 |
}
|
| 281 |
}
|
| 282 |
// Get possible sets from our repo
|
| 283 |
variable_set('oai_pmh_'. $repo .'_sets', oai_pmh_get_sets($repo));
|
| 284 |
}
|
| 285 |
}
|
| 286 |
|
| 287 |
function oai_pmh_get_repo($request, $repo) {
|
| 288 |
$return = '';
|
| 289 |
|
| 290 |
$compression = variable_get('oai_pmh_'. $repo .'_compression', FALSE);
|
| 291 |
$compression_gzip = variable_get('oai_pmh_'. $repo .'_compression_gzip', FALSE);
|
| 292 |
$compression_deflate = variable_get('oai_pmh_'. $repo .'_compression_deflate', FALSE);
|
| 293 |
|
| 294 |
$url = variable_get('oai_pmh_'. $repo .'_repo', '');
|
| 295 |
$url .= $request;
|
| 296 |
|
| 297 |
// If the repo supports gzip compression use curl to reduce overhead
|
| 298 |
if ($compression) {
|
| 299 |
// Init, return data, and set up compression headers.
|
| 300 |
$curl = curl_init($url);
|
| 301 |
|
| 302 |
if ($compression_gzip) {
|
| 303 |
$httpheader = array(
|
| 304 |
"Accept-Encoding: gzip;q=1.0, identity;q=0.5"
|
| 305 |
);
|
| 306 |
}
|
| 307 |
elseif ($compression_deflate) {
|
| 308 |
$httpheader = array(
|
| 309 |
"Accept-Encoding: deflate;q=1.0, identity;q=0.5"
|
| 310 |
);
|
| 311 |
}
|
| 312 |
|
| 313 |
curl_setopt($curl, CURLOPT_RETURNTRANSFER, TRUE);
|
| 314 |
curl_setopt($curl, CURLOPT_HTTPHEADER, $httpheader);
|
| 315 |
|
| 316 |
// Do work.
|
| 317 |
$data = curl_exec($curl);
|
| 318 |
|
| 319 |
// Check work.
|
| 320 |
if (curl_errno($curl)) {
|
| 321 |
die(curl_error($curl));
|
| 322 |
}
|
| 323 |
|
| 324 |
// Clean up
|
| 325 |
curl_close($curl);
|
| 326 |
|
| 327 |
// Magic number of bytes to strip, from php.net to play nice with
|
| 328 |
// mod_deflate, because apparently the php functions just work with gzipped
|
| 329 |
// stuff from inside php.
|
| 330 |
if ($compression_gzip) {
|
| 331 |
$return = gzinflate(substr($data, 10));
|
| 332 |
}
|
| 333 |
else if ($compression_deflate) {
|
| 334 |
$return = gzuncompress($data);
|
| 335 |
}
|
| 336 |
}
|
| 337 |
else {
|
| 338 |
// No compression supported, old school grab.
|
| 339 |
$return = file_get_contents($url);
|
| 340 |
}
|
| 341 |
|
| 342 |
return $return;
|
| 343 |
}
|
| 344 |
|
| 345 |
function oai_pmh_get_sets($repo) {
|
| 346 |
$request = '?verb=ListSets';
|
| 347 |
$xml = oai_pmh_get_repo($request, $repo);
|
| 348 |
|
| 349 |
$s = simplexml_load_string($xml);
|
| 350 |
|
| 351 |
if (!$s) {
|
| 352 |
die('SimpleXML load string error: '.$xml);
|
| 353 |
}
|
| 354 |
|
| 355 |
if (isset($s->error)) {
|
| 356 |
die('Error from server, code -- '. $s->error['code'] .
|
| 357 |
' -- value -- '. (string)$s->error);
|
| 358 |
}
|
| 359 |
|
| 360 |
$return = array();
|
| 361 |
|
| 362 |
foreach ($s->ListSets->set as $set) {
|
| 363 |
$return[(string)$set->setSpec] = (string)$set->setName;
|
| 364 |
}
|
| 365 |
return $return;
|
| 366 |
}
|
| 367 |
|
| 368 |
function oai_pmh_main($repo='') {
|
| 369 |
if ($repo == '') {
|
| 370 |
die('No repo passed into oai_pmh_main()');
|
| 371 |
}
|
| 372 |
|
| 373 |
// Get current time before we start fetching. Fetching shouldnt take
|
| 374 |
// long, but in theory there could be a really small window in which
|
| 375 |
// a record could be created in and not fetched, and man that would
|
| 376 |
// suck to debug.
|
| 377 |
if ('seconds' == variable_get('oai_pmh_'. $repo .'_granularity', 'days')) {
|
| 378 |
$now = date('Y-m-d\TH:m:s\Z');
|
| 379 |
}
|
| 380 |
else if ('days' == variable_get('oai_pmh_'. $repo .'_granularity', 'days')) {
|
| 381 |
$now = date('Y-m-d');
|
| 382 |
}
|
| 383 |
else {
|
| 384 |
die('Unsuported granularity returned');
|
| 385 |
}
|
| 386 |
|
| 387 |
// Get the last fetch date, if we cant get the earlist date stamp
|
| 388 |
// and if we cant do that, die.
|
| 389 |
$from = variable_get('oai_pmh_'. $repo .'_last_fetch',
|
| 390 |
variable_get('oai_pmh_'. $repo .'_earlist_datestamp', FALSE));
|
| 391 |
|
| 392 |
// To reimport everything
|
| 393 |
//$from = variable_get('oai_pmh_'.$repo.'_earlist_datestamp', FALSE);
|
| 394 |
|
| 395 |
if (!$from) {
|
| 396 |
die('Unable to get a date to fetch from');
|
| 397 |
}
|
| 398 |
// If we dont declare this, array_merge throws a 500. Stay Classy PHP
|
| 399 |
$record_store['store'] = array();
|
| 400 |
|
| 401 |
/*
|
| 402 |
$sets = oai_pmh_get_sets($repo);
|
| 403 |
|
| 404 |
foreach ($sets as $k => $v) {
|
| 405 |
if (variable_get('oai_pmh_'. $repo .'_tag_'. $k, FALSE) != 0) {
|
| 406 |
$tmp[$k] = '"'. $v .'"';
|
| 407 |
}
|
| 408 |
}
|
| 409 |
|
| 410 |
$sets = $tmp;
|
| 411 |
*/
|
| 412 |
$enabled_sets = variable_get('oai_pmh_'. $repo .'_enabled_sets', array());
|
| 413 |
|
| 414 |
|
| 415 |
if (count($enabled_sets) > 0) {
|
| 416 |
foreach ($enabled_sets as $k) {
|
| 417 |
// Build request for first pass. Clean date to make url safe
|
| 418 |
if (is_string($k)) {
|
| 419 |
$request = '?verb=ListRecords&from='. _oai_pmh_clean_url($from) .
|
| 420 |
'&set='. $k .'&metadataPrefix=oai_dc';
|
| 421 |
$merge = oai_pmh_do_work($repo, $request, $k);
|
| 422 |
$record_store['store'] = array_merge($record_store['store'], $merge);
|
| 423 |
}
|
| 424 |
}
|
| 425 |
}
|
| 426 |
else {
|
| 427 |
$request = '?verb=ListRecords&from='. _oai_pmh_clean_url($from) .
|
| 428 |
'&metadataPrefix=oai_dc';
|
| 429 |
$merge = oai_pmh_do_work($repo, $request, $record_store['store']);
|
| 430 |
$record_store['store'] = array_merge($record_store['store'], $merge);
|
| 431 |
}
|
| 432 |
|
| 433 |
// Store "currnet" time as our last fetched time for next iteration
|
| 434 |
variable_set('oai_pmh_'. $repo .'_last_fetch', $now);
|
| 435 |
|
| 436 |
$record_store['store'] = array_array_unique($record_store['store']);
|
| 437 |
|
| 438 |
return $record_store;
|
| 439 |
}
|
| 440 |
|
| 441 |
|
| 442 |
function oai_pmh_do_work($repo, $request, $tag = '') {
|
| 443 |
// Oneshot var to allow a single bad resumption token
|
| 444 |
$bad_resumpt_token = FALSE;
|
| 445 |
|
| 446 |
// Temp storage of results, will be merged with big array
|
| 447 |
$rs = array();
|
| 448 |
|
| 449 |
// Save this in case of error later.
|
| 450 |
$orig_request = $request;
|
| 451 |
|
| 452 |
do {
|
| 453 |
$continue = FALSE;
|
| 454 |
|
| 455 |
$xml = oai_pmh_get_repo($request, $repo);
|
| 456 |
|
| 457 |
$s = simplexml_load_string($xml);
|
| 458 |
|
| 459 |
if (!$s) {
|
| 460 |
die('SimpleXML load string error: '. $xml);
|
| 461 |
}
|
| 462 |
|
| 463 |
// If error element is set, we have a problem. Blow up before the
|
| 464 |
// foreach blows up for us. More info this way too.
|
| 465 |
if (isset($s->error)) {
|
| 466 |
// If the error is a bad resumption token, and we havnt already tried
|
| 467 |
// reset the query, unset our existing data to avoid duplicates, tell
|
| 468 |
// the loop to continue, and shoot out oneshot var
|
| 469 |
if ('badResumptionToken' == $s->error['code'] && !$bad_resumpt_token) {
|
| 470 |
$bad_resumpt_token = TRUE;
|
| 471 |
$continue = TRUE;
|
| 472 |
unset($rs);
|
| 473 |
$request = $orig_request;
|
| 474 |
}
|
| 475 |
// If its not a bad resump token, or if its a bad resump token for the
|
| 476 |
// second time die and say why. We want to ignore noRecordsmatch errors
|
| 477 |
// as theyre not 'real' errors, just means no new records.
|
| 478 |
else if ('noRecordsMatch' != $s->error['code']) {
|
| 479 |
die('Error from server, code -- '. $s->error['code'] .
|
| 480 |
' -- value -- '. (string)$s->error);
|
| 481 |
}
|
| 482 |
}
|
| 483 |
else {
|
| 484 |
if ($tag != '') {
|
| 485 |
$sets = variable_get('oai_pmh_'. $repo .'_sets', '');
|
| 486 |
}
|
| 487 |
foreach ($s->ListRecords->record as $item) {
|
| 488 |
/*
|
| 489 |
$res = db_query("SELECT COUNT(*) FROM content_type_oai_pmh_record WHERE field_record_identifier_value = %s",
|
| 490 |
(string)$item->header->identifier);
|
| 491 |
$count = db_fetch_array($res);
|
| 492 |
*/
|
| 493 |
// STORE RECORDS IN THIS VARIABLE. IF YOU CHANGE IT, CHANGE THE UNSET
|
| 494 |
// ABOVE OTHERWISE INCONSISTENCY IS POSSIBLE -- $rs
|
| 495 |
$dc_node = $item->metadata->children('http://www.openarchives.org/OAI/2.0/oai_dc/')->children('http://purl.org/dc/elements/1.1/');
|
| 496 |
|
| 497 |
$values = new stdClass();
|
| 498 |
|
| 499 |
$values->type = 'oai_pmh_record';
|
| 500 |
$values->title = (string)$dc_node->title;
|
| 501 |
$values->body = (string)$dc_node->description;
|
| 502 |
$values->status = 1;
|
| 503 |
$values->name = 'admin';
|
| 504 |
|
| 505 |
$values->field_record_identifier[0]['value'] =
|
| 506 |
(string)$item->header->identifier;
|
| 507 |
$values->field_record_date[0]['value'] = (string)$dc_node->date[0];
|
| 508 |
$values->field_record_creator[0]['value'] =
|
| 509 |
(string)$dc_node->creator;
|
| 510 |
$values->field_record_language[0]['value'] =
|
| 511 |
(string)$dc_node->language;
|
| 512 |
$values->field_record_type[0]['value'] = (string)$dc_node->type;
|
| 513 |
|
| 514 |
|
| 515 |
if (isset($dc_node->subject)) {
|
| 516 |
|
| 517 |
foreach ($dc_node->subject as $sub) {
|
| 518 |
$subjects[] = (string)$sub;
|
| 519 |
}
|
| 520 |
|
| 521 |
$tax = variable_get('oai_pmh_'. $repo .'_taxo', '');
|
| 522 |
if ($tax != '') {
|
| 523 |
$values->taxonomy['tags'][$tax] = implode(',', $subjects);
|
| 524 |
if ($tag != '') {
|
| 525 |
$values->taxonomy['tags'][$tax] .= "," . $sets[$tag];
|
| 526 |
$values->field_record_set[0]['value'] = $tag;
|
| 527 |
}
|
| 528 |
}
|
| 529 |
|
| 530 |
// Delete subjects for next run through
|
| 531 |
unset($subjects);
|
| 532 |
}
|
| 533 |
|
| 534 |
$rs[] = $values;
|
| 535 |
|
| 536 |
}
|
| 537 |
// if a resumption token is set, and it is non-null. Requests with
|
| 538 |
// resumptionTokens come back with an empty self closing tag
|
| 539 |
// indicating the end of the request.
|
| 540 |
if (isset($s->ListRecords->resumptionToken) &&
|
| 541 |
'' != (string)$s->ListRecords->resumptionToken) {
|
| 542 |
// Run the loop a second time, update the request url
|
| 543 |
$continue = TRUE;
|
| 544 |
$request = '?verb=ListRecords&resumptionToken='.
|
| 545 |
_oai_pmh_clean_url((string)$s->ListRecords->resumptionToken);
|
| 546 |
// Unneeded in theory, but makes me feel better
|
| 547 |
unset($s->ListRecords->resumptionToken);
|
| 548 |
}
|
| 549 |
}
|
| 550 |
}
|
| 551 |
while ($continue);
|
| 552 |
|
| 553 |
return $rs;
|
| 554 |
}
|
| 555 |
|
| 556 |
# Eventually deletion stuff will go here also
|
| 557 |
function oai_pmh_insert($arr) {
|
| 558 |
$n = 0;
|
| 559 |
echo "### ". count($arr['store']) ."\n";
|
| 560 |
foreach ($arr['store'] as $values) {
|
| 561 |
echo "#". $n++ ."\n";
|
| 562 |
$node = node_submit($values);
|
| 563 |
node_save($node);
|
| 564 |
ob_flush();
|
| 565 |
}
|
| 566 |
}
|
| 567 |
|
| 568 |
function _oai_pmh_clean_url($str) {
|
| 569 |
// Do %'s first, so we dont get double replacement
|
| 570 |
$str = str_replace('/%/', '%25', $str);
|
| 571 |
// Do the rest.
|
| 572 |
$pattern = array('/', '?', '#', '=', '&', ':', ';', ' ', '+');
|
| 573 |
$replace = array('%2F', '%3F', '%23', '%3D', '%26', '%3A', '%3B', '%20',
|
| 574 |
'%2B');
|
| 575 |
|
| 576 |
return str_replace($pattern, $replace, $str);
|
| 577 |
}
|
| 578 |
|
| 579 |
function _oai_pmh_new_taxo($name) {
|
| 580 |
// Create a new Taxonomy for the new repo
|
| 581 |
$taxo = array(
|
| 582 |
'name' => 'OAI PMH '. $name,
|
| 583 |
'description' => 'Taxonomy for the '. $name .' Repository',
|
| 584 |
'help' => 'Auto-generated Taxonomy from OAI PMH Module',
|
| 585 |
'relations' => 0,
|
| 586 |
'hierarchy' => 0,
|
| 587 |
'multiple' => 0,
|
| 588 |
'tags' => 1,
|
| 589 |
'weight' => 0,
|
| 590 |
'nodes' => array(
|
| 591 |
// This seems backwards, but according to the code...
|
| 592 |
'oai_pmh_record' => 'type'
|
| 593 |
),
|
| 594 |
);
|
| 595 |
|
| 596 |
taxonomy_save_vocabulary($taxo);
|
| 597 |
|
| 598 |
$taxo_all = taxonomy_get_vocabularies('oai_pmh_record');
|
| 599 |
|
| 600 |
foreach ($taxo_all as $taxo) {
|
| 601 |
if ($taxo->name == 'OAI PMH '. $name) {
|
| 602 |
return $taxo->vid;
|
| 603 |
}
|
| 604 |
}
|
| 605 |
|
| 606 |
return NULL;
|
| 607 |
}
|
| 608 |
|
| 609 |
function oai_pmh_cron() {
|
| 610 |
print "<pre>";
|
| 611 |
set_time_limit(0);
|
| 612 |
oai_pmh_get_identify();
|
| 613 |
$repos_str = variable_get('oai_pmh_repos', '');
|
| 614 |
$repos = explode(',', substr($repos_str, 1));
|
| 615 |
foreach ($repos as $repo) {
|
| 616 |
$last = variable_get('oai_pmh_'. $repo .'_last_fetch', NULL);
|
| 617 |
$gran = variable_get('oai_pmh_'. $repo .'_granularity', NULL);
|
| 618 |
// Get time between fetching, default to 6hr
|
| 619 |
if ($gran == 'seconds') {
|
| 620 |
$interval = variable_get('oai_pmh_'. $repo .'_interval', 21600);
|
| 621 |
}
|
| 622 |
else if ($gran == 'days') {
|
| 623 |
$interval = variable_get('oai_pmh_'. $repo .'_interval', 86400);
|
| 624 |
}
|
| 625 |
|
| 626 |
// Based on granularity, if it hasnt been 6 or 24 hours respectively (magic num is time in sec)
|
| 627 |
// exit before running
|
| 628 |
if ($last != NULL && strtotime($last) >= (time() - $interval)) {
|
| 629 |
break;
|
| 630 |
}
|
| 631 |
|
| 632 |
$ret = oai_pmh_main($repo);
|
| 633 |
flush();
|
| 634 |
ob_flush();
|
| 635 |
oai_pmh_insert($ret);
|
| 636 |
}
|
| 637 |
print "</pre>";
|
| 638 |
}
|
| 639 |
|
| 640 |
// From php.net. Allows us to get a unique array when elements are arrays.
|
| 641 |
// Stay classy php.
|
| 642 |
function array_array_unique($my_array) {
|
| 643 |
if (!is_array($my_array)) {
|
| 644 |
return $my_array;
|
| 645 |
}
|
| 646 |
|
| 647 |
foreach ($my_array as &$my_value) {
|
| 648 |
$my_value = serialize($my_value);
|
| 649 |
}
|
| 650 |
|
| 651 |
$my_array = array_unique($my_array);
|
| 652 |
|
| 653 |
foreach ($my_array as &$my_value) {
|
| 654 |
$my_value = unserialize($my_value);
|
| 655 |
}
|
| 656 |
|
| 657 |
return $my_array;
|
| 658 |
}
|
| 659 |
|