| 1 |
<?php
|
| 2 |
/* $Id: parser_common_syndication.module,v 1.5 2007/07/23 15:40:20 alexb Exp $ */
|
| 3 |
|
| 4 |
/**
|
| 5 |
* @file
|
| 6 |
* Parse the incoming URL with SimpleXML then provide a data structure of the feed
|
| 7 |
* Require PHP5 because of SimpleXML
|
| 8 |
*/
|
| 9 |
|
| 10 |
/**
|
| 11 |
* Implementation of hook_help().
|
| 12 |
*/
|
| 13 |
function parser_common_syndication_help($section) {
|
| 14 |
switch($section) {
|
| 15 |
case 'admin/modules#description':
|
| 16 |
return t('Provide a common syndication parser for FeedAPI-compatible modules');
|
| 17 |
break;
|
| 18 |
}
|
| 19 |
}
|
| 20 |
|
| 21 |
/**
|
| 22 |
* Implementation of hook_feedapi_compatible().
|
| 23 |
*
|
| 24 |
* @param $url
|
| 25 |
* The feed's url
|
| 26 |
* @return
|
| 27 |
* a string - feed type if the parser is able to process it, FALSE if it's not compatible
|
| 28 |
*/
|
| 29 |
function parser_common_syndication_feedapi_compatible($url) {
|
| 30 |
if (!function_exists('simplexml_load_string')) {
|
| 31 |
return FALSE;
|
| 32 |
}
|
| 33 |
$downloaded_string = _parser_common_syndication_download($url);
|
| 34 |
if (!defined('LIBXML_VERSION') || (version_compare(phpversion(), '5.1.0', '<'))) {
|
| 35 |
@ $xml = simplexml_load_string($downloaded_string, NULL);
|
| 36 |
}
|
| 37 |
else {
|
| 38 |
@ $xml = simplexml_load_string($downloaded_string, NULL, LIBXML_NOERROR | LIBXML_NOWARNING);
|
| 39 |
}
|
| 40 |
if (_parser_common_syndication_feed_format_detect($xml) != FALSE) {
|
| 41 |
// We don't have to choose between the types, because this module is only able to parse one
|
| 42 |
return array_shift(parser_common_syndication_feedapi_type());
|
| 43 |
}
|
| 44 |
return FALSE;
|
| 45 |
}
|
| 46 |
|
| 47 |
/**
|
| 48 |
* Implementation of hook_feedapi_parse().
|
| 49 |
*
|
| 50 |
* @param $url
|
| 51 |
* The feed's url
|
| 52 |
* @return stdClass
|
| 53 |
* The structured datas extracted from the feed
|
| 54 |
*/
|
| 55 |
function parser_common_syndication_feedapi_parse($feed) {
|
| 56 |
$downloaded_string = _parser_common_syndication_download($feed->url);
|
| 57 |
if ($downloaded_string == FALSE) {
|
| 58 |
return FALSE;
|
| 59 |
}
|
| 60 |
|
| 61 |
if (!defined('LIBXML_VERSION') || (version_compare(phpversion(), '5.1.0', '<'))) {
|
| 62 |
@ $xml = simplexml_load_string($downloaded_string, NULL);
|
| 63 |
}
|
| 64 |
else {
|
| 65 |
@ $xml = simplexml_load_string($downloaded_string, NULL, LIBXML_NOERROR | LIBXML_NOWARNING);
|
| 66 |
}
|
| 67 |
|
| 68 |
// We got a malformed XML
|
| 69 |
if ($xml === FALSE || $xml == NULL) {
|
| 70 |
return FALSE;
|
| 71 |
}
|
| 72 |
|
| 73 |
$feed_type = _parser_common_syndication_feed_format_detect($xml);
|
| 74 |
if ($feed_type == "atom1.0") {
|
| 75 |
return _parser_common_syndication_atom10_parse($xml);
|
| 76 |
}
|
| 77 |
if ($feed_type == "RSS2.0" || $feed_type == "RSS0.91" || $feed_type == "RSS0.92") {
|
| 78 |
return _parser_common_syndication_RSS20_parse($xml);
|
| 79 |
}
|
| 80 |
if ($feed_type == "RDF") {
|
| 81 |
return _parser_common_syndication_RDF10_parse($xml);
|
| 82 |
}
|
| 83 |
return FALSE;
|
| 84 |
}
|
| 85 |
|
| 86 |
/**
|
| 87 |
* Implementation of hook_feedapi_type().
|
| 88 |
* Define the feed types that this module is able to handle
|
| 89 |
*
|
| 90 |
* @return
|
| 91 |
* The types
|
| 92 |
*/
|
| 93 |
function parser_common_syndication_feedapi_type() {
|
| 94 |
return array("XML feed");
|
| 95 |
}
|
| 96 |
|
| 97 |
/**
|
| 98 |
* Determine the feed format of a SimpleXML parsed object structure
|
| 99 |
*
|
| 100 |
* @param $xml
|
| 101 |
* SimpleXML-preprocessed feed
|
| 102 |
* @return
|
| 103 |
* a string - means the feed format
|
| 104 |
*/
|
| 105 |
function _parser_common_syndication_feed_format_detect($xml) {
|
| 106 |
if (!is_object($xml)) {
|
| 107 |
return FALSE;
|
| 108 |
}
|
| 109 |
$attr = $xml->attributes();
|
| 110 |
//print_r($xml);
|
| 111 |
if (isset($xml->entry) && strtolower($xml->getName()) == "feed") {
|
| 112 |
return "atom1.0";
|
| 113 |
}
|
| 114 |
if (strtolower($xml->getName()) == "rss" && $attr["version"] == "2.0") {
|
| 115 |
return "RSS2.0";
|
| 116 |
}
|
| 117 |
if (strtolower($xml->getName()) == "rdf" && isset($xml->channel)) {
|
| 118 |
return "RDF";
|
| 119 |
}
|
| 120 |
if (strtolower($xml->getName()) == "rss" && $attr["version"] == "0.91") {
|
| 121 |
return "RSS0.91";
|
| 122 |
}
|
| 123 |
if (strtolower($xml->getName()) == "rss" && $attr["version"] == "0.92") {
|
| 124 |
return "RSS0.92";
|
| 125 |
}
|
| 126 |
return FALSE;
|
| 127 |
}
|
| 128 |
|
| 129 |
/**
|
| 130 |
* Call one of the possible feedapi_get hook and pass back the downloaded data
|
| 131 |
*
|
| 132 |
* @return
|
| 133 |
* string - the downloaded data, FALSE - if the URL is not reachable
|
| 134 |
*/
|
| 135 |
function _parser_common_syndication_download($url) {
|
| 136 |
$downloaders = module_implements("feedapi_get");
|
| 137 |
$downloaded_string = "";
|
| 138 |
|
| 139 |
$this_types = parser_common_syndication_feedapi_type();
|
| 140 |
// Pick one module that can able to download this
|
| 141 |
foreach ($downloaders as $concrete_module) {
|
| 142 |
$types = module_invoke($concrete_module, "feedapi_type");
|
| 143 |
// If the downloader can get this type of content
|
| 144 |
if (count(array_intersect($this_types, $types)) > 0) {
|
| 145 |
$downloaded_string = module_invoke($concrete_module, "feedapi_get", $url);
|
| 146 |
break;
|
| 147 |
}
|
| 148 |
}
|
| 149 |
|
| 150 |
// Cannot get the feed, pass the problem to one level upper
|
| 151 |
if ($downloaded_string == "") {
|
| 152 |
return FALSE;
|
| 153 |
}
|
| 154 |
return $downloaded_string;
|
| 155 |
}
|
| 156 |
|
| 157 |
/**
|
| 158 |
* Parse atom feeds
|
| 159 |
*/
|
| 160 |
function _parser_common_syndication_atom10_parse($feed_XML) {
|
| 161 |
$parsed_source = new stdClass();
|
| 162 |
// Detect the title
|
| 163 |
$parsed_source->title = isset($feed_XML->title) ? (string) $feed_XML->title : "";
|
| 164 |
// Detect the description
|
| 165 |
$parsed_source->description = isset($feed_XML->subtitle) ? (string) $feed_XML->subtitle : "";
|
| 166 |
$parsed_source->options = new stdClass();
|
| 167 |
// Detect the link
|
| 168 |
$parsed_source->options->link = "";
|
| 169 |
if (count($feed_XML->link) > 0) {
|
| 170 |
$link = $feed_XML->link;
|
| 171 |
$link = $link->attributes();
|
| 172 |
$parsed_source->options->link = isset($link["href"]) ? (string) $link["href"] : "";
|
| 173 |
}
|
| 174 |
|
| 175 |
$parsed_source->items = array();
|
| 176 |
|
| 177 |
foreach ($feed_XML->entry as $news) {
|
| 178 |
$original_url = NULL;
|
| 179 |
|
| 180 |
if ($news->id) {
|
| 181 |
$guid = "{$news->id}";
|
| 182 |
}
|
| 183 |
else {
|
| 184 |
$guid = NULL;
|
| 185 |
}
|
| 186 |
|
| 187 |
// I don't know how standard this is, but sometimes the id is the URL
|
| 188 |
if (valid_url($guid, TRUE)) {
|
| 189 |
$original_url = $guid;
|
| 190 |
}
|
| 191 |
|
| 192 |
$additional_taxonomies = array();
|
| 193 |
|
| 194 |
if ($news->category) {
|
| 195 |
$additional_taxonomies['ATOM Categories'] = array();
|
| 196 |
foreach ($news->category AS $category) {
|
| 197 |
$additional_taxonomies['ATOM Categories'][] = "{$category['term']}";
|
| 198 |
}
|
| 199 |
}
|
| 200 |
|
| 201 |
$title = "{$news->title}";
|
| 202 |
|
| 203 |
if ($news->content) {
|
| 204 |
$body = '';
|
| 205 |
foreach($news->content->children() as $child) {
|
| 206 |
$body .= $child->asXML();
|
| 207 |
}
|
| 208 |
$body .= "{$news->content}";
|
| 209 |
}
|
| 210 |
else if ($news->summary) {
|
| 211 |
$body = '';
|
| 212 |
foreach($news->summary->children() as $child) {
|
| 213 |
$body .= $child->asXML();
|
| 214 |
}
|
| 215 |
$body .= "{$news->summary}";
|
| 216 |
}
|
| 217 |
|
| 218 |
if ($news->content['src']) {
|
| 219 |
// some src elements in some valid atom feeds contained no urls at all
|
| 220 |
if (valid_url("{$news->content['src']}", TRUE)) {
|
| 221 |
$original_url = "{$news->content['src']}";
|
| 222 |
}
|
| 223 |
}
|
| 224 |
|
| 225 |
if ($news->summary) {
|
| 226 |
$teaser = '';
|
| 227 |
foreach($news->summary->children() as $child) {
|
| 228 |
$teaser .= $child->asXML();
|
| 229 |
}
|
| 230 |
$teaser .= "{$news->summary}";
|
| 231 |
}
|
| 232 |
else {
|
| 233 |
$teaser = node_teaser($body);
|
| 234 |
}
|
| 235 |
|
| 236 |
$author_found = FALSE;
|
| 237 |
|
| 238 |
if ($news->source->author->name) {
|
| 239 |
$original_author = "{$news->source->author->name}";
|
| 240 |
$author_found = TRUE;
|
| 241 |
}
|
| 242 |
else if ($news->author->name) {
|
| 243 |
$original_author = "{$news->author->name}";
|
| 244 |
$author_found = TRUE;
|
| 245 |
}
|
| 246 |
|
| 247 |
if ($feed_XML->author->name && !$author_found) {
|
| 248 |
$original_author = "{$feed_XML->author->name}";
|
| 249 |
}
|
| 250 |
|
| 251 |
if ($news->link['href'] && valid_url("{$news->link['href']}", TRUE)) {
|
| 252 |
$original_url = "{$news->link['href']}";
|
| 253 |
}
|
| 254 |
|
| 255 |
$timestamp = strtotime("{$news->published}");
|
| 256 |
if ($timestamp === FALSE) {
|
| 257 |
$timestamp = time();
|
| 258 |
}
|
| 259 |
$item = new stdClass();
|
| 260 |
$item->title = $title;
|
| 261 |
$item->description = $body;
|
| 262 |
$item->options = new stdClass();
|
| 263 |
$item->options->teaser = $teaser;
|
| 264 |
$item->options->original_author = $original_author;
|
| 265 |
$item->options->timestamp = $timestamp;
|
| 266 |
$item->options->original_url = $original_url;
|
| 267 |
$item->options->guid = $guid;
|
| 268 |
$item->options->tags = $additional_taxonomies['ATOM Categories'];
|
| 269 |
//_aggregation_add_item($title, $body, $teaser, $original_author, $feed, $additional_taxonomies, $timestamp, $original_url, $guid, array());
|
| 270 |
$parsed_source->items[] = $item;
|
| 271 |
}
|
| 272 |
return $parsed_source;
|
| 273 |
}
|
| 274 |
|
| 275 |
/**
|
| 276 |
* Parse RSS1.0/RDF feeds
|
| 277 |
*/
|
| 278 |
function _parser_common_syndication_RDF10_parse($feed_XML) {
|
| 279 |
$parsed_source = new stdClass();
|
| 280 |
// Detect the title
|
| 281 |
$parsed_source->title = isset($feed_XML->channel->title) ? (string) $feed_XML->channel->title : "";
|
| 282 |
// Detect the description
|
| 283 |
$parsed_source->description = isset($feed_XML->channel->description) ? (string) $feed_XML->channel->description : "";
|
| 284 |
$parsed_source->options = new stdClass();
|
| 285 |
// Detect the link
|
| 286 |
$parsed_source->options->link = isset($feed_XML->channel->link) ? (string) $feed_XML->channel->link : "";
|
| 287 |
$parsed_source->items = array();
|
| 288 |
|
| 289 |
// set category splitter (space is for del.icio.us feed)
|
| 290 |
$category_splitter = ' ';
|
| 291 |
|
| 292 |
// get the default original author
|
| 293 |
if ($feed_XML->channel->title) {
|
| 294 |
$oa = (string) $feed_XML->channel->title;
|
| 295 |
}
|
| 296 |
|
| 297 |
// get all namespaces
|
| 298 |
if (version_compare(phpversion(), '5.1.2', '<')) {
|
| 299 |
//versions prior 5.1.2 don't allow namespaces
|
| 300 |
$namespaces['default'] = NULL;
|
| 301 |
}
|
| 302 |
else {
|
| 303 |
$namespaces = $feed_XML->getNamespaces(TRUE);
|
| 304 |
}
|
| 305 |
|
| 306 |
foreach ($feed_XML->item as $news) {
|
| 307 |
//initialization
|
| 308 |
$guid = $original_url = NULL;
|
| 309 |
$title = $body = $teaser = '';
|
| 310 |
$timestamp = time();
|
| 311 |
$additional_taxonomies = array();
|
| 312 |
$original_author = $oa;
|
| 313 |
|
| 314 |
foreach($namespaces as $ns_link) {
|
| 315 |
//get about attribute as guid
|
| 316 |
foreach ($news->attributes($ns_link) as $name => $value) {
|
| 317 |
if ($name == 'about') {
|
| 318 |
$guid = $value;
|
| 319 |
}
|
| 320 |
}
|
| 321 |
|
| 322 |
//get children for current namespace
|
| 323 |
if (version_compare(phpversion(), '5.1.2', '<')) {
|
| 324 |
$ns = (array)$news;
|
| 325 |
}
|
| 326 |
else {
|
| 327 |
$ns = (array)$news->children($ns_link);
|
| 328 |
}
|
| 329 |
|
| 330 |
//title
|
| 331 |
if ((string)$ns['title']) {
|
| 332 |
$title = (string)$ns['title'];
|
| 333 |
}
|
| 334 |
|
| 335 |
//description or dc:description
|
| 336 |
if ((string)$ns['description'] && $body <> '') {
|
| 337 |
$body = (string)$ns['description'];
|
| 338 |
}
|
| 339 |
|
| 340 |
//link
|
| 341 |
if ((string)$ns['link']) {
|
| 342 |
$original_url = (string)$ns['link'];
|
| 343 |
}
|
| 344 |
|
| 345 |
//dc:creator
|
| 346 |
if ((string)$ns['creator']) {
|
| 347 |
$original_author = (string)$ns['creator'];
|
| 348 |
}
|
| 349 |
|
| 350 |
//dc:date
|
| 351 |
if ((string)$ns['date']) {
|
| 352 |
$timestamp = strtotime((string)$ns['date']);
|
| 353 |
}
|
| 354 |
|
| 355 |
//content:encoded
|
| 356 |
if ((string)$ns['encoded']) {
|
| 357 |
$body = (string)$ns['encoded'];
|
| 358 |
}
|
| 359 |
|
| 360 |
//dc:subject
|
| 361 |
if ((string)$ns['subject']) {
|
| 362 |
//there can be multiple category tags
|
| 363 |
if (is_array($ns['subject'])) {
|
| 364 |
foreach ($ns['subject'] as $cat) {
|
| 365 |
if (is_object($cat)) {
|
| 366 |
$additional_taxonomies['RDF Categories'][] = trim(strip_tags($cat->asXML()));
|
| 367 |
}
|
| 368 |
else {
|
| 369 |
$additional_taxonomies['RDF Categories'][] = $cat;
|
| 370 |
}
|
| 371 |
}
|
| 372 |
}
|
| 373 |
else { //or single tag
|
| 374 |
$additional_taxonomies['RDF Categories'] = explode($category_splitter, (string)$ns['subject']);
|
| 375 |
}
|
| 376 |
}
|
| 377 |
}
|
| 378 |
|
| 379 |
// description is not mandatory so use title if description not present
|
| 380 |
if (!$body) {
|
| 381 |
$body = $title;
|
| 382 |
}
|
| 383 |
|
| 384 |
//make teaser
|
| 385 |
$teaser = node_teaser($body);
|
| 386 |
|
| 387 |
// if there are no link tag but rdf:about is provided
|
| 388 |
if (!$original_url && $guid) {
|
| 389 |
$original_url = $guid;
|
| 390 |
}
|
| 391 |
$item = new stdClass();
|
| 392 |
$item->title = $title;
|
| 393 |
$item->description = $body;
|
| 394 |
$item->options = new stdClass();
|
| 395 |
$item->options->teaser = $teaser;
|
| 396 |
$item->options->original_author = $original_author;
|
| 397 |
$item->options->timestamp = $timestamp;
|
| 398 |
$item->options->original_url = $original_url;
|
| 399 |
$item->options->guid = $guid;
|
| 400 |
$item->options->link = $additional_taxonomies['RDF Categories'];
|
| 401 |
$parsed_source->items[] = $item;
|
| 402 |
}
|
| 403 |
return $parsed_source;
|
| 404 |
}
|
| 405 |
|
| 406 |
/**
|
| 407 |
* Parse RSS2.0 feeds
|
| 408 |
*/
|
| 409 |
function _parser_common_syndication_RSS20_parse($feed_XML) {
|
| 410 |
$parsed_source = new stdClass();
|
| 411 |
// Detect the title
|
| 412 |
$parsed_source->title = isset($feed_XML->channel->title) ? (string) $feed_XML->channel->title : "";
|
| 413 |
// Detect the description
|
| 414 |
$parsed_source->description = isset($feed_XML->channel->description) ? (string) $feed_XML->channel->description : "";
|
| 415 |
$parsed_source->options = new stdClass();
|
| 416 |
// Detect the link
|
| 417 |
$parsed_source->options->link = isset($feed_XML->channel->link) ? (string) $feed_XML->channel->link : "";
|
| 418 |
$parsed_source->items = array();
|
| 419 |
|
| 420 |
foreach ($feed_XML->xpath('//item') as $news) {
|
| 421 |
// for PHP > 5.1.2 get 'content' namespace
|
| 422 |
$content = (array)$news->children('content');
|
| 423 |
|
| 424 |
$news = (array)$news;
|
| 425 |
|
| 426 |
if ($news['guid']) {
|
| 427 |
$guid = $news['guid'];
|
| 428 |
}
|
| 429 |
else {
|
| 430 |
$guid = NULL;
|
| 431 |
}
|
| 432 |
|
| 433 |
if ((string)$news['title']) {
|
| 434 |
$title = (string)$news['title'];
|
| 435 |
}
|
| 436 |
else {
|
| 437 |
$title = '';
|
| 438 |
}
|
| 439 |
|
| 440 |
if ((string)$news['description']) {
|
| 441 |
$body = (string)$news['description'];
|
| 442 |
}
|
| 443 |
// some sources use content:encoded as description i.e. PostNuke PageSetter module
|
| 444 |
elseif ((string)$news['encoded']) { //content:encoded for PHP < 5.1.2
|
| 445 |
$body = (string)$news['encoded'];
|
| 446 |
}
|
| 447 |
elseif ((string)$content['encoded']) { //content:encoded for PHP >= 5.1.2
|
| 448 |
$body = (string)$content['encoded'];
|
| 449 |
}
|
| 450 |
else {
|
| 451 |
$body = $news['title'];
|
| 452 |
}
|
| 453 |
|
| 454 |
$teaser = node_teaser($body);
|
| 455 |
|
| 456 |
if ($feed_XML->channel->title) {
|
| 457 |
$original_author = (string)$feed_XML->channel->title;
|
| 458 |
}
|
| 459 |
|
| 460 |
if ($news['link']) {
|
| 461 |
$original_url = $news['link'];
|
| 462 |
}
|
| 463 |
else {
|
| 464 |
$original_url = NULL;
|
| 465 |
}
|
| 466 |
|
| 467 |
$timestamp = strtotime($news['pubDate']);
|
| 468 |
if ($timestamp === FALSE) {
|
| 469 |
$timestamp = time();
|
| 470 |
}
|
| 471 |
|
| 472 |
$additional_taxonomies = array();
|
| 473 |
if ((string) $news['category'] || !empty($news['category'])) {
|
| 474 |
if (is_array($news['category'])) {
|
| 475 |
$news['category'] = $news['category'][0];
|
| 476 |
}
|
| 477 |
$additional_taxonomies['RSS Categories'] = explode('/', $news['category']);
|
| 478 |
}
|
| 479 |
|
| 480 |
$item = new stdClass();
|
| 481 |
$item->title = $title;
|
| 482 |
$item->description = $body;
|
| 483 |
$item->options = new stdClass();
|
| 484 |
$item->options->teaser = $teaser;
|
| 485 |
$item->options->original_author = $original_author;
|
| 486 |
$item->options->timestamp = $timestamp;
|
| 487 |
$item->options->original_url = $original_url;
|
| 488 |
$item->options->guid = $guid;
|
| 489 |
$item->options->tags = $additional_taxonomies['RSS Categories'];
|
| 490 |
$parsed_source->items[] = $item;
|
| 491 |
}
|
| 492 |
return $parsed_source;
|
| 493 |
}
|