| 1 |
<?php
|
| 2 |
// $Id$
|
| 3 |
|
| 4 |
/**
|
| 5 |
* @file
|
| 6 |
* Downloading and parsing functions for Common Syndication Parser
|
| 7 |
*/
|
| 8 |
|
| 9 |
/**
|
| 10 |
* Parse the feed into a data structure.
|
| 11 |
*
|
| 12 |
* @param $feed
|
| 13 |
* The feed object (contains the URL or the parsed XML structure.
|
| 14 |
* @return
|
| 15 |
* stdClass The structured datas extracted from the feed.
|
| 16 |
*/
|
| 17 |
function _parser_common_syndication_feedapi_parse($feed) {
|
| 18 |
if (is_a($feed, 'SimpleXMLElement')) {
|
| 19 |
$xml = $feed;
|
| 20 |
}
|
| 21 |
else {
|
| 22 |
$downloaded_string = _parser_common_syndication_download($feed->url);
|
| 23 |
if ($downloaded_string === FALSE || is_object($downloaded_string)) {
|
| 24 |
return $downloaded_string;
|
| 25 |
}
|
| 26 |
|
| 27 |
if (!defined('LIBXML_VERSION') || (version_compare(phpversion(), '5.1.0', '<'))) {
|
| 28 |
@ $xml = simplexml_load_string($downloaded_string, NULL);
|
| 29 |
}
|
| 30 |
else {
|
| 31 |
@ $xml = simplexml_load_string($downloaded_string, NULL, LIBXML_NOERROR | LIBXML_NOWARNING | LIBXML_NOCDATA);
|
| 32 |
}
|
| 33 |
|
| 34 |
// Got a malformed XML.
|
| 35 |
if ($xml === FALSE || is_null($xml)) {
|
| 36 |
return FALSE;
|
| 37 |
}
|
| 38 |
}
|
| 39 |
$feed_type = _parser_common_syndication_feed_format_detect($xml);
|
| 40 |
if ($feed_type == "atom1.0") {
|
| 41 |
return _parser_common_syndication_atom10_parse($xml);
|
| 42 |
}
|
| 43 |
if ($feed_type == "RSS2.0" || $feed_type == "RSS0.91" || $feed_type == "RSS0.92") {
|
| 44 |
return _parser_common_syndication_RSS20_parse($xml);
|
| 45 |
}
|
| 46 |
if ($feed_type == "RDF") {
|
| 47 |
return _parser_common_syndication_RDF10_parse($xml);
|
| 48 |
}
|
| 49 |
return FALSE;
|
| 50 |
}
|
| 51 |
|
| 52 |
/**
|
| 53 |
* Get the cached version of the <var>$url</var>
|
| 54 |
*/
|
| 55 |
function _parser_common_syndication_cache_get($url) {
|
| 56 |
$cache_file = _parser_common_syndication_sanitize_cache() .'/'. md5($url);
|
| 57 |
if (file_exists($cache_file)) {
|
| 58 |
$file_content = file_get_contents($cache_file);
|
| 59 |
return unserialize($file_content);
|
| 60 |
}
|
| 61 |
return FALSE;
|
| 62 |
}
|
| 63 |
|
| 64 |
/**
|
| 65 |
* Store the parsed feed into the cache
|
| 66 |
*/
|
| 67 |
function _parser_common_syndication_cache_set($url, $parsed_feed) {
|
| 68 |
$cache_file = _parser_common_syndication_sanitize_cache() .'/'. md5($url);
|
| 69 |
$cache_fp = fopen($cache_file, 'w');
|
| 70 |
fwrite($cache_fp, serialize($parsed_feed));
|
| 71 |
fclose($cache_fp);
|
| 72 |
}
|
| 73 |
|
| 74 |
/**
|
| 75 |
* Get the content from the given URL.
|
| 76 |
*
|
| 77 |
* @param $url
|
| 78 |
* A valid URL (not only web URLs).
|
| 79 |
* @param $username
|
| 80 |
* If the URL use authentication, here you can supply the username for this.
|
| 81 |
* @param $password
|
| 82 |
* If the URL use authentication, here you can supply the password for this.
|
| 83 |
* @return
|
| 84 |
* The data pulled from the URL or FALSE if the feed does not need refresh.
|
| 85 |
*/
|
| 86 |
function _parser_common_syndication_feedapi_get($url, $username = NULL, $password = NULL, $accept_invalid_cert = FALSE) {
|
| 87 |
// Intra-pagedownload cache, avoid to download the same content twice within one page download (it's possible, compatible and parse calls).
|
| 88 |
static $download_cache = array();
|
| 89 |
if (isset($download_cache[$url])) {
|
| 90 |
return $download_cache[$url];
|
| 91 |
}
|
| 92 |
$has_etag = FALSE;
|
| 93 |
$curl = _parser_common_syndication_use_curl();
|
| 94 |
|
| 95 |
// Only download and parse data if really needs refresh.
|
| 96 |
// Based on "Last-Modified" and "If-Modified-Since".
|
| 97 |
$headers = array();
|
| 98 |
$db_result = db_query("SELECT etag, last_modified FROM {parser_common_syndication} WHERE url = '%s'", md5($url));
|
| 99 |
while ($validate = db_fetch_array($db_result)) {
|
| 100 |
$has_etag = TRUE;
|
| 101 |
if (!empty($validate['etag'])) {
|
| 102 |
if ($curl) {
|
| 103 |
$headers[] = 'If-None-Match: '. $validate['etag'];
|
| 104 |
}
|
| 105 |
else {
|
| 106 |
$headers['If-None-Match'] = $validate['etag'];
|
| 107 |
}
|
| 108 |
}
|
| 109 |
if (!empty($validate['last_modified'])) {
|
| 110 |
if ($curl) {
|
| 111 |
$headers[] = 'If-Modified-Since: '. $validate['last_modified'];
|
| 112 |
}
|
| 113 |
else {
|
| 114 |
$headers['If-Modified-Since'] = $validate['last_modified'];
|
| 115 |
}
|
| 116 |
}
|
| 117 |
if (!empty($username) && !$curl) {
|
| 118 |
$headers['Authorization'] = 'Basic '. base64_encode("$username:$password");
|
| 119 |
}
|
| 120 |
}
|
| 121 |
if ($curl) {
|
| 122 |
$headers[] = 'User-Agent: Drupal (+http://drupal.org/)';
|
| 123 |
$result = new stdClass();
|
| 124 |
$download = curl_init($url);
|
| 125 |
curl_setopt($download, CURLOPT_FOLLOWLOCATION, TRUE);
|
| 126 |
if (!empty($username)) {
|
| 127 |
curl_setopt($download, CURLOPT_USERPWD, "{$username}:{$password}");
|
| 128 |
}
|
| 129 |
curl_setopt($download, CURLOPT_HTTPHEADER, $headers);
|
| 130 |
curl_setopt($download, CURLOPT_HEADER, TRUE);
|
| 131 |
curl_setopt($download, CURLOPT_RETURNTRANSFER, TRUE);
|
| 132 |
if ($accept_invalid_cert) {
|
| 133 |
curl_setopt($download, CURLOPT_SSL_VERIFYPEER, 0);
|
| 134 |
}
|
| 135 |
$header = '';
|
| 136 |
$data = curl_exec($download);
|
| 137 |
$header_size = curl_getinfo($download, CURLINFO_HEADER_SIZE);
|
| 138 |
$header = substr($data, 0, $header_size - 1);
|
| 139 |
$result->data = substr($data, $header_size);
|
| 140 |
$header_lines = preg_split("/\r\n|\n|\r/", $header);
|
| 141 |
|
| 142 |
$result->headers = array();
|
| 143 |
array_shift($header_lines); // skip HTTP response status
|
| 144 |
while ($line = trim(array_shift($header_lines))) {
|
| 145 |
list($header, $value) = explode(':', $line, 2);
|
| 146 |
if (isset($result->headers[$header]) && $header == 'Set-Cookie') {
|
| 147 |
// RFC 2109: the Set-Cookie response header comprises the token Set-
|
| 148 |
// Cookie:, followed by a comma-separated list of one or more cookies.
|
| 149 |
$result->headers[$header] .= ','. trim($value);
|
| 150 |
}
|
| 151 |
else {
|
| 152 |
$result->headers[$header] = trim($value);
|
| 153 |
}
|
| 154 |
}
|
| 155 |
$result->code = curl_getinfo($download, CURLINFO_HTTP_CODE);
|
| 156 |
|
| 157 |
curl_close($download);
|
| 158 |
}
|
| 159 |
else {
|
| 160 |
$result = drupal_http_request($url, $headers);
|
| 161 |
}
|
| 162 |
|
| 163 |
$result->code = isset($result->code) ? $result->code : 200;
|
| 164 |
// In this case return the cached data.
|
| 165 |
if ($result->code == 304) {
|
| 166 |
$cached_data = _parser_common_syndication_cache_get($url);
|
| 167 |
if (is_object($cached_data)) {
|
| 168 |
$cached_data->from_cache = TRUE;
|
| 169 |
return $cached_data;
|
| 170 |
}
|
| 171 |
else {
|
| 172 |
// It's a tragedy, this file has to be exist and contain good data.
|
| 173 |
// In this case, repeat the stuff without cache.
|
| 174 |
db_query("DELETE FROM {parser_common_syndication} WHERE url = '%s'", md5($url));
|
| 175 |
return _parser_common_syndication_feedapi_get($url, $username, $password);
|
| 176 |
}
|
| 177 |
}
|
| 178 |
|
| 179 |
if (!isset($result->headers) || !isset($result->headers['ETag']) || !isset($result->headers['Last-Modified'])) {
|
| 180 |
$result->headers = isset($result->headers) ? $result->headers : array();
|
| 181 |
$result->headers['ETag'] = isset($result->headers['ETag']) ? $result->headers['ETag'] : '';
|
| 182 |
$result->headers['Last-Modified'] = isset($result->headers['Last-Modified']) ? $result->headers['Last-Modified'] : '';
|
| 183 |
}
|
| 184 |
if ($has_etag == TRUE) {
|
| 185 |
db_query("UPDATE {parser_common_syndication} SET etag = '%s', last_modified = '%s' WHERE url = '%s'", $result->headers['ETag'], $result->headers['Last-Modified'], md5($url));
|
| 186 |
}
|
| 187 |
else {
|
| 188 |
db_query("INSERT INTO {parser_common_syndication} (etag, last_modified, url) VALUES ('%s', '%s', '%s')", $result->headers['ETag'], $result->headers['Last-Modified'], md5($url));
|
| 189 |
}
|
| 190 |
$download_cache[$url] = $result->data;
|
| 191 |
return empty($result->data) ? FALSE : $result->data;
|
| 192 |
}
|
| 193 |
|
| 194 |
/**
|
| 195 |
* Delete cache validating functions when feed is deleted
|
| 196 |
*/
|
| 197 |
function parser_common_syndication_nodeapi(&$node, $op) {
|
| 198 |
if (isset($node->feed) || feedapi_enabled_type($node->type)) {
|
| 199 |
switch ($op) {
|
| 200 |
case 'delete':
|
| 201 |
db_query("DELETE FROM {parser_common_syndication} WHERE url = '%s'", $node->feed->url);
|
| 202 |
$cache_dir = _parser_common_syndication_sanitize_cache();
|
| 203 |
$cache_filename = $cache_dir .'/'. md5($node->feed->url);
|
| 204 |
if (file_exists($cache_filename)) {
|
| 205 |
unlink($cache_filename);
|
| 206 |
}
|
| 207 |
break;
|
| 208 |
}
|
| 209 |
}
|
| 210 |
}
|
| 211 |
|
| 212 |
/**
|
| 213 |
* Determine the feed format of a SimpleXML parsed object structure.
|
| 214 |
*
|
| 215 |
* @param $xml
|
| 216 |
* SimpleXML-preprocessed feed.
|
| 217 |
* @return
|
| 218 |
* The feed format short description or FALSE if not compatible.
|
| 219 |
*/
|
| 220 |
function _parser_common_syndication_feed_format_detect($xml) {
|
| 221 |
if (!is_object($xml)) {
|
| 222 |
return FALSE;
|
| 223 |
}
|
| 224 |
$attr = $xml->attributes();
|
| 225 |
$type = strtolower($xml->getName());
|
| 226 |
if (isset($xml->entry) && $type == "feed") {
|
| 227 |
return "atom1.0";
|
| 228 |
}
|
| 229 |
if ($type == "rss" && $attr["version"] == "2.0") {
|
| 230 |
return "RSS2.0";
|
| 231 |
}
|
| 232 |
if ($type == "rdf" && isset($xml->channel)) {
|
| 233 |
return "RDF";
|
| 234 |
}
|
| 235 |
if ($type == "rss" && $attr["version"] == "0.91") {
|
| 236 |
return "RSS0.91";
|
| 237 |
}
|
| 238 |
if ($type == "rss" && $attr["version"] == "0.92") {
|
| 239 |
return "RSS0.92";
|
| 240 |
}
|
| 241 |
return FALSE;
|
| 242 |
}
|
| 243 |
|
| 244 |
/**
|
| 245 |
* Call one of the possible feedapi_get hook and pass back the downloaded data
|
| 246 |
*
|
| 247 |
* @return
|
| 248 |
* string - the downloaded data, FALSE - if the URL is not reachable
|
| 249 |
*/
|
| 250 |
function _parser_common_syndication_download($url, $settings = NULL) {
|
| 251 |
if (valid_url($url, TRUE)) {
|
| 252 |
// Handle password protected feeds.
|
| 253 |
$url_parts = parse_url($url);
|
| 254 |
$password = $username = NULL;
|
| 255 |
if (!empty($url_parts['user'])) {
|
| 256 |
$password = $url_parts['pass'];
|
| 257 |
$username = $url_parts['user'];
|
| 258 |
}
|
| 259 |
}
|
| 260 |
|
| 261 |
$accept_invalid_cert = isset($settings['accept_invalid_cert']) ? $settings['accept_invalid_cert'] : FALSE;
|
| 262 |
$downloaded_string = _parser_common_syndication_feedapi_get($url, $username, $password, $accept_invalid_cert);
|
| 263 |
|
| 264 |
// Cannot get the feed, pass the problem to one level up.
|
| 265 |
if ($downloaded_string == FALSE) {
|
| 266 |
return FALSE;
|
| 267 |
}
|
| 268 |
// The data comes from cache, just pass one level up.
|
| 269 |
else if (is_object($downloaded_string)) {
|
| 270 |
return $downloaded_string;
|
| 271 |
}
|
| 272 |
|
| 273 |
// Do the autodiscovery at this level, pass back the real data.
|
| 274 |
// Maybe it's HTML. If it's not HTML, not worth to take a look into the downloaded string.
|
| 275 |
if (strpos(strtolower($downloaded_string), "<html") !== FALSE) {
|
| 276 |
$allowed_mime = array("text/xml", "application/rss+xml", "application/atom+xml", "application/rdf+xml", "application/xml");
|
| 277 |
$matches = array();
|
| 278 |
// Get all the links tag
|
| 279 |
preg_match_all('/<link\s+(.*?)\s*\/?>/si', $downloaded_string, $matches);
|
| 280 |
$links = $matches[1];
|
| 281 |
$rss_link = FALSE;
|
| 282 |
foreach ($links as $link) {
|
| 283 |
$mime = array();
|
| 284 |
// Get the type attribute and check if the mime type is allowed.
|
| 285 |
preg_match_all('/type\s*=\s*("|\')([A-Za-z\/+]*)("|\')/si', $link, $mime);
|
| 286 |
if (in_array(array_pop($mime[2]), $allowed_mime)) {
|
| 287 |
$href = array();
|
| 288 |
// Get the href attribute.
|
| 289 |
preg_match_all('/href\s*=\s*("|\')([=#\?_:.0-9A-Za-z\/+]*)("|\')/si', $link, $href);
|
| 290 |
$rss_link = array_pop($href[2]);
|
| 291 |
if (is_string($rss_link) && strlen($rss_link) > 0 && $rss_link != $url) {
|
| 292 |
// Handle base url related stuff.
|
| 293 |
$parsed_url = parse_url($rss_link);
|
| 294 |
if (!isset($parsed_url['host'])) {
|
| 295 |
// It's relative so make it absolute.
|
| 296 |
$base_tag = array();
|
| 297 |
preg_match_all('/<base href\s*=\s*("|\')([_:.0-9A-Za-z\/+]*)("|\')/si', $link, $base_tag);
|
| 298 |
$base_url = array_pop($base_tag[2]);
|
| 299 |
if (is_string($base_url) && strlen($base_url) > 0) {
|
| 300 |
// Get from the HTML base tag.
|
| 301 |
$rss_link = $base_url . $rss_link;
|
| 302 |
}
|
| 303 |
else {
|
| 304 |
// Guess from the original URL.
|
| 305 |
$original_url = parse_url($url);
|
| 306 |
$rss_link = $original_url['scheme'] .'://'. $original_url['host'] . (isset($original_url['port']) ? ':' : '') . $original_url['port'] . $parsed_url['path'] .'?'. $parsed_url['query'] .'#'. $parsed_url['fragment'];
|
| 307 |
}
|
| 308 |
}
|
| 309 |
$downloaded_string = _parser_common_syndication_download($rss_link);
|
| 310 |
break;
|
| 311 |
}
|
| 312 |
}
|
| 313 |
}
|
| 314 |
}
|
| 315 |
// Ugly hack to be able to retrieve the xml:base property, no way to access xml:lang inside <feed>
|
| 316 |
$downloaded_string = preg_replace('/xml:base *=/', 'base=', $downloaded_string);
|
| 317 |
|
| 318 |
// Filter out strange tags. Without this, the text would contain strange stuff.
|
| 319 |
// @todo: make sure that these are not important for feed element mapper
|
| 320 |
$downloaded_string_filtered = preg_replace(array('@<script[^>]*?.*?</script>@si', '@<object[^>]*?.*?</object>@si', '@<embed[^>]*?.*?</embed>@si', '@<applet[^>]*?.*?</applet>@si', '@<noframes[^>]*?.*?</noframes>@si', '@<noscript[^>]*?.*?</noscript>@si', '@<noembed[^>]*?.*?</noembed>@si'), '', $downloaded_string);
|
| 321 |
return empty($downloaded_string_filtered) ? $downloaded_string : $downloaded_string_filtered;
|
| 322 |
}
|
| 323 |
|
| 324 |
/**
|
| 325 |
* Parse atom feeds.
|
| 326 |
*/
|
| 327 |
function _parser_common_syndication_atom10_parse($feed_XML) {
|
| 328 |
$parsed_source = new stdClass();
|
| 329 |
|
| 330 |
$base = (string) array_shift($feed_XML->xpath("@base"));
|
| 331 |
if (!valid_url($base, TRUE)) {
|
| 332 |
$base = FALSE;
|
| 333 |
}
|
| 334 |
|
| 335 |
// Detect the title
|
| 336 |
$parsed_source->title = isset($feed_XML->title) ? _parser_common_syndication_title("{$feed_XML->title}") : "";
|
| 337 |
// Detect the description
|
| 338 |
$parsed_source->description = isset($feed_XML->subtitle) ? "{$feed_XML->subtitle}" : "";
|
| 339 |
$parsed_source->options = new stdClass();
|
| 340 |
|
| 341 |
$parsed_source->options->link = _parser_common_syndication_link($feed_XML->link);
|
| 342 |
if (valid_url($parsed_source->options->link) && !valid_url($parsed_source->options->link, TRUE) && !empty($base)) {
|
| 343 |
$parsed_source->options->link = $base . $parsed_source->options->link;
|
| 344 |
}
|
| 345 |
|
| 346 |
$parsed_source->items = array();
|
| 347 |
|
| 348 |
foreach ($feed_XML->entry as $news) {
|
| 349 |
$original_url = NULL;
|
| 350 |
|
| 351 |
$guid = !empty($news->id) ? "{$news->id}" : NULL;
|
| 352 |
|
| 353 |
// I don't know how standard this is, but sometimes the id is the URL.
|
| 354 |
if (valid_url($guid, TRUE)) {
|
| 355 |
$original_url = $guid;
|
| 356 |
}
|
| 357 |
|
| 358 |
$additional_taxonomies = array();
|
| 359 |
|
| 360 |
if (isset($news->category)) {
|
| 361 |
$additional_taxonomies['ATOM Categories'] = array();
|
| 362 |
$additional_taxonomies['ATOM Domains'] = array();
|
| 363 |
foreach ($news->category as $category) {
|
| 364 |
if (isset($category['scheme'])) {
|
| 365 |
$domain = "{$category['scheme']}";
|
| 366 |
if (!empty($domain)) {
|
| 367 |
if (!isset($additional_taxonomies['ATOM Domains'][$domain])) {
|
| 368 |
$additional_taxonomies['ATOM Domains'][$domain] = array();
|
| 369 |
}
|
| 370 |
$additional_taxonomies['ATOM Domains'][$domain][] = count($additional_taxonomies['ATOM Categories']) - 1;
|
| 371 |
}
|
| 372 |
}
|
| 373 |
$additional_taxonomies['ATOM Categories'][] = "{$category['term']}";
|
| 374 |
}
|
| 375 |
}
|
| 376 |
$title = "{$news->title}";
|
| 377 |
|
| 378 |
$body = '';
|
| 379 |
if (!empty($news->content)) {
|
| 380 |
foreach ($news->content->children() as $child) {
|
| 381 |
$body .= $child->asXML();
|
| 382 |
}
|
| 383 |
$body .= "{$news->content}";
|
| 384 |
}
|
| 385 |
else if (!empty($news->summary)) {
|
| 386 |
foreach ($news->summary->children() as $child) {
|
| 387 |
$body .= $child->asXML();
|
| 388 |
}
|
| 389 |
$body .= "{$news->summary}";
|
| 390 |
}
|
| 391 |
|
| 392 |
if (!empty($news->content['src'])) {
|
| 393 |
// some src elements in some valid atom feeds contained no urls at all
|
| 394 |
if (valid_url("{$news->content['src']}", TRUE)) {
|
| 395 |
$original_url = "{$news->content['src']}";
|
| 396 |
}
|
| 397 |
}
|
| 398 |
|
| 399 |
$author_found = FALSE;
|
| 400 |
|
| 401 |
if (!empty($news->source->author->name)) {
|
| 402 |
$original_author = "{$news->source->author->name}";
|
| 403 |
$author_found = TRUE;
|
| 404 |
}
|
| 405 |
else if (!empty($news->author->name)) {
|
| 406 |
$original_author = "{$news->author->name}";
|
| 407 |
$author_found = TRUE;
|
| 408 |
}
|
| 409 |
|
| 410 |
if (!empty($feed_XML->author->name) && !$author_found) {
|
| 411 |
$original_author = "{$feed_XML->author->name}";
|
| 412 |
}
|
| 413 |
|
| 414 |
$original_url = _parser_common_syndication_link($news->link);
|
| 415 |
|
| 416 |
$item = new stdClass();
|
| 417 |
$item->title = _parser_common_syndication_title($title, $body);
|
| 418 |
$item->description = $body;
|
| 419 |
$item->options = new stdClass();
|
| 420 |
$item->options->original_author = $original_author;
|
| 421 |
$item->options->timestamp = _parser_common_syndication_parse_date(isset($news->published) ? "{$news->published}" : "{$news->issued}");
|
| 422 |
$item->options->original_url = trim($original_url);
|
| 423 |
if (valid_url($item->options->original_url) && !valid_url($item->options->original_url, TRUE) && !empty($base)) {
|
| 424 |
$item->options->original_url = $base . $item->options->original_url;
|
| 425 |
}
|
| 426 |
$item->options->guid = $guid;
|
| 427 |
$item->options->tags = isset($additional_taxonomies['ATOM Categories']) ? $additional_taxonomies['ATOM Categories'] : array();
|
| 428 |
$item->options->domains = isset($additional_taxonomies['ATOM Domains']) ? $additional_taxonomies['ATOM Domains'] : array();
|
| 429 |
$parsed_source->items[] = $item;
|
| 430 |
}
|
| 431 |
return $parsed_source;
|
| 432 |
}
|
| 433 |
|
| 434 |
/**
|
| 435 |
* Parse RDF Site Summary (RSS) 1.0 feeds in RDF/XML format.
|
| 436 |
*
|
| 437 |
* @see http://web.resource.org/rss/1.0/
|
| 438 |
*/
|
| 439 |
function _parser_common_syndication_RDF10_parse($feed_XML) {
|
| 440 |
// Declare some canonical standard prefixes for well-known namespaces:
|
| 441 |
static $canonical_namespaces = array(
|
| 442 |
'rdf' => 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
|
| 443 |
'rdfs' => 'http://www.w3.org/2000/01/rdf-schema#',
|
| 444 |
'xsi' => 'http://www.w3.org/2001/XMLSchema-instance#',
|
| 445 |
'xsd' => 'http://www.w3.org/2001/XMLSchema#',
|
| 446 |
'owl' => 'http://www.w3.org/2002/07/owl#',
|
| 447 |
'dc' => 'http://purl.org/dc/elements/1.1/',
|
| 448 |
'dcterms' => 'http://purl.org/dc/terms/',
|
| 449 |
'dcmitype' => 'http://purl.org/dc/dcmitype/',
|
| 450 |
'foaf' => 'http://xmlns.com/foaf/0.1/',
|
| 451 |
'rss' => 'http://purl.org/rss/1.0/',
|
| 452 |
);
|
| 453 |
|
| 454 |
// Get all namespaces declared in the feed element, with special handling
|
| 455 |
// for PHP versions prior to 5.1.2 as they don't handle namespaces.
|
| 456 |
$namespaces = version_compare(phpversion(), '5.1.2', '<') ? array() : $feed_XML->getNamespaces(TRUE);
|
| 457 |
|
| 458 |
// Process the <rss:channel> resource containing feed metadata:
|
| 459 |
foreach ($feed_XML->children($canonical_namespaces['rss'])->channel as $rss_channel) {
|
| 460 |
$parsed_source = (object)array(
|
| 461 |
'title' => _parser_common_syndication_title((string)$rss_channel->title),
|
| 462 |
'description' => (string)$rss_channel->description,
|
| 463 |
'options' => (object)array('link' => (string)$rss_channel->link),
|
| 464 |
'items' => array(),
|
| 465 |
);
|
| 466 |
break;
|
| 467 |
}
|
| 468 |
|
| 469 |
// Process each <rss:item> resource contained in the feed:
|
| 470 |
foreach ($feed_XML->children($canonical_namespaces['rss'])->item as $rss_item) {
|
| 471 |
|
| 472 |
// Extract all available RDF statements from the feed item's RDF/XML
|
| 473 |
// tags, allowing for both the item's attributes and child elements to
|
| 474 |
// contain RDF properties:
|
| 475 |
$rdf_data = array();
|
| 476 |
foreach ($namespaces as $ns => $ns_uri) {
|
| 477 |
// Note that we attempt to normalize the found property name
|
| 478 |
// namespaces to well-known 'standard' prefixes where possible, as the
|
| 479 |
// feed may in principle use any arbitrary prefixes and we should
|
| 480 |
// still be able to correctly handle it.
|
| 481 |
foreach ($rss_item->attributes($ns_uri) as $attr_name => $attr_value) {
|
| 482 |
$ns_prefix = ($ns_prefix = array_search($ns_uri, $canonical_namespaces)) ? $ns_prefix : $ns;
|
| 483 |
$rdf_data[$ns_prefix .':'. $attr_name][] = (string)$attr_value;
|
| 484 |
}
|
| 485 |
foreach ($rss_item->children($ns_uri) as $rss_property) {
|
| 486 |
$ns_prefix = ($ns_prefix = array_search($ns_uri, $canonical_namespaces)) ? $ns_prefix : $ns;
|
| 487 |
$rdf_data[$ns_prefix .':'. $rss_property->getName()][] = (string)$rss_property;
|
| 488 |
}
|
| 489 |
}
|
| 490 |
|
| 491 |
// Declaratively define mappings that determine how to construct the
|
| 492 |
// object that gets passed back to FeedAPI:
|
| 493 |
$item = _parser_common_syndication_RDF10_item($rdf_data, (object)array(
|
| 494 |
'title' => array('rss:title', 'dc:title'),
|
| 495 |
'description' => array('rss:description', 'dc:description', 'content:encoded'),
|
| 496 |
'options' => (object)array(
|
| 497 |
'guid' => 'rdf:about',
|
| 498 |
'timestamp' => 'dc:date',
|
| 499 |
'original_author' => array('dc:creator', 'dc:publisher'),
|
| 500 |
'original_url' => array('rss:link', 'rdf:about'),
|
| 501 |
'tags' => 'dc:subject',
|
| 502 |
),
|
| 503 |
));
|
| 504 |
|
| 505 |
// Special handling for the title:
|
| 506 |
$item->title = _parser_common_syndication_title($item->title, $item->description);
|
| 507 |
|
| 508 |
// Parse any date/time values into Unix timestamps:
|
| 509 |
$item->options->timestamp = _parser_common_syndication_parse_date($item->options->timestamp);
|
| 510 |
|
| 511 |
// If no author name found, use the feed title:
|
| 512 |
if (empty($item->options->original_author)) {
|
| 513 |
$item->options->original_author = $parsed_source->title;
|
| 514 |
}
|
| 515 |
|
| 516 |
// Add every found RDF property to the FeedAPI item in order for Feed
|
| 517 |
// Element Mapper to be able to map these properties:
|
| 518 |
$item->rdf = (object)array();
|
| 519 |
foreach ($rdf_data as $rdf_property => $rdf_value) {
|
| 520 |
$rdf_property = str_replace(':', '_', $rdf_property); // looks nicer in the mapper UI
|
| 521 |
$item->rdf->$rdf_property = $rdf_value;
|
| 522 |
}
|
| 523 |
|
| 524 |
$parsed_source->items[] = $item;
|
| 525 |
}
|
| 526 |
|
| 527 |
return $parsed_source;
|
| 528 |
}
|
| 529 |
|
| 530 |
function _parser_common_syndication_RDF10_property($rdf_data, $rdf_properties = array()) {
|
| 531 |
$rdf_properties = is_array($rdf_properties) ? $rdf_properties : array_slice(func_get_args(), 1);
|
| 532 |
foreach ($rdf_properties as $rdf_property) {
|
| 533 |
if ($rdf_property && !empty($rdf_data[$rdf_property])) {
|
| 534 |
return array_filter($rdf_data[$rdf_property], 'strlen'); // remove empty strings
|
| 535 |
}
|
| 536 |
}
|
| 537 |
}
|
| 538 |
|
| 539 |
function _parser_common_syndication_RDF10_item($rdf_data, $mappings) {
|
| 540 |
foreach (get_object_vars($mappings) as $k => $v) {
|
| 541 |
if (is_object($v)) {
|
| 542 |
$mappings->$k = _parser_common_syndication_RDF10_item($rdf_data, $v);
|
| 543 |
}
|
| 544 |
else {
|
| 545 |
$values = _parser_common_syndication_RDF10_property($rdf_data, $v);
|
| 546 |
$mappings->$k = !is_array($values) || count($values) > 1 ? $values : reset($values);
|
| 547 |
}
|
| 548 |
}
|
| 549 |
return (object)$mappings;
|
| 550 |
}
|
| 551 |
|
| 552 |
/**
|
| 553 |
* Parse RSS2.0 feeds.
|
| 554 |
*/
|
| 555 |
function _parser_common_syndication_RSS20_parse($feed_XML) {
|
| 556 |
$parsed_source = new stdClass();
|
| 557 |
// Detect the title.
|
| 558 |
$parsed_source->title = isset($feed_XML->channel->title) ? _parser_common_syndication_title("{$feed_XML->channel->title}") : "";
|
| 559 |
// Detect the description.
|
| 560 |
$parsed_source->description = isset($feed_XML->channel->description) ? "{$feed_XML->channel->description}" : "";
|
| 561 |
$parsed_source->options = new stdClass();
|
| 562 |
// Detect the link.
|
| 563 |
$parsed_source->options->link = isset($feed_XML->channel->link) ? "{$feed_XML->channel->link}" : "";
|
| 564 |
$parsed_source->items = array();
|
| 565 |
|
| 566 |
foreach ($feed_XML->xpath('//item') as $news) {
|
| 567 |
$category = $news->xpath('category');
|
| 568 |
// Get children for current namespace.
|
| 569 |
if (version_compare(phpversion(), '5.1.2', '>')) {
|
| 570 |
$content = (array)$news->children('http://purl.org/rss/1.0/modules/content/');
|
| 571 |
}
|
| 572 |
$news = (array) $news;
|
| 573 |
$news['category'] = $category;
|
| 574 |
|
| 575 |
if (isset($news['guid'])) {
|
| 576 |
$guid = "{$news['guid']}";
|
| 577 |
}
|
| 578 |
else {
|
| 579 |
$guid = NULL;
|
| 580 |
}
|
| 581 |
|
| 582 |
if (isset($news['title'])) {
|
| 583 |
$title = "{$news['title']}";
|
| 584 |
}
|
| 585 |
else {
|
| 586 |
$title = '';
|
| 587 |
}
|
| 588 |
|
| 589 |
if (isset($news['description'])) {
|
| 590 |
$body = "{$news['description']}";
|
| 591 |
}
|
| 592 |
// Some sources use content:encoded as description i.e. PostNuke PageSetter module.
|
| 593 |
if (isset($news['encoded'])) { // content:encoded for PHP < 5.1.2.
|
| 594 |
if (strlen($body) < strlen("{$news['encoded']}")) {
|
| 595 |
$body = "{$news['encoded']}";
|
| 596 |
}
|
| 597 |
}
|
| 598 |
if (isset($content['encoded'])) { // content:encoded for PHP >= 5.1.2.
|
| 599 |
if (strlen($body) < strlen("{$content['encoded']}")) {
|
| 600 |
$body = "{$content['encoded']}";
|
| 601 |
}
|
| 602 |
}
|
| 603 |
if (!isset($body)) {
|
| 604 |
$body = "{$news['title']}";
|
| 605 |
}
|
| 606 |
|
| 607 |
if (!empty($feed_XML->channel->title)) {
|
| 608 |
$original_author = "{$feed_XML->channel->title}";
|
| 609 |
}
|
| 610 |
|
| 611 |
if (!empty($news['link'])) {
|
| 612 |
$original_url = "{$news['link']}";
|
| 613 |
}
|
| 614 |
else {
|
| 615 |
$original_url = NULL;
|
| 616 |
}
|
| 617 |
|
| 618 |
$additional_taxonomies = array();
|
| 619 |
$additional_taxonomies['RSS Categories'] = array();
|
| 620 |
$additional_taxonomies['RSS Domains'] = array();
|
| 621 |
if (isset($news['category'])) {
|
| 622 |
foreach ($news['category'] as $category) {
|
| 623 |
$additional_taxonomies['RSS Categories'][] = "{$category}";
|
| 624 |
if (isset($category['domain'])) {
|
| 625 |
$domain = "{$category['domain']}";
|
| 626 |
if (!empty($domain)) {
|
| 627 |
if (!isset($additional_taxonomies['RSS Domains'][$domain])) {
|
| 628 |
$additional_taxonomies['RSS Domains'][$domain] = array();
|
| 629 |
}
|
| 630 |
$additional_taxonomies['RSS Domains'][$domain][] = count($additional_taxonomies['RSS Categories']) - 1;
|
| 631 |
}
|
| 632 |
}
|
| 633 |
}
|
| 634 |
}
|
| 635 |
|
| 636 |
$item = new stdClass();
|
| 637 |
$item->title = _parser_common_syndication_title($title, $body);
|
| 638 |
$item->description = $body;
|
| 639 |
$item->options = new stdClass();
|
| 640 |
$item->options->original_author = $original_author;
|
| 641 |
if (isset($news['pubDate'])) {
|
| 642 |
$item->options->timestamp = _parser_common_syndication_parse_date($news['pubDate']);
|
| 643 |
}
|
| 644 |
else {
|
| 645 |
$item->options->timestamp = time();
|
| 646 |
}
|
| 647 |
$item->options->original_url = trim($original_url);
|
| 648 |
$item->options->guid = $guid;
|
| 649 |
$item->options->domains = $additional_taxonomies['RSS Domains'];
|
| 650 |
$item->options->tags = $additional_taxonomies['RSS Categories'];
|
| 651 |
$parsed_source->items[] = $item;
|
| 652 |
}
|
| 653 |
return $parsed_source;
|
| 654 |
}
|
| 655 |
|
| 656 |
/**
|
| 657 |
* Set the default caching directory if the current setting is not useable
|
| 658 |
*/
|
| 659 |
function _parser_common_syndication_sanitize_cache() {
|
| 660 |
$cache_location = file_directory_path() .'/parser_common_syndication_cache';
|
| 661 |
if (!is_writeable($cache_location) || !is_dir($cache_location)) {
|
| 662 |
$cache_location = file_create_path($cache_location);
|
| 663 |
if (!file_exists($cache_location) && is_writable(file_directory_path())) {
|
| 664 |
mkdir($cache_location);
|
| 665 |
}
|
| 666 |
if (!is_writeable($cache_location)) {
|
| 667 |
return FALSE;
|
| 668 |
}
|
| 669 |
}
|
| 670 |
return $cache_location;
|
| 671 |
}
|
| 672 |
|
| 673 |
/**
|
| 674 |
* Parse a date comes from a feed.
|
| 675 |
*
|
| 676 |
* @param $date_string
|
| 677 |
* The date string in various formats.
|
| 678 |
* @return
|
| 679 |
* The timestamp of the string or the current time if can't be parsed
|
| 680 |
*/
|
| 681 |
function _parser_common_syndication_parse_date($date_str) {
|
| 682 |
$parsed_date = strtotime($date_str);
|
| 683 |
if ($parsed_date === FALSE || $parsed_date == -1) {
|
| 684 |
$parsed_date = _parser_common_syndication_parse_w3cdtf($date_str);
|
| 685 |
}
|
| 686 |
return $parsed_date === FALSE ? time() : $parsed_date;
|
| 687 |
}
|
| 688 |
|
| 689 |
/**
|
| 690 |
* Parse the W3C date/time format, a subset of ISO 8601.
|
| 691 |
*
|
| 692 |
* PHP date parsing functions do not handle this format.
|
| 693 |
* See http://www.w3.org/TR/NOTE-datetime for more information.
|
| 694 |
* Originally from MagpieRSS (http://magpierss.sourceforge.net/).
|
| 695 |
*
|
| 696 |
* @param $date_str
|
| 697 |
* A string with a potentially W3C DTF date.
|
| 698 |
* @return
|
| 699 |
* A timestamp if parsed successfully or FALSE if not.
|
| 700 |
*/
|
| 701 |
function _parser_common_syndication_parse_w3cdtf($date_str) {
|
| 702 |
if (preg_match('/(\d{4})-(\d{2})-(\d{2})T(\d{2}):(\d{2})(:(\d{2}))?(?:([-+])(\d{2}):?(\d{2})|(Z))?/', $date_str, $match)) {
|
| 703 |
list($year, $month, $day, $hours, $minutes, $seconds) = array($match[1], $match[2], $match[3], $match[4], $match[5], $match[6]);
|
| 704 |
// Calculate the epoch for current date assuming GMT.
|
| 705 |
$epoch = gmmktime($hours, $minutes, $seconds, $month, $day, $year);
|
| 706 |
if ($match[10] != 'Z') { // Z is zulu time, aka GMT
|
| 707 |
list($tz_mod, $tz_hour, $tz_min) = array($match[8], $match[9], $match[10]);
|
| 708 |
// Zero out the variables.
|
| 709 |
if (!$tz_hour) {
|
| 710 |
$tz_hour = 0;
|
| 711 |
}
|
| 712 |
if (!$tz_min) {
|
| 713 |
$tz_min = 0;
|
| 714 |
}
|
| 715 |
$offset_secs = (($tz_hour * 60) + $tz_min) * 60;
|
| 716 |
// Is timezone ahead of GMT? If yes, subtract offset.
|
| 717 |
if ($tz_mod == '+') {
|
| 718 |
$offset_secs *= -1;
|
| 719 |
}
|
| 720 |
$epoch += $offset_secs;
|
| 721 |
}
|
| 722 |
return $epoch;
|
| 723 |
}
|
| 724 |
else {
|
| 725 |
return FALSE;
|
| 726 |
}
|
| 727 |
}
|
| 728 |
|
| 729 |
/**
|
| 730 |
* Extract the link that points to the original content (back to site or original article)
|
| 731 |
*
|
| 732 |
* @param $links
|
| 733 |
* Array of SimpleXML objects
|
| 734 |
*/
|
| 735 |
function _parser_common_syndication_link($links) {
|
| 736 |
$to_link = '';
|
| 737 |
if (count($links) > 0) {
|
| 738 |
foreach ($links as $link) {
|
| 739 |
$link = $link->attributes();
|
| 740 |
$to_link = isset($link["href"]) ? "{$link["href"]}" : "";
|
| 741 |
if (isset($link["rel"])) {
|
| 742 |
if ("{$link["rel"]}" == 'alternate') {
|
| 743 |
break;
|
| 744 |
}
|
| 745 |
}
|
| 746 |
}
|
| 747 |
}
|
| 748 |
return $to_link;
|
| 749 |
}
|
| 750 |
|
| 751 |
/**
|
| 752 |
* Prepare raw data to be a title
|
| 753 |
*/
|
| 754 |
function _parser_common_syndication_title($title, $body = FALSE) {
|
| 755 |
if (empty($title) && !empty($body)) {
|
| 756 |
// Explode to words and use the first 3 words.
|
| 757 |
$words = preg_split("/[\s,]+/", $body);
|
| 758 |
$title = $words[0] .' '. $words[1] .' '. $words[2];
|
| 759 |
}
|
| 760 |
return $title;
|
| 761 |
}
|