/[drupal]/contributions/modules/feedapi/parser_common_syndication/parser_common_syndication.inc
ViewVC logotype

Contents of /contributions/modules/feedapi/parser_common_syndication/parser_common_syndication.inc

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.1 - (show annotations) (download) (as text)
Fri Sep 18 09:47:37 2009 UTC (2 months, 1 week ago) by aronnovak
Branch: MAIN
CVS Tags: HEAD
Branch point for: DRUPAL-6--1
File MIME type: text/x-php
#258434 by mustafau - Split parser_common_syndication.module
1 <?php
2 // $Id$
3
4 /**
5 * @file
6 * Downloading and parsing functions for Common Syndication Parser
7 */
8
9 /**
10 * Parse the feed into a data structure.
11 *
12 * @param $feed
13 * The feed object (contains the URL or the parsed XML structure.
14 * @return
15 * stdClass The structured datas extracted from the feed.
16 */
17 function _parser_common_syndication_feedapi_parse($feed) {
18 if (is_a($feed, 'SimpleXMLElement')) {
19 $xml = $feed;
20 }
21 else {
22 $downloaded_string = _parser_common_syndication_download($feed->url);
23 if ($downloaded_string === FALSE || is_object($downloaded_string)) {
24 return $downloaded_string;
25 }
26
27 if (!defined('LIBXML_VERSION') || (version_compare(phpversion(), '5.1.0', '<'))) {
28 @ $xml = simplexml_load_string($downloaded_string, NULL);
29 }
30 else {
31 @ $xml = simplexml_load_string($downloaded_string, NULL, LIBXML_NOERROR | LIBXML_NOWARNING | LIBXML_NOCDATA);
32 }
33
34 // Got a malformed XML.
35 if ($xml === FALSE || is_null($xml)) {
36 return FALSE;
37 }
38 }
39 $feed_type = _parser_common_syndication_feed_format_detect($xml);
40 if ($feed_type == "atom1.0") {
41 return _parser_common_syndication_atom10_parse($xml);
42 }
43 if ($feed_type == "RSS2.0" || $feed_type == "RSS0.91" || $feed_type == "RSS0.92") {
44 return _parser_common_syndication_RSS20_parse($xml);
45 }
46 if ($feed_type == "RDF") {
47 return _parser_common_syndication_RDF10_parse($xml);
48 }
49 return FALSE;
50 }
51
52 /**
53 * Get the cached version of the <var>$url</var>
54 */
55 function _parser_common_syndication_cache_get($url) {
56 $cache_file = _parser_common_syndication_sanitize_cache() .'/'. md5($url);
57 if (file_exists($cache_file)) {
58 $file_content = file_get_contents($cache_file);
59 return unserialize($file_content);
60 }
61 return FALSE;
62 }
63
64 /**
65 * Store the parsed feed into the cache
66 */
67 function _parser_common_syndication_cache_set($url, $parsed_feed) {
68 $cache_file = _parser_common_syndication_sanitize_cache() .'/'. md5($url);
69 $cache_fp = fopen($cache_file, 'w');
70 fwrite($cache_fp, serialize($parsed_feed));
71 fclose($cache_fp);
72 }
73
74 /**
75 * Get the content from the given URL.
76 *
77 * @param $url
78 * A valid URL (not only web URLs).
79 * @param $username
80 * If the URL use authentication, here you can supply the username for this.
81 * @param $password
82 * If the URL use authentication, here you can supply the password for this.
83 * @return
84 * The data pulled from the URL or FALSE if the feed does not need refresh.
85 */
86 function _parser_common_syndication_feedapi_get($url, $username = NULL, $password = NULL, $accept_invalid_cert = FALSE) {
87 // Intra-pagedownload cache, avoid to download the same content twice within one page download (it's possible, compatible and parse calls).
88 static $download_cache = array();
89 if (isset($download_cache[$url])) {
90 return $download_cache[$url];
91 }
92 $has_etag = FALSE;
93 $curl = _parser_common_syndication_use_curl();
94
95 // Only download and parse data if really needs refresh.
96 // Based on "Last-Modified" and "If-Modified-Since".
97 $headers = array();
98 $db_result = db_query("SELECT etag, last_modified FROM {parser_common_syndication} WHERE url = '%s'", md5($url));
99 while ($validate = db_fetch_array($db_result)) {
100 $has_etag = TRUE;
101 if (!empty($validate['etag'])) {
102 if ($curl) {
103 $headers[] = 'If-None-Match: '. $validate['etag'];
104 }
105 else {
106 $headers['If-None-Match'] = $validate['etag'];
107 }
108 }
109 if (!empty($validate['last_modified'])) {
110 if ($curl) {
111 $headers[] = 'If-Modified-Since: '. $validate['last_modified'];
112 }
113 else {
114 $headers['If-Modified-Since'] = $validate['last_modified'];
115 }
116 }
117 if (!empty($username) && !$curl) {
118 $headers['Authorization'] = 'Basic '. base64_encode("$username:$password");
119 }
120 }
121 if ($curl) {
122 $headers[] = 'User-Agent: Drupal (+http://drupal.org/)';
123 $result = new stdClass();
124 $download = curl_init($url);
125 curl_setopt($download, CURLOPT_FOLLOWLOCATION, TRUE);
126 if (!empty($username)) {
127 curl_setopt($download, CURLOPT_USERPWD, "{$username}:{$password}");
128 }
129 curl_setopt($download, CURLOPT_HTTPHEADER, $headers);
130 curl_setopt($download, CURLOPT_HEADER, TRUE);
131 curl_setopt($download, CURLOPT_RETURNTRANSFER, TRUE);
132 if ($accept_invalid_cert) {
133 curl_setopt($download, CURLOPT_SSL_VERIFYPEER, 0);
134 }
135 $header = '';
136 $data = curl_exec($download);
137 $header_size = curl_getinfo($download, CURLINFO_HEADER_SIZE);
138 $header = substr($data, 0, $header_size - 1);
139 $result->data = substr($data, $header_size);
140 $header_lines = preg_split("/\r\n|\n|\r/", $header);
141
142 $result->headers = array();
143 array_shift($header_lines); // skip HTTP response status
144 while ($line = trim(array_shift($header_lines))) {
145 list($header, $value) = explode(':', $line, 2);
146 if (isset($result->headers[$header]) && $header == 'Set-Cookie') {
147 // RFC 2109: the Set-Cookie response header comprises the token Set-
148 // Cookie:, followed by a comma-separated list of one or more cookies.
149 $result->headers[$header] .= ','. trim($value);
150 }
151 else {
152 $result->headers[$header] = trim($value);
153 }
154 }
155 $result->code = curl_getinfo($download, CURLINFO_HTTP_CODE);
156
157 curl_close($download);
158 }
159 else {
160 $result = drupal_http_request($url, $headers);
161 }
162
163 $result->code = isset($result->code) ? $result->code : 200;
164 // In this case return the cached data.
165 if ($result->code == 304) {
166 $cached_data = _parser_common_syndication_cache_get($url);
167 if (is_object($cached_data)) {
168 $cached_data->from_cache = TRUE;
169 return $cached_data;
170 }
171 else {
172 // It's a tragedy, this file has to be exist and contain good data.
173 // In this case, repeat the stuff without cache.
174 db_query("DELETE FROM {parser_common_syndication} WHERE url = '%s'", md5($url));
175 return _parser_common_syndication_feedapi_get($url, $username, $password);
176 }
177 }
178
179 if (!isset($result->headers) || !isset($result->headers['ETag']) || !isset($result->headers['Last-Modified'])) {
180 $result->headers = isset($result->headers) ? $result->headers : array();
181 $result->headers['ETag'] = isset($result->headers['ETag']) ? $result->headers['ETag'] : '';
182 $result->headers['Last-Modified'] = isset($result->headers['Last-Modified']) ? $result->headers['Last-Modified'] : '';
183 }
184 if ($has_etag == TRUE) {
185 db_query("UPDATE {parser_common_syndication} SET etag = '%s', last_modified = '%s' WHERE url = '%s'", $result->headers['ETag'], $result->headers['Last-Modified'], md5($url));
186 }
187 else {
188 db_query("INSERT INTO {parser_common_syndication} (etag, last_modified, url) VALUES ('%s', '%s', '%s')", $result->headers['ETag'], $result->headers['Last-Modified'], md5($url));
189 }
190 $download_cache[$url] = $result->data;
191 return empty($result->data) ? FALSE : $result->data;
192 }
193
194 /**
195 * Delete cache validating functions when feed is deleted
196 */
197 function parser_common_syndication_nodeapi(&$node, $op) {
198 if (isset($node->feed) || feedapi_enabled_type($node->type)) {
199 switch ($op) {
200 case 'delete':
201 db_query("DELETE FROM {parser_common_syndication} WHERE url = '%s'", $node->feed->url);
202 $cache_dir = _parser_common_syndication_sanitize_cache();
203 $cache_filename = $cache_dir .'/'. md5($node->feed->url);
204 if (file_exists($cache_filename)) {
205 unlink($cache_filename);
206 }
207 break;
208 }
209 }
210 }
211
212 /**
213 * Determine the feed format of a SimpleXML parsed object structure.
214 *
215 * @param $xml
216 * SimpleXML-preprocessed feed.
217 * @return
218 * The feed format short description or FALSE if not compatible.
219 */
220 function _parser_common_syndication_feed_format_detect($xml) {
221 if (!is_object($xml)) {
222 return FALSE;
223 }
224 $attr = $xml->attributes();
225 $type = strtolower($xml->getName());
226 if (isset($xml->entry) && $type == "feed") {
227 return "atom1.0";
228 }
229 if ($type == "rss" && $attr["version"] == "2.0") {
230 return "RSS2.0";
231 }
232 if ($type == "rdf" && isset($xml->channel)) {
233 return "RDF";
234 }
235 if ($type == "rss" && $attr["version"] == "0.91") {
236 return "RSS0.91";
237 }
238 if ($type == "rss" && $attr["version"] == "0.92") {
239 return "RSS0.92";
240 }
241 return FALSE;
242 }
243
244 /**
245 * Call one of the possible feedapi_get hook and pass back the downloaded data
246 *
247 * @return
248 * string - the downloaded data, FALSE - if the URL is not reachable
249 */
250 function _parser_common_syndication_download($url, $settings = NULL) {
251 if (valid_url($url, TRUE)) {
252 // Handle password protected feeds.
253 $url_parts = parse_url($url);
254 $password = $username = NULL;
255 if (!empty($url_parts['user'])) {
256 $password = $url_parts['pass'];
257 $username = $url_parts['user'];
258 }
259 }
260
261 $accept_invalid_cert = isset($settings['accept_invalid_cert']) ? $settings['accept_invalid_cert'] : FALSE;
262 $downloaded_string = _parser_common_syndication_feedapi_get($url, $username, $password, $accept_invalid_cert);
263
264 // Cannot get the feed, pass the problem to one level up.
265 if ($downloaded_string == FALSE) {
266 return FALSE;
267 }
268 // The data comes from cache, just pass one level up.
269 else if (is_object($downloaded_string)) {
270 return $downloaded_string;
271 }
272
273 // Do the autodiscovery at this level, pass back the real data.
274 // Maybe it's HTML. If it's not HTML, not worth to take a look into the downloaded string.
275 if (strpos(strtolower($downloaded_string), "<html") !== FALSE) {
276 $allowed_mime = array("text/xml", "application/rss+xml", "application/atom+xml", "application/rdf+xml", "application/xml");
277 $matches = array();
278 // Get all the links tag
279 preg_match_all('/<link\s+(.*?)\s*\/?>/si', $downloaded_string, $matches);
280 $links = $matches[1];
281 $rss_link = FALSE;
282 foreach ($links as $link) {
283 $mime = array();
284 // Get the type attribute and check if the mime type is allowed.
285 preg_match_all('/type\s*=\s*("|\')([A-Za-z\/+]*)("|\')/si', $link, $mime);
286 if (in_array(array_pop($mime[2]), $allowed_mime)) {
287 $href = array();
288 // Get the href attribute.
289 preg_match_all('/href\s*=\s*("|\')([=#\?_:.0-9A-Za-z\/+]*)("|\')/si', $link, $href);
290 $rss_link = array_pop($href[2]);
291 if (is_string($rss_link) && strlen($rss_link) > 0 && $rss_link != $url) {
292 // Handle base url related stuff.
293 $parsed_url = parse_url($rss_link);
294 if (!isset($parsed_url['host'])) {
295 // It's relative so make it absolute.
296 $base_tag = array();
297 preg_match_all('/<base href\s*=\s*("|\')([_:.0-9A-Za-z\/+]*)("|\')/si', $link, $base_tag);
298 $base_url = array_pop($base_tag[2]);
299 if (is_string($base_url) && strlen($base_url) > 0) {
300 // Get from the HTML base tag.
301 $rss_link = $base_url . $rss_link;
302 }
303 else {
304 // Guess from the original URL.
305 $original_url = parse_url($url);
306 $rss_link = $original_url['scheme'] .'://'. $original_url['host'] . (isset($original_url['port']) ? ':' : '') . $original_url['port'] . $parsed_url['path'] .'?'. $parsed_url['query'] .'#'. $parsed_url['fragment'];
307 }
308 }
309 $downloaded_string = _parser_common_syndication_download($rss_link);
310 break;
311 }
312 }
313 }
314 }
315 // Ugly hack to be able to retrieve the xml:base property, no way to access xml:lang inside <feed>
316 $downloaded_string = preg_replace('/xml:base *=/', 'base=', $downloaded_string);
317
318 // Filter out strange tags. Without this, the text would contain strange stuff.
319 // @todo: make sure that these are not important for feed element mapper
320 $downloaded_string_filtered = preg_replace(array('@<script[^>]*?.*?</script>@si', '@<object[^>]*?.*?</object>@si', '@<embed[^>]*?.*?</embed>@si', '@<applet[^>]*?.*?</applet>@si', '@<noframes[^>]*?.*?</noframes>@si', '@<noscript[^>]*?.*?</noscript>@si', '@<noembed[^>]*?.*?</noembed>@si'), '', $downloaded_string);
321 return empty($downloaded_string_filtered) ? $downloaded_string : $downloaded_string_filtered;
322 }
323
324 /**
325 * Parse atom feeds.
326 */
327 function _parser_common_syndication_atom10_parse($feed_XML) {
328 $parsed_source = new stdClass();
329
330 $base = (string) array_shift($feed_XML->xpath("@base"));
331 if (!valid_url($base, TRUE)) {
332 $base = FALSE;
333 }
334
335 // Detect the title
336 $parsed_source->title = isset($feed_XML->title) ? _parser_common_syndication_title("{$feed_XML->title}") : "";
337 // Detect the description
338 $parsed_source->description = isset($feed_XML->subtitle) ? "{$feed_XML->subtitle}" : "";
339 $parsed_source->options = new stdClass();
340
341 $parsed_source->options->link = _parser_common_syndication_link($feed_XML->link);
342 if (valid_url($parsed_source->options->link) && !valid_url($parsed_source->options->link, TRUE) && !empty($base)) {
343 $parsed_source->options->link = $base . $parsed_source->options->link;
344 }
345
346 $parsed_source->items = array();
347
348 foreach ($feed_XML->entry as $news) {
349 $original_url = NULL;
350
351 $guid = !empty($news->id) ? "{$news->id}" : NULL;
352
353 // I don't know how standard this is, but sometimes the id is the URL.
354 if (valid_url($guid, TRUE)) {
355 $original_url = $guid;
356 }
357
358 $additional_taxonomies = array();
359
360 if (isset($news->category)) {
361 $additional_taxonomies['ATOM Categories'] = array();
362 $additional_taxonomies['ATOM Domains'] = array();
363 foreach ($news->category as $category) {
364 if (isset($category['scheme'])) {
365 $domain = "{$category['scheme']}";
366 if (!empty($domain)) {
367 if (!isset($additional_taxonomies['ATOM Domains'][$domain])) {
368 $additional_taxonomies['ATOM Domains'][$domain] = array();
369 }
370 $additional_taxonomies['ATOM Domains'][$domain][] = count($additional_taxonomies['ATOM Categories']) - 1;
371 }
372 }
373 $additional_taxonomies['ATOM Categories'][] = "{$category['term']}";
374 }
375 }
376 $title = "{$news->title}";
377
378 $body = '';
379 if (!empty($news->content)) {
380 foreach ($news->content->children() as $child) {
381 $body .= $child->asXML();
382 }
383 $body .= "{$news->content}";
384 }
385 else if (!empty($news->summary)) {
386 foreach ($news->summary->children() as $child) {
387 $body .= $child->asXML();
388 }
389 $body .= "{$news->summary}";
390 }
391
392 if (!empty($news->content['src'])) {
393 // some src elements in some valid atom feeds contained no urls at all
394 if (valid_url("{$news->content['src']}", TRUE)) {
395 $original_url = "{$news->content['src']}";
396 }
397 }
398
399 $author_found = FALSE;
400
401 if (!empty($news->source->author->name)) {
402 $original_author = "{$news->source->author->name}";
403 $author_found = TRUE;
404 }
405 else if (!empty($news->author->name)) {
406 $original_author = "{$news->author->name}";
407 $author_found = TRUE;
408 }
409
410 if (!empty($feed_XML->author->name) && !$author_found) {
411 $original_author = "{$feed_XML->author->name}";
412 }
413
414 $original_url = _parser_common_syndication_link($news->link);
415
416 $item = new stdClass();
417 $item->title = _parser_common_syndication_title($title, $body);
418 $item->description = $body;
419 $item->options = new stdClass();
420 $item->options->original_author = $original_author;
421 $item->options->timestamp = _parser_common_syndication_parse_date(isset($news->published) ? "{$news->published}" : "{$news->issued}");
422 $item->options->original_url = trim($original_url);
423 if (valid_url($item->options->original_url) && !valid_url($item->options->original_url, TRUE) && !empty($base)) {
424 $item->options->original_url = $base . $item->options->original_url;
425 }
426 $item->options->guid = $guid;
427 $item->options->tags = isset($additional_taxonomies['ATOM Categories']) ? $additional_taxonomies['ATOM Categories'] : array();
428 $item->options->domains = isset($additional_taxonomies['ATOM Domains']) ? $additional_taxonomies['ATOM Domains'] : array();
429 $parsed_source->items[] = $item;
430 }
431 return $parsed_source;
432 }
433
434 /**
435 * Parse RDF Site Summary (RSS) 1.0 feeds in RDF/XML format.
436 *
437 * @see http://web.resource.org/rss/1.0/
438 */
439 function _parser_common_syndication_RDF10_parse($feed_XML) {
440 // Declare some canonical standard prefixes for well-known namespaces:
441 static $canonical_namespaces = array(
442 'rdf' => 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
443 'rdfs' => 'http://www.w3.org/2000/01/rdf-schema#',
444 'xsi' => 'http://www.w3.org/2001/XMLSchema-instance#',
445 'xsd' => 'http://www.w3.org/2001/XMLSchema#',
446 'owl' => 'http://www.w3.org/2002/07/owl#',
447 'dc' => 'http://purl.org/dc/elements/1.1/',
448 'dcterms' => 'http://purl.org/dc/terms/',
449 'dcmitype' => 'http://purl.org/dc/dcmitype/',
450 'foaf' => 'http://xmlns.com/foaf/0.1/',
451 'rss' => 'http://purl.org/rss/1.0/',
452 );
453
454 // Get all namespaces declared in the feed element, with special handling
455 // for PHP versions prior to 5.1.2 as they don't handle namespaces.
456 $namespaces = version_compare(phpversion(), '5.1.2', '<') ? array() : $feed_XML->getNamespaces(TRUE);
457
458 // Process the <rss:channel> resource containing feed metadata:
459 foreach ($feed_XML->children($canonical_namespaces['rss'])->channel as $rss_channel) {
460 $parsed_source = (object)array(
461 'title' => _parser_common_syndication_title((string)$rss_channel->title),
462 'description' => (string)$rss_channel->description,
463 'options' => (object)array('link' => (string)$rss_channel->link),
464 'items' => array(),
465 );
466 break;
467 }
468
469 // Process each <rss:item> resource contained in the feed:
470 foreach ($feed_XML->children($canonical_namespaces['rss'])->item as $rss_item) {
471
472 // Extract all available RDF statements from the feed item's RDF/XML
473 // tags, allowing for both the item's attributes and child elements to
474 // contain RDF properties:
475 $rdf_data = array();
476 foreach ($namespaces as $ns => $ns_uri) {
477 // Note that we attempt to normalize the found property name
478 // namespaces to well-known 'standard' prefixes where possible, as the
479 // feed may in principle use any arbitrary prefixes and we should
480 // still be able to correctly handle it.
481 foreach ($rss_item->attributes($ns_uri) as $attr_name => $attr_value) {
482 $ns_prefix = ($ns_prefix = array_search($ns_uri, $canonical_namespaces)) ? $ns_prefix : $ns;
483 $rdf_data[$ns_prefix .':'. $attr_name][] = (string)$attr_value;
484 }
485 foreach ($rss_item->children($ns_uri) as $rss_property) {
486 $ns_prefix = ($ns_prefix = array_search($ns_uri, $canonical_namespaces)) ? $ns_prefix : $ns;
487 $rdf_data[$ns_prefix .':'. $rss_property->getName()][] = (string)$rss_property;
488 }
489 }
490
491 // Declaratively define mappings that determine how to construct the
492 // object that gets passed back to FeedAPI:
493 $item = _parser_common_syndication_RDF10_item($rdf_data, (object)array(
494 'title' => array('rss:title', 'dc:title'),
495 'description' => array('rss:description', 'dc:description', 'content:encoded'),
496 'options' => (object)array(
497 'guid' => 'rdf:about',
498 'timestamp' => 'dc:date',
499 'original_author' => array('dc:creator', 'dc:publisher'),
500 'original_url' => array('rss:link', 'rdf:about'),
501 'tags' => 'dc:subject',
502 ),
503 ));
504
505 // Special handling for the title:
506 $item->title = _parser_common_syndication_title($item->title, $item->description);
507
508 // Parse any date/time values into Unix timestamps:
509 $item->options->timestamp = _parser_common_syndication_parse_date($item->options->timestamp);
510
511 // If no author name found, use the feed title:
512 if (empty($item->options->original_author)) {
513 $item->options->original_author = $parsed_source->title;
514 }
515
516 // Add every found RDF property to the FeedAPI item in order for Feed
517 // Element Mapper to be able to map these properties:
518 $item->rdf = (object)array();
519 foreach ($rdf_data as $rdf_property => $rdf_value) {
520 $rdf_property = str_replace(':', '_', $rdf_property); // looks nicer in the mapper UI
521 $item->rdf->$rdf_property = $rdf_value;
522 }
523
524 $parsed_source->items[] = $item;
525 }
526
527 return $parsed_source;
528 }
529
530 function _parser_common_syndication_RDF10_property($rdf_data, $rdf_properties = array()) {
531 $rdf_properties = is_array($rdf_properties) ? $rdf_properties : array_slice(func_get_args(), 1);
532 foreach ($rdf_properties as $rdf_property) {
533 if ($rdf_property && !empty($rdf_data[$rdf_property])) {
534 return array_filter($rdf_data[$rdf_property], 'strlen'); // remove empty strings
535 }
536 }
537 }
538
539 function _parser_common_syndication_RDF10_item($rdf_data, $mappings) {
540 foreach (get_object_vars($mappings) as $k => $v) {
541 if (is_object($v)) {
542 $mappings->$k = _parser_common_syndication_RDF10_item($rdf_data, $v);
543 }
544 else {
545 $values = _parser_common_syndication_RDF10_property($rdf_data, $v);
546 $mappings->$k = !is_array($values) || count($values) > 1 ? $values : reset($values);
547 }
548 }
549 return (object)$mappings;
550 }
551
552 /**
553 * Parse RSS2.0 feeds.
554 */
555 function _parser_common_syndication_RSS20_parse($feed_XML) {
556 $parsed_source = new stdClass();
557 // Detect the title.
558 $parsed_source->title = isset($feed_XML->channel->title) ? _parser_common_syndication_title("{$feed_XML->channel->title}") : "";
559 // Detect the description.
560 $parsed_source->description = isset($feed_XML->channel->description) ? "{$feed_XML->channel->description}" : "";
561 $parsed_source->options = new stdClass();
562 // Detect the link.
563 $parsed_source->options->link = isset($feed_XML->channel->link) ? "{$feed_XML->channel->link}" : "";
564 $parsed_source->items = array();
565
566 foreach ($feed_XML->xpath('//item') as $news) {
567 $category = $news->xpath('category');
568 // Get children for current namespace.
569 if (version_compare(phpversion(), '5.1.2', '>')) {
570 $content = (array)$news->children('http://purl.org/rss/1.0/modules/content/');
571 }
572 $news = (array) $news;
573 $news['category'] = $category;
574
575 if (isset($news['guid'])) {
576 $guid = "{$news['guid']}";
577 }
578 else {
579 $guid = NULL;
580 }
581
582 if (isset($news['title'])) {
583 $title = "{$news['title']}";
584 }
585 else {
586 $title = '';
587 }
588
589 if (isset($news['description'])) {
590 $body = "{$news['description']}";
591 }
592 // Some sources use content:encoded as description i.e. PostNuke PageSetter module.
593 if (isset($news['encoded'])) { // content:encoded for PHP < 5.1.2.
594 if (strlen($body) < strlen("{$news['encoded']}")) {
595 $body = "{$news['encoded']}";
596 }
597 }
598 if (isset($content['encoded'])) { // content:encoded for PHP >= 5.1.2.
599 if (strlen($body) < strlen("{$content['encoded']}")) {
600 $body = "{$content['encoded']}";
601 }
602 }
603 if (!isset($body)) {
604 $body = "{$news['title']}";
605 }
606
607 if (!empty($feed_XML->channel->title)) {
608 $original_author = "{$feed_XML->channel->title}";
609 }
610
611 if (!empty($news['link'])) {
612 $original_url = "{$news['link']}";
613 }
614 else {
615 $original_url = NULL;
616 }
617
618 $additional_taxonomies = array();
619 $additional_taxonomies['RSS Categories'] = array();
620 $additional_taxonomies['RSS Domains'] = array();
621 if (isset($news['category'])) {
622 foreach ($news['category'] as $category) {
623 $additional_taxonomies['RSS Categories'][] = "{$category}";
624 if (isset($category['domain'])) {
625 $domain = "{$category['domain']}";
626 if (!empty($domain)) {
627 if (!isset($additional_taxonomies['RSS Domains'][$domain])) {
628 $additional_taxonomies['RSS Domains'][$domain] = array();
629 }
630 $additional_taxonomies['RSS Domains'][$domain][] = count($additional_taxonomies['RSS Categories']) - 1;
631 }
632 }
633 }
634 }
635
636 $item = new stdClass();
637 $item->title = _parser_common_syndication_title($title, $body);
638 $item->description = $body;
639 $item->options = new stdClass();
640 $item->options->original_author = $original_author;
641 if (isset($news['pubDate'])) {
642 $item->options->timestamp = _parser_common_syndication_parse_date($news['pubDate']);
643 }
644 else {
645 $item->options->timestamp = time();
646 }
647 $item->options->original_url = trim($original_url);
648 $item->options->guid = $guid;
649 $item->options->domains = $additional_taxonomies['RSS Domains'];
650 $item->options->tags = $additional_taxonomies['RSS Categories'];
651 $parsed_source->items[] = $item;
652 }
653 return $parsed_source;
654 }
655
656 /**
657 * Set the default caching directory if the current setting is not useable
658 */
659 function _parser_common_syndication_sanitize_cache() {
660 $cache_location = file_directory_path() .'/parser_common_syndication_cache';
661 if (!is_writeable($cache_location) || !is_dir($cache_location)) {
662 $cache_location = file_create_path($cache_location);
663 if (!file_exists($cache_location) && is_writable(file_directory_path())) {
664 mkdir($cache_location);
665 }
666 if (!is_writeable($cache_location)) {
667 return FALSE;
668 }
669 }
670 return $cache_location;
671 }
672
673 /**
674 * Parse a date comes from a feed.
675 *
676 * @param $date_string
677 * The date string in various formats.
678 * @return
679 * The timestamp of the string or the current time if can't be parsed
680 */
681 function _parser_common_syndication_parse_date($date_str) {
682 $parsed_date = strtotime($date_str);
683 if ($parsed_date === FALSE || $parsed_date == -1) {
684 $parsed_date = _parser_common_syndication_parse_w3cdtf($date_str);
685 }
686 return $parsed_date === FALSE ? time() : $parsed_date;
687 }
688
689 /**
690 * Parse the W3C date/time format, a subset of ISO 8601.
691 *
692 * PHP date parsing functions do not handle this format.
693 * See http://www.w3.org/TR/NOTE-datetime for more information.
694 * Originally from MagpieRSS (http://magpierss.sourceforge.net/).
695 *
696 * @param $date_str
697 * A string with a potentially W3C DTF date.
698 * @return
699 * A timestamp if parsed successfully or FALSE if not.
700 */
701 function _parser_common_syndication_parse_w3cdtf($date_str) {
702 if (preg_match('/(\d{4})-(\d{2})-(\d{2})T(\d{2}):(\d{2})(:(\d{2}))?(?:([-+])(\d{2}):?(\d{2})|(Z))?/', $date_str, $match)) {
703 list($year, $month, $day, $hours, $minutes, $seconds) = array($match[1], $match[2], $match[3], $match[4], $match[5], $match[6]);
704 // Calculate the epoch for current date assuming GMT.
705 $epoch = gmmktime($hours, $minutes, $seconds, $month, $day, $year);
706 if ($match[10] != 'Z') { // Z is zulu time, aka GMT
707 list($tz_mod, $tz_hour, $tz_min) = array($match[8], $match[9], $match[10]);
708 // Zero out the variables.
709 if (!$tz_hour) {
710 $tz_hour = 0;
711 }
712 if (!$tz_min) {
713 $tz_min = 0;
714 }
715 $offset_secs = (($tz_hour * 60) + $tz_min) * 60;
716 // Is timezone ahead of GMT? If yes, subtract offset.
717 if ($tz_mod == '+') {
718 $offset_secs *= -1;
719 }
720 $epoch += $offset_secs;
721 }
722 return $epoch;
723 }
724 else {
725 return FALSE;
726 }
727 }
728
729 /**
730 * Extract the link that points to the original content (back to site or original article)
731 *
732 * @param $links
733 * Array of SimpleXML objects
734 */
735 function _parser_common_syndication_link($links) {
736 $to_link = '';
737 if (count($links) > 0) {
738 foreach ($links as $link) {
739 $link = $link->attributes();
740 $to_link = isset($link["href"]) ? "{$link["href"]}" : "";
741 if (isset($link["rel"])) {
742 if ("{$link["rel"]}" == 'alternate') {
743 break;
744 }
745 }
746 }
747 }
748 return $to_link;
749 }
750
751 /**
752 * Prepare raw data to be a title
753 */
754 function _parser_common_syndication_title($title, $body = FALSE) {
755 if (empty($title) && !empty($body)) {
756 // Explode to words and use the first 3 words.
757 $words = preg_split("/[\s,]+/", $body);
758 $title = $words[0] .' '. $words[1] .' '. $words[2];
759 }
760 return $title;
761 }

  ViewVC Help
Powered by ViewVC 1.1.2