/[drupal]/contributions/modules/feedapi/parser_common_syndication/parser_common_syndication.module
ViewVC logotype

Contents of /contributions/modules/feedapi/parser_common_syndication/parser_common_syndication.module

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.6 - (show annotations) (download) (as text)
Mon Jul 23 17:27:08 2007 UTC (2 years, 4 months ago) by aronnovak
Branch: MAIN
CVS Tags: DRUPAL-5--0-1, HEAD
Branch point for: DRUPAL-5
Changes since 1.5: +5 -5 lines
File MIME type: text/x-php
Fixing the flow of creating the feed. $feed - $url parameter swap was fixed.
1 <?php
2 /* $Id: parser_common_syndication.module,v 1.5 2007/07/23 15:40:20 alexb Exp $ */
3
4 /**
5 * @file
6 * Parse the incoming URL with SimpleXML then provide a data structure of the feed
7 * Require PHP5 because of SimpleXML
8 */
9
10 /**
11 * Implementation of hook_help().
12 */
13 function parser_common_syndication_help($section) {
14 switch($section) {
15 case 'admin/modules#description':
16 return t('Provide a common syndication parser for FeedAPI-compatible modules');
17 break;
18 }
19 }
20
21 /**
22 * Implementation of hook_feedapi_compatible().
23 *
24 * @param $url
25 * The feed's url
26 * @return
27 * a string - feed type if the parser is able to process it, FALSE if it's not compatible
28 */
29 function parser_common_syndication_feedapi_compatible($url) {
30 if (!function_exists('simplexml_load_string')) {
31 return FALSE;
32 }
33 $downloaded_string = _parser_common_syndication_download($url);
34 if (!defined('LIBXML_VERSION') || (version_compare(phpversion(), '5.1.0', '<'))) {
35 @ $xml = simplexml_load_string($downloaded_string, NULL);
36 }
37 else {
38 @ $xml = simplexml_load_string($downloaded_string, NULL, LIBXML_NOERROR | LIBXML_NOWARNING);
39 }
40 if (_parser_common_syndication_feed_format_detect($xml) != FALSE) {
41 // We don't have to choose between the types, because this module is only able to parse one
42 return array_shift(parser_common_syndication_feedapi_type());
43 }
44 return FALSE;
45 }
46
47 /**
48 * Implementation of hook_feedapi_parse().
49 *
50 * @param $url
51 * The feed's url
52 * @return stdClass
53 * The structured datas extracted from the feed
54 */
55 function parser_common_syndication_feedapi_parse($feed) {
56 $downloaded_string = _parser_common_syndication_download($feed->url);
57 if ($downloaded_string == FALSE) {
58 return FALSE;
59 }
60
61 if (!defined('LIBXML_VERSION') || (version_compare(phpversion(), '5.1.0', '<'))) {
62 @ $xml = simplexml_load_string($downloaded_string, NULL);
63 }
64 else {
65 @ $xml = simplexml_load_string($downloaded_string, NULL, LIBXML_NOERROR | LIBXML_NOWARNING);
66 }
67
68 // We got a malformed XML
69 if ($xml === FALSE || $xml == NULL) {
70 return FALSE;
71 }
72
73 $feed_type = _parser_common_syndication_feed_format_detect($xml);
74 if ($feed_type == "atom1.0") {
75 return _parser_common_syndication_atom10_parse($xml);
76 }
77 if ($feed_type == "RSS2.0" || $feed_type == "RSS0.91" || $feed_type == "RSS0.92") {
78 return _parser_common_syndication_RSS20_parse($xml);
79 }
80 if ($feed_type == "RDF") {
81 return _parser_common_syndication_RDF10_parse($xml);
82 }
83 return FALSE;
84 }
85
86 /**
87 * Implementation of hook_feedapi_type().
88 * Define the feed types that this module is able to handle
89 *
90 * @return
91 * The types
92 */
93 function parser_common_syndication_feedapi_type() {
94 return array("XML feed");
95 }
96
97 /**
98 * Determine the feed format of a SimpleXML parsed object structure
99 *
100 * @param $xml
101 * SimpleXML-preprocessed feed
102 * @return
103 * a string - means the feed format
104 */
105 function _parser_common_syndication_feed_format_detect($xml) {
106 if (!is_object($xml)) {
107 return FALSE;
108 }
109 $attr = $xml->attributes();
110 //print_r($xml);
111 if (isset($xml->entry) && strtolower($xml->getName()) == "feed") {
112 return "atom1.0";
113 }
114 if (strtolower($xml->getName()) == "rss" && $attr["version"] == "2.0") {
115 return "RSS2.0";
116 }
117 if (strtolower($xml->getName()) == "rdf" && isset($xml->channel)) {
118 return "RDF";
119 }
120 if (strtolower($xml->getName()) == "rss" && $attr["version"] == "0.91") {
121 return "RSS0.91";
122 }
123 if (strtolower($xml->getName()) == "rss" && $attr["version"] == "0.92") {
124 return "RSS0.92";
125 }
126 return FALSE;
127 }
128
129 /**
130 * Call one of the possible feedapi_get hook and pass back the downloaded data
131 *
132 * @return
133 * string - the downloaded data, FALSE - if the URL is not reachable
134 */
135 function _parser_common_syndication_download($url) {
136 $downloaders = module_implements("feedapi_get");
137 $downloaded_string = "";
138
139 $this_types = parser_common_syndication_feedapi_type();
140 // Pick one module that can able to download this
141 foreach ($downloaders as $concrete_module) {
142 $types = module_invoke($concrete_module, "feedapi_type");
143 // If the downloader can get this type of content
144 if (count(array_intersect($this_types, $types)) > 0) {
145 $downloaded_string = module_invoke($concrete_module, "feedapi_get", $url);
146 break;
147 }
148 }
149
150 // Cannot get the feed, pass the problem to one level upper
151 if ($downloaded_string == "") {
152 return FALSE;
153 }
154 return $downloaded_string;
155 }
156
157 /**
158 * Parse atom feeds
159 */
160 function _parser_common_syndication_atom10_parse($feed_XML) {
161 $parsed_source = new stdClass();
162 // Detect the title
163 $parsed_source->title = isset($feed_XML->title) ? (string) $feed_XML->title : "";
164 // Detect the description
165 $parsed_source->description = isset($feed_XML->subtitle) ? (string) $feed_XML->subtitle : "";
166 $parsed_source->options = new stdClass();
167 // Detect the link
168 $parsed_source->options->link = "";
169 if (count($feed_XML->link) > 0) {
170 $link = $feed_XML->link;
171 $link = $link->attributes();
172 $parsed_source->options->link = isset($link["href"]) ? (string) $link["href"] : "";
173 }
174
175 $parsed_source->items = array();
176
177 foreach ($feed_XML->entry as $news) {
178 $original_url = NULL;
179
180 if ($news->id) {
181 $guid = "{$news->id}";
182 }
183 else {
184 $guid = NULL;
185 }
186
187 // I don't know how standard this is, but sometimes the id is the URL
188 if (valid_url($guid, TRUE)) {
189 $original_url = $guid;
190 }
191
192 $additional_taxonomies = array();
193
194 if ($news->category) {
195 $additional_taxonomies['ATOM Categories'] = array();
196 foreach ($news->category AS $category) {
197 $additional_taxonomies['ATOM Categories'][] = "{$category['term']}";
198 }
199 }
200
201 $title = "{$news->title}";
202
203 if ($news->content) {
204 $body = '';
205 foreach($news->content->children() as $child) {
206 $body .= $child->asXML();
207 }
208 $body .= "{$news->content}";
209 }
210 else if ($news->summary) {
211 $body = '';
212 foreach($news->summary->children() as $child) {
213 $body .= $child->asXML();
214 }
215 $body .= "{$news->summary}";
216 }
217
218 if ($news->content['src']) {
219 // some src elements in some valid atom feeds contained no urls at all
220 if (valid_url("{$news->content['src']}", TRUE)) {
221 $original_url = "{$news->content['src']}";
222 }
223 }
224
225 if ($news->summary) {
226 $teaser = '';
227 foreach($news->summary->children() as $child) {
228 $teaser .= $child->asXML();
229 }
230 $teaser .= "{$news->summary}";
231 }
232 else {
233 $teaser = node_teaser($body);
234 }
235
236 $author_found = FALSE;
237
238 if ($news->source->author->name) {
239 $original_author = "{$news->source->author->name}";
240 $author_found = TRUE;
241 }
242 else if ($news->author->name) {
243 $original_author = "{$news->author->name}";
244 $author_found = TRUE;
245 }
246
247 if ($feed_XML->author->name && !$author_found) {
248 $original_author = "{$feed_XML->author->name}";
249 }
250
251 if ($news->link['href'] && valid_url("{$news->link['href']}", TRUE)) {
252 $original_url = "{$news->link['href']}";
253 }
254
255 $timestamp = strtotime("{$news->published}");
256 if ($timestamp === FALSE) {
257 $timestamp = time();
258 }
259 $item = new stdClass();
260 $item->title = $title;
261 $item->description = $body;
262 $item->options = new stdClass();
263 $item->options->teaser = $teaser;
264 $item->options->original_author = $original_author;
265 $item->options->timestamp = $timestamp;
266 $item->options->original_url = $original_url;
267 $item->options->guid = $guid;
268 $item->options->tags = $additional_taxonomies['ATOM Categories'];
269 //_aggregation_add_item($title, $body, $teaser, $original_author, $feed, $additional_taxonomies, $timestamp, $original_url, $guid, array());
270 $parsed_source->items[] = $item;
271 }
272 return $parsed_source;
273 }
274
275 /**
276 * Parse RSS1.0/RDF feeds
277 */
278 function _parser_common_syndication_RDF10_parse($feed_XML) {
279 $parsed_source = new stdClass();
280 // Detect the title
281 $parsed_source->title = isset($feed_XML->channel->title) ? (string) $feed_XML->channel->title : "";
282 // Detect the description
283 $parsed_source->description = isset($feed_XML->channel->description) ? (string) $feed_XML->channel->description : "";
284 $parsed_source->options = new stdClass();
285 // Detect the link
286 $parsed_source->options->link = isset($feed_XML->channel->link) ? (string) $feed_XML->channel->link : "";
287 $parsed_source->items = array();
288
289 // set category splitter (space is for del.icio.us feed)
290 $category_splitter = ' ';
291
292 // get the default original author
293 if ($feed_XML->channel->title) {
294 $oa = (string) $feed_XML->channel->title;
295 }
296
297 // get all namespaces
298 if (version_compare(phpversion(), '5.1.2', '<')) {
299 //versions prior 5.1.2 don't allow namespaces
300 $namespaces['default'] = NULL;
301 }
302 else {
303 $namespaces = $feed_XML->getNamespaces(TRUE);
304 }
305
306 foreach ($feed_XML->item as $news) {
307 //initialization
308 $guid = $original_url = NULL;
309 $title = $body = $teaser = '';
310 $timestamp = time();
311 $additional_taxonomies = array();
312 $original_author = $oa;
313
314 foreach($namespaces as $ns_link) {
315 //get about attribute as guid
316 foreach ($news->attributes($ns_link) as $name => $value) {
317 if ($name == 'about') {
318 $guid = $value;
319 }
320 }
321
322 //get children for current namespace
323 if (version_compare(phpversion(), '5.1.2', '<')) {
324 $ns = (array)$news;
325 }
326 else {
327 $ns = (array)$news->children($ns_link);
328 }
329
330 //title
331 if ((string)$ns['title']) {
332 $title = (string)$ns['title'];
333 }
334
335 //description or dc:description
336 if ((string)$ns['description'] && $body <> '') {
337 $body = (string)$ns['description'];
338 }
339
340 //link
341 if ((string)$ns['link']) {
342 $original_url = (string)$ns['link'];
343 }
344
345 //dc:creator
346 if ((string)$ns['creator']) {
347 $original_author = (string)$ns['creator'];
348 }
349
350 //dc:date
351 if ((string)$ns['date']) {
352 $timestamp = strtotime((string)$ns['date']);
353 }
354
355 //content:encoded
356 if ((string)$ns['encoded']) {
357 $body = (string)$ns['encoded'];
358 }
359
360 //dc:subject
361 if ((string)$ns['subject']) {
362 //there can be multiple category tags
363 if (is_array($ns['subject'])) {
364 foreach ($ns['subject'] as $cat) {
365 if (is_object($cat)) {
366 $additional_taxonomies['RDF Categories'][] = trim(strip_tags($cat->asXML()));
367 }
368 else {
369 $additional_taxonomies['RDF Categories'][] = $cat;
370 }
371 }
372 }
373 else { //or single tag
374 $additional_taxonomies['RDF Categories'] = explode($category_splitter, (string)$ns['subject']);
375 }
376 }
377 }
378
379 // description is not mandatory so use title if description not present
380 if (!$body) {
381 $body = $title;
382 }
383
384 //make teaser
385 $teaser = node_teaser($body);
386
387 // if there are no link tag but rdf:about is provided
388 if (!$original_url && $guid) {
389 $original_url = $guid;
390 }
391 $item = new stdClass();
392 $item->title = $title;
393 $item->description = $body;
394 $item->options = new stdClass();
395 $item->options->teaser = $teaser;
396 $item->options->original_author = $original_author;
397 $item->options->timestamp = $timestamp;
398 $item->options->original_url = $original_url;
399 $item->options->guid = $guid;
400 $item->options->link = $additional_taxonomies['RDF Categories'];
401 $parsed_source->items[] = $item;
402 }
403 return $parsed_source;
404 }
405
406 /**
407 * Parse RSS2.0 feeds
408 */
409 function _parser_common_syndication_RSS20_parse($feed_XML) {
410 $parsed_source = new stdClass();
411 // Detect the title
412 $parsed_source->title = isset($feed_XML->channel->title) ? (string) $feed_XML->channel->title : "";
413 // Detect the description
414 $parsed_source->description = isset($feed_XML->channel->description) ? (string) $feed_XML->channel->description : "";
415 $parsed_source->options = new stdClass();
416 // Detect the link
417 $parsed_source->options->link = isset($feed_XML->channel->link) ? (string) $feed_XML->channel->link : "";
418 $parsed_source->items = array();
419
420 foreach ($feed_XML->xpath('//item') as $news) {
421 // for PHP > 5.1.2 get 'content' namespace
422 $content = (array)$news->children('content');
423
424 $news = (array)$news;
425
426 if ($news['guid']) {
427 $guid = $news['guid'];
428 }
429 else {
430 $guid = NULL;
431 }
432
433 if ((string)$news['title']) {
434 $title = (string)$news['title'];
435 }
436 else {
437 $title = '';
438 }
439
440 if ((string)$news['description']) {
441 $body = (string)$news['description'];
442 }
443 // some sources use content:encoded as description i.e. PostNuke PageSetter module
444 elseif ((string)$news['encoded']) { //content:encoded for PHP < 5.1.2
445 $body = (string)$news['encoded'];
446 }
447 elseif ((string)$content['encoded']) { //content:encoded for PHP >= 5.1.2
448 $body = (string)$content['encoded'];
449 }
450 else {
451 $body = $news['title'];
452 }
453
454 $teaser = node_teaser($body);
455
456 if ($feed_XML->channel->title) {
457 $original_author = (string)$feed_XML->channel->title;
458 }
459
460 if ($news['link']) {
461 $original_url = $news['link'];
462 }
463 else {
464 $original_url = NULL;
465 }
466
467 $timestamp = strtotime($news['pubDate']);
468 if ($timestamp === FALSE) {
469 $timestamp = time();
470 }
471
472 $additional_taxonomies = array();
473 if ((string) $news['category'] || !empty($news['category'])) {
474 if (is_array($news['category'])) {
475 $news['category'] = $news['category'][0];
476 }
477 $additional_taxonomies['RSS Categories'] = explode('/', $news['category']);
478 }
479
480 $item = new stdClass();
481 $item->title = $title;
482 $item->description = $body;
483 $item->options = new stdClass();
484 $item->options->teaser = $teaser;
485 $item->options->original_author = $original_author;
486 $item->options->timestamp = $timestamp;
487 $item->options->original_url = $original_url;
488 $item->options->guid = $guid;
489 $item->options->tags = $additional_taxonomies['RSS Categories'];
490 $parsed_source->items[] = $item;
491 }
492 return $parsed_source;
493 }

  ViewVC Help
Powered by ViewVC 1.1.2