/[drupal]/contributions/sandbox/alex_b/feedapi_aggregator/feedapi_aggregator_parser.inc
ViewVC logotype

Contents of /contributions/sandbox/alex_b/feedapi_aggregator/feedapi_aggregator_parser.inc

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.3 - (show annotations) (download) (as text)
Tue Jul 24 13:58:02 2007 UTC (2 years, 4 months ago) by alexb
Branch: MAIN
CVS Tags: HEAD
Changes since 1.2: +3 -2 lines
File MIME type: text/x-php
Fixed: coding standards
1 <?php
2 // $Id$
3
4 function feedapi_aggregator_refresh($feed) {
5 global $channel, $image;
6 // Generate conditional GET headers.
7 $headers = array();
8
9 if ($feed['etag']) {
10 $headers['If-None-Match'] = $feed['etag'];
11 }
12 if ($feed['modified']) {
13 $headers['If-Modified-Since'] = gmdate('D, d M Y H:i:s', $feed['modified']) .' GMT';
14 }
15
16 // Request feed.
17 $result = drupal_http_request($feed['url'], $headers);
18
19 // Process HTTP response code.
20 switch ($result->code) {
21 case 304:
22 // This is done by feedapi db_query('UPDATE {aggregator_feed} SET checked = %d WHERE fid = %d', time(), $feed['fid']);
23 drupal_set_message(t('There is no new syndicated content from %site.', array('%site' => $feed['title'])));
24 break;
25 case 301:
26 $feed['url'] = $result->redirect_url;
27 watchdog('aggregator', t('Updated URL for feed %title to %url.', array('%title' => $feed['title'], '%url' => $feed['url'])));
28
29 case 200:
30 case 302:
31 case 307:
32 // Filter the input data:
33 if ($parsed_source = feedapi_aggregator_parse_feed($result->data, $feed)) {
34 if ($result->headers['Last-Modified']) {
35 $modified = strtotime($result->headers['Last-Modified']);
36 }
37
38 /*
39 ** Prepare the channel data:
40 */
41
42 foreach ($channel as $key => $value) {
43 $channel[$key] = trim($value);
44 }
45
46 /*
47 ** Prepare the image data (if any):
48 */
49
50 foreach ($image as $key => $value) {
51 $image[$key] = trim($value);
52 }
53
54 if ($image['LINK'] && $image['URL'] && $image['TITLE']) {
55 // Note, we should really use theme_image() here but that only works with local images it won't work with images fetched with a URL unless PHP version > 5
56 $image = '<a href="'. check_url($image['LINK']) .'" class="feed-image"><img src="'. check_url($image['URL']) .'" alt="'. check_plain($image['TITLE']) .'" /></a>';
57 }
58 else {
59 $image = NULL;
60 }
61
62 /*
63 ** Update the feed data:
64 */
65
66 db_query("UPDATE {aggregator_feed} SET url = '%s', checked = %d, link = '%s', description = '%s', image = '%s', etag = '%s', modified = %d WHERE fid = %d", $feed['url'], time(), $channel['LINK'], $channel['DESCRIPTION'], $image, $result->headers['ETag'], $modified, $feed['fid']);
67
68 /*
69 ** Clear the cache:
70 */
71
72 cache_clear_all();
73
74 watchdog('aggregator', t('There is new syndicated content from %site.', array('%site' => $feed['title'])));
75 drupal_set_message(t('There is new syndicated content from %site.', array('%site' => $feed['title'])));
76
77 return $parsed_source;
78 }
79 break;
80 default:
81 watchdog('aggregator', t('The feed from %site seems to be broken, due to "%error".', array('%site' => $feed['title'], '%error' => $result->code .' '. $result->error)), WATCHDOG_WARNING);
82 drupal_set_message(t('The feed from %site seems to be broken, because of error "%error".', array('%site' => $feed['title'], '%error' => $result->code .' '. $result->error)));
83 }
84 }
85
86
87 /**
88 * Call-back function used by the XML parser.
89 */
90 function feedapi_aggregator_element_start($parser, $name, $attributes) {
91 global $item, $element, $tag, $items, $channel;
92
93 switch ($name) {
94 case 'IMAGE':
95 case 'TEXTINPUT':
96 case 'CONTENT':
97 case 'SUMMARY':
98 case 'TAGLINE':
99 case 'SUBTITLE':
100 case 'LOGO':
101 case 'INFO':
102 $element = $name;
103 break;
104 case 'ID':
105 if ($element != 'ITEM') {
106 $element = $name;
107 }
108 case 'LINK':
109 if ($attributes['REL'] == 'alternate') {
110 if ($element == 'ITEM') {
111 $items[$item]['LINK'] = $attributes['HREF'];
112 }
113 else {
114 $channel['LINK'] = $attributes['HREF'];
115 }
116 }
117 break;
118 case 'ITEM':
119 $element = $name;
120 $item += 1;
121 break;
122 case 'ENTRY':
123 $element = 'ITEM';
124 $item += 1;
125 break;
126 }
127
128 $tag = $name;
129 }
130
131 /**
132 * Call-back function used by the XML parser.
133 */
134 function feedapi_aggregator_element_end($parser, $name) {
135 global $element;
136
137 switch ($name) {
138 case 'IMAGE':
139 case 'TEXTINPUT':
140 case 'ITEM':
141 case 'ENTRY':
142 case 'CONTENT':
143 case 'INFO':
144 $element = '';
145 break;
146 case 'ID':
147 if ($element == 'ID') {
148 $element = '';
149 }
150 }
151 }
152
153 /**
154 * Call-back function used by the XML parser.
155 */
156 function feedapi_aggregator_element_data($parser, $data) {
157 global $channel, $element, $items, $item, $image, $tag;
158 switch ($element) {
159 case 'ITEM':
160 $items[$item][$tag] .= $data;
161 break;
162 case 'IMAGE':
163 case 'LOGO':
164 $image[$tag] .= $data;
165 break;
166 case 'LINK':
167 if ($data) {
168 $items[$item][$tag] .= $data;
169 }
170 break;
171 case 'CONTENT':
172 $items[$item]['CONTENT'] .= $data;
173 break;
174 case 'SUMMARY':
175 $items[$item]['SUMMARY'] .= $data;
176 break;
177 case 'TAGLINE':
178 case 'SUBTITLE':
179 $channel['DESCRIPTION'] .= $data;
180 break;
181 case 'INFO':
182 case 'ID':
183 case 'TEXTINPUT':
184 // The sub-element is not supported. However, we must recognize
185 // it or its contents will end up in the item array.
186 break;
187 default:
188 $channel[$tag] .= $data;
189 }
190 }
191
192 /**
193 * Parse the W3C date/time format, a subset of ISO 8601. PHP date parsing
194 * functions do not handle this format.
195 * See http://www.w3.org/TR/NOTE-datetime for more information.
196 * Originally from MagpieRSS (http://magpierss.sourceforge.net/).
197 *
198 * @param $date_str A string with a potentially W3C DTF date.
199 * @return A timestamp if parsed successfully or -1 if not.
200 */
201 function feedapi_aggregator_parse_w3cdtf($date_str) {
202 if (preg_match('/(\d{4})-(\d{2})-(\d{2})T(\d{2}):(\d{2})(:(\d{2}))?(?:([-+])(\d{2}):?(\d{2})|(Z))?/', $date_str, $match)) {
203 list($year, $month, $day, $hours, $minutes, $seconds) = array($match[1], $match[2], $match[3], $match[4], $match[5], $match[6]);
204 // calc epoch for current date assuming GMT
205 $epoch = gmmktime($hours, $minutes, $seconds, $month, $day, $year);
206 if ($match[10] != 'Z') { // Z is zulu time, aka GMT
207 list($tz_mod, $tz_hour, $tz_min) = array($match[8], $match[9], $match[10]);
208 // zero out the variables
209 if (!$tz_hour) {
210 $tz_hour = 0;
211 }
212 if (!$tz_min) {
213 $tz_min = 0;
214 }
215 $offset_secs = (($tz_hour * 60) + $tz_min) * 60;
216 // is timezone ahead of GMT? then subtract offset
217 if ($tz_mod == '+') {
218 $offset_secs *= -1;
219 }
220 $epoch += $offset_secs;
221 }
222 return $epoch;
223 }
224 else {
225 return FALSE;
226 }
227 }
228
229 function feedapi_aggregator_parse_feed(&$data, $feed) {
230 global $items, $image, $channel;
231
232 // Unset the global variables before we use them:
233 unset($GLOBALS['element'], $GLOBALS['item'], $GLOBALS['tag']);
234 $items = array();
235 $image = array();
236 $channel = array();
237
238 // parse the data:
239 $xml_parser = drupal_xml_parser_create($data);
240 xml_set_element_handler($xml_parser, 'feedapi_aggregator_element_start', 'feedapi_aggregator_element_end');
241 xml_set_character_data_handler($xml_parser, 'feedapi_aggregator_element_data');
242
243 if (!xml_parse($xml_parser, $data, 1)) {
244 watchdog('aggregator', t('The feed from %site seems to be broken, due to an error "%error" on line %line.', array('%site' => $feed['title'], '%error' => xml_error_string(xml_get_error_code($xml_parser)), '%line' => xml_get_current_line_number($xml_parser))), WATCHDOG_WARNING);
245 drupal_set_message(t('The feed from %site seems to be broken, because of error "%error" on line %line.', array('%site' => $feed['title'], '%error' => xml_error_string(xml_get_error_code($xml_parser)), '%line' => xml_get_current_line_number($xml_parser))), 'error');
246 return 0;
247 }
248 xml_parser_free($xml_parser);
249
250 /*
251 ** We reverse the array such that we store the first item last,
252 ** and the last item first. In the database, the newest item
253 ** should be at the top.
254 */
255
256 $items = array_reverse($items);
257
258 // Create a feedapi parsed source object
259 $parsed_source = new stdClass();
260 // Detect the title
261 $parsed_source->title = isset($channel['TITLE']) ? $channel['TITLE'] : "";
262 // Detect the description
263 $parsed_source->description = isset($channel['DESCRIPTION']) ? $channel['DESCRIPTION'] : "";
264 $parsed_source->options = new stdClass();
265 // Detect the link
266 $parsed_source->options->link = isset($channel['LINK']) ? $channel['LINK'] : "";
267 $parsed_source->items = array();
268
269 // Initialize variables
270 $title = $link = $author = $description = $guid = NULL;
271 foreach ($items as $item) {
272 unset($title, $link, $author, $description, $guid);
273
274 // Prepare the item:
275 foreach ($item as $key => $value) {
276 $item[$key] = trim($value);
277 }
278
279 /*
280 ** Resolve the item's title. If no title is found, we use
281 ** up to 40 characters of the description ending at a word
282 ** boundary but not splitting potential entities.
283 */
284
285 if ($item['TITLE']) {
286 $title = $item['TITLE'];
287 }
288 else {
289 $title = preg_replace('/^(.*)[^\w;&].*?$/', "\\1", truncate_utf8($item['DESCRIPTION'], 40));
290 }
291
292 /*
293 ** Resolve the items link.
294 */
295
296 if ($item['LINK']) {
297 $link = $item['LINK'];
298 }
299 else {
300 $link = $feed['link'];
301 }
302 if ($item['GUID']) {
303 $guid = $item['GUID'];
304 }
305
306 /**
307 * Atom feeds have a CONTENT and/or SUMMARY tag instead of a DESCRIPTION tag
308 */
309 if ($item['CONTENT:ENCODED']) {
310 $item['DESCRIPTION'] = $item['CONTENT:ENCODED'];
311 }
312 else if ($item['SUMMARY']) {
313 $item['DESCRIPTION'] = $item['SUMMARY'];
314 }
315 else if ($item['CONTENT']) {
316 $item['DESCRIPTION'] = $item['CONTENT'];
317 }
318
319 /*
320 ** Try to resolve and parse the item's publication date. If no
321 ** date is found, we use the current date instead.
322 */
323
324 if ($item['PUBDATE']) $date = $item['PUBDATE']; // RSS 2.0
325 else if ($item['DC:DATE']) $date = $item['DC:DATE']; // Dublin core
326 else if ($item['DCTERMS:ISSUED']) $date = $item['DCTERMS:ISSUED']; // Dublin core
327 else if ($item['DCTERMS:CREATED']) $date = $item['DCTERMS:CREATED']; // Dublin core
328 else if ($item['DCTERMS:MODIFIED']) $date = $item['DCTERMS:MODIFIED']; // Dublin core
329 else if ($item['ISSUED']) $date = $item['ISSUED']; // Atom XML
330 else if ($item['CREATED']) $date = $item['CREATED']; // Atom XML
331 else if ($item['MODIFIED']) $date = $item['MODIFIED']; // Atom XML
332 else if ($item['PUBLISHED']) $date = $item['PUBLISHED']; // Atom XML
333 else if ($item['UPDATED']) $date = $item['UPDATED']; // Atom XML
334 else $date = 'now';
335
336 $timestamp = strtotime($date); // As of PHP 5.1.0, strtotime returns FALSE on failure instead of -1.
337 if ($timestamp <= 0) {
338 $timestamp = feedapi_aggregator_parse_w3cdtf($date); // Returns FALSE on failure
339 if (!$timestamp) {
340 $timestamp = time(); // better than nothing
341 }
342 }
343
344 // Add item to feedapi items array.
345 $parsed_item = new stdClass();
346 $parsed_item->title = $title;
347 $parsed_item->description = $item['DESCRIPTION'];
348 $parsed_item->options = new stdClass();
349 $parsed_item->options->original_author = $item['AUTHOR'];
350 $parsed_item->options->timestamp = $timestamp;
351 $parsed_item->options->original_url = $link;
352 $parsed_item->options->guid = $guid;
353 $parsed_source->items[] = $parsed_item;
354
355 }
356
357 return $parsed_source;
358 }

  ViewVC Help
Powered by ViewVC 1.1.2