Stripping CVS keywords
[project/feeds.git] / libraries / http_request.inc
1 <?php
2
3 /**
4 * @file
5 * Download via HTTP.
6 *
7 * Support caching, HTTP Basic Authentication, detection of RSS/Atom feeds,
8 * redirects.
9 */
10
11 /**
12 * PCRE for finding the link tags in html.
13 */
14 define('HTTP_REQUEST_PCRE_LINK_TAG', '/<link((?:[\x09\x0A\x0B\x0C\x0D\x20]+[^\x09\x0A\x0B\x0C\x0D\x20\x2F\x3E][^\x09\x0A\x0B\x0C\x0D\x20\x2F\x3D\x3E]*(?:[\x09\x0A\x0B\x0C\x0D\x20]*=[\x09\x0A\x0B\x0C\x0D\x20]*(?:"(?:[^"]*)"|\'(?:[^\']*)\'|(?:[^\x09\x0A\x0B\x0C\x0D\x20\x22\x27\x3E][^\x09\x0A\x0B\x0C\x0D\x20\x3E]*)?))?)*)[\x09\x0A\x0B\x0C\x0D\x20]*(>(.*)<\/link>|(\/)?>)/si');
15
16 /**
17 * PCRE for matching all the attributes in a tag.
18 */
19 define('HTTP_REQUEST_PCRE_TAG_ATTRIBUTES', '/[\x09\x0A\x0B\x0C\x0D\x20]+([^\x09\x0A\x0B\x0C\x0D\x20\x2F\x3E][^\x09\x0A\x0B\x0C\x0D\x20\x2F\x3D\x3E]*)(?:[\x09\x0A\x0B\x0C\x0D\x20]*=[\x09\x0A\x0B\x0C\x0D\x20]*(?:"([^"]*)"|\'([^\']*)\'|([^\x09\x0A\x0B\x0C\x0D\x20\x22\x27\x3E][^\x09\x0A\x0B\x0C\x0D\x20\x3E]*)?))?/');
20
21 /**
22 * For cUrl specific errors.
23 */
24 class HRCurlException extends Exception {}
25
26 /**
27 * Discover RSS or atom feeds at the given URL. If document in given URL is an
28 * HTML document, function attempts to discover RSS or Atom feeds.
29 *
30 * @return
31 * string - the discovered feed, FALSE - if the URL is not reachable or there
32 * no feeds.
33 */
34 function http_request_get_common_syndication($url, $settings = NULL) {
35 if (valid_url($url, TRUE)) {
36 // Handle password protected feeds.
37 $url_parts = parse_url($url);
38 $password = $username = NULL;
39 if (!empty($url_parts['user'])) {
40 $password = $url_parts['pass'];
41 $username = $url_parts['user'];
42 }
43 }
44
45 $accept_invalid_cert = isset($settings['accept_invalid_cert']) ? $settings['accept_invalid_cert'] : FALSE;
46 $download = http_request_get($url, $username, $password, $accept_invalid_cert);
47
48 // Cannot get the feed, return.
49 // http_request_get() always returns 200 even if its 304.
50 if ($download->code != 200) {
51 return FALSE;
52 }
53
54 // Drop the data into a seperate variable so all manipulations of the html
55 // will not effect the actual object that exists in the static cache.
56 // @see http_request_get.
57 $downloaded_string = $download->data;
58 // If this happens to be a feed then just return the url.
59 if (http_request_is_feed($download->headers['Content-Type'], $downloaded_string)) {
60 return $url;
61 }
62
63 $discovered_feeds = http_request_find_feeds($downloaded_string);
64 foreach ($discovered_feeds as $feed_url) {
65 $absolute = http_request_create_absolute_url($feed_url, $url);
66 if (!empty($absolute)) {
67 // @TODO: something more intelligent?
68 return $absolute;
69 }
70 }
71 }
72
73 /**
74 * Get the content from the given URL.
75 *
76 * @param $url
77 * A valid URL (not only web URLs).
78 * @param $username
79 * If the URL use authentication, here you can supply the username for this.
80 * @param $password
81 * If the URL use authentication, here you can supply the password for this.
82 * @return
83 * A stdClass object that describes the data downloaded from $url. The object's
84 * data property contains the actual document at the URL.
85 */
86 function http_request_get($url, $username = NULL, $password = NULL, $accept_invalid_cert = FALSE) {
87 // Intra-pagedownload cache, avoid to download the same content twice within one page download (it's possible, compatible and parse calls).
88 static $download_cache = array();
89 if (isset($download_cache[$url])) {
90 return $download_cache[$url];
91 }
92 $has_etag = FALSE;
93 $curl = http_request_use_curl();
94
95 // Only download and parse data if really needs refresh.
96 // Based on "Last-Modified" and "If-Modified-Since".
97 $headers = array();
98 if ($cache = cache_get('feeds_http_download_'. md5($url))) {
99 $last_result = $cache->data;
100 $last_headers = $last_result->headers;
101
102 $has_etag = TRUE;
103 if (!empty($last_headers['ETag'])) {
104 if ($curl) {
105 $headers[] = 'If-None-Match: '. $last_headers['ETag'];
106 }
107 else {
108 $headers['If-None-Match'] = $last_headers['ETag'];
109 }
110 }
111 if (!empty($last_headers['Last-Modified'])) {
112 if ($curl) {
113 $headers[] = 'If-Modified-Since: '. $last_headers['Last-Modified'];
114 }
115 else {
116 $headers['If-Modified-Since'] = $last_headers['Last-Modified'];
117 }
118 }
119 if (!empty($username) && !$curl) {
120 $headers['Authorization'] = 'Basic '. base64_encode("$username:$password");
121 }
122 }
123
124 if ($curl) {
125 $headers[] = 'User-Agent: Drupal (+http://drupal.org/)';
126 $result = new stdClass();
127
128 // Only download via cURL if we can validate the scheme to be either http or
129 // https.
130 // Validate in PHP, CURLOPT_PROTOCOLS is only supported with cURL 7.19.4
131 $uri = parse_url($url);
132 if (isset($uri['scheme']) && $uri['scheme'] != 'http' && $uri['scheme'] != 'https') {
133 $result->error = 'invalid schema '. $uri['scheme'];
134 $result->code = -1003; // This corresponds to drupal_http_request()
135 }
136 else {
137
138 $download = curl_init($url);
139 curl_setopt($download, CURLOPT_FOLLOWLOCATION, TRUE);
140 if (!empty($username)) {
141 curl_setopt($download, CURLOPT_USERPWD, "{$username}:{$password}");
142 }
143 curl_setopt($download, CURLOPT_HTTPHEADER, $headers);
144 curl_setopt($download, CURLOPT_HEADER, TRUE);
145 curl_setopt($download, CURLOPT_RETURNTRANSFER, TRUE);
146 curl_setopt($download, CURLOPT_ENCODING, '');
147 curl_setopt($download, CURLOPT_TIMEOUT, variable_get('http_request_timeout', 15));
148 if ($accept_invalid_cert) {
149 curl_setopt($download, CURLOPT_SSL_VERIFYPEER, 0);
150 }
151 $header = '';
152 $data = curl_exec($download);
153 if (curl_error($download)) {
154 throw new HRCurlException(t('cURL error (@code) @error for @url', array('@code' => curl_errno($download), '@error' => curl_error($download), '@url' => $url)), curl_errno($download));
155 }
156 $header_size = curl_getinfo($download, CURLINFO_HEADER_SIZE);
157 $header = substr($data, 0, $header_size - 1);
158 $result->data = substr($data, $header_size);
159 $header_lines = preg_split("/\r\n|\n|\r/", $header);
160
161 $result->headers = array();
162 array_shift($header_lines); // skip HTTP response status
163 while ($line = trim(array_shift($header_lines))) {
164 list($header, $value) = explode(':', $line, 2);
165 if (isset($result->headers[$header]) && $header == 'Set-Cookie') {
166 // RFC 2109: the Set-Cookie response header comprises the token Set-
167 // Cookie:, followed by a comma-separated list of one or more cookies.
168 $result->headers[$header] .= ','. trim($value);
169 }
170 else {
171 $result->headers[$header] = trim($value);
172 }
173 }
174 $result->code = curl_getinfo($download, CURLINFO_HTTP_CODE);
175
176 curl_close($download);
177 }
178 }
179 else {
180 $result = drupal_http_request($url, array('headers' => $headers));
181 }
182
183 $result->code = isset($result->code) ? $result->code : 200;
184
185 // In case of 304 Not Modified try to return cached data.
186 if ($result->code == 304) {
187
188 if (isset($last_result)) {
189 $last_result->from_cache = TRUE;
190 return $last_result;
191 }
192 else {
193 // It's a tragedy, this file must exist and contain good data.
194 // In this case, clear cache and repeat.
195 cache_clear_all('feeds_http_download_'. md5($url), 'cache');
196 return http_request_get($url, $username, $password);
197 }
198 }
199
200 if (!isset($result->headers) || !isset($result->headers['ETag']) || !isset($result->headers['Last-Modified'])) {
201 $result->headers = isset($result->headers) ? $result->headers : array();
202 $result->headers['ETag'] = isset($result->headers['ETag']) ? $result->headers['ETag'] : '';
203 $result->headers['Last-Modified'] = isset($result->headers['Last-Modified']) ? $result->headers['Last-Modified'] : '';
204 }
205
206 // Set caches.
207 cache_set('feeds_http_download_'. md5($url), $result);
208 $download_cache[$url] = $result;
209
210 return $result;
211 }
212
213 /**
214 * Decides if it's possible to use cURL or not.
215 *
216 * @return
217 * TRUE if curl is available, FALSE otherwise.
218 */
219 function http_request_use_curl() {
220 $basedir = ini_get("open_basedir");
221 return function_exists('curl_init') && !ini_get('safe_mode') && empty($basedir);
222 }
223
224 /**
225 * Clear cache for a specific URL.
226 */
227 function http_request_clear_cache($url) {
228 cache_clear_all('feeds_http_download_'. md5($url), 'cache');
229 }
230
231 /**
232 * Returns if the provided $content_type is a feed.
233 *
234 * @param string $content_type
235 * The Content-Type header.
236 *
237 * @param string $data
238 * The actual data from the http request.
239 *
240 * @return boolean
241 * Returns TRUE if this is a parsable feed.
242 */
243 function http_request_is_feed($content_type, $data) {
244 $pos = strpos($content_type, ';');
245 if ($pos !== FALSE) {
246 $content_type = substr($content_type, 0, $pos);
247 }
248 $content_type = strtolower($content_type);
249 if (strpos($content_type, 'xml') !== FALSE) {
250 return TRUE;
251 }
252
253 // @TODO: Sometimes the content-type can be text/html but still be a valid
254 // feed.
255 return FALSE;
256 }
257
258 /**
259 * Finds potential feed tags in the HTML document.
260 *
261 * @param string $html
262 * The html string to search.
263 *
264 * @return array()
265 * An array of href to feeds.
266 */
267 function http_request_find_feeds($html) {
268 $matches = array();
269 preg_match_all(HTTP_REQUEST_PCRE_LINK_TAG, $html, $matches);
270 $links = $matches[1];
271 $candidates = array();
272 $valid_links = array();
273
274 // Build up all the links information.
275 foreach ($links as $link_tag) {
276 $attributes = array();
277 $candidate = array();
278
279 preg_match_all(HTTP_REQUEST_PCRE_TAG_ATTRIBUTES, $link_tag, $attributes, PREG_SET_ORDER);
280 foreach ($attributes as $attribute) {
281 // Find the key value pairs, attribute[1] is key and attribute[2] is the
282 // value.
283 if(!empty($attribute[1]) && !empty($attribute[2])) {
284 $candidate[drupal_strtolower($attribute[1])] = drupal_strtolower(decode_entities($attribute[2]));
285 }
286 }
287
288 // Examine candidate to see if it s a feed.
289 // @TODO: could/should use http_request_is_feed ??
290 if (isset($candidate['rel']) && $candidate['rel'] == 'alternate') {
291 if (isset($candidate['href']) && isset($candidate['type']) && strpos($candidate['type'], 'xml') !== FALSE) {
292 // All tests pass, its a valid candidate.
293 $valid_links[] = $candidate['href'];
294 }
295 }
296 }
297
298 return $valid_links;
299 }
300
301 /**
302 * Create an absolute url.
303 *
304 * @param string $url
305 * The href to transform.
306 *
307 * @param $base_url
308 * The url to be used as the base for a relative $url.
309 *
310 * @return string
311 * an absolute url
312 */
313 function http_request_create_absolute_url($url, $base_url) {
314 $url = trim($url);
315 if (valid_url($url, TRUE)) {
316 // Valid absolute url already.
317 return $url;
318 }
319
320 // Turn relative url into absolute.
321 if (valid_url($url, FALSE)) {
322 // Produces variables $scheme, $host, $user, $pass, $path, $query and $fragment.
323 $parsed_url = parse_url($base_url);
324
325 $path = dirname($parsed_url['path']);
326
327 // Adding to the existing path.
328 if ($url{0} == '/') {
329 $cparts = array_filter(explode("/", $url));
330 }
331 else {
332 // Backtracking from the existing path.
333 $cparts = array_merge(array_filter(explode("/", $path)), array_filter(explode("/", $url)));
334 foreach($cparts as $i => $part) {
335 if($part == '.') {
336 $cparts[$i] = null;
337 }
338 if($part == '..') {
339 $cparts[$i - 1] = null;
340 $cparts[$i] = null;
341 }
342 }
343 $cparts = array_filter($cparts);
344 }
345 $path = implode("/", $cparts);
346
347 // Build the prefix to the path.
348 $absolute_url = '';
349 if (isset($parsed_url['scheme'])) {
350 $absolute_url = $parsed_url['scheme'] . '://';
351 }
352
353 if (isset($parsed_url['user'])) {
354 $absolute_url .= $parsed_url['user'];
355 if (isset($pass)) {
356 $absolute_url .= ':' . $parsed_url['pass'];
357 }
358 $absolute_url .= '@';
359 }
360 if (isset($parsed_url['host'])) {
361 $absolute_url .= $parsed_url['host'] . '/';
362 }
363
364 $absolute_url .= $path;
365
366 if (valid_url($absolute_url, TRUE)) {
367 return $absolute_url;
368 }
369 }
370 return FALSE;
371 }