7 * Support caching, HTTP Basic Authentication, detection of RSS/Atom feeds,
12 * PCRE for finding the link tags in html.
14 define('HTTP_REQUEST_PCRE_LINK_TAG', '/<link((?:[\x09\x0A\x0B\x0C\x0D\x20]+[^\x09\x0A\x0B\x0C\x0D\x20\x2F\x3E][^\x09\x0A\x0B\x0C\x0D\x20\x2F\x3D\x3E]*(?:[\x09\x0A\x0B\x0C\x0D\x20]*=[\x09\x0A\x0B\x0C\x0D\x20]*(?:"(?:[^"]*)"|\'(?:[^\']*)\'|(?:[^\x09\x0A\x0B\x0C\x0D\x20\x22\x27\x3E][^\x09\x0A\x0B\x0C\x0D\x20\x3E]*)?))?)*)[\x09\x0A\x0B\x0C\x0D\x20]*(>(.*)<\/link>|(\/)?>)/si');
17 * PCRE for matching all the attributes in a tag.
19 define('HTTP_REQUEST_PCRE_TAG_ATTRIBUTES', '/[\x09\x0A\x0B\x0C\x0D\x20]+([^\x09\x0A\x0B\x0C\x0D\x20\x2F\x3E][^\x09\x0A\x0B\x0C\x0D\x20\x2F\x3D\x3E]*)(?:[\x09\x0A\x0B\x0C\x0D\x20]*=[\x09\x0A\x0B\x0C\x0D\x20]*(?:"([^"]*)"|\'([^\']*)\'|([^\x09\x0A\x0B\x0C\x0D\x20\x22\x27\x3E][^\x09\x0A\x0B\x0C\x0D\x20\x3E]*)?))?/');
22 * For cUrl specific errors.
24 class HRCurlException
extends Exception
{}
27 * Discover RSS or atom feeds at the given URL. If document in given URL is an
28 * HTML document, function attempts to discover RSS or Atom feeds.
31 * string - the discovered feed, FALSE - if the URL is not reachable or there
34 function http_request_get_common_syndication($url, $settings = NULL
) {
35 if (valid_url($url, TRUE
)) {
36 // Handle password protected feeds.
37 $url_parts = parse_url($url);
38 $password = $username = NULL
;
39 if (!empty($url_parts['user'])) {
40 $password = $url_parts['pass'];
41 $username = $url_parts['user'];
45 $accept_invalid_cert = isset($settings['accept_invalid_cert']) ?
$settings['accept_invalid_cert'] : FALSE
;
46 $download = http_request_get($url, $username, $password, $accept_invalid_cert);
48 // Cannot get the feed, return.
49 // http_request_get() always returns 200 even if its 304.
50 if ($download->code
!= 200) {
54 // Drop the data into a seperate variable so all manipulations of the html
55 // will not effect the actual object that exists in the static cache.
56 // @see http_request_get.
57 $downloaded_string = $download->data
;
58 // If this happens to be a feed then just return the url.
59 if (http_request_is_feed($download->headers
['Content-Type'], $downloaded_string)) {
63 $discovered_feeds = http_request_find_feeds($downloaded_string);
64 foreach ($discovered_feeds as
$feed_url) {
65 $absolute = http_request_create_absolute_url($feed_url, $url);
66 if (!empty($absolute)) {
67 // @TODO: something more intelligent?
74 * Get the content from the given URL.
77 * A valid URL (not only web URLs).
79 * If the URL use authentication, here you can supply the username for this.
81 * If the URL use authentication, here you can supply the password for this.
83 * A stdClass object that describes the data downloaded from $url. The object's
84 * data property contains the actual document at the URL.
86 function http_request_get($url, $username = NULL
, $password = NULL
, $accept_invalid_cert = FALSE
) {
87 // Intra-pagedownload cache, avoid to download the same content twice within one page download (it's possible, compatible and parse calls).
88 static
$download_cache = array();
89 if (isset($download_cache[$url])) {
90 return $download_cache[$url];
93 $curl = http_request_use_curl();
95 // Only download and parse data if really needs refresh.
96 // Based on "Last-Modified" and "If-Modified-Since".
98 if ($cache = cache_get('feeds_http_download_'.
md5($url))) {
99 $last_result = $cache->data
;
100 $last_headers = $last_result->headers
;
103 if (!empty($last_headers['ETag'])) {
105 $headers[] = 'If-None-Match: '.
$last_headers['ETag'];
108 $headers['If-None-Match'] = $last_headers['ETag'];
111 if (!empty($last_headers['Last-Modified'])) {
113 $headers[] = 'If-Modified-Since: '.
$last_headers['Last-Modified'];
116 $headers['If-Modified-Since'] = $last_headers['Last-Modified'];
119 if (!empty($username) && !$curl) {
120 $headers['Authorization'] = 'Basic '.
base64_encode("$username:$password");
125 $headers[] = 'User-Agent: Drupal (+http://drupal.org/)';
126 $result = new
stdClass();
128 // Only download via cURL if we can validate the scheme to be either http or
130 // Validate in PHP, CURLOPT_PROTOCOLS is only supported with cURL 7.19.4
131 $uri = parse_url($url);
132 if (isset($uri['scheme']) && $uri['scheme'] != 'http' && $uri['scheme'] != 'https') {
133 $result->error
= 'invalid schema '.
$uri['scheme'];
134 $result->code
= -1003; // This corresponds to drupal_http_request()
138 $download = curl_init($url);
139 curl_setopt($download, CURLOPT_FOLLOWLOCATION
, TRUE
);
140 if (!empty($username)) {
141 curl_setopt($download, CURLOPT_USERPWD
, "{$username}:{$password}");
143 curl_setopt($download, CURLOPT_HTTPHEADER
, $headers);
144 curl_setopt($download, CURLOPT_HEADER
, TRUE
);
145 curl_setopt($download, CURLOPT_RETURNTRANSFER
, TRUE
);
146 curl_setopt($download, CURLOPT_ENCODING
, '');
147 curl_setopt($download, CURLOPT_TIMEOUT
, variable_get('http_request_timeout', 15));
148 if ($accept_invalid_cert) {
149 curl_setopt($download, CURLOPT_SSL_VERIFYPEER
, 0);
152 $data = curl_exec($download);
153 if (curl_error($download)) {
154 throw new
HRCurlException(t('cURL error (@code) @error for @url', array('@code' => curl_errno($download), '@error' => curl_error($download), '@url' => $url)), curl_errno($download));
156 $header_size = curl_getinfo($download, CURLINFO_HEADER_SIZE
);
157 $header = substr($data, 0, $header_size - 1);
158 $result->data
= substr($data, $header_size);
159 $header_lines = preg_split("/\r\n|\n|\r/", $header);
161 $result->headers
= array();
162 array_shift($header_lines); // skip HTTP response status
163 while ($line = trim(array_shift($header_lines))) {
164 list($header, $value) = explode(':', $line, 2);
165 if (isset($result->headers
[$header]) && $header == 'Set-Cookie') {
166 // RFC 2109: the Set-Cookie response header comprises the token Set-
167 // Cookie:, followed by a comma-separated list of one or more cookies.
168 $result->headers
[$header] .
= ','.
trim($value);
171 $result->headers
[$header] = trim($value);
174 $result->code
= curl_getinfo($download, CURLINFO_HTTP_CODE
);
176 curl_close($download);
180 $result = drupal_http_request($url, array('headers' => $headers));
183 $result->code
= isset($result->code
) ?
$result->code
: 200;
185 // In case of 304 Not Modified try to return cached data.
186 if ($result->code
== 304) {
188 if (isset($last_result)) {
189 $last_result->from_cache
= TRUE
;
193 // It's a tragedy, this file must exist and contain good data.
194 // In this case, clear cache and repeat.
195 cache_clear_all('feeds_http_download_'.
md5($url), 'cache');
196 return http_request_get($url, $username, $password);
200 if (!isset($result->headers
) || !isset($result->headers
['ETag']) || !isset($result->headers
['Last-Modified'])) {
201 $result->headers
= isset($result->headers
) ?
$result->headers
: array();
202 $result->headers
['ETag'] = isset($result->headers
['ETag']) ?
$result->headers
['ETag'] : '';
203 $result->headers
['Last-Modified'] = isset($result->headers
['Last-Modified']) ?
$result->headers
['Last-Modified'] : '';
207 cache_set('feeds_http_download_'.
md5($url), $result);
208 $download_cache[$url] = $result;
214 * Decides if it's possible to use cURL or not.
217 * TRUE if curl is available, FALSE otherwise.
219 function http_request_use_curl() {
220 $basedir = ini_get("open_basedir");
221 return function_exists('curl_init') && !ini_get('safe_mode') && empty($basedir);
225 * Clear cache for a specific URL.
227 function http_request_clear_cache($url) {
228 cache_clear_all('feeds_http_download_'.
md5($url), 'cache');
232 * Returns if the provided $content_type is a feed.
234 * @param string $content_type
235 * The Content-Type header.
237 * @param string $data
238 * The actual data from the http request.
241 * Returns TRUE if this is a parsable feed.
243 function http_request_is_feed($content_type, $data) {
244 $pos = strpos($content_type, ';');
245 if ($pos !== FALSE
) {
246 $content_type = substr($content_type, 0, $pos);
248 $content_type = strtolower($content_type);
249 if (strpos($content_type, 'xml') !== FALSE
) {
253 // @TODO: Sometimes the content-type can be text/html but still be a valid
259 * Finds potential feed tags in the HTML document.
261 * @param string $html
262 * The html string to search.
265 * An array of href to feeds.
267 function http_request_find_feeds($html) {
269 preg_match_all(HTTP_REQUEST_PCRE_LINK_TAG
, $html, $matches);
270 $links = $matches[1];
271 $candidates = array();
272 $valid_links = array();
274 // Build up all the links information.
275 foreach ($links as
$link_tag) {
276 $attributes = array();
277 $candidate = array();
279 preg_match_all(HTTP_REQUEST_PCRE_TAG_ATTRIBUTES
, $link_tag, $attributes, PREG_SET_ORDER
);
280 foreach ($attributes as
$attribute) {
281 // Find the key value pairs, attribute[1] is key and attribute[2] is the
283 if(!empty($attribute[1]) && !empty($attribute[2])) {
284 $candidate[drupal_strtolower($attribute[1])] = drupal_strtolower(decode_entities($attribute[2]));
288 // Examine candidate to see if it s a feed.
289 // @TODO: could/should use http_request_is_feed ??
290 if (isset($candidate['rel']) && $candidate['rel'] == 'alternate') {
291 if (isset($candidate['href']) && isset($candidate['type']) && strpos($candidate['type'], 'xml') !== FALSE
) {
292 // All tests pass, its a valid candidate.
293 $valid_links[] = $candidate['href'];
302 * Create an absolute url.
305 * The href to transform.
308 * The url to be used as the base for a relative $url.
313 function http_request_create_absolute_url($url, $base_url) {
315 if (valid_url($url, TRUE
)) {
316 // Valid absolute url already.
320 // Turn relative url into absolute.
321 if (valid_url($url, FALSE
)) {
322 // Produces variables $scheme, $host, $user, $pass, $path, $query and $fragment.
323 $parsed_url = parse_url($base_url);
325 $path = dirname($parsed_url['path']);
327 // Adding to the existing path.
328 if ($url{0} == '/') {
329 $cparts = array_filter(explode("/", $url));
332 // Backtracking from the existing path.
333 $cparts = array_merge(array_filter(explode("/", $path)), array_filter(explode("/", $url)));
334 foreach($cparts as
$i => $part) {
339 $cparts[$i - 1] = null
;
343 $cparts = array_filter($cparts);
345 $path = implode("/", $cparts);
347 // Build the prefix to the path.
349 if (isset($parsed_url['scheme'])) {
350 $absolute_url = $parsed_url['scheme'] .
'://';
353 if (isset($parsed_url['user'])) {
354 $absolute_url .
= $parsed_url['user'];
356 $absolute_url .
= ':' .
$parsed_url['pass'];
358 $absolute_url .
= '@';
360 if (isset($parsed_url['host'])) {
361 $absolute_url .
= $parsed_url['host'] .
'/';
364 $absolute_url .
= $path;
366 if (valid_url($absolute_url, TRUE
)) {
367 return $absolute_url;