| 1 |
<?php
|
| 2 |
// $Id: robots_parser.module,v 1.1 2008/05/23 19:11:26 mustafau Exp $
|
| 3 |
|
| 4 |
/**
|
| 5 |
* @file
|
| 6 |
* Functions for reading and parsing /robots.txt files.
|
| 7 |
*/
|
| 8 |
|
| 9 |
/**
|
| 10 |
* Drupal is allowed to access any URL from the host.
|
| 11 |
*/
|
| 12 |
define('ROBOTS_PARSER_ALLOW_ALL', 1);
|
| 13 |
|
| 14 |
/**
|
| 15 |
* Drupal is not allowed to access any URL from the host.
|
| 16 |
*/
|
| 17 |
define('ROBOTS_PARSER_DISALLOW_ALL', -1);
|
| 18 |
|
| 19 |
/**
|
| 20 |
* Either there is no /robots.txt file or it is not accessible.
|
| 21 |
*
|
| 22 |
* In this case it is assumed that Drupal is allowed.
|
| 23 |
*/
|
| 24 |
define('ROBOTS_PARSER_NOT_IMPLEMENTED', 0);
|
| 25 |
|
| 26 |
/**
|
| 27 |
* Check if Drupal is allowed to access the URL.
|
| 28 |
*
|
| 29 |
* @param $url
|
| 30 |
* The URL that is being checked.
|
| 31 |
* @return
|
| 32 |
* TRUE if Drupal is allowed to access the URL, FALSE otherwise.
|
| 33 |
*/
|
| 34 |
function robots_parser_allowed($url) {
|
| 35 |
$uri = parse_url($url);
|
| 36 |
$robots_parser = robots_parser_import($uri['host']);
|
| 37 |
|
| 38 |
if (is_int($robots_parser)) {
|
| 39 |
switch ($robots_parser) {
|
| 40 |
case ROBOTS_PARSER_ALLOW_ALL:
|
| 41 |
return TRUE;
|
| 42 |
case ROBOTS_PARSER_DISALLOW_ALL:
|
| 43 |
return FALSE;
|
| 44 |
case ROBOTS_PARSER_NOT_IMPLEMENTED:
|
| 45 |
return TRUE;
|
| 46 |
}
|
| 47 |
}
|
| 48 |
elseif (is_array($robots_parser)) {
|
| 49 |
// Construct the path to act on.
|
| 50 |
$path = isset($uri['path']) ? $uri['path'] : '/';
|
| 51 |
if (isset($uri['query'])) {
|
| 52 |
$path .= '?'. $uri['query'];
|
| 53 |
}
|
| 54 |
|
| 55 |
foreach ($robots_parser as $disallow) {
|
| 56 |
if (strncasecmp($disallow, $path, strlen($disallow)-1) == 0) {
|
| 57 |
return FALSE;
|
| 58 |
}
|
| 59 |
}
|
| 60 |
}
|
| 61 |
return TRUE;
|
| 62 |
}
|
| 63 |
|
| 64 |
/**
|
| 65 |
* Import the /robots.txt file associated with the given host.
|
| 66 |
*
|
| 67 |
* @param $host
|
| 68 |
* The hostname to download the /robots.txt from.
|
| 69 |
* @return
|
| 70 |
* Either an array representing the downloaded /robot.txt or one of the
|
| 71 |
* Robots parser constants (One of ROBOTS_PARSER_ALLOW_ALL,
|
| 72 |
* ROBOTS_PARSER_DISALLOW_ALL, ROBOTS_PARSER_NOT_IMPLEMENTED).
|
| 73 |
*/
|
| 74 |
function robots_parser_import($host) {
|
| 75 |
if ($cache = cache_get('robots_parser:'. $host) && !empty($cache->data)) {
|
| 76 |
$result = $cache->data;
|
| 77 |
}
|
| 78 |
else {
|
| 79 |
$result = drupal_http_request("http://$host/robots.txt");
|
| 80 |
}
|
| 81 |
|
| 82 |
if (isset($result->error)) {
|
| 83 |
if (($result->code > 99) && ($result->code < 1000)) {
|
| 84 |
// @todo Output should be more user friendly.
|
| 85 |
watchdog('robots_parser', '%url: %error', array('%url' => "$host/robots.txt", '%error' => $result->error), WATCHDOG_WARNING);
|
| 86 |
}
|
| 87 |
if ($result->code == 401 || $result->code == 403) {
|
| 88 |
|
| 89 |
return ROBOTS_PARSER_DISALLOW_ALL;
|
| 90 |
}
|
| 91 |
elseif ($result->code >= 400) {
|
| 92 |
|
| 93 |
return ROBOTS_PARSER_ALLOW_ALL;
|
| 94 |
}
|
| 95 |
}
|
| 96 |
elseif ($result->code == 200) {
|
| 97 |
if ($cache == 0) {
|
| 98 |
cache_set('robots_parser:'. $host, $result, 'cache', CACHE_TEMPORARY);
|
| 99 |
}
|
| 100 |
$robots_parser = robots_parser_parse($result->data);
|
| 101 |
|
| 102 |
return $robots_parser;
|
| 103 |
}
|
| 104 |
|
| 105 |
return ROBOTS_PARSER_NOT_IMPLEMENTED;
|
| 106 |
}
|
| 107 |
|
| 108 |
/**
|
| 109 |
* Parse content coming from a /robots.txt file into an array.
|
| 110 |
*
|
| 111 |
* @param $data
|
| 112 |
* Content of a /robots.txt file.
|
| 113 |
* @return
|
| 114 |
* An array representing the /robot.txt file.
|
| 115 |
*/
|
| 116 |
function robots_parser_parse($data) {
|
| 117 |
$info = array();
|
| 118 |
$user_agent = NULL;
|
| 119 |
|
| 120 |
if (preg_match_all('
|
| 121 |
@^\s* # Start at the beginning of a line, ignoring leading whitespace
|
| 122 |
((?:
|
| 123 |
[^#] # Key names cannot contain hash signs,
|
| 124 |
)+?)
|
| 125 |
\s*:\s* # Key/value pairs are separated by colon signs (ignoring white-space)
|
| 126 |
(?:
|
| 127 |
([^\r\n]*?) # String
|
| 128 |
)\s*$ # Stop at the next end of a line, ignoring trailing whitespace
|
| 129 |
@msx', $data, $matches, PREG_SET_ORDER)) {
|
| 130 |
foreach ($matches as $match) {
|
| 131 |
// Fetch the key and value string.
|
| 132 |
$i = 0;
|
| 133 |
foreach (array('key', 'value') as $var) {
|
| 134 |
$$var = isset($match[++$i]) ? $match[$i] : '';
|
| 135 |
}
|
| 136 |
|
| 137 |
if (strcasecmp($key, 'User-agent') == 0) {
|
| 138 |
if (strcasecmp($value, '*') == 0 || strcasecmp($value, 'Drupal') == 0) {
|
| 139 |
$user_agent = $value;
|
| 140 |
}
|
| 141 |
else {
|
| 142 |
$user_agent = NULL;
|
| 143 |
}
|
| 144 |
}
|
| 145 |
elseif (isset($user_agent) && strcasecmp($key, 'Disallow') == 0) {
|
| 146 |
$info[] = $value;
|
| 147 |
}
|
| 148 |
}
|
| 149 |
}
|
| 150 |
|
| 151 |
return $info;
|
| 152 |
}
|