/[drupal]/contributions/modules/robots_parser/robots_parser.module
ViewVC logotype

Contents of /contributions/modules/robots_parser/robots_parser.module

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.2 - (show annotations) (download) (as text)
Wed May 28 20:02:40 2008 UTC (17 months, 3 weeks ago) by mustafau
Branch: MAIN
CVS Tags: HEAD
Changes since 1.1: +26 -6 lines
File MIME type: text/x-php
Doxygen improvements.
1 <?php
2 // $Id: robots_parser.module,v 1.1 2008/05/23 19:11:26 mustafau Exp $
3
4 /**
5 * @file
6 * Functions for reading and parsing /robots.txt files.
7 */
8
9 /**
10 * Drupal is allowed to access any URL from the host.
11 */
12 define('ROBOTS_PARSER_ALLOW_ALL', 1);
13
14 /**
15 * Drupal is not allowed to access any URL from the host.
16 */
17 define('ROBOTS_PARSER_DISALLOW_ALL', -1);
18
19 /**
20 * Either there is no /robots.txt file or it is not accessible.
21 *
22 * In this case it is assumed that Drupal is allowed.
23 */
24 define('ROBOTS_PARSER_NOT_IMPLEMENTED', 0);
25
26 /**
27 * Check if Drupal is allowed to access the URL.
28 *
29 * @param $url
30 * The URL that is being checked.
31 * @return
32 * TRUE if Drupal is allowed to access the URL, FALSE otherwise.
33 */
34 function robots_parser_allowed($url) {
35 $uri = parse_url($url);
36 $robots_parser = robots_parser_import($uri['host']);
37
38 if (is_int($robots_parser)) {
39 switch ($robots_parser) {
40 case ROBOTS_PARSER_ALLOW_ALL:
41 return TRUE;
42 case ROBOTS_PARSER_DISALLOW_ALL:
43 return FALSE;
44 case ROBOTS_PARSER_NOT_IMPLEMENTED:
45 return TRUE;
46 }
47 }
48 elseif (is_array($robots_parser)) {
49 // Construct the path to act on.
50 $path = isset($uri['path']) ? $uri['path'] : '/';
51 if (isset($uri['query'])) {
52 $path .= '?'. $uri['query'];
53 }
54
55 foreach ($robots_parser as $disallow) {
56 if (strncasecmp($disallow, $path, strlen($disallow)-1) == 0) {
57 return FALSE;
58 }
59 }
60 }
61 return TRUE;
62 }
63
64 /**
65 * Import the /robots.txt file associated with the given host.
66 *
67 * @param $host
68 * The hostname to download the /robots.txt from.
69 * @return
70 * Either an array representing the downloaded /robot.txt or one of the
71 * Robots parser constants (One of ROBOTS_PARSER_ALLOW_ALL,
72 * ROBOTS_PARSER_DISALLOW_ALL, ROBOTS_PARSER_NOT_IMPLEMENTED).
73 */
74 function robots_parser_import($host) {
75 if ($cache = cache_get('robots_parser:'. $host) && !empty($cache->data)) {
76 $result = $cache->data;
77 }
78 else {
79 $result = drupal_http_request("http://$host/robots.txt");
80 }
81
82 if (isset($result->error)) {
83 if (($result->code > 99) && ($result->code < 1000)) {
84 // @todo Output should be more user friendly.
85 watchdog('robots_parser', '%url: %error', array('%url' => "$host/robots.txt", '%error' => $result->error), WATCHDOG_WARNING);
86 }
87 if ($result->code == 401 || $result->code == 403) {
88
89 return ROBOTS_PARSER_DISALLOW_ALL;
90 }
91 elseif ($result->code >= 400) {
92
93 return ROBOTS_PARSER_ALLOW_ALL;
94 }
95 }
96 elseif ($result->code == 200) {
97 if ($cache == 0) {
98 cache_set('robots_parser:'. $host, $result, 'cache', CACHE_TEMPORARY);
99 }
100 $robots_parser = robots_parser_parse($result->data);
101
102 return $robots_parser;
103 }
104
105 return ROBOTS_PARSER_NOT_IMPLEMENTED;
106 }
107
108 /**
109 * Parse content coming from a /robots.txt file into an array.
110 *
111 * @param $data
112 * Content of a /robots.txt file.
113 * @return
114 * An array representing the /robot.txt file.
115 */
116 function robots_parser_parse($data) {
117 $info = array();
118 $user_agent = NULL;
119
120 if (preg_match_all('
121 @^\s* # Start at the beginning of a line, ignoring leading whitespace
122 ((?:
123 [^#] # Key names cannot contain hash signs,
124 )+?)
125 \s*:\s* # Key/value pairs are separated by colon signs (ignoring white-space)
126 (?:
127 ([^\r\n]*?) # String
128 )\s*$ # Stop at the next end of a line, ignoring trailing whitespace
129 @msx', $data, $matches, PREG_SET_ORDER)) {
130 foreach ($matches as $match) {
131 // Fetch the key and value string.
132 $i = 0;
133 foreach (array('key', 'value') as $var) {
134 $$var = isset($match[++$i]) ? $match[$i] : '';
135 }
136
137 if (strcasecmp($key, 'User-agent') == 0) {
138 if (strcasecmp($value, '*') == 0 || strcasecmp($value, 'Drupal') == 0) {
139 $user_agent = $value;
140 }
141 else {
142 $user_agent = NULL;
143 }
144 }
145 elseif (isset($user_agent) && strcasecmp($key, 'Disallow') == 0) {
146 $info[] = $value;
147 }
148 }
149 }
150
151 return $info;
152 }

  ViewVC Help
Powered by ViewVC 1.1.2