| 1 |
<?php
|
| 2 |
/* $Id:*/
|
| 3 |
|
| 4 |
/**
|
| 5 |
* @file
|
| 6 |
* Insert tags into header to protect against Google Proxy Attack
|
| 7 |
*/
|
| 8 |
|
| 9 |
/**
|
| 10 |
* Implemenation of hook_help().
|
| 11 |
*/
|
| 12 |
function antiproxyhack_help($section) {
|
| 13 |
switch ($section) {
|
| 14 |
case 'admin/settings/antiproxyhack':
|
| 15 |
//return t('From here you can manage all the settings for antiproxyhack.');
|
| 16 |
}
|
| 17 |
}
|
| 18 |
|
| 19 |
/**
|
| 20 |
* Implementation of hook_menu().
|
| 21 |
*/
|
| 22 |
function antiproxyhack_menu($may_cache) {
|
| 23 |
$items = array();
|
| 24 |
if ($may_cache) {
|
| 25 |
$items[] = array(
|
| 26 |
'title' => t('Google Proxy Attack'),
|
| 27 |
'path' => 'admin/settings/antiproxyhack',
|
| 28 |
'callback' => 'drupal_get_form',
|
| 29 |
'callback arguments' => array('antiproxyhack_settings_form'),
|
| 30 |
'description' => t('Configure Google Proxy Attack settings.'),
|
| 31 |
'access' => user_access('administer antiproxyhack'),
|
| 32 |
'type' => MENU_NORMAL_ITEM,);
|
| 33 |
$items[] = array(
|
| 34 |
'path' => 'aph/update',
|
| 35 |
'callback' => 'antiproxyhack_update_all',
|
| 36 |
'access' => user_access('administer antiproxyhack'),
|
| 37 |
'type' => MENU_CALLBACK,);
|
| 38 |
}
|
| 39 |
else {
|
| 40 |
antiproxyhack_metaRobotsExcludeProxies();
|
| 41 |
}
|
| 42 |
return $items;
|
| 43 |
}
|
| 44 |
|
| 45 |
function antiproxyhack_settings_form() {
|
| 46 |
|
| 47 |
$form = array();
|
| 48 |
$form['robot_settings'] = array(
|
| 49 |
'#type' => 'fieldset',
|
| 50 |
'#title' => t('Robot Settings'),
|
| 51 |
'#weight' => -5,);
|
| 52 |
$robot_options = array(
|
| 53 |
'google' => 'Google',
|
| 54 |
'yahoo' => 'Yahoo!',
|
| 55 |
'msn' => 'MSN',
|
| 56 |
'ask' => 'Ask.com',
|
| 57 |
);
|
| 58 |
$form['robot_settings']['antiproxyhack_robots'] = array(
|
| 59 |
'#type' => 'checkboxes',
|
| 60 |
'#options' => $robot_options,
|
| 61 |
'#default_value' => variable_get('antiproxyhack_robots', 0),
|
| 62 |
'#title' => t('Robots to Detect'),
|
| 63 |
);
|
| 64 |
$form['robot_settings']['antiproxyhack_meta_tags'] = array(
|
| 65 |
'#type' => 'textfield',
|
| 66 |
'#default_value' => variable_get('antiproxyhack_meta_tags', '<meta name="robots" content="noindex,nofollow" />'),
|
| 67 |
'#title' => t('Meta tags to insert when illicit proxy scans site'),
|
| 68 |
);
|
| 69 |
$form['robot_checking'] = array(
|
| 70 |
'#type' => 'fieldset',
|
| 71 |
'#title' => t('Robot Confidence Checks'),
|
| 72 |
'#weight' => -4,);
|
| 73 |
$check_options = array(
|
| 74 |
'check_uas' => 'Check User Agent',
|
| 75 |
'check_ips' => 'Check IP Range',
|
| 76 |
);
|
| 77 |
$form['robot_checking']['antiproxyhack_confidence_checks'] = array(
|
| 78 |
'#type' => 'checkboxes',
|
| 79 |
'#options' => $check_options,
|
| 80 |
'#default_value' => variable_get('antiproxyhack_confidence_checks', 0),
|
| 81 |
'#title' => t('Methods of Detection'),
|
| 82 |
'#description' => t('Please select the types of checking you would like to conduct to determine if a given user agent is a "bad" robot.'),
|
| 83 |
);
|
| 84 |
return system_settings_form($form);
|
| 85 |
}
|
| 86 |
|
| 87 |
function antiproxyhack_cron() {
|
| 88 |
return antiproxyhack_update_all();
|
| 89 |
}
|
| 90 |
|
| 91 |
function antiproxyhack_is_spider($spider_name = '', $check_uas = true, $check_ips = true, $use_user_defined_data = true, $ignore_bad_uas = true) {
|
| 92 |
// default confidence level to 0
|
| 93 |
$confidence = 0;
|
| 94 |
|
| 95 |
// matching user agent?
|
| 96 |
$confidence_checks = variable_get('antiproxyhack_confidence_checks','');
|
| 97 |
if ($confidence_checks['check_uas'])
|
| 98 |
if (antiproxyhack_get(0, $spider_name, 'UA', $_SERVER['HTTP_USER_AGENT'], '', $use_user_defined_data ? '' : 'N', $ignore_bad_uas ? 'bad' : ''))
|
| 99 |
$confidence += 2;
|
| 100 |
// matching IP?
|
| 101 |
if ($confidence_checks['check_ips'])
|
| 102 |
if (antiproxyhack_get(0, $spider_name, 'IP', '', $_SERVER['REMOTE_ADDR'], $use_user_defined_data ? '' : 'N', $ignore_bad_uas ? 'bad' : ''))
|
| 103 |
$confidence += 3;
|
| 104 |
|
| 105 |
// return confidence level
|
| 106 |
return $confidence;
|
| 107 |
}
|
| 108 |
|
| 109 |
// retrieve cloaking data filtered by the supplied parameters
|
| 110 |
function antiproxyhack_get($id = 0, $spider_name = '', $record_type = '', $value = '', $wildcard_value = '', $is_user_defined_data = '', $not_spider_name = '') {
|
| 111 |
// by default, retrieve all records
|
| 112 |
$q = " SELECT antiproxyhack_cloak_data.* FROM {antiproxyhack_cloak_data} WHERE TRUE ";
|
| 113 |
|
| 114 |
// add filters
|
| 115 |
if ($id) {
|
| 116 |
$id = (int) $id;
|
| 117 |
$q .= " AND id = $id ";
|
| 118 |
}
|
| 119 |
if ($spider_name) {
|
| 120 |
$spider_name = mysql_escape_string($spider_name);
|
| 121 |
$q .= " AND spider_name = '$spider_name' ";
|
| 122 |
}
|
| 123 |
if ($record_type) {
|
| 124 |
$record_type = mysql_escape_string($record_type);
|
| 125 |
$q .= " AND record_type = '$record_type' ";
|
| 126 |
}
|
| 127 |
if ($value) {
|
| 128 |
$value = mysql_escape_string($value);
|
| 129 |
$q .= " AND value = '$value' ";
|
| 130 |
}
|
| 131 |
if ($wildcard_value) {
|
| 132 |
$wildcard_value = mysql_escape_string($wildcard_value);
|
| 133 |
$q .= " AND ( '$wildcard_value' = value OR '$wildcard_value' LIKE CONCAT(value, '.%') ) ";
|
| 134 |
}
|
| 135 |
if ($is_user_defined_data) {
|
| 136 |
$is_user_defined_data = mysql_escape_string($is_user_defined_data);
|
| 137 |
$q .= " AND is_user_defined = '$is_user_defined_data' ";
|
| 138 |
}
|
| 139 |
if ($not_spider_name) {
|
| 140 |
$not_spider_name = mysql_escape_string($not_spider_name);
|
| 141 |
$q .= " AND spider_name <> '$not_spider_name' ";
|
| 142 |
}
|
| 143 |
$tmp = db_query($q);
|
| 144 |
$rows = array();
|
| 145 |
while ($_x = db_fetch_array($tmp)) {
|
| 146 |
$rows[] = $_x;
|
| 147 |
}
|
| 148 |
return $rows;
|
| 149 |
}
|
| 150 |
|
| 151 |
// updates the entire database with fresh spider data, but only if our data is
|
| 152 |
// more than 7 days old, and if the online version from iplists.org has changed
|
| 153 |
function antiproxyhack_update_all($delete_user_defined_data = false) {
|
| 154 |
// retrieve last update information from database
|
| 155 |
$q = "SELECT antiproxyhack_cloak_update.* FROM {antiproxyhack_cloak_update}";
|
| 156 |
$updated = db_fetch_array(db_query($q));
|
| 157 |
$db_version = $updated['version'];
|
| 158 |
$updated_on = $updated ['updated_on'];
|
| 159 |
|
| 160 |
// get the latest update more recent than 7 days, don't attempt an update
|
| 161 |
if (isset($updated_on) &&
|
| 162 |
(strtotime($updated_on) > strtotime("-604800 seconds")))
|
| 163 |
{
|
| 164 |
return false;
|
| 165 |
}
|
| 166 |
|
| 167 |
// read the latest iplists version
|
| 168 |
$version_url = 'http://www.iplists.com/nw/version.php';
|
| 169 |
|
| 170 |
$ch = curl_init();
|
| 171 |
curl_setopt ($ch, CURLOPT_URL, $version_url);
|
| 172 |
curl_setopt ($ch, CURLOPT_RETURNTRANSFER, 1);
|
| 173 |
curl_setopt ($ch, CURLOPT_TIMEOUT, 60);
|
| 174 |
$latest_version = curl_exec($ch);
|
| 175 |
curl_close($ch);
|
| 176 |
|
| 177 |
$latest_version = mysql_escape_string($latest_version);
|
| 178 |
|
| 179 |
// if no updated version information was retrieved, abort
|
| 180 |
if (!$latest_version)
|
| 181 |
{
|
| 182 |
// return false to indicate an update wasn't performed
|
| 183 |
return false;
|
| 184 |
}
|
| 185 |
|
| 186 |
// save the update data
|
| 187 |
$q = "DELETE FROM {antiproxyhack_cloak_update}";
|
| 188 |
db_query($q);
|
| 189 |
$q = "INSERT INTO {antiproxyhack_cloak_update} (version, updated_on) " .
|
| 190 |
"VALUES('$latest_version', NOW())";
|
| 191 |
db_query($q);
|
| 192 |
|
| 193 |
// if we already have the current data, don't attempt an update
|
| 194 |
if ($latest_version == $db_version) {
|
| 195 |
return false;
|
| 196 |
}
|
| 197 |
|
| 198 |
// update the database
|
| 199 |
antiproxyhack_update_cloaking_DB('google',
|
| 200 |
'http://www.iplists.com/nw/google.txt', $delete_user_defined_data);
|
| 201 |
antiproxyhack_update_cloaking_DB('yahoo',
|
| 202 |
'http://www.iplists.com/nw/inktomi.txt', $delete_user_defined_data);
|
| 203 |
antiproxyhack_update_cloaking_DB('msn',
|
| 204 |
'http://www.iplists.com/nw/msn.txt', $delete_user_defined_data);
|
| 205 |
antiproxyhack_update_cloaking_DB('ask',
|
| 206 |
'http://www.iplists.com/nw/askjeeves.txt', $delete_user_defined_data);
|
| 207 |
antiproxyhack_update_cloaking_DB('altavista',
|
| 208 |
'http://www.iplists.com/nw/altavista.txt', $delete_user_defined_data);
|
| 209 |
antiproxyhack_update_cloaking_DB('lycos',
|
| 210 |
'http://www.iplists.com/nw/lycos.txt', $delete_user_defined_data);
|
| 211 |
antiproxyhack_update_cloaking_DB('wisenut',
|
| 212 |
'http://www.iplists.com/nw/wisenut.txt', $delete_user_defined_data);
|
| 213 |
|
| 214 |
return "Database updated successfully.";
|
| 215 |
}
|
| 216 |
|
| 217 |
// update the database for the mentioned spider, by reading the provided URL
|
| 218 |
function antiproxyhack_update_cloaking_DB($spider_name, $url, $delete_user_defined_data = false) {
|
| 219 |
|
| 220 |
$ua_regex = '/^# UA "(.*)"$/m';
|
| 221 |
$ip_regex = '/^([0-9.]+)$/m';
|
| 222 |
|
| 223 |
// use cURL to read the data from $url
|
| 224 |
// NOTE: additional settings are required when accessing the web through a proxy
|
| 225 |
$ch = curl_init();
|
| 226 |
curl_setopt ($ch, CURLOPT_URL, $url);
|
| 227 |
curl_setopt ($ch, CURLOPT_HEADER, 1);
|
| 228 |
curl_setopt ($ch, CURLOPT_RETURNTRANSFER, 1);
|
| 229 |
curl_setopt ($ch, CURLOPT_FOLLOWLOCATION, 1);
|
| 230 |
curl_setopt ($ch, CURLOPT_TIMEOUT, 60);
|
| 231 |
$result = curl_exec($ch);
|
| 232 |
curl_close($ch);
|
| 233 |
|
| 234 |
// use _parseListURL to parse the list of IPs and user agents
|
| 235 |
$lists = antiproxyhack_parse_list_URL($result, $ua_regex, $ip_regex);
|
| 236 |
|
| 237 |
|
| 238 |
// if the user agents and IPs weren't retrieved, we cancel the update
|
| 239 |
if (!$lists['ua_list'] || !$lists['ip_list']) return;
|
| 240 |
|
| 241 |
// lock the cloack_data table to avoid concurrency problems
|
| 242 |
db_query('LOCK TABLES {antiproxyhack_cloak_data} WRITE');
|
| 243 |
|
| 244 |
// delete all the existing data for $spider_name
|
| 245 |
antiproxyhack_delete_spider_data($spider_name, $delete_user_defined_data ? '' : 'N');
|
| 246 |
|
| 247 |
// insert the list of user agents for the spider
|
| 248 |
foreach ($lists['ua_list'] as $ua) {
|
| 249 |
antiproxyhack_insert_spider_data($spider_name, 'UA', $ua);
|
| 250 |
}
|
| 251 |
|
| 252 |
// insert the list of IPs for the spider
|
| 253 |
foreach ($lists['ip_list'] as $ip) {
|
| 254 |
antiproxyhack_insert_spider_data($spider_name, 'IP', $ip);
|
| 255 |
}
|
| 256 |
|
| 257 |
// release the table lock
|
| 258 |
db_query('UNLOCK TABLES');
|
| 259 |
}
|
| 260 |
|
| 261 |
// inserts a new row of data to the cloaking table
|
| 262 |
function antiproxyhack_insert_spider_data($spider_name, $record_type, $value, $is_user_defined = 'N') {
|
| 263 |
// escape input data
|
| 264 |
$spider_name = mysql_escape_string($spider_name);
|
| 265 |
$record_type = mysql_escape_string($record_type);
|
| 266 |
$value = mysql_escape_string($value);
|
| 267 |
$is_user_defined = mysql_escape_string($is_user_defined);
|
| 268 |
|
| 269 |
// build and execute the INSERT query
|
| 270 |
$q = "INSERT INTO {antiproxyhack_cloak_data} (spider_name, record_type, value, is_user_defined) " .
|
| 271 |
"VALUES ('$spider_name', '$record_type', '$value', '$is_user_defined')";
|
| 272 |
|
| 273 |
db_query($q);
|
| 274 |
//mysql_query($q);
|
| 275 |
}
|
| 276 |
|
| 277 |
// delete the cloaking data for the mentioned spider
|
| 278 |
function antiproxyhack_delete_spider_data($spider_name, $is_user_defined = '') {
|
| 279 |
// escape input data
|
| 280 |
$spider_name = mysql_escape_string($spider_name);
|
| 281 |
|
| 282 |
// build and execute the DELETE query
|
| 283 |
$q = "DELETE FROM {antiproxyhack_cloak_data} WHERE spider_name='$spider_name'";
|
| 284 |
|
| 285 |
if ($is_user_defined) {
|
| 286 |
$is_user_defined = mysql_escape_string($is_user_defined);
|
| 287 |
$q .= " AND is_user_defined = '$is_user_defined' ";
|
| 288 |
}
|
| 289 |
db_query($q);
|
| 290 |
}
|
| 291 |
|
| 292 |
function antiproxyhack_parse_list_URL($data, $ua_regex, $ip_regex) {
|
| 293 |
$ua_list_ret = preg_match_all($ua_regex, $data, $ua_list);
|
| 294 |
$ip_list_ret = preg_match_all($ip_regex, $data, $ip_list);
|
| 295 |
return array('ua_list' => $ua_list[1], 'ip_list' => $ip_list[1]);
|
| 296 |
}
|
| 297 |
|
| 298 |
// only use if it's not found via the IPLists cloaking database
|
| 299 |
function antiproxyhack_bot_verify_by_DNS($ua = array('google', '#.*\.googlebot\.com$#')) {
|
| 300 |
|
| 301 |
// check cache of bad bots
|
| 302 |
if (antiproxyhack_is_spider('bad', false, true, true, false)) {
|
| 303 |
return false;
|
| 304 |
}
|
| 305 |
|
| 306 |
// check only UA since this function is only called if the cloaking DB doesn't handle it
|
| 307 |
if (antiproxyhack_is_spider($ua[0], true, false)) {
|
| 308 |
// reverse lookup
|
| 309 |
$host_name = gethostbyaddr($_SERVER['REMOTE_ADDR']);
|
| 310 |
|
| 311 |
// if it says it's a certain UA but gethostbyaddr the corresponding domain regex, store it and then abort
|
| 312 |
if (!preg_match($ua[1], $host_name)) {
|
| 313 |
//$dbLink = SimpleCloakV2::_connect();
|
| 314 |
antiproxyhack_insert_spider_data('bad', 'IP', $_SERVER['REMOTE_ADDR'], 'Y');
|
| 315 |
// SimpleCloakV2::_close($dbLink);
|
| 316 |
return false;
|
| 317 |
}
|
| 318 |
|
| 319 |
$connected_ip_address = $_SERVER['REMOTE_ADDR'];
|
| 320 |
$host_name_ip_address = gethostbyname($host_name);
|
| 321 |
|
| 322 |
// if the connected IP matches the authoritative IP, we have a match
|
| 323 |
if ($connected_ip_address == $host_name_ip_address) {
|
| 324 |
//$dbLink = SimpleCloakV2::_connect();
|
| 325 |
antiproxyhack_insert_spider_data($ua[0], 'IP', $_SERVER['REMOTE_ADDR'], 'Y');
|
| 326 |
//SimpleCloakV2::_close($dbLink);
|
| 327 |
return true;
|
| 328 |
} else {
|
| 329 |
// if it says it's a certain UA, gethostbyaddr says the right thing, but gethostbyname does not
|
| 330 |
//$dbLink = SimpleCloakV2::_connect();
|
| 331 |
antiproxyhack_insert_spider_data('bad', 'IP', $_SERVER['REMOTE_ADDR'], 'Y');
|
| 332 |
// SimpleCloakV2::_close($dbLink);
|
| 333 |
return false;
|
| 334 |
}
|
| 335 |
}
|
| 336 |
// it does not even say it's a bot via UA
|
| 337 |
return false;
|
| 338 |
}
|
| 339 |
|
| 340 |
function antiproxyhack_metaRobotsExcludeProxies($auto_modify_content = true, $uas = array(array('google', '#.*\.googlebot\.com$#'), array('yahoo', '#.*\.yahoo\.net$#'), array('msn', '#.*\.live\.com$#'), array('ask', '#.*\.ask.com$#') ), $meta_tag = '<meta name="robots" content="noindex,nofollow" />', $passlist_regex = '') {
|
| 341 |
// if it's on our passlist
|
| 342 |
// ex: #become|lycos|somestupidbot#
|
| 343 |
if ($passlist_regex) {
|
| 344 |
if (preg_match($passlist_regex, $_SERVER['HTTP_USER_AGENT'])) return false;
|
| 345 |
}
|
| 346 |
foreach ($uas as $u) {
|
| 347 |
// if it's a bot according to UA, then start to investigate
|
| 348 |
if (antiproxyhack_is_spider($u[0], true, false)) {
|
| 349 |
// if it's a bot according to IPLists or our user-defined list
|
| 350 |
if (antiproxyhack_is_spider($u[0], false, true)) {
|
| 351 |
return false;
|
| 352 |
//if it's a bot according to DNS
|
| 353 |
}
|
| 354 |
else if (antiproxyhack_bot_verify_by_DNS($u)) {
|
| 355 |
return false;
|
| 356 |
// if it's not
|
| 357 |
}
|
| 358 |
else {
|
| 359 |
drupal_set_html_head(variable_get('antiproxyhack_meta_tags', '<meta name="robots" content="noindex,nofollow" />'));
|
| 360 |
return true;
|
| 361 |
}
|
| 362 |
}
|
| 363 |
}
|
| 364 |
// it's not a bot according to UA
|
| 365 |
/* if ($auto_modify_content) //ob_start(array('SimpleCloakV2', '_addMetaRobotsExcludeProxiesCallback'));
|
| 366 |
drupal_set_html_head(variable_get('antiproxyhack_meta_tags', '<meta name="robots" content="noindex,nofollow" />'));*/
|
| 367 |
return true + 1;
|
| 368 |
}
|