| 1 |
<?php
|
| 2 |
// $Id$
|
| 3 |
|
| 4 |
/*************************************************
|
| 5 |
|
| 6 |
Snoopy - the PHP net client
|
| 7 |
Author: Monte Ohrt <monte@ispi.net>
|
| 8 |
Copyright (c): 1999-2000 ispi, all rights reserved
|
| 9 |
Version: 0.94
|
| 10 |
|
| 11 |
This program is free software; you can redistribute it and/or
|
| 12 |
modify it under the terms of the GNU General Public License
|
| 13 |
as published by the Free Software Foundation; either version 2
|
| 14 |
of the License, or (at your option) any later version.
|
| 15 |
|
| 16 |
This program is distributed in the hope that it will be useful,
|
| 17 |
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 18 |
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
| 19 |
GNU General Public License for more details.
|
| 20 |
|
| 21 |
You should have received a copy of the GNU General Public License
|
| 22 |
along with this program; if not, write to the Free Software
|
| 23 |
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
| 24 |
|
| 25 |
You may contact the author of Snoopy by e-mail at:
|
| 26 |
monte@ispi.net
|
| 27 |
|
| 28 |
Or, write to:
|
| 29 |
Monte Ohrt
|
| 30 |
CTO, ispi
|
| 31 |
237 S. 70th suite 220
|
| 32 |
Lincoln, NE 68510
|
| 33 |
|
| 34 |
The latest version of Snoopy can be obtained from:
|
| 35 |
http://snoopy.sourceforge.com
|
| 36 |
|
| 37 |
*************************************************/
|
| 38 |
|
| 39 |
class Snoopy
|
| 40 |
{
|
| 41 |
/**** Public variables ****/
|
| 42 |
|
| 43 |
/* user definable vars */
|
| 44 |
|
| 45 |
var $host = "www.php.net"; // host name we are connecting to
|
| 46 |
var $port = 80; // port we are connecting to
|
| 47 |
var $proxy_host = ""; // proxy host to use
|
| 48 |
var $proxy_port = ""; // proxy port to use
|
| 49 |
var $agent = "Snoopy v0.94"; // agent we masquerade as
|
| 50 |
var $referer = ""; // referer info to pass
|
| 51 |
var $cookies = array(); // array of cookies to pass
|
| 52 |
// $cookies["username"]="joe";
|
| 53 |
var $rawheaders = array(); // array of raw headers to send
|
| 54 |
// $rawheaders["Content-type"]="text/html";
|
| 55 |
|
| 56 |
var $maxredirs = 5; // http redirection depth maximum. 0 = disallow
|
| 57 |
var $lastredirectaddr = ""; // contains address of last redirected address
|
| 58 |
var $offsiteok = true; // allows redirection off-site
|
| 59 |
var $maxframes = 0; // frame content depth maximum. 0 = disallow
|
| 60 |
var $expandlinks = true; // expand links to fully qualified URLs.
|
| 61 |
// this only applies to fetchlinks()
|
| 62 |
// or submitlinks()
|
| 63 |
var $passcookies = true; // pass set cookies back through redirects
|
| 64 |
// NOTE: this currently does not respect
|
| 65 |
// dates, domains or paths.
|
| 66 |
|
| 67 |
var $user = ""; // user for http authentication
|
| 68 |
var $pass = ""; // password for http authentication
|
| 69 |
|
| 70 |
// http accept types
|
| 71 |
var $accept = "image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, */*";
|
| 72 |
|
| 73 |
var $results = ""; // where the content is put
|
| 74 |
|
| 75 |
var $error = ""; // error messages sent here
|
| 76 |
var $response_code = ""; // response code returned from server
|
| 77 |
var $headers = array(); // headers returned from server sent here
|
| 78 |
var $maxlength = 500000; // max return data length (body)
|
| 79 |
var $read_timeout = 0; // timeout on read operations, in seconds
|
| 80 |
// supported only since PHP 4 Beta 4
|
| 81 |
// set to 0 to disallow timeouts
|
| 82 |
var $timed_out = false; // if a read operation timed out
|
| 83 |
var $status = 0; // http request status
|
| 84 |
|
| 85 |
var $curl_path = "/usr/local/bin/curl";
|
| 86 |
// Snoopy will use cURL for fetching
|
| 87 |
// SSL content if a full system path to
|
| 88 |
// the cURL binary is supplied here.
|
| 89 |
// set to false if you do not have
|
| 90 |
// cURL installed. See http://curl.haxx.se
|
| 91 |
// for details on installing cURL.
|
| 92 |
// Snoopy does *not* use the cURL
|
| 93 |
// library functions built into php,
|
| 94 |
// as these functions are not stable
|
| 95 |
// as of this Snoopy release.
|
| 96 |
|
| 97 |
/**** Private variables ****/
|
| 98 |
|
| 99 |
var $_maxlinelen = 4096; // max line length (headers)
|
| 100 |
|
| 101 |
var $_httpmethod = "GET"; // default http request method
|
| 102 |
var $_httpversion = "HTTP/1.0"; // default http request version
|
| 103 |
var $_submit_method = "POST"; // default submit method
|
| 104 |
var $_submittype = "application/x-www-form-urlencoded"; // default submit type
|
| 105 |
var $_redirectaddr = false; // will be set if page fetched is a redirect
|
| 106 |
var $_redirectdepth = 0; // increments on an http redirect
|
| 107 |
var $_frameurls = array(); // frame src urls
|
| 108 |
var $_framedepth = 0; // increments on frame depth
|
| 109 |
|
| 110 |
var $_isproxy = false; // set if using a proxy server
|
| 111 |
var $_fp_timeout = 30; // timeout for socket connection
|
| 112 |
|
| 113 |
/*======================================================================*\
|
| 114 |
Function: fetch
|
| 115 |
Purpose: fetch the contents of a web page
|
| 116 |
(and possibly other protocols in the
|
| 117 |
future like ftp, nntp, gopher, etc.)
|
| 118 |
Input: $URI the location of the page to fetch
|
| 119 |
Output: $this->results the output text from the fetch
|
| 120 |
\*======================================================================*/
|
| 121 |
|
| 122 |
function fetch($URI)
|
| 123 |
{
|
| 124 |
|
| 125 |
//preg_match("|^([^:]+)://([^:/]+)(:[\d]+)*(.*)|",$URI,$URI_PARTS);
|
| 126 |
$URI_PARTS = parse_url($URI);
|
| 127 |
if (!empty($URI_PARTS["user"]))
|
| 128 |
$this->user = $URI_PARTS["user"];
|
| 129 |
if (!empty($URI_PARTS["pass"]))
|
| 130 |
$this->pass = $URI_PARTS["pass"];
|
| 131 |
|
| 132 |
switch($URI_PARTS["scheme"])
|
| 133 |
{
|
| 134 |
case "http":
|
| 135 |
$this->host = $URI_PARTS["host"];
|
| 136 |
if(!empty($URI_PARTS["port"]))
|
| 137 |
$this->port = $URI_PARTS["port"];
|
| 138 |
if($this->_connect($fp))
|
| 139 |
{
|
| 140 |
if($this->_isproxy)
|
| 141 |
{
|
| 142 |
// using proxy, send entire URI
|
| 143 |
$this->_httprequest($URI,$fp,$URI,$this->_httpmethod);
|
| 144 |
}
|
| 145 |
else
|
| 146 |
{
|
| 147 |
$path = $URI_PARTS["path"].($URI_PARTS["query"] ? "?".$URI_PARTS["query"] : "");
|
| 148 |
// no proxy, send only the path
|
| 149 |
$this->_httprequest($path, $fp, $URI, $this->_httpmethod);
|
| 150 |
}
|
| 151 |
|
| 152 |
$this->_disconnect($fp);
|
| 153 |
|
| 154 |
if($this->_redirectaddr)
|
| 155 |
{
|
| 156 |
/* url was redirected, check if we've hit the max depth */
|
| 157 |
if($this->maxredirs > $this->_redirectdepth)
|
| 158 |
{
|
| 159 |
// only follow redirect if it's on this site, or offsiteok is true
|
| 160 |
if(preg_match("|^http://".preg_quote($this->host)."|i",$this->_redirectaddr) || $this->offsiteok)
|
| 161 |
{
|
| 162 |
/* follow the redirect */
|
| 163 |
$this->_redirectdepth++;
|
| 164 |
$this->lastredirectaddr=$this->_redirectaddr;
|
| 165 |
$this->fetch($this->_redirectaddr);
|
| 166 |
}
|
| 167 |
}
|
| 168 |
}
|
| 169 |
|
| 170 |
if($this->_framedepth < $this->maxframes && count($this->_frameurls) > 0)
|
| 171 |
{
|
| 172 |
$frameurls = $this->_frameurls;
|
| 173 |
$this->_frameurls = array();
|
| 174 |
|
| 175 |
while(list(,$frameurl) = each($frameurls))
|
| 176 |
{
|
| 177 |
if($this->_framedepth < $this->maxframes)
|
| 178 |
{
|
| 179 |
$this->fetch($frameurl);
|
| 180 |
$this->_framedepth++;
|
| 181 |
}
|
| 182 |
else
|
| 183 |
break;
|
| 184 |
}
|
| 185 |
}
|
| 186 |
}
|
| 187 |
else
|
| 188 |
{
|
| 189 |
return false;
|
| 190 |
}
|
| 191 |
return true;
|
| 192 |
break;
|
| 193 |
case "https":
|
| 194 |
if(!$this->curl_path || (!is_executable($this->curl_path)))
|
| 195 |
return false;
|
| 196 |
$this->host = $URI_PARTS["host"];
|
| 197 |
if(!empty($URI_PARTS["port"]))
|
| 198 |
$this->port = $URI_PARTS["port"];
|
| 199 |
if($this->_isproxy)
|
| 200 |
{
|
| 201 |
// using proxy, send entire URI
|
| 202 |
$this->_httpsrequest($URI,$URI,$this->_httpmethod);
|
| 203 |
}
|
| 204 |
else
|
| 205 |
{
|
| 206 |
$path = $URI_PARTS["path"].($URI_PARTS["query"] ? "?".$URI_PARTS["query"] : "");
|
| 207 |
// no proxy, send only the path
|
| 208 |
$this->_httpsrequest($path, $URI, $this->_httpmethod);
|
| 209 |
}
|
| 210 |
|
| 211 |
if($this->_redirectaddr)
|
| 212 |
{
|
| 213 |
/* url was redirected, check if we've hit the max depth */
|
| 214 |
if($this->maxredirs > $this->_redirectdepth)
|
| 215 |
{
|
| 216 |
// only follow redirect if it's on this site, or offsiteok is true
|
| 217 |
if(preg_match("|^http://".preg_quote($this->host)."|i",$this->_redirectaddr) || $this->offsiteok)
|
| 218 |
{
|
| 219 |
/* follow the redirect */
|
| 220 |
$this->_redirectdepth++;
|
| 221 |
$this->lastredirectaddr=$this->_redirectaddr;
|
| 222 |
$this->fetch($this->_redirectaddr);
|
| 223 |
}
|
| 224 |
}
|
| 225 |
}
|
| 226 |
|
| 227 |
if($this->_framedepth < $this->maxframes && count($this->_frameurls) > 0)
|
| 228 |
{
|
| 229 |
$frameurls = $this->_frameurls;
|
| 230 |
$this->_frameurls = array();
|
| 231 |
|
| 232 |
while(list(,$frameurl) = each($frameurls))
|
| 233 |
{
|
| 234 |
if($this->_framedepth < $this->maxframes)
|
| 235 |
{
|
| 236 |
$this->fetch($frameurl);
|
| 237 |
$this->_framedepth++;
|
| 238 |
}
|
| 239 |
else
|
| 240 |
break;
|
| 241 |
}
|
| 242 |
}
|
| 243 |
return true;
|
| 244 |
break;
|
| 245 |
default:
|
| 246 |
// not a valid protocol
|
| 247 |
$this->error = 'Invalid protocol "'.$URI_PARTS["scheme"].'"\n';
|
| 248 |
return false;
|
| 249 |
break;
|
| 250 |
}
|
| 251 |
return true;
|
| 252 |
}
|
| 253 |
|
| 254 |
/*======================================================================*\
|
| 255 |
Function: submit
|
| 256 |
Purpose: submit an http form
|
| 257 |
Input: $URI the location to post the data
|
| 258 |
$formvars the formvars to use.
|
| 259 |
format: $formvars["var"] = "val";
|
| 260 |
Output: $this->results the text output from the post
|
| 261 |
\*======================================================================*/
|
| 262 |
|
| 263 |
function submit($URI, $formvars="")
|
| 264 |
{
|
| 265 |
unset($postdata);
|
| 266 |
|
| 267 |
settype($formvars, "array");
|
| 268 |
|
| 269 |
while(list($key,$val) = each($formvars)) {
|
| 270 |
if (is_array($val) || is_object($val)) {
|
| 271 |
while (list($cur_key, $cur_val) = each($val)) {
|
| 272 |
$postdata .= urlencode($cur_key)."=".urlencode($cur_val)."&";
|
| 273 |
}
|
| 274 |
} else
|
| 275 |
$postdata .= urlencode($key)."=".urlencode($val)."&";
|
| 276 |
}
|
| 277 |
|
| 278 |
$URI_PARTS = parse_url($URI);
|
| 279 |
if (!empty($URI_PARTS["user"]))
|
| 280 |
$this->user = $URI_PARTS["user"];
|
| 281 |
if (!empty($URI_PARTS["pass"]))
|
| 282 |
$this->pass = $URI_PARTS["pass"];
|
| 283 |
|
| 284 |
switch($URI_PARTS["scheme"])
|
| 285 |
{
|
| 286 |
case "http":
|
| 287 |
$this->host = $URI_PARTS["host"];
|
| 288 |
if(!empty($URI_PARTS["port"]))
|
| 289 |
$this->port = $URI_PARTS["port"];
|
| 290 |
if($this->_connect($fp))
|
| 291 |
{
|
| 292 |
if($this->_isproxy)
|
| 293 |
{
|
| 294 |
// using proxy, send entire URI
|
| 295 |
$this->_httprequest($URI,$fp,$URI,$this->_submit_method,$this->_submittype,$postdata);
|
| 296 |
}
|
| 297 |
else
|
| 298 |
{
|
| 299 |
$path = $URI_PARTS["path"].($URI_PARTS["query"] ? "?".$URI_PARTS["query"] : "");
|
| 300 |
// no proxy, send only the path
|
| 301 |
$this->_httprequest($path, $fp, $URI, $this->_submit_method, $this->_submittype, $postdata);
|
| 302 |
}
|
| 303 |
|
| 304 |
$this->_disconnect($fp);
|
| 305 |
|
| 306 |
if($this->_redirectaddr)
|
| 307 |
{
|
| 308 |
/* url was redirected, check if we've hit the max depth */
|
| 309 |
if($this->maxredirs > $this->_redirectdepth)
|
| 310 |
{
|
| 311 |
if(!preg_match("|^".$URI_PARTS["scheme"]."://|", $this->_redirectaddr))
|
| 312 |
$this->_redirectaddr = $this->_expandlinks($this->_redirectaddr,$URI_PARTS["scheme"]."://".$URI_PARTS["host"]);
|
| 313 |
|
| 314 |
// only follow redirect if it's on this site, or offsiteok is true
|
| 315 |
if(preg_match("|^http://".preg_quote($this->host)."|i",$this->_redirectaddr) || $this->offsiteok)
|
| 316 |
{
|
| 317 |
/* follow the redirect */
|
| 318 |
$this->_redirectdepth++;
|
| 319 |
$this->lastredirectaddr=$this->_redirectaddr;
|
| 320 |
$this->submit($this->_redirectaddr,$formvars);
|
| 321 |
}
|
| 322 |
}
|
| 323 |
}
|
| 324 |
|
| 325 |
if($this->_framedepth < $this->maxframes && count($this->_frameurls) > 0)
|
| 326 |
{
|
| 327 |
$frameurls = $this->_frameurls;
|
| 328 |
$this->_frameurls = array();
|
| 329 |
|
| 330 |
while(list(,$frameurl) = each($frameurls))
|
| 331 |
{
|
| 332 |
if($this->_framedepth < $this->maxframes)
|
| 333 |
{
|
| 334 |
$this->fetch($frameurl);
|
| 335 |
$this->_framedepth++;
|
| 336 |
}
|
| 337 |
else
|
| 338 |
break;
|
| 339 |
}
|
| 340 |
}
|
| 341 |
|
| 342 |
}
|
| 343 |
else
|
| 344 |
{
|
| 345 |
return false;
|
| 346 |
}
|
| 347 |
return true;
|
| 348 |
break;
|
| 349 |
case "https":
|
| 350 |
if(!$this->curl_path || (!is_executable($this->curl_path)))
|
| 351 |
return false;
|
| 352 |
$this->host = $URI_PARTS["host"];
|
| 353 |
if(!empty($URI_PARTS["port"]))
|
| 354 |
$this->port = $URI_PARTS["port"];
|
| 355 |
if($this->_isproxy)
|
| 356 |
{
|
| 357 |
// using proxy, send entire URI
|
| 358 |
$this->_httpsrequest($URI, $URI, $this->_submit_method, $this->_submittype, $postdata);
|
| 359 |
}
|
| 360 |
else
|
| 361 |
{
|
| 362 |
$path = $URI_PARTS["path"].($URI_PARTS["query"] ? "?".$URI_PARTS["query"] : "");
|
| 363 |
// no proxy, send only the path
|
| 364 |
$this->_httpsrequest($path, $URI, $this->_submit_method, $this->_submittype, $postdata);
|
| 365 |
}
|
| 366 |
|
| 367 |
if($this->_redirectaddr)
|
| 368 |
{
|
| 369 |
/* url was redirected, check if we've hit the max depth */
|
| 370 |
if($this->maxredirs > $this->_redirectdepth)
|
| 371 |
{
|
| 372 |
if(!preg_match("|^".$URI_PARTS["scheme"]."://|", $this->_redirectaddr))
|
| 373 |
$this->_redirectaddr = $this->_expandlinks($this->_redirectaddr,$URI_PARTS["scheme"]."://".$URI_PARTS["host"]);
|
| 374 |
|
| 375 |
// only follow redirect if it's on this site, or offsiteok is true
|
| 376 |
if(preg_match("|^http://".preg_quote($this->host)."|i",$this->_redirectaddr) || $this->offsiteok)
|
| 377 |
{
|
| 378 |
/* follow the redirect */
|
| 379 |
$this->_redirectdepth++;
|
| 380 |
$this->lastredirectaddr=$this->_redirectaddr;
|
| 381 |
$this->submit($this->_redirectaddr,$formvars);
|
| 382 |
}
|
| 383 |
}
|
| 384 |
}
|
| 385 |
|
| 386 |
if($this->_framedepth < $this->maxframes && count($this->_frameurls) > 0)
|
| 387 |
{
|
| 388 |
$frameurls = $this->_frameurls;
|
| 389 |
$this->_frameurls = array();
|
| 390 |
|
| 391 |
while(list(,$frameurl) = each($frameurls))
|
| 392 |
{
|
| 393 |
if($this->_framedepth < $this->maxframes)
|
| 394 |
{
|
| 395 |
$this->fetch($frameurl);
|
| 396 |
$this->_framedepth++;
|
| 397 |
}
|
| 398 |
else
|
| 399 |
break;
|
| 400 |
}
|
| 401 |
}
|
| 402 |
return true;
|
| 403 |
break;
|
| 404 |
|
| 405 |
default:
|
| 406 |
// not a valid protocol
|
| 407 |
$this->error = 'Invalid protocol "'.$URI_PARTS["scheme"].'"\n';
|
| 408 |
return false;
|
| 409 |
break;
|
| 410 |
}
|
| 411 |
return true;
|
| 412 |
}
|
| 413 |
|
| 414 |
/*======================================================================*\
|
| 415 |
Function: fetchlinks
|
| 416 |
Purpose: fetch the links from a web page
|
| 417 |
Input: $URI where you are fetching from
|
| 418 |
Output: $this->results an array of the URLs
|
| 419 |
\*======================================================================*/
|
| 420 |
|
| 421 |
function fetchlinks($URI)
|
| 422 |
{
|
| 423 |
if ($this->fetch($URI))
|
| 424 |
{
|
| 425 |
|
| 426 |
if(is_array($this->results))
|
| 427 |
{
|
| 428 |
for($x=0;$x<count($this->results);$x++)
|
| 429 |
$this->results[$x] = $this->_striplinks($this->results[$x]);
|
| 430 |
}
|
| 431 |
else
|
| 432 |
$this->results = $this->_striplinks($this->results);
|
| 433 |
|
| 434 |
if($this->expandlinks)
|
| 435 |
$this->results = $this->_expandlinks($this->results, $URI);
|
| 436 |
return true;
|
| 437 |
}
|
| 438 |
else
|
| 439 |
return false;
|
| 440 |
}
|
| 441 |
|
| 442 |
/*======================================================================*\
|
| 443 |
Function: fetchform
|
| 444 |
Purpose: fetch the form elements from a web page
|
| 445 |
Input: $URI where you are fetching from
|
| 446 |
Output: $this->results the resulting html form
|
| 447 |
\*======================================================================*/
|
| 448 |
|
| 449 |
function fetchform($URI)
|
| 450 |
{
|
| 451 |
|
| 452 |
if ($this->fetch($URI))
|
| 453 |
{
|
| 454 |
|
| 455 |
if(is_array($this->results))
|
| 456 |
{
|
| 457 |
for($x=0;$x<count($this->results);$x++)
|
| 458 |
$this->results[$x] = $this->_stripform($this->results[$x]);
|
| 459 |
}
|
| 460 |
else
|
| 461 |
$this->results = $this->_stripform($this->results);
|
| 462 |
|
| 463 |
return true;
|
| 464 |
}
|
| 465 |
else
|
| 466 |
return false;
|
| 467 |
}
|
| 468 |
|
| 469 |
|
| 470 |
/*======================================================================*\
|
| 471 |
Function: fetchtext
|
| 472 |
Purpose: fetch the text from a web page, stripping the links
|
| 473 |
Input: $URI where you are fetching from
|
| 474 |
Output: $this->results the text from the web page
|
| 475 |
\*======================================================================*/
|
| 476 |
|
| 477 |
function fetchtext($URI)
|
| 478 |
{
|
| 479 |
if($this->fetch($URI))
|
| 480 |
{
|
| 481 |
if(is_array($this->results))
|
| 482 |
{
|
| 483 |
for($x=0;$x<count($this->results);$x++)
|
| 484 |
$this->results[$x] = $this->_striptext($this->results[$x]);
|
| 485 |
}
|
| 486 |
else
|
| 487 |
$this->results = $this->_striptext($this->results);
|
| 488 |
return true;
|
| 489 |
}
|
| 490 |
else
|
| 491 |
return false;
|
| 492 |
}
|
| 493 |
|
| 494 |
/*======================================================================*\
|
| 495 |
Function: submitlinks
|
| 496 |
Purpose: grab links from a form submission
|
| 497 |
Input: $URI where you are submitting from
|
| 498 |
Output: $this->results an array of the links from the post
|
| 499 |
\*======================================================================*/
|
| 500 |
|
| 501 |
function submitlinks($URI,$formvars)
|
| 502 |
{
|
| 503 |
if($this->submit($URI,$formvars))
|
| 504 |
{
|
| 505 |
if(is_array($this->results))
|
| 506 |
{
|
| 507 |
for($x=0;$x<count($this->results);$x++)
|
| 508 |
{
|
| 509 |
$this->results[$x] = $this->_striplinks($this->results[$x]);
|
| 510 |
if($this->expandlinks)
|
| 511 |
$this->results[$x] = $this->_expandlinks($this->results[$x],$URI);
|
| 512 |
}
|
| 513 |
}
|
| 514 |
else
|
| 515 |
{
|
| 516 |
$this->results = $this->_striplinks($this->results);
|
| 517 |
if($this->expandlinks)
|
| 518 |
$this->results = $this->_expandlinks($this->results,$URI);
|
| 519 |
}
|
| 520 |
return true;
|
| 521 |
}
|
| 522 |
else
|
| 523 |
return false;
|
| 524 |
}
|
| 525 |
|
| 526 |
/*======================================================================*\
|
| 527 |
Function: submittext
|
| 528 |
Purpose: grab text from a form submission
|
| 529 |
Input: $URI where you are submitting from
|
| 530 |
Output: $this->results the text from the web page
|
| 531 |
\*======================================================================*/
|
| 532 |
|
| 533 |
function submittext($URI,$formvars)
|
| 534 |
{
|
| 535 |
if($this->submit($URI,$formvars))
|
| 536 |
{
|
| 537 |
if(is_array($this->results))
|
| 538 |
{
|
| 539 |
for($x=0;$x<count($this->results);$x++)
|
| 540 |
{
|
| 541 |
$this->results[$x] = $this->_striptext($this->results[$x]);
|
| 542 |
if($this->expandlinks)
|
| 543 |
$this->results[$x] = $this->_expandlinks($this->results[$x],$URI);
|
| 544 |
}
|
| 545 |
}
|
| 546 |
else
|
| 547 |
{
|
| 548 |
$this->results = $this->_striptext($this->results);
|
| 549 |
if($this->expandlinks)
|
| 550 |
$this->results = $this->_expandlinks($this->results,$URI);
|
| 551 |
}
|
| 552 |
return true;
|
| 553 |
}
|
| 554 |
else
|
| 555 |
return false;
|
| 556 |
}
|
| 557 |
|
| 558 |
/*======================================================================*\
|
| 559 |
Function: _striplinks
|
| 560 |
Purpose: strip the hyperlinks from an html document
|
| 561 |
Input: $document document to strip.
|
| 562 |
Output: $match an array of the links
|
| 563 |
\*======================================================================*/
|
| 564 |
|
| 565 |
function _striplinks($document)
|
| 566 |
{
|
| 567 |
preg_match_all("'<a\s+href\s*=\s* # find <a href=
|
| 568 |
([\"\'])? # find single or double quote
|
| 569 |
(?(1) (.*?)\\1 | ([^\s\>]+)) # if quote found, match up to next matching
|
| 570 |
# quote, otherwise match up to next space
|
| 571 |
'isx",$document,$links);
|
| 572 |
|
| 573 |
|
| 574 |
// catenate the non-empty matches from the conditional subpattern
|
| 575 |
|
| 576 |
while(list($key,$val) = each($links[2]))
|
| 577 |
{
|
| 578 |
if(!empty($val))
|
| 579 |
$match[] = $val;
|
| 580 |
}
|
| 581 |
|
| 582 |
while(list($key,$val) = each($links[3]))
|
| 583 |
{
|
| 584 |
if(!empty($val))
|
| 585 |
$match[] = $val;
|
| 586 |
}
|
| 587 |
|
| 588 |
// return the links
|
| 589 |
return $match;
|
| 590 |
}
|
| 591 |
|
| 592 |
/*======================================================================*\
|
| 593 |
Function: _stripform
|
| 594 |
Purpose: strip the form elements from an html document
|
| 595 |
Input: $document document to strip.
|
| 596 |
Output: $match an array of the links
|
| 597 |
\*======================================================================*/
|
| 598 |
|
| 599 |
function _stripform($document)
|
| 600 |
{
|
| 601 |
preg_match_all("'<\/?(FORM|INPUT|SELECT|(OPTION))[^<>]*>(?(2)(.*(?=<\/?(option|select)[^<>]*>[\r\n]*)|(?=[\r\n]*))|(?=[\r\n]*))'Usi",$document,$elements);
|
| 602 |
|
| 603 |
// catenate the matches
|
| 604 |
$match = implode("\n",$elements[0]);
|
| 605 |
|
| 606 |
// return the links
|
| 607 |
return $match;
|
| 608 |
}
|
| 609 |
|
| 610 |
|
| 611 |
|
| 612 |
/*======================================================================*\
|
| 613 |
Function: _striptext
|
| 614 |
Purpose: strip the text from an html document
|
| 615 |
Input: $document document to strip.
|
| 616 |
Output: $text the resulting text
|
| 617 |
\*======================================================================*/
|
| 618 |
|
| 619 |
function _striptext($document)
|
| 620 |
{
|
| 621 |
|
| 622 |
// I didn't use preg eval (//e) since that is only available in PHP 4.0.
|
| 623 |
// so, list your entities one by one here. I included some of the
|
| 624 |
// more common ones.
|
| 625 |
|
| 626 |
$search = array("'<script[^>]*?>.*?</script>'si", // strip out javascript
|
| 627 |
"'<[\/\!]*?[^<>]*?>'si", // strip out html tags
|
| 628 |
"'([\r\n])[\s]+'", // strip out white space
|
| 629 |
"'&(quote|#34);'i", // replace html entities
|
| 630 |
"'&(amp|#38);'i",
|
| 631 |
"'&(lt|#60);'i",
|
| 632 |
"'&(gt|#62);'i",
|
| 633 |
"'&(nbsp|#160);'i",
|
| 634 |
"'&(iexcl|#161);'i",
|
| 635 |
"'&(cent|#162);'i",
|
| 636 |
"'&(pound|#163);'i",
|
| 637 |
"'&(copy|#169);'i"
|
| 638 |
);
|
| 639 |
$replace = array( "",
|
| 640 |
"",
|
| 641 |
"\\1",
|
| 642 |
"\"",
|
| 643 |
"&",
|
| 644 |
"<",
|
| 645 |
">",
|
| 646 |
" ",
|
| 647 |
chr(161),
|
| 648 |
chr(162),
|
| 649 |
chr(163),
|
| 650 |
chr(169));
|
| 651 |
|
| 652 |
$text = preg_replace($search,$replace,$document);
|
| 653 |
|
| 654 |
return $text;
|
| 655 |
}
|
| 656 |
|
| 657 |
/*======================================================================*\
|
| 658 |
Function: _expandlinks
|
| 659 |
Purpose: expand each link into a fully qualified URL
|
| 660 |
Input: $links the links to qualify
|
| 661 |
$URI the full URI to get the base from
|
| 662 |
Output: $expandedLinks the expanded links
|
| 663 |
\*======================================================================*/
|
| 664 |
|
| 665 |
function _expandlinks($links,$URI)
|
| 666 |
{
|
| 667 |
preg_match("/^[^\?]+/",$URI,$match);
|
| 668 |
|
| 669 |
$match = preg_replace("|/[^\/\.]+\.[^\/\.]+$|","",$match[0]);
|
| 670 |
|
| 671 |
$search = array( "|^http://".preg_quote($this->host)."|i",
|
| 672 |
"|^(?!http://)(\/)?(?!mailto:)|i",
|
| 673 |
"|/\./|",
|
| 674 |
"|/[^\/]+/\.\./|"
|
| 675 |
);
|
| 676 |
|
| 677 |
$replace = array( "",
|
| 678 |
$match."/",
|
| 679 |
"/",
|
| 680 |
"/"
|
| 681 |
);
|
| 682 |
|
| 683 |
$expandedLinks = preg_replace($search,$replace,$links);
|
| 684 |
|
| 685 |
return $expandedLinks;
|
| 686 |
}
|
| 687 |
|
| 688 |
/*======================================================================*\
|
| 689 |
Function: _httprequest
|
| 690 |
Purpose: go get the http data from the server
|
| 691 |
Input: $url the url to fetch
|
| 692 |
$fp the current open file pointer
|
| 693 |
$URI the full URI
|
| 694 |
$body body contents to send if any (POST)
|
| 695 |
Output:
|
| 696 |
\*======================================================================*/
|
| 697 |
|
| 698 |
function _httprequest($url,$fp,$URI,$http_method,$content_type="",$body="")
|
| 699 |
{
|
| 700 |
if($this->passcookies && $this->_redirectaddr)
|
| 701 |
$this->setcookies();
|
| 702 |
|
| 703 |
$URI_PARTS = parse_url($URI);
|
| 704 |
if(empty($url))
|
| 705 |
$url = "/";
|
| 706 |
$headers = $http_method." ".$url." ".$this->_httpversion."\n";
|
| 707 |
if(!empty($this->agent))
|
| 708 |
$headers .= "User-Agent: ".$this->agent."\n";
|
| 709 |
if(!empty($this->host))
|
| 710 |
$headers .= "Host: ".$this->host."\n";
|
| 711 |
if(!empty($this->accept))
|
| 712 |
$headers .= "Accept: ".$this->accept."\n";
|
| 713 |
if(!empty($this->referer))
|
| 714 |
$headers .= "Referer: ".$this->referer."\n";
|
| 715 |
if(!empty($this->cookies))
|
| 716 |
{
|
| 717 |
if(!is_array($this->cookies))
|
| 718 |
$this->cookies = Array($this->cookies);
|
| 719 |
|
| 720 |
while(list($cookieKey,$cookieVal) = each($this->cookies))
|
| 721 |
$headers .= "Cookie: ".$cookieKey."=".$cookieVal."\n";
|
| 722 |
}
|
| 723 |
if(!empty($this->rawheaders))
|
| 724 |
{
|
| 725 |
if(!is_array($this->rawheaders))
|
| 726 |
$this->rawheaders = (array)$this->rawheaders;
|
| 727 |
while(list($headerKey,$headerVal) = each($this->rawheaders))
|
| 728 |
$headers .= $headerKey.": ".$headerVal."\n";
|
| 729 |
}
|
| 730 |
if(!empty($content_type))
|
| 731 |
$headers .= "Content-type: $content_type\n";
|
| 732 |
if(!empty($body))
|
| 733 |
$headers .= "Content-length: ".strlen($body)."\n";
|
| 734 |
if(!empty($this->user) || !empty($this->pass))
|
| 735 |
$headers .= "Authorization: BASIC ".base64_encode($this->user.":".$this->pass)."\n";
|
| 736 |
|
| 737 |
$headers .= "\n";
|
| 738 |
|
| 739 |
// set the read timeout if needed
|
| 740 |
if ($this->read_timeout > 0)
|
| 741 |
socket_set_timeout($fp, $this->read_timeout);
|
| 742 |
$this->timed_out = false;
|
| 743 |
|
| 744 |
fwrite($fp,$headers.$body,strlen($headers.$body));
|
| 745 |
|
| 746 |
$this->_redirectaddr = false;
|
| 747 |
unset($this->headers);
|
| 748 |
|
| 749 |
while($currentHeader = fgets($fp,$this->_maxlinelen))
|
| 750 |
{
|
| 751 |
if ($this->read_timeout > 0 && $this->_check_timeout($fp))
|
| 752 |
{
|
| 753 |
$this->status=-100;
|
| 754 |
return false;
|
| 755 |
}
|
| 756 |
|
| 757 |
if($currentHeader == "\r\n")
|
| 758 |
break;
|
| 759 |
|
| 760 |
// if a header begins with Location: or URI:, set the redirect
|
| 761 |
if(preg_match("/^(Location: |URI: )/i",$currentHeader))
|
| 762 |
{
|
| 763 |
// get URL portion of the redirect
|
| 764 |
preg_match("/^(Location: |URI:)(.*)/",chop($currentHeader),$matches);
|
| 765 |
// look for :// in the Location header to see if hostname is included
|
| 766 |
if(!preg_match("|\:\/\/|",$matches[2]))
|
| 767 |
{
|
| 768 |
// no host in the path, so prepend
|
| 769 |
$this->_redirectaddr = $URI_PARTS["scheme"]."://".$this->host.":".$this->port;
|
| 770 |
// eliminate double slash
|
| 771 |
if(!preg_match("|^/|",$matches[2]))
|
| 772 |
$this->_redirectaddr .= "/".$matches[2];
|
| 773 |
else
|
| 774 |
$this->_redirectaddr .= $matches[2];
|
| 775 |
}
|
| 776 |
else
|
| 777 |
$this->_redirectaddr = $matches[2];
|
| 778 |
}
|
| 779 |
|
| 780 |
if(preg_match("|^HTTP/|",$currentHeader))
|
| 781 |
{
|
| 782 |
if(preg_match("|^HTTP/[^\s]*\s(.*?)\s|",$currentHeader, $status))
|
| 783 |
{
|
| 784 |
$this->status= $status[1];
|
| 785 |
}
|
| 786 |
$this->response_code = $currentHeader;
|
| 787 |
}
|
| 788 |
|
| 789 |
$this->headers[] = $currentHeader;
|
| 790 |
}
|
| 791 |
|
| 792 |
$results = fread($fp, $this->maxlength);
|
| 793 |
|
| 794 |
if ($this->read_timeout > 0 && $this->_check_timeout($fp))
|
| 795 |
{
|
| 796 |
$this->status=-100;
|
| 797 |
return false;
|
| 798 |
}
|
| 799 |
|
| 800 |
// check if there is a a redirect meta tag
|
| 801 |
|
| 802 |
if(preg_match("'<meta[\s]*http-equiv[^>]*?content[\s]*=[\s]*[\"\']?\d+;[\s]+URL[\s]*=[\s]*([^\"\']*?)[\"\']?>'i",$results,$match))
|
| 803 |
{
|
| 804 |
$this->_redirectaddr = $this->_expandlinks($match[1],$URI);
|
| 805 |
}
|
| 806 |
|
| 807 |
// have we hit our frame depth and is there frame src to fetch?
|
| 808 |
if(($this->_framedepth < $this->maxframes) && preg_match_all("'<frame[\s]*src[\s]*=[\'\"]?([^\'\"\>]+)'i",$results,$match))
|
| 809 |
{
|
| 810 |
$this->results[] = $results;
|
| 811 |
for($x=0; $x<count($match[1]); $x++)
|
| 812 |
$this->_frameurls[] = $this->_expandlinks($match[1][$x],$URI_PARTS["scheme"]."://".$this->host);
|
| 813 |
}
|
| 814 |
// have we already fetched framed content?
|
| 815 |
elseif(is_array($this->results))
|
| 816 |
$this->results[] = $results;
|
| 817 |
// no framed content
|
| 818 |
else
|
| 819 |
$this->results = $results;
|
| 820 |
|
| 821 |
return true;
|
| 822 |
}
|
| 823 |
|
| 824 |
/*======================================================================*\
|
| 825 |
Function: _httpsrequest
|
| 826 |
Purpose: go get the https data from the server using curl
|
| 827 |
Input: $url the url to fetch
|
| 828 |
$URI the full URI
|
| 829 |
$body body contents to send if any (POST)
|
| 830 |
Output:
|
| 831 |
\*======================================================================*/
|
| 832 |
|
| 833 |
function _httpsrequest($url,$URI,$http_method,$content_type="",$body="")
|
| 834 |
{
|
| 835 |
if($this->passcookies && $this->_redirectaddr)
|
| 836 |
$this->setcookies();
|
| 837 |
|
| 838 |
$headers = array();
|
| 839 |
|
| 840 |
$URI_PARTS = parse_url($URI);
|
| 841 |
if(empty($url))
|
| 842 |
$url = "/";
|
| 843 |
$headers[] = $http_method." ".$url." ".$this->_httpversion;
|
| 844 |
if(!empty($this->agent))
|
| 845 |
$headers[] = "User-Agent: ".$this->agent;
|
| 846 |
if(!empty($this->host))
|
| 847 |
$headers[] = "Host: ".$this->host;
|
| 848 |
if(!empty($this->accept))
|
| 849 |
$headers[] = "Accept: ".$this->accept;
|
| 850 |
if(!empty($this->referer))
|
| 851 |
$headers[] = "Referer: ".$this->referer;
|
| 852 |
if(!empty($this->cookies))
|
| 853 |
{
|
| 854 |
if(!is_array($this->cookies))
|
| 855 |
$this->cookies = (array)$this->cookies;
|
| 856 |
|
| 857 |
while(list($cookieKey,$cookieVal) = each($this->cookies))
|
| 858 |
$headers[] = "Cookie: ".$cookieKey."=".$cookieVal;
|
| 859 |
}
|
| 860 |
if(!empty($this->rawheaders))
|
| 861 |
{
|
| 862 |
if(!is_array($this->rawheaders))
|
| 863 |
$this->rawheaders = (array)$this->rawheaders;
|
| 864 |
while(list($headerKey,$headerVal) = each($this->rawheaders))
|
| 865 |
$headers[] = $headerKey.": ".$headerVal;
|
| 866 |
}
|
| 867 |
if(!empty($content_type))
|
| 868 |
$headers[] = "Content-type: $content_type";
|
| 869 |
if(!empty($body))
|
| 870 |
$headers[] = "Content-length: ".strlen($body);
|
| 871 |
if(!empty($this->user) || !empty($this->pass))
|
| 872 |
$headers[] = "Authorization: BASIC ".base64_encode($this->user.":".$this->pass);
|
| 873 |
|
| 874 |
for($curr_header = 0; $curr_header < count($headers); $curr_header++)
|
| 875 |
$cmdline_params .= " -H \"".$headers[$curr_header]."\"";
|
| 876 |
|
| 877 |
if(!empty($body))
|
| 878 |
$cmdline_params .= " -d \"$body\"";
|
| 879 |
|
| 880 |
if($this->read_timeout > 0)
|
| 881 |
$cmdline_params .= " -m ".$this->read_timeout;
|
| 882 |
|
| 883 |
$headerfile = uniqid(time());
|
| 884 |
|
| 885 |
exec($this->curl_path." -D \"/tmp/$headerfile\"".$cmdline_params." ".$URI,$results,$return);
|
| 886 |
|
| 887 |
if($return)
|
| 888 |
{
|
| 889 |
$this->error = "Error: cURL could not retrieve the document, error $return.";
|
| 890 |
return false;
|
| 891 |
}
|
| 892 |
|
| 893 |
|
| 894 |
$results = implode("\n",$results);
|
| 895 |
|
| 896 |
$result_headers = file("/tmp/$headerfile");
|
| 897 |
|
| 898 |
$this->_redirectaddr = false;
|
| 899 |
unset($this->headers);
|
| 900 |
|
| 901 |
for($currentHeader = 0; $currentHeader < count($result_headers); $currentHeader++)
|
| 902 |
{
|
| 903 |
|
| 904 |
// if a header begins with Location: or URI:, set the redirect
|
| 905 |
if(preg_match("/^(Location: |URI: )/i",$result_headers[$currentHeader]))
|
| 906 |
{
|
| 907 |
// get URL portion of the redirect
|
| 908 |
preg_match("/^(Location: |URI:)(.*)/",chop($result_headers[$currentHeader]),$matches);
|
| 909 |
// look for :// in the Location header to see if hostname is included
|
| 910 |
if(!preg_match("|\:\/\/|",$matches[2]))
|
| 911 |
{
|
| 912 |
// no host in the path, so prepend
|
| 913 |
$this->_redirectaddr = $URI_PARTS["scheme"]."://".$this->host.":".$this->port;
|
| 914 |
// eliminate double slash
|
| 915 |
if(!preg_match("|^/|",$matches[2]))
|
| 916 |
$this->_redirectaddr .= "/".$matches[2];
|
| 917 |
else
|
| 918 |
$this->_redirectaddr .= $matches[2];
|
| 919 |
}
|
| 920 |
else
|
| 921 |
$this->_redirectaddr = $matches[2];
|
| 922 |
}
|
| 923 |
|
| 924 |
if(preg_match("|^HTTP/|",$result_headers[$currentHeader]))
|
| 925 |
$this->response_code = $result_headers[$currentHeader];
|
| 926 |
|
| 927 |
$this->headers[] = $result_headers[$currentHeader];
|
| 928 |
}
|
| 929 |
|
| 930 |
// check if there is a a redirect meta tag
|
| 931 |
|
| 932 |
if(preg_match("'<meta[\s]*http-equiv[^>]*?content[\s]*=[\s]*[\"\']?\d+;[\s]+URL[\s]*=[\s]*([^\"\']*?)[\"\']?>'i",$results,$match))
|
| 933 |
{
|
| 934 |
$this->_redirectaddr = $this->_expandlinks($match[1],$URI);
|
| 935 |
}
|
| 936 |
|
| 937 |
// have we hit our frame depth and is there frame src to fetch?
|
| 938 |
if(($this->_framedepth < $this->maxframes) && preg_match_all("'<frame[\s].*?src[\s]*=[\'\"]?([^\'\"\>]+)'i",$results,$match))
|
| 939 |
{
|
| 940 |
$this->results[] = $results;
|
| 941 |
for($x=0; $x<count($match[1]); $x++)
|
| 942 |
$this->_frameurls[] = $this->_expandlinks($match[1][$x],$URI_PARTS["scheme"]."://".$this->host);
|
| 943 |
}
|
| 944 |
// have we already fetched framed content?
|
| 945 |
elseif(is_array($this->results))
|
| 946 |
$this->results[] = $results;
|
| 947 |
// no framed content
|
| 948 |
else
|
| 949 |
$this->results = $results;
|
| 950 |
|
| 951 |
unlink("/tmp/$headerfile");
|
| 952 |
|
| 953 |
return true;
|
| 954 |
}
|
| 955 |
|
| 956 |
/*======================================================================*\
|
| 957 |
Function: setcookies()
|
| 958 |
Purpose: set cookies for a redirection
|
| 959 |
\*======================================================================*/
|
| 960 |
function setcookies()
|
| 961 |
{
|
| 962 |
for($x=0; $x<count($this->headers); $x++)
|
| 963 |
{
|
| 964 |
if(preg_match("/^set-cookie:[\s]+([^=]+)=([^;]+)/i", $this->headers[$x],$match))
|
| 965 |
$this->cookies[$match[1]] = $match[2];
|
| 966 |
}
|
| 967 |
}
|
| 968 |
|
| 969 |
|
| 970 |
/*======================================================================*\
|
| 971 |
Function: _check_timeout
|
| 972 |
Purpose: checks whether timeout has occurred
|
| 973 |
Input: $fp file pointer
|
| 974 |
\*======================================================================*/
|
| 975 |
function _check_timeout($fp)
|
| 976 |
{
|
| 977 |
if ($this->read_timeout > 0) {
|
| 978 |
$fp_status = socket_get_status($fp);
|
| 979 |
if ($fp_status["timed_out"]) {
|
| 980 |
$this->timed_out = true;
|
| 981 |
return true;
|
| 982 |
}
|
| 983 |
}
|
| 984 |
return false;
|
| 985 |
}
|
| 986 |
|
| 987 |
/*======================================================================*\
|
| 988 |
Function: _connect
|
| 989 |
Purpose: make a socket connection
|
| 990 |
Input: $fp file pointer
|
| 991 |
\*======================================================================*/
|
| 992 |
|
| 993 |
function _connect(&$fp)
|
| 994 |
{
|
| 995 |
if(!empty($this->proxy_host) && !empty($this->proxy_port))
|
| 996 |
{
|
| 997 |
$this->_isproxy = true;
|
| 998 |
$host = $this->proxy_host;
|
| 999 |
$port = $this->proxy_port;
|
| 1000 |
}
|
| 1001 |
else
|
| 1002 |
{
|
| 1003 |
$host = $this->host;
|
| 1004 |
$port = $this->port;
|
| 1005 |
}
|
| 1006 |
|
| 1007 |
$this->status = 0;
|
| 1008 |
|
| 1009 |
if($fp = fsockopen(
|
| 1010 |
$host,
|
| 1011 |
$port,
|
| 1012 |
$errno,
|
| 1013 |
$errstr,
|
| 1014 |
$this->_fp_timeout
|
| 1015 |
))
|
| 1016 |
{
|
| 1017 |
// socket connection succeeded
|
| 1018 |
|
| 1019 |
return true;
|
| 1020 |
}
|
| 1021 |
else
|
| 1022 |
{
|
| 1023 |
// socket connection failed
|
| 1024 |
$this->status = $errno;
|
| 1025 |
switch($errno)
|
| 1026 |
{
|
| 1027 |
case -3:
|
| 1028 |
$this->error="socket creation failed (-3)";
|
| 1029 |
case -4:
|
| 1030 |
$this->error="dns lookup failure (-4)";
|
| 1031 |
case -5:
|
| 1032 |
$this->error="connection refused or timed out (-5)";
|
| 1033 |
default:
|
| 1034 |
$this->error="connection failed (".$errno.")";
|
| 1035 |
}
|
| 1036 |
return false;
|
| 1037 |
}
|
| 1038 |
}
|
| 1039 |
|
| 1040 |
/*======================================================================*\
|
| 1041 |
Function: _disconnect
|
| 1042 |
Purpose: disconnect a socket connection
|
| 1043 |
Input: $fp file pointer
|
| 1044 |
\*======================================================================*/
|
| 1045 |
|
| 1046 |
function _disconnect($fp)
|
| 1047 |
{
|
| 1048 |
return(fclose($fp));
|
| 1049 |
}
|
| 1050 |
|
| 1051 |
}
|
| 1052 |
|
| 1053 |
?>
|