| 1 |
<?php
|
| 2 |
// $Id$
|
| 3 |
|
| 4 |
/*******************************************************************************
|
| 5 |
Version: 0.98 ($Rev: 117 $)
|
| 6 |
Website: http://sourceforge.net/projects/simplehtmldom/
|
| 7 |
Author: S.C. Chen (me578022@gmail.com)
|
| 8 |
Acknowledge: Jose Solorzano (https://sourceforge.net/projects/php-html/)
|
| 9 |
Contributions by: Yousuke Kumakura (Attribute filters)
|
| 10 |
Licensed under The MIT License
|
| 11 |
Redistributions of files must retain the above copyright notice.
|
| 12 |
*******************************************************************************/
|
| 13 |
|
| 14 |
define('HDOM_TYPE_ELEMENT', 1);
|
| 15 |
define('HDOM_TYPE_COMMENT', 2);
|
| 16 |
define('HDOM_TYPE_TEXT', 3);
|
| 17 |
define('HDOM_TYPE_ENDTAG', 4);
|
| 18 |
define('HDOM_TYPE_ROOT', 5);
|
| 19 |
define('HDOM_TYPE_UNKNOWN', 6);
|
| 20 |
define('HDOM_QUOTE_DOUBLE', 0);
|
| 21 |
define('HDOM_QUOTE_SINGLE', 1);
|
| 22 |
define('HDOM_QUOTE_NO', 3);
|
| 23 |
define('HDOM_INFO_BEGIN', 0);
|
| 24 |
define('HDOM_INFO_END', 1);
|
| 25 |
define('HDOM_INFO_QUOTE', 2);
|
| 26 |
define('HDOM_INFO_SPACE', 3);
|
| 27 |
define('HDOM_INFO_TEXT', 4);
|
| 28 |
define('HDOM_INFO_INNER', 5);
|
| 29 |
define('HDOM_INFO_OUTER', 6);
|
| 30 |
define('HDOM_INFO_ENDSPACE',7);
|
| 31 |
|
| 32 |
// helper functions
|
| 33 |
// -----------------------------------------------------------------------------
|
| 34 |
// get dom form file
|
| 35 |
function file_get_dom() {
|
| 36 |
$dom = new simple_html_dom;
|
| 37 |
$args = func_get_args();
|
| 38 |
$dom->load(call_user_func_array('file_get_contents', $args), true);
|
| 39 |
return $dom;
|
| 40 |
}
|
| 41 |
|
| 42 |
// get dom form string
|
| 43 |
function str_get_dom($str, $lowercase=true) {
|
| 44 |
$dom = new simple_html_dom;
|
| 45 |
$dom->load($str, $lowercase);
|
| 46 |
return $dom;
|
| 47 |
}
|
| 48 |
|
| 49 |
// simple html dom node
|
| 50 |
// -----------------------------------------------------------------------------
|
| 51 |
class simple_html_dom_node {
|
| 52 |
public $tag = 'text';
|
| 53 |
public $nodetype = HDOM_TYPE_TEXT;
|
| 54 |
public $attr = array();
|
| 55 |
public $parent = null;
|
| 56 |
public $children = array();
|
| 57 |
public $dom = null;
|
| 58 |
public $nodes = array();
|
| 59 |
public $info = array(
|
| 60 |
HDOM_INFO_BEGIN=>-1,
|
| 61 |
HDOM_INFO_END=>0,
|
| 62 |
HDOM_INFO_TEXT=>'',
|
| 63 |
HDOM_INFO_ENDSPACE=>'',
|
| 64 |
HDOM_INFO_QUOTE=>array(),
|
| 65 |
HDOM_INFO_SPACE=>array()
|
| 66 |
);
|
| 67 |
|
| 68 |
function __construct($dom=null) {
|
| 69 |
$this->dom = $dom;
|
| 70 |
}
|
| 71 |
|
| 72 |
// clean up memory due to php5 circular references memory leak...
|
| 73 |
function clear() {
|
| 74 |
unset($this->tag);
|
| 75 |
unset($this->nodetype);
|
| 76 |
unset($this->attr);
|
| 77 |
unset($this->parent);
|
| 78 |
unset($this->children);
|
| 79 |
unset($this->nodes);
|
| 80 |
unset($this->dom);
|
| 81 |
unset($this->info);
|
| 82 |
}
|
| 83 |
|
| 84 |
// returns the parent of node
|
| 85 |
function parent() {
|
| 86 |
return $this->parent;
|
| 87 |
}
|
| 88 |
|
| 89 |
// returns children of node
|
| 90 |
function children($idx=-1) {
|
| 91 |
if ($idx==-1) return $this->children;
|
| 92 |
if (isset($this->children[$idx])) return $this->children[$idx];
|
| 93 |
return null;
|
| 94 |
}
|
| 95 |
|
| 96 |
// returns the first child of node
|
| 97 |
function first_child() {
|
| 98 |
if (count($this->children)>0) return $this->children[0];
|
| 99 |
return null;
|
| 100 |
}
|
| 101 |
|
| 102 |
// returns the last child of node
|
| 103 |
function last_child() {
|
| 104 |
if (($count=count($this->children))>0) return $this->children[$count-1];
|
| 105 |
return null;
|
| 106 |
}
|
| 107 |
|
| 108 |
// returns the next sibling of node
|
| 109 |
function next_sibling() {
|
| 110 |
if ($this->parent===null) return null;
|
| 111 |
$idx = 0;
|
| 112 |
$count = count($this->parent->children);
|
| 113 |
while ($idx<$count && $this!==$this->parent->children[$idx])
|
| 114 |
++$idx;
|
| 115 |
if (++$idx>=$count) return null;
|
| 116 |
return $this->parent->children[$idx];
|
| 117 |
}
|
| 118 |
|
| 119 |
// returns the previous sibling of node
|
| 120 |
function prev_sibling() {
|
| 121 |
if ($this->parent===null) return null;
|
| 122 |
$idx = 0;
|
| 123 |
$count = count($this->parent->children);
|
| 124 |
while ($idx<$count && $this!==$this->parent->children[$idx])
|
| 125 |
++$idx;
|
| 126 |
if (--$idx<0) return null;
|
| 127 |
return $this->parent->children[$idx];
|
| 128 |
}
|
| 129 |
|
| 130 |
// get dom node's inner html
|
| 131 |
function innertext() {
|
| 132 |
if (isset($this->info[HDOM_INFO_INNER])) return $this->info[HDOM_INFO_INNER];
|
| 133 |
switch ($this->nodetype) {
|
| 134 |
case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->info[HDOM_INFO_TEXT]);
|
| 135 |
case HDOM_TYPE_COMMENT: return $this->dom->restore_noise($this->info[HDOM_INFO_TEXT]);
|
| 136 |
case HDOM_TYPE_UNKNOWN: return $this->dom->restore_noise($this->info[HDOM_INFO_TEXT]);
|
| 137 |
}
|
| 138 |
|
| 139 |
$ret = '';
|
| 140 |
foreach($this->nodes as $n)
|
| 141 |
$ret .= $n->outertext();
|
| 142 |
return $ret;
|
| 143 |
}
|
| 144 |
|
| 145 |
// get dom node's outer text (with tag)
|
| 146 |
function outertext() {
|
| 147 |
if ($this->tag=='root') return $this->dom->save();
|
| 148 |
if (isset($this->info[HDOM_INFO_OUTER])) return $this->info[HDOM_INFO_OUTER];
|
| 149 |
|
| 150 |
// render begin tag
|
| 151 |
$ret = $this->dom->nodes[$this->info[HDOM_INFO_BEGIN]]->makeup();
|
| 152 |
|
| 153 |
// render inner text
|
| 154 |
if (isset($this->info[HDOM_INFO_INNER]))
|
| 155 |
$ret .= $this->info[HDOM_INFO_INNER];
|
| 156 |
else {
|
| 157 |
foreach($this->nodes as $n)
|
| 158 |
$ret .= $n->outertext();
|
| 159 |
}
|
| 160 |
// render end tag
|
| 161 |
if($this->info[HDOM_INFO_END])
|
| 162 |
$ret .= $this->dom->nodes[$this->info[HDOM_INFO_END]]->makeup($this->tag);
|
| 163 |
|
| 164 |
return $ret;
|
| 165 |
}
|
| 166 |
|
| 167 |
// get dom node's plain text
|
| 168 |
function plaintext() {
|
| 169 |
if (isset($this->info[HDOM_INFO_INNER])) return $this->info[HDOM_INFO_INNER];
|
| 170 |
switch ($this->nodetype) {
|
| 171 |
case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->info[HDOM_INFO_TEXT]);
|
| 172 |
case HDOM_TYPE_COMMENT: return '';
|
| 173 |
case HDOM_TYPE_UNKNOWN: return '';
|
| 174 |
}
|
| 175 |
if (strcasecmp($this->tag, 'script')==0) return '';
|
| 176 |
if (strcasecmp($this->tag, 'style')==0) return '';
|
| 177 |
$ret = '';
|
| 178 |
|
| 179 |
foreach($this->nodes as $n)
|
| 180 |
$ret .= $n->plaintext();
|
| 181 |
|
| 182 |
return $ret;
|
| 183 |
}
|
| 184 |
|
| 185 |
// build node's text with tag
|
| 186 |
function makeup($tag=null) {
|
| 187 |
if ($tag===null) $tag = $this->tag;
|
| 188 |
|
| 189 |
switch($this->nodetype) {
|
| 190 |
case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->info[HDOM_INFO_TEXT]);
|
| 191 |
case HDOM_TYPE_COMMENT: return $this->dom->restore_noise($this->info[HDOM_INFO_TEXT]);
|
| 192 |
case HDOM_TYPE_UNKNOWN: return $this->dom->restore_noise($this->info[HDOM_INFO_TEXT]);
|
| 193 |
case HDOM_TYPE_ENDTAG: return '</'.$tag.'>';
|
| 194 |
}
|
| 195 |
|
| 196 |
$ret = '<'.$tag;
|
| 197 |
$i = 0;
|
| 198 |
|
| 199 |
foreach($this->attr as $key=>$val) {
|
| 200 |
// skip removed attribute
|
| 201 |
if ($val===null || $val===false) {
|
| 202 |
++$i;
|
| 203 |
continue;
|
| 204 |
}
|
| 205 |
$ret .= $this->info[HDOM_INFO_SPACE][$i][0];
|
| 206 |
//no value attr: nowrap, checked selected...
|
| 207 |
if ($val===true)
|
| 208 |
$ret .= $key;
|
| 209 |
else {
|
| 210 |
$quote = '';
|
| 211 |
switch($this->info[HDOM_INFO_QUOTE][$i]) {
|
| 212 |
case HDOM_QUOTE_DOUBLE: $quote = '"'; break;
|
| 213 |
case HDOM_QUOTE_SINGLE: $quote = '\''; break;
|
| 214 |
}
|
| 215 |
$ret .= $key.$this->info[HDOM_INFO_SPACE][$i][1].'='.$this->info[HDOM_INFO_SPACE][$i][2].$quote.$val.$quote;
|
| 216 |
}
|
| 217 |
++$i;
|
| 218 |
}
|
| 219 |
return $ret . $this->info[HDOM_INFO_ENDSPACE] . '>';
|
| 220 |
}
|
| 221 |
|
| 222 |
// find elements by css selector
|
| 223 |
function find($selector, $idx=-1) {
|
| 224 |
$selector = trim($selector);
|
| 225 |
if ($selector=='*') return $this->children;
|
| 226 |
|
| 227 |
$selectors = $this->parse_selector($selector);
|
| 228 |
if (($count=count($selectors))==0) return array();
|
| 229 |
$found_keys = array();
|
| 230 |
|
| 231 |
// find each selector
|
| 232 |
for ($c=0; $c<$count; ++$c) {
|
| 233 |
if (($levle=count($selectors[0]))==0) return array();
|
| 234 |
$head = array($this->info[HDOM_INFO_BEGIN]=>1);
|
| 235 |
|
| 236 |
// handle descendant selectors, no recursive!
|
| 237 |
for ($l=0; $l<$levle; ++$l) {
|
| 238 |
$ret = array();
|
| 239 |
foreach($head as $k=>$v) {
|
| 240 |
$n = ($k==-1) ? $this->dom->root : $this->dom->nodes[$k];
|
| 241 |
$n->seek($selectors[$c][$l], $ret);
|
| 242 |
}
|
| 243 |
$head = $ret;
|
| 244 |
}
|
| 245 |
|
| 246 |
foreach($head as $k=>$v) {
|
| 247 |
if (!isset($found_keys[$k]))
|
| 248 |
$found_keys[$k] = 1;
|
| 249 |
}
|
| 250 |
}
|
| 251 |
|
| 252 |
// sort keys
|
| 253 |
ksort($found_keys);
|
| 254 |
|
| 255 |
$found = array();
|
| 256 |
foreach($found_keys as $k=>$v)
|
| 257 |
$found[] = $this->dom->nodes[$k];
|
| 258 |
|
| 259 |
// return nth-element or array
|
| 260 |
if ($idx<0) return $found;
|
| 261 |
return (isset($found[$idx])) ? $found[$idx] : null;
|
| 262 |
}
|
| 263 |
|
| 264 |
protected function parse_selector($selector_string) {
|
| 265 |
// pattern of CSS selectors, modified from mootools
|
| 266 |
$pattern = "/([A-Za-z0-9_\\-:]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[(\w+)(?:([!*^$]?=)[\"']?([^\"']*)[\"']?)?])?/";
|
| 267 |
|
| 268 |
// handle multiple selectors
|
| 269 |
$selector_list = split(',', $selector_string);
|
| 270 |
$selectors = array();
|
| 271 |
|
| 272 |
foreach($selector_list as $selector) {
|
| 273 |
$result = array();
|
| 274 |
preg_match_all($pattern, trim($selector), $matches, PREG_SET_ORDER);
|
| 275 |
|
| 276 |
foreach ($matches as $m) {
|
| 277 |
list($tag, $key, $val, $exp) = array($m[1], null, null, '=');
|
| 278 |
|
| 279 |
if ($m[0]=='') continue;
|
| 280 |
if(!empty($m[2])) {$key='id'; $val=$m[2];}
|
| 281 |
if(!empty($m[3])) {$key='class'; $val=$m[3];}
|
| 282 |
if(!empty($m[4])) {$key=$m[4];}
|
| 283 |
if(!empty($m[5])) {$exp=$m[5];}
|
| 284 |
if(!empty($m[6])) {$val=$m[6];}
|
| 285 |
|
| 286 |
// convert to lowercase
|
| 287 |
if ($this->dom->lowercase) {$tag=strtolower($tag); $key=strtolower($key);}
|
| 288 |
|
| 289 |
$result[] = array($tag, $key, $val, $exp);
|
| 290 |
}
|
| 291 |
$selectors[] = $result;
|
| 292 |
}
|
| 293 |
return $selectors;
|
| 294 |
}
|
| 295 |
|
| 296 |
// seek for given conditions
|
| 297 |
protected function seek($selector, &$ret) {
|
| 298 |
list($tag, $key, $val, $exp) = $selector;
|
| 299 |
|
| 300 |
$end = $this->info[HDOM_INFO_END];
|
| 301 |
if ($end==0)
|
| 302 |
$end = $this->parent->info[HDOM_INFO_END]-1;
|
| 303 |
|
| 304 |
for($i=$this->info[HDOM_INFO_BEGIN]+1; $i<$end; ++$i) {
|
| 305 |
$node = $this->dom->nodes[$i];
|
| 306 |
if ($node->nodetype==HDOM_TYPE_ENDTAG) continue;
|
| 307 |
$pass = true;
|
| 308 |
|
| 309 |
// compare tag
|
| 310 |
if ($tag && $tag!=$node->tag) {$pass=false;}
|
| 311 |
// compare key
|
| 312 |
if ($pass && $key && !(isset($node->attr[$key]))) {$pass=false;}
|
| 313 |
// compare value
|
| 314 |
if ($pass && $key && $val) {
|
| 315 |
$check = $this->match($exp, $val, $node->attr[$key]);
|
| 316 |
|
| 317 |
// handle multiple class
|
| 318 |
if (!$check && strcasecmp($key, 'class')==0) {
|
| 319 |
foreach(explode(' ',$node->attr[$key]) as $k) {
|
| 320 |
$check = $this->match($exp, $val, $k);
|
| 321 |
if ($check) break;
|
| 322 |
}
|
| 323 |
}
|
| 324 |
|
| 325 |
if (!$check)
|
| 326 |
$pass = false;
|
| 327 |
}
|
| 328 |
|
| 329 |
if ($pass)
|
| 330 |
$ret[$i] = 1;
|
| 331 |
}
|
| 332 |
unset($node);
|
| 333 |
}
|
| 334 |
|
| 335 |
protected function match($exp, $pattern, $value) {
|
| 336 |
$check = true;
|
| 337 |
switch ($exp) {
|
| 338 |
case '=':
|
| 339 |
$check = ($value===$pattern) ? true : false; break;
|
| 340 |
case '!=':
|
| 341 |
$check = ($value!==$pattern) ? true : false; break;
|
| 342 |
case '^=':
|
| 343 |
$check = (preg_match("/^".preg_quote($pattern,'/')."/", $value)) ? true : false; break;
|
| 344 |
case '$=':
|
| 345 |
$check = (preg_match("/".preg_quote($pattern,'/')."$/", $value)) ? true : false; break;
|
| 346 |
case '*=':
|
| 347 |
$check = (preg_match("/".preg_quote($pattern,'/')."/", $value)) ? true : false; break;
|
| 348 |
}
|
| 349 |
return $check;
|
| 350 |
}
|
| 351 |
|
| 352 |
function __toString() {
|
| 353 |
return $this->outertext();
|
| 354 |
}
|
| 355 |
|
| 356 |
function __get($name) {
|
| 357 |
if (isset($this->attr[$name])) return $this->attr[$name];
|
| 358 |
switch($name) {
|
| 359 |
case 'outertext': return $this->outertext();
|
| 360 |
case 'innertext': return $this->innertext();
|
| 361 |
case 'plaintext': return $this->plaintext();
|
| 362 |
default: return array_key_exists($name, $this->attr);
|
| 363 |
}
|
| 364 |
}
|
| 365 |
|
| 366 |
function __set($name, $value) {
|
| 367 |
switch($name) {
|
| 368 |
case 'outertext': return $this->info[HDOM_INFO_OUTER] = $value;
|
| 369 |
case 'innertext': return $this->info[HDOM_INFO_INNER] = $value;
|
| 370 |
case 'plaintext': return $this->dom->restore_noise($this->info[HDOM_INFO_TEXT]);
|
| 371 |
}
|
| 372 |
if (!isset($this->attr[$name])) {
|
| 373 |
$this->info[HDOM_INFO_SPACE][] = array(' ', '', '');
|
| 374 |
$this->info[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE;
|
| 375 |
}
|
| 376 |
$this->attr[$name] = $value;
|
| 377 |
}
|
| 378 |
|
| 379 |
function __isset($name) {
|
| 380 |
switch($name) {
|
| 381 |
case 'outertext': return true;
|
| 382 |
case 'innertext': return true;
|
| 383 |
case 'plaintext': return true;
|
| 384 |
}
|
| 385 |
//no value attr: nowrap, checked selected...
|
| 386 |
return (array_key_exists($name, $this->attr)) ? true : isset($this->attr[$name]);
|
| 387 |
}
|
| 388 |
|
| 389 |
// camel naming conventions
|
| 390 |
function getAttribute($name) {return $this->__get($name);}
|
| 391 |
function setAttribute($name, $value) {$this->__set($name, $value);}
|
| 392 |
function hasAttribute($name) {return $this->__isset($name);}
|
| 393 |
function removeAttribute($name) {$this->__set($name, null);}
|
| 394 |
function getElementById($id) {return $this->find("#$id", 0);}
|
| 395 |
function getElementsById($id, $idx=-1) {return $this->find("#$id", $idx);}
|
| 396 |
function getElementByTagName($name) {return $this->find($name, 0);}
|
| 397 |
function getElementsByTagName($name, $idx=-1) {return $this->find($name, $idx);}
|
| 398 |
function parentNode() {return $this->parent();}
|
| 399 |
function childNodes($idx=-1) {return $this->children($idx);}
|
| 400 |
function firstChild() {return $this->first_child();}
|
| 401 |
function lastChild() {return $this->last_child();}
|
| 402 |
function nextSibling() {return $this->next_sibling();}
|
| 403 |
function previousSibling() {return $this->prev_sibling();}
|
| 404 |
}
|
| 405 |
|
| 406 |
// simple html dom parser
|
| 407 |
// -----------------------------------------------------------------------------
|
| 408 |
class simple_html_dom {
|
| 409 |
public $nodes = array();
|
| 410 |
public $root = null;
|
| 411 |
public $lowercase = false;
|
| 412 |
protected $html = '';
|
| 413 |
protected $parent = null;
|
| 414 |
protected $pos;
|
| 415 |
protected $char;
|
| 416 |
protected $size;
|
| 417 |
protected $index;
|
| 418 |
public $callback = null;
|
| 419 |
protected $noise = array();
|
| 420 |
// use isset instead of in_array, performance boost about 30%...
|
| 421 |
protected $token_blank = array(' '=>1, "\t"=>1, "\r"=>1, "\n"=>1);
|
| 422 |
protected $token_equal = array(' '=>1, '='=>1, '/'=>1, '>'=>1, '<'=>1);
|
| 423 |
protected $token_slash = array(' '=>1, '/'=>1, '>'=>1, "\r"=>1, "\n"=>1, "\t"=>1);
|
| 424 |
protected $token_attr = array(' '=>1, '>'=>1);
|
| 425 |
protected $self_closing_tags = array('img'=>1, 'br'=>1, 'input'=>1, 'meta'=>1, 'link'=>1, 'hr'=>1, 'base'=>1, 'embed'=>1, 'spacer'=>1);
|
| 426 |
protected $block_tags = array('div'=>1, 'span'=>1, 'table'=>1, 'form'=>1, 'dl'=>1, 'ol'=>1);
|
| 427 |
protected $optional_closing_tags = array(
|
| 428 |
'tr'=>array('tr'=>1, 'td'=>1, 'th'=>1),
|
| 429 |
'th'=>array('th'=>1),
|
| 430 |
'td'=>array('td'=>1),
|
| 431 |
'ul'=>array('ul'=>1, 'li'=>1),
|
| 432 |
'li'=>array('li'=>1),
|
| 433 |
'dt'=>array('dt'=>1, 'dd'=>1),
|
| 434 |
'dd'=>array('dd'=>1, 'dt'=>1),
|
| 435 |
'p'=>array('p'=>1),
|
| 436 |
);
|
| 437 |
|
| 438 |
// load html from string
|
| 439 |
function load($str, $lowercase=true) {
|
| 440 |
// prepare
|
| 441 |
$this->prepare($str, $lowercase);
|
| 442 |
// strip out comments
|
| 443 |
$this->remove_noise("'<!--(.*?)-->'is", false, false);
|
| 444 |
// strip out <style> tags
|
| 445 |
$this->remove_noise("'<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>'is", false, false);
|
| 446 |
$this->remove_noise("'<\s*style\s*>(.*?)<\s*/\s*style\s*>'is", false, false);
|
| 447 |
// strip out <script> tags
|
| 448 |
$this->remove_noise("'<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>'is", false, false);
|
| 449 |
$this->remove_noise("'<\s*script\s*>(.*?)<\s*/\s*script\s*>'is", false, false);
|
| 450 |
// strip out <pre> tags
|
| 451 |
$this->remove_noise("'<\s*pre[^>]*>(.*?)<\s*/\s*pre\s*>'is", false, false);
|
| 452 |
// strip out <code> tags
|
| 453 |
$this->remove_noise("'<\s*code[^>]*>(.*?)<\s*/\s*code\s*>'is", false, false);
|
| 454 |
// strip out server side scripts
|
| 455 |
$this->remove_noise("'(<\?)(.*?)(\?>)'is", false, false);
|
| 456 |
// parsing
|
| 457 |
while ($this->parse());
|
| 458 |
$this->root->info[HDOM_INFO_END] = $this->index;
|
| 459 |
}
|
| 460 |
|
| 461 |
// load html from file
|
| 462 |
function load_file() {
|
| 463 |
$args = func_get_args();
|
| 464 |
$this->load(call_user_func_array('file_get_contents', $args), true);
|
| 465 |
}
|
| 466 |
|
| 467 |
// set callback function
|
| 468 |
function set_callback($function_name) {
|
| 469 |
$this->callback = $function_name;
|
| 470 |
}
|
| 471 |
|
| 472 |
// save dom as string
|
| 473 |
function save($filepath='') {
|
| 474 |
$ret = '';
|
| 475 |
$count = count($this->nodes);
|
| 476 |
|
| 477 |
$func_callback = $this->callback;
|
| 478 |
for ($i=0; $i<$count; ++$i) {
|
| 479 |
// trigger callback
|
| 480 |
if ($func_callback!==null)
|
| 481 |
$handle = $func_callback($this->nodes[$i]);
|
| 482 |
|
| 483 |
// outertext defined
|
| 484 |
if (isset($this->nodes[$i]->info[HDOM_INFO_OUTER])) {
|
| 485 |
$ret .= $this->nodes[$i]->info[HDOM_INFO_OUTER];
|
| 486 |
if ($this->nodes[$i]->info[HDOM_INFO_END]>0)
|
| 487 |
$i = $this->nodes[$i]->info[HDOM_INFO_END];
|
| 488 |
continue;
|
| 489 |
}
|
| 490 |
|
| 491 |
$ret .= $this->nodes[$i]->makeup();
|
| 492 |
|
| 493 |
// innertext defined
|
| 494 |
if (isset($this->nodes[$i]->info[HDOM_INFO_INNER]) && $this->nodes[$i]->info[HDOM_INFO_END]>0) {
|
| 495 |
$ret .= $this->nodes[$i]->info[HDOM_INFO_INNER];
|
| 496 |
if ($this->nodes[$i]->info[HDOM_INFO_END]-1>$i)
|
| 497 |
$i = $this->nodes[$i]->info[HDOM_INFO_END]-1;
|
| 498 |
}
|
| 499 |
}
|
| 500 |
if ($filepath!=='') file_put_contents($filepath, $ret);
|
| 501 |
return $ret;
|
| 502 |
}
|
| 503 |
|
| 504 |
// find dom node by css selector
|
| 505 |
function find($selector, $idx=-1) {
|
| 506 |
return $this->root->find($selector, $idx);
|
| 507 |
}
|
| 508 |
|
| 509 |
// prepare HTML data and init everything
|
| 510 |
function prepare($str, $lowercase=true) {
|
| 511 |
$this->clear();
|
| 512 |
$this->noise = array();
|
| 513 |
$this->nodes = array();
|
| 514 |
$this->html = $str;
|
| 515 |
$this->lowercase = $lowercase;
|
| 516 |
$this->index = 0;
|
| 517 |
$this->pos = 0;
|
| 518 |
$this->root = new simple_html_dom_node($this);
|
| 519 |
$this->root->tag = 'root';
|
| 520 |
$this->root->nodetype = HDOM_TYPE_ROOT;
|
| 521 |
$this->parent = $this->root;
|
| 522 |
// set the length of content
|
| 523 |
$this->size = strlen($str);
|
| 524 |
if ($this->size>0) $this->char = $this->html[0];
|
| 525 |
}
|
| 526 |
|
| 527 |
// clean up memory due to php5 circular references memory leak...
|
| 528 |
function clear() {
|
| 529 |
foreach($this->nodes as $n) {
|
| 530 |
$n->clear();
|
| 531 |
unset($n);
|
| 532 |
}
|
| 533 |
|
| 534 |
if (isset($this->parent)) {$this->parent->clear(); unset($this->parent);}
|
| 535 |
if (isset($this->root)) {$this->root->clear(); unset($this->root);}
|
| 536 |
unset($this->html);
|
| 537 |
unset($this->noise);
|
| 538 |
}
|
| 539 |
|
| 540 |
// parse html content
|
| 541 |
function parse() {
|
| 542 |
$s = $this->copy_until_char('<');
|
| 543 |
if ($s=='') return $this->read_tag();
|
| 544 |
|
| 545 |
// text
|
| 546 |
$node = new simple_html_dom_node($this);
|
| 547 |
$this->nodes[] = $node;
|
| 548 |
$node->info[HDOM_INFO_BEGIN] = $this->index;
|
| 549 |
$node->info[HDOM_INFO_TEXT] = $s;
|
| 550 |
$node->parent = $this->parent;
|
| 551 |
$this->parent->nodes[] = $node;
|
| 552 |
|
| 553 |
++$this->index;
|
| 554 |
return $node;
|
| 555 |
}
|
| 556 |
|
| 557 |
// read tag info
|
| 558 |
protected function read_tag() {
|
| 559 |
if ($this->char!='<') {
|
| 560 |
$this->root->info[HDOM_INFO_END] = $this->index;
|
| 561 |
return null;
|
| 562 |
}
|
| 563 |
$this->char = (++$this->pos<$this->size) ? $this->html[$this->pos] : null; // next
|
| 564 |
|
| 565 |
$node = new simple_html_dom_node($this);
|
| 566 |
$this->nodes[] = $node;
|
| 567 |
$node->info[HDOM_INFO_BEGIN] = $this->index;
|
| 568 |
++$this->index;
|
| 569 |
|
| 570 |
// end tag
|
| 571 |
if ($this->char=='/') {
|
| 572 |
$this->char = (++$this->pos<$this->size) ? $this->html[$this->pos] : null; // next
|
| 573 |
$this->skip($this->token_blank);
|
| 574 |
$node->nodetype = HDOM_TYPE_ENDTAG;
|
| 575 |
$node->tag = $this->copy_until_char('>');
|
| 576 |
$tag_lower = strtolower($node->tag);
|
| 577 |
if ($this->lowercase) $node->tag = $tag_lower;
|
| 578 |
|
| 579 |
// mapping parent node
|
| 580 |
if (strtolower($this->parent->tag)!==$tag_lower) {
|
| 581 |
if (isset($this->block_tags[$tag_lower])) {
|
| 582 |
$this->parent->info[HDOM_INFO_END] = 0;
|
| 583 |
while (($this->parent->parent) && strtolower($this->parent->tag)!==$tag_lower)
|
| 584 |
$this->parent = $this->parent->parent;
|
| 585 |
}
|
| 586 |
else {
|
| 587 |
$node->nodetype = HDOM_TYPE_ENDTAG;
|
| 588 |
$node->info[HDOM_INFO_END] = $this->index-1;
|
| 589 |
$node->info[HDOM_INFO_TEXT] = '</' . $node->tag . '>';
|
| 590 |
$node->tag = $node->tag;
|
| 591 |
$this->parent->nodes[] = $node;
|
| 592 |
}
|
| 593 |
$this->parent->info[HDOM_INFO_END] = $this->index-1;
|
| 594 |
}
|
| 595 |
else {
|
| 596 |
$this->parent->info[HDOM_INFO_END] = $this->index-1;
|
| 597 |
$this->parent = $this->parent->parent;
|
| 598 |
}
|
| 599 |
|
| 600 |
$node->parent = $this->parent;
|
| 601 |
$this->char = (++$this->pos<$this->size) ? $this->html[$this->pos] : null; // next
|
| 602 |
return $node;
|
| 603 |
}
|
| 604 |
|
| 605 |
$node->tag = $this->copy_until($this->token_slash);
|
| 606 |
$node->parent = $this->parent;
|
| 607 |
|
| 608 |
// doctype, cdata & comments...
|
| 609 |
if (isset($node->tag[0]) && $node->tag[0]=='!') {
|
| 610 |
$node->info[HDOM_INFO_TEXT] = '<' . $node->tag . $this->copy_until_char('>');
|
| 611 |
|
| 612 |
if (isset($node->tag[2]) && $node->tag[1]=='-' && $node->tag[2]=='-') {
|
| 613 |
$node->nodetype = HDOM_TYPE_COMMENT;
|
| 614 |
$node->tag = 'comment';
|
| 615 |
} else {
|
| 616 |
$node->nodetype = HDOM_TYPE_UNKNOWN;
|
| 617 |
$node->tag = 'unknown';
|
| 618 |
}
|
| 619 |
|
| 620 |
if ($this->char=='>') $node->info[HDOM_INFO_TEXT].='>';
|
| 621 |
$this->parent->nodes[] = $node;
|
| 622 |
$this->char = (++$this->pos<$this->size) ? $this->html[$this->pos] : null; // next
|
| 623 |
return $node;
|
| 624 |
}
|
| 625 |
|
| 626 |
// text
|
| 627 |
if (!preg_match("/^[A-Za-z0-9_\\-:]+$/", $node->tag)) {
|
| 628 |
$node->info[HDOM_INFO_TEXT] = '<' . $node->tag . $this->copy_until_char('>');
|
| 629 |
if ($this->char=='>') $node->info[HDOM_INFO_TEXT].='>';
|
| 630 |
$this->parent->nodes[] = $node;
|
| 631 |
$this->char = (++$this->pos<$this->size) ? $this->html[$this->pos] : null; // next
|
| 632 |
return $node;
|
| 633 |
}
|
| 634 |
|
| 635 |
// begin tag
|
| 636 |
$node->nodetype = HDOM_TYPE_ELEMENT;
|
| 637 |
$tag_lower = strtolower($node->tag);
|
| 638 |
if ($this->lowercase) $node->tag = $tag_lower;
|
| 639 |
|
| 640 |
// handle optional closing tags
|
| 641 |
if (isset($this->optional_closing_tags[$tag_lower]) ) {
|
| 642 |
while (isset($this->optional_closing_tags[$tag_lower][strtolower($this->parent->tag)])) {
|
| 643 |
$this->parent->info[HDOM_INFO_END] = 0;
|
| 644 |
$this->parent = $this->parent->parent;
|
| 645 |
}
|
| 646 |
$node->parent = $this->parent;
|
| 647 |
}
|
| 648 |
$this->parent->children[] = $node;
|
| 649 |
$this->parent->nodes[] = $node;
|
| 650 |
|
| 651 |
$guard = 0; // prevent infinity loop
|
| 652 |
$space = array($this->copy_skip($this->token_blank), '', '');
|
| 653 |
|
| 654 |
// handle attributes
|
| 655 |
do {
|
| 656 |
if ($this->char!==null && $space[0]=='') break;
|
| 657 |
$name = $this->copy_until($this->token_equal);
|
| 658 |
|
| 659 |
if($guard==$this->pos) {
|
| 660 |
$this->char = (++$this->pos<$this->size) ? $this->html[$this->pos] : null; // next
|
| 661 |
continue;
|
| 662 |
}
|
| 663 |
$guard = $this->pos;
|
| 664 |
|
| 665 |
// handle endless '<'
|
| 666 |
if($this->pos>=$this->size-1 && $this->char!='>') {
|
| 667 |
$node->nodetype = HDOM_TYPE_TEXT;
|
| 668 |
$node->info[HDOM_INFO_END] = 0;
|
| 669 |
$node->info[HDOM_INFO_TEXT] = '<'.$node->tag . $space[0] . $name;
|
| 670 |
$node->tag = 'text';
|
| 671 |
return $node;
|
| 672 |
}
|
| 673 |
|
| 674 |
if ($name!='/' && $name!='') {
|
| 675 |
$space[1] = $this->copy_skip($this->token_blank);
|
| 676 |
if ($this->lowercase) $name = strtolower($name);
|
| 677 |
if ($this->char=='=') {
|
| 678 |
$this->char = (++$this->pos<$this->size) ? $this->html[$this->pos] : null; // next
|
| 679 |
$this->parse_attr($node, $name, $space);
|
| 680 |
}
|
| 681 |
else {
|
| 682 |
//no value attr: nowrap, checked selected...
|
| 683 |
$node->info[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO;
|
| 684 |
$node->attr[$name] = true;
|
| 685 |
if ($this->char!='>') $this->char = $this->html[--$this->pos]; // prev
|
| 686 |
}
|
| 687 |
$node->info[HDOM_INFO_SPACE][] = $space;
|
| 688 |
$space = array($this->copy_skip($this->token_blank), '', '');
|
| 689 |
}
|
| 690 |
else
|
| 691 |
break;
|
| 692 |
} while($this->char!='>' && $this->char!='/');
|
| 693 |
|
| 694 |
$node->info[HDOM_INFO_ENDSPACE] = $space[0];
|
| 695 |
|
| 696 |
// check self closing
|
| 697 |
if ($this->copy_until_char_escape('>')=='/') {
|
| 698 |
$node->info[HDOM_INFO_ENDSPACE] .= '/';
|
| 699 |
$node->info[HDOM_INFO_END] = 0;
|
| 700 |
}
|
| 701 |
else {
|
| 702 |
// reset parent
|
| 703 |
if (!isset($this->self_closing_tags[strtolower($node->tag)])) $this->parent = $node;
|
| 704 |
}
|
| 705 |
$this->char = (++$this->pos<$this->size) ? $this->html[$this->pos] : null; // next
|
| 706 |
return $node;
|
| 707 |
}
|
| 708 |
|
| 709 |
// parse attributes
|
| 710 |
protected function parse_attr($node, $name, &$space) {
|
| 711 |
$space[2] = $this->copy_skip($this->token_blank);
|
| 712 |
switch($this->char) {
|
| 713 |
case '"':
|
| 714 |
$node->info[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE;
|
| 715 |
$this->char = (++$this->pos<$this->size) ? $this->html[$this->pos] : null; // next
|
| 716 |
$value = $this->copy_until_char_escape('"');
|
| 717 |
$this->char = (++$this->pos<$this->size) ? $this->html[$this->pos] : null; // next
|
| 718 |
break;
|
| 719 |
case '\'':
|
| 720 |
$node->info[HDOM_INFO_QUOTE][] = HDOM_QUOTE_SINGLE;
|
| 721 |
$this->char = (++$this->pos<$this->size) ? $this->html[$this->pos] : null; // next
|
| 722 |
$value = $this->copy_until_char_escape('\'');
|
| 723 |
$this->char = (++$this->pos<$this->size) ? $this->html[$this->pos] : null; // next
|
| 724 |
break;
|
| 725 |
default:
|
| 726 |
$node->info[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO;
|
| 727 |
$value = $this->copy_until($this->token_attr);
|
| 728 |
}
|
| 729 |
$node->attr[$name] = $this->restore_noise($value);
|
| 730 |
}
|
| 731 |
|
| 732 |
protected function skip($chars) {
|
| 733 |
while ($this->char!==null) {
|
| 734 |
if (!isset($chars[$this->char])) return;
|
| 735 |
$this->char = (++$this->pos<$this->size) ? $this->html[$this->pos] : null; // next
|
| 736 |
}
|
| 737 |
}
|
| 738 |
|
| 739 |
protected function copy_skip($chars) {
|
| 740 |
$ret = '';
|
| 741 |
while ($this->char!==null) {
|
| 742 |
if (!isset($chars[$this->char])) return $ret;
|
| 743 |
$ret .= $this->char;
|
| 744 |
$this->char = (++$this->pos<$this->size) ? $this->html[$this->pos] : null; // next
|
| 745 |
}
|
| 746 |
return $ret;
|
| 747 |
}
|
| 748 |
|
| 749 |
protected function copy_until($chars) {
|
| 750 |
$ret = '';
|
| 751 |
while ($this->char!==null) {
|
| 752 |
if (isset($chars[$this->char])) return $ret;
|
| 753 |
$ret .= $this->char;
|
| 754 |
$this->char = (++$this->pos<$this->size) ? $this->html[$this->pos] : null; // next
|
| 755 |
}
|
| 756 |
return $ret;
|
| 757 |
}
|
| 758 |
|
| 759 |
protected function copy_until_char($char) {
|
| 760 |
if ($this->char===null) return '';
|
| 761 |
|
| 762 |
if (($pos = strpos($this->html, $char, $this->pos))===false) {
|
| 763 |
$ret = substr($this->html, $this->pos, $this->size-$this->pos);
|
| 764 |
$this->char = null;
|
| 765 |
$this->pos = $this->size;
|
| 766 |
return $ret;
|
| 767 |
}
|
| 768 |
|
| 769 |
if ($pos==$this->pos) return '';
|
| 770 |
|
| 771 |
$ret = substr($this->html, $this->pos, $pos-$this->pos);
|
| 772 |
$this->char = $this->html[$pos];
|
| 773 |
$this->pos = $pos;
|
| 774 |
return $ret;
|
| 775 |
}
|
| 776 |
|
| 777 |
protected function copy_until_char_escape($char) {
|
| 778 |
if ($this->char===null) return '';
|
| 779 |
|
| 780 |
$start = $this->pos;
|
| 781 |
while(1) {
|
| 782 |
if (($pos = strpos($this->html, $char, $start))===false) {
|
| 783 |
$ret = substr($this->html, $this->pos, $this->size-$this->pos);
|
| 784 |
$this->char = null;
|
| 785 |
$this->pos = $this->size;
|
| 786 |
return $ret;
|
| 787 |
}
|
| 788 |
|
| 789 |
if ($pos==$this->pos) return '';
|
| 790 |
|
| 791 |
if ($this->html[$pos-1]==='\\') {
|
| 792 |
$start = $pos+1;
|
| 793 |
continue;
|
| 794 |
}
|
| 795 |
|
| 796 |
$ret = substr($this->html, $this->pos, $pos-$this->pos);
|
| 797 |
$this->char = $this->html[$pos];
|
| 798 |
$this->pos = $pos;
|
| 799 |
return $ret;
|
| 800 |
}
|
| 801 |
}
|
| 802 |
|
| 803 |
// remove noise from html content
|
| 804 |
function remove_noise($pattern, $remove_tag=true, $remove_contents=true) {
|
| 805 |
$count = preg_match_all($pattern, $this->html, $matches, PREG_SET_ORDER|PREG_OFFSET_CAPTURE);
|
| 806 |
|
| 807 |
for ($i=$count-1; $i>-1; --$i) {
|
| 808 |
$key = '___noise___'.sprintf('% 3d', count($this->noise));
|
| 809 |
$idx = ($remove_tag) ? 0 : 1;
|
| 810 |
$this->noise[$key] = ($remove_contents) ? '' : $matches[$i][$idx][0];
|
| 811 |
$this->html = substr_replace($this->html, $key, $matches[$i][$idx][1], strlen($matches[$i][$idx][0]));
|
| 812 |
}
|
| 813 |
|
| 814 |
// reset the length of content
|
| 815 |
$this->size = strlen($this->html);
|
| 816 |
if ($this->size>0)
|
| 817 |
$this->char = $this->html[0];
|
| 818 |
}
|
| 819 |
|
| 820 |
// restore noise to html content
|
| 821 |
function restore_noise($text) {
|
| 822 |
while(($pos=strpos($text, '___noise___'))!==false) {
|
| 823 |
$key = '___noise___'.$text[$pos+11].$text[$pos+12].$text[$pos+13];
|
| 824 |
if (isset($this->noise[$key]))
|
| 825 |
$text = substr($text, 0, $pos).$this->noise[$key].substr($text, $pos+14);
|
| 826 |
}
|
| 827 |
return $text;
|
| 828 |
}
|
| 829 |
|
| 830 |
function __toString() {
|
| 831 |
return $this->save();
|
| 832 |
}
|
| 833 |
|
| 834 |
function __get($name) {
|
| 835 |
switch($name) {
|
| 836 |
case 'outertext': return $this->save();
|
| 837 |
case 'innertext': return $this->root->innertext();
|
| 838 |
case 'plaintext': return $this->root->plaintext();
|
| 839 |
}
|
| 840 |
}
|
| 841 |
|
| 842 |
// camel naming conventions
|
| 843 |
function childNodes($idx=-1) {return $this->root->childNodes($idx);}
|
| 844 |
function firstChild() {return $this->root->first_child();}
|
| 845 |
function lastChild() {return $this->root->last_child();}
|
| 846 |
function getElementById($id) {return $this->find("#$id", 0);}
|
| 847 |
function getElementsById($id, $idx=-1) {return $this->find("#$id", $idx);}
|
| 848 |
function getElementByTagName($name) {return $this->find($name, 0);}
|
| 849 |
function getElementsByTagName($name, $idx=-1) {return $this->find($name, $idx);}
|
| 850 |
function loadFile() {$args = func_get_args();$this->load(call_user_func_array('file_get_contents', $args), true);}
|
| 851 |
}
|
| 852 |
?>
|