| 1 |
/*
|
| 2 |
* HTML Parser By John Resig (ejohn.org)
|
| 3 |
* Original code by Erik Arvidsson, Mozilla Public License
|
| 4 |
* http://erik.eae.net/simplehtmlparser/simplehtmlparser.js
|
| 5 |
*
|
| 6 |
* // Use like so:
|
| 7 |
* HTMLParser(htmlString, {
|
| 8 |
* start: function(tag, attrs, unary) {},
|
| 9 |
* end: function(tag) {},
|
| 10 |
* chars: function(text) {},
|
| 11 |
* comment: function(text) {}
|
| 12 |
* });
|
| 13 |
*
|
| 14 |
* // or to get an XML string:
|
| 15 |
* HTMLtoXML(htmlString);
|
| 16 |
*
|
| 17 |
* // or to get an XML DOM Document
|
| 18 |
* HTMLtoDOM(htmlString);
|
| 19 |
*
|
| 20 |
* // or to inject into an existing document/DOM node
|
| 21 |
* HTMLtoDOM(htmlString, document);
|
| 22 |
* HTMLtoDOM(htmlString, document.body);
|
| 23 |
*
|
| 24 |
*/
|
| 25 |
|
| 26 |
(function(){
|
| 27 |
|
| 28 |
// Regular Expressions for parsing tags and attributes
|
| 29 |
var startTag = /^<(\w+)((?:\s+\w+(?:\s*=\s*(?:(?:"[^"]*")|(?:'[^']*')|[^>\s]+))?)*)\s*(\/?)>/,
|
| 30 |
endTag = /^<\/(\w+)[^>]*>/,
|
| 31 |
attr = /(\w+)(?:\s*=\s*(?:(?:"((?:\\.|[^"])*)")|(?:'((?:\\.|[^'])*)')|([^>\s]+)))?/g;
|
| 32 |
|
| 33 |
// Empty Elements - HTML 4.01
|
| 34 |
var empty = makeMap("area,base,basefont,br,col,frame,hr,img,input,isindex,link,meta,param,embed");
|
| 35 |
|
| 36 |
// Block Elements - HTML 4.01
|
| 37 |
var block = makeMap("address,applet,blockquote,button,center,dd,del,dir,div,dl,dt,fieldset,form,frameset,hr,iframe,ins,isindex,li,map,menu,noframes,noscript,object,ol,p,pre,script,table,tbody,td,tfoot,th,thead,tr,ul");
|
| 38 |
|
| 39 |
// Inline Elements - HTML 4.01
|
| 40 |
var inline = makeMap("a,abbr,acronym,applet,b,basefont,bdo,big,br,button,cite,code,del,dfn,em,font,i,iframe,img,input,ins,kbd,label,map,object,q,s,samp,script,select,small,span,strike,strong,sub,sup,textarea,tt,u,var");
|
| 41 |
|
| 42 |
// Elements that you can, intentionally, leave open
|
| 43 |
// (and which close themselves)
|
| 44 |
var closeSelf = makeMap("colgroup,dd,dt,li,options,p,td,tfoot,th,thead,tr");
|
| 45 |
|
| 46 |
// Attributes that have their values filled in disabled="disabled"
|
| 47 |
var fillAttrs = makeMap("checked,compact,declare,defer,disabled,ismap,multiple,nohref,noresize,noshade,nowrap,readonly,selected");
|
| 48 |
|
| 49 |
// Special Elements (can contain anything)
|
| 50 |
var special = makeMap("script,style");
|
| 51 |
|
| 52 |
var HTMLParser = this.HTMLParser = function( html, handler ) {
|
| 53 |
var index, chars, match, stack = [], last = html;
|
| 54 |
stack.last = function(){
|
| 55 |
return this[ this.length - 1 ];
|
| 56 |
};
|
| 57 |
|
| 58 |
while ( html ) {
|
| 59 |
chars = true;
|
| 60 |
|
| 61 |
// Make sure we're not in a script or style element
|
| 62 |
if ( !stack.last() || !special[ stack.last() ] ) {
|
| 63 |
|
| 64 |
// Comment
|
| 65 |
if ( html.indexOf("<!--") == 0 ) {
|
| 66 |
index = html.indexOf("-->");
|
| 67 |
|
| 68 |
if ( index >= 0 ) {
|
| 69 |
if ( handler.comment )
|
| 70 |
handler.comment( html.substring( 4, index ) );
|
| 71 |
html = html.substring( index + 3 );
|
| 72 |
chars = false;
|
| 73 |
}
|
| 74 |
|
| 75 |
// end tag
|
| 76 |
} else if ( html.indexOf("</") == 0 ) {
|
| 77 |
match = html.match( endTag );
|
| 78 |
|
| 79 |
if ( match ) {
|
| 80 |
html = html.substring( match[0].length );
|
| 81 |
match[0].replace( endTag, parseEndTag );
|
| 82 |
chars = false;
|
| 83 |
}
|
| 84 |
|
| 85 |
// start tag
|
| 86 |
} else if ( html.indexOf("<") == 0 ) {
|
| 87 |
match = html.match( startTag );
|
| 88 |
|
| 89 |
if ( match ) {
|
| 90 |
html = html.substring( match[0].length );
|
| 91 |
match[0].replace( startTag, parseStartTag );
|
| 92 |
chars = false;
|
| 93 |
}
|
| 94 |
}
|
| 95 |
|
| 96 |
if ( chars ) {
|
| 97 |
index = html.indexOf("<");
|
| 98 |
|
| 99 |
var text = index < 0 ? html : html.substring( 0, index );
|
| 100 |
html = index < 0 ? "" : html.substring( index );
|
| 101 |
|
| 102 |
if ( handler.chars )
|
| 103 |
handler.chars( text );
|
| 104 |
}
|
| 105 |
|
| 106 |
} else {
|
| 107 |
html = html.replace(new RegExp("(.*)<\/" + stack.last() + "[^>]*>"), function(all, text){
|
| 108 |
text = text.replace(/<!--(.*?)-->/g, "$1")
|
| 109 |
.replace(/<!\[CDATA\[(.*?)]]>/g, "$1");
|
| 110 |
|
| 111 |
if ( handler.chars )
|
| 112 |
handler.chars( text );
|
| 113 |
|
| 114 |
return "";
|
| 115 |
});
|
| 116 |
|
| 117 |
parseEndTag( "", stack.last() );
|
| 118 |
}
|
| 119 |
|
| 120 |
if ( html == last )
|
| 121 |
throw "Parse Error: " + html;
|
| 122 |
last = html;
|
| 123 |
}
|
| 124 |
|
| 125 |
// Clean up any remaining tags
|
| 126 |
parseEndTag();
|
| 127 |
|
| 128 |
function parseStartTag( tag, tagName, rest, unary ) {
|
| 129 |
if ( block[ tagName ] ) {
|
| 130 |
while ( stack.last() && inline[ stack.last() ] ) {
|
| 131 |
parseEndTag( "", stack.last() );
|
| 132 |
}
|
| 133 |
}
|
| 134 |
|
| 135 |
if ( closeSelf[ tagName ] && stack.last() == tagName ) {
|
| 136 |
parseEndTag( "", tagName );
|
| 137 |
}
|
| 138 |
|
| 139 |
unary = empty[ tagName ] || !!unary;
|
| 140 |
|
| 141 |
if ( !unary )
|
| 142 |
stack.push( tagName );
|
| 143 |
|
| 144 |
if ( handler.start ) {
|
| 145 |
var attrs = [];
|
| 146 |
|
| 147 |
rest.replace(attr, function(match, name) {
|
| 148 |
var value = arguments[2] ? arguments[2] :
|
| 149 |
arguments[3] ? arguments[3] :
|
| 150 |
arguments[4] ? arguments[4] :
|
| 151 |
fillAttrs[name] ? name : "";
|
| 152 |
|
| 153 |
attrs.push({
|
| 154 |
name: name,
|
| 155 |
value: value,
|
| 156 |
escaped: value.replace(/(^|[^\\])"/g, '$1\\\"') //"
|
| 157 |
});
|
| 158 |
});
|
| 159 |
|
| 160 |
if ( handler.start )
|
| 161 |
handler.start( tagName, attrs, unary );
|
| 162 |
}
|
| 163 |
}
|
| 164 |
|
| 165 |
function parseEndTag( tag, tagName ) {
|
| 166 |
// If no tag name is provided, clean shop
|
| 167 |
if ( !tagName )
|
| 168 |
var pos = 0;
|
| 169 |
|
| 170 |
// Find the closest opened tag of the same type
|
| 171 |
else
|
| 172 |
for ( var pos = stack.length - 1; pos >= 0; pos-- )
|
| 173 |
if ( stack[ pos ] == tagName )
|
| 174 |
break;
|
| 175 |
|
| 176 |
if ( pos >= 0 ) {
|
| 177 |
// Close all the open elements, up the stack
|
| 178 |
for ( var i = stack.length - 1; i >= pos; i-- )
|
| 179 |
if ( handler.end )
|
| 180 |
handler.end( stack[ i ] );
|
| 181 |
|
| 182 |
// Remove the open elements from the stack
|
| 183 |
stack.length = pos;
|
| 184 |
}
|
| 185 |
}
|
| 186 |
};
|
| 187 |
|
| 188 |
this.HTMLtoXML = function( html ) {
|
| 189 |
var results = "";
|
| 190 |
|
| 191 |
HTMLParser(html, {
|
| 192 |
start: function( tag, attrs, unary ) {
|
| 193 |
results += "<" + tag;
|
| 194 |
|
| 195 |
for ( var i = 0; i < attrs.length; i++ )
|
| 196 |
results += " " + attrs[i].name + '="' + attrs[i].escaped + '"';
|
| 197 |
|
| 198 |
results += (unary ? "/" : "") + ">";
|
| 199 |
},
|
| 200 |
end: function( tag ) {
|
| 201 |
results += "</" + tag + ">";
|
| 202 |
},
|
| 203 |
chars: function( text ) {
|
| 204 |
results += text;
|
| 205 |
},
|
| 206 |
comment: function( text ) {
|
| 207 |
results += "<!--" + text + "-->";
|
| 208 |
}
|
| 209 |
});
|
| 210 |
|
| 211 |
return results;
|
| 212 |
};
|
| 213 |
|
| 214 |
this.HTMLtoDOM = function( html, doc ) {
|
| 215 |
// There can be only one of these elements
|
| 216 |
var one = makeMap("html,head,body,title");
|
| 217 |
|
| 218 |
// Enforce a structure for the document
|
| 219 |
var structure = {
|
| 220 |
link: "head",
|
| 221 |
base: "head"
|
| 222 |
};
|
| 223 |
|
| 224 |
if ( !doc ) {
|
| 225 |
if ( typeof DOMDocument != "undefined" )
|
| 226 |
doc = new DOMDocument();
|
| 227 |
else if ( typeof document != "undefined" && document.implementation && document.implementation.createDocument )
|
| 228 |
doc = document.implementation.createDocument("", "", null);
|
| 229 |
else if ( typeof ActiveX != "undefined" )
|
| 230 |
doc = new ActiveXObject("Msxml.DOMDocument");
|
| 231 |
|
| 232 |
} else
|
| 233 |
doc = doc.ownerDocument ||
|
| 234 |
doc.getOwnerDocument && doc.getOwnerDocument() ||
|
| 235 |
doc;
|
| 236 |
|
| 237 |
var elems = [],
|
| 238 |
documentElement = doc.documentElement ||
|
| 239 |
doc.getDocumentElement && doc.getDocumentElement();
|
| 240 |
|
| 241 |
// If we're dealing with an empty document then we
|
| 242 |
// need to pre-populate it with the HTML document structure
|
| 243 |
if ( !documentElement && doc.createElement ) (function(){
|
| 244 |
var html = doc.createElement("html");
|
| 245 |
var head = doc.createElement("head");
|
| 246 |
head.appendChild( doc.createElement("title") );
|
| 247 |
html.appendChild( head );
|
| 248 |
html.appendChild( doc.createElement("body") );
|
| 249 |
doc.appendChild( html );
|
| 250 |
})();
|
| 251 |
|
| 252 |
// Find all the unique elements
|
| 253 |
if ( doc.getElementsByTagName )
|
| 254 |
for ( var i in one )
|
| 255 |
one[ i ] = doc.getElementsByTagName( i )[0];
|
| 256 |
|
| 257 |
// If we're working with a document, inject contents into
|
| 258 |
// the body element
|
| 259 |
var curParentNode = one.body;
|
| 260 |
|
| 261 |
HTMLParser( html, {
|
| 262 |
start: function( tagName, attrs, unary ) {
|
| 263 |
// If it's a pre-built element, then we can ignore
|
| 264 |
// its construction
|
| 265 |
if ( one[ tagName ] ) {
|
| 266 |
curParentNode = one[ tagName ];
|
| 267 |
return;
|
| 268 |
}
|
| 269 |
|
| 270 |
var elem = doc.createElement( tagName );
|
| 271 |
|
| 272 |
for ( var attr in attrs )
|
| 273 |
elem.setAttribute( attrs[ attr ].name, attrs[ attr ].value );
|
| 274 |
|
| 275 |
if ( structure[ tagName ] && typeof one[ structure[ tagName ] ] != "boolean" )
|
| 276 |
one[ structure[ tagName ] ].appendChild( elem );
|
| 277 |
|
| 278 |
else if ( curParentNode && curParentNode.appendChild )
|
| 279 |
curParentNode.appendChild( elem );
|
| 280 |
|
| 281 |
if ( !unary ) {
|
| 282 |
elems.push( elem );
|
| 283 |
curParentNode = elem;
|
| 284 |
}
|
| 285 |
},
|
| 286 |
end: function( tag ) {
|
| 287 |
elems.length -= 1;
|
| 288 |
|
| 289 |
// Init the new parentNode
|
| 290 |
curParentNode = elems[ elems.length - 1 ];
|
| 291 |
},
|
| 292 |
chars: function( text ) {
|
| 293 |
curParentNode.appendChild( doc.createTextNode( text ) );
|
| 294 |
},
|
| 295 |
comment: function( text ) {
|
| 296 |
// create comment node
|
| 297 |
}
|
| 298 |
});
|
| 299 |
|
| 300 |
return doc;
|
| 301 |
};
|
| 302 |
|
| 303 |
function makeMap(str){
|
| 304 |
var obj = {}, items = str.split(",");
|
| 305 |
for ( var i = 0; i < items.length; i++ )
|
| 306 |
obj[ items[i] ] = true;
|
| 307 |
return obj;
|
| 308 |
}
|
| 309 |
})();
|