/[drupal]/contributions/modules/liquid/mw_sanitizer.inc
ViewVC logotype

Contents of /contributions/modules/liquid/mw_sanitizer.inc

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.3 - (show annotations) (download) (as text)
Thu Feb 8 21:02:10 2007 UTC (2 years, 9 months ago) by sorenp
Branch: MAIN
CVS Tags: DRUPAL-5--0-1, HEAD
Branch point for: DRUPAL-5
Changes since 1.2: +1 -1 lines
File MIME type: text/x-php
The next development version of liquid. Updated for Drupal 5.1 (new files)
1 <?php
2
3 // $Id$
4
5 /**
6 * (X)HTML sanitizer for MediaWiki
7 *
8 * Copyright (C) 2002-2005 Brion Vibber <brion@pobox.com> et al
9 * http://www.mediawiki.org/
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License along
22 * with this program; if not, write to the Free Software Foundation, Inc.,
23 * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
24 * http://www.gnu.org/copyleft/gpl.html
25 *
26 * @package MediaWiki
27 * @subpackage Parser
28 */
29
30 /**
31 * Regular expression to match various types of character references in
32 * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences
33 */
34 define( 'MW_CHAR_REFS_REGEX',
35 '/&([A-Za-z0-9]+);
36 |&\#([0-9]+);
37 |&\#x([0-9A-Za-z]+);
38 |&\#X([0-9A-Za-z]+);
39 |(&)/x' );
40
41 /**
42 * Regular expression to match HTML/XML attribute pairs within a tag.
43 * Allows some... latitude.
44 * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes
45 */
46 $attrib = '[A-Za-z0-9]';
47 $space = '[\x09\x0a\x0d\x20]';
48 define( 'MW_ATTRIBS_REGEX',
49 "/(?:^|$space)($attrib+)
50 ($space*=$space*
51 (?:
52 # The attribute value: quoted or alone
53 \"([^<\"]*)\"
54 | '([^<']*)'
55 | ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
56 | (\#[0-9a-fA-F]+) # Technically wrong, but lots of
57 # colors are specified like this.
58 # We'll be normalizing it.
59 )
60 )?(?=$space|\$)/sx" );
61
62 /**
63 * List of all named character entities defined in HTML 4.01
64 * http://www.w3.org/TR/html4/sgml/entities.html
65 * @access private
66 */
67 global $wgHtmlEntities;
68 $wgHtmlEntities = array(
69 'Aacute' => 193,
70 'aacute' => 225,
71 'Acirc' => 194,
72 'acirc' => 226,
73 'acute' => 180,
74 'AElig' => 198,
75 'aelig' => 230,
76 'Agrave' => 192,
77 'agrave' => 224,
78 'alefsym' => 8501,
79 'Alpha' => 913,
80 'alpha' => 945,
81 'amp' => 38,
82 'and' => 8743,
83 'ang' => 8736,
84 'Aring' => 197,
85 'aring' => 229,
86 'asymp' => 8776,
87 'Atilde' => 195,
88 'atilde' => 227,
89 'Auml' => 196,
90 'auml' => 228,
91 'bdquo' => 8222,
92 'Beta' => 914,
93 'beta' => 946,
94 'brvbar' => 166,
95 'bull' => 8226,
96 'cap' => 8745,
97 'Ccedil' => 199,
98 'ccedil' => 231,
99 'cedil' => 184,
100 'cent' => 162,
101 'Chi' => 935,
102 'chi' => 967,
103 'circ' => 710,
104 'clubs' => 9827,
105 'cong' => 8773,
106 'copy' => 169,
107 'crarr' => 8629,
108 'cup' => 8746,
109 'curren' => 164,
110 'dagger' => 8224,
111 'Dagger' => 8225,
112 'darr' => 8595,
113 'dArr' => 8659,
114 'deg' => 176,
115 'Delta' => 916,
116 'delta' => 948,
117 'diams' => 9830,
118 'divide' => 247,
119 'Eacute' => 201,
120 'eacute' => 233,
121 'Ecirc' => 202,
122 'ecirc' => 234,
123 'Egrave' => 200,
124 'egrave' => 232,
125 'empty' => 8709,
126 'emsp' => 8195,
127 'ensp' => 8194,
128 'Epsilon' => 917,
129 'epsilon' => 949,
130 'equiv' => 8801,
131 'Eta' => 919,
132 'eta' => 951,
133 'ETH' => 208,
134 'eth' => 240,
135 'Euml' => 203,
136 'euml' => 235,
137 'euro' => 8364,
138 'exist' => 8707,
139 'fnof' => 402,
140 'forall' => 8704,
141 'frac12' => 189,
142 'frac14' => 188,
143 'frac34' => 190,
144 'frasl' => 8260,
145 'Gamma' => 915,
146 'gamma' => 947,
147 'ge' => 8805,
148 'gt' => 62,
149 'harr' => 8596,
150 'hArr' => 8660,
151 'hearts' => 9829,
152 'hellip' => 8230,
153 'Iacute' => 205,
154 'iacute' => 237,
155 'Icirc' => 206,
156 'icirc' => 238,
157 'iexcl' => 161,
158 'Igrave' => 204,
159 'igrave' => 236,
160 'image' => 8465,
161 'infin' => 8734,
162 'int' => 8747,
163 'Iota' => 921,
164 'iota' => 953,
165 'iquest' => 191,
166 'isin' => 8712,
167 'Iuml' => 207,
168 'iuml' => 239,
169 'Kappa' => 922,
170 'kappa' => 954,
171 'Lambda' => 923,
172 'lambda' => 955,
173 'lang' => 9001,
174 'laquo' => 171,
175 'larr' => 8592,
176 'lArr' => 8656,
177 'lceil' => 8968,
178 'ldquo' => 8220,
179 'le' => 8804,
180 'lfloor' => 8970,
181 'lowast' => 8727,
182 'loz' => 9674,
183 'lrm' => 8206,
184 'lsaquo' => 8249,
185 'lsquo' => 8216,
186 'lt' => 60,
187 'macr' => 175,
188 'mdash' => 8212,
189 'micro' => 181,
190 'middot' => 183,
191 'minus' => 8722,
192 'Mu' => 924,
193 'mu' => 956,
194 'nabla' => 8711,
195 'nbsp' => 160,
196 'ndash' => 8211,
197 'ne' => 8800,
198 'ni' => 8715,
199 'not' => 172,
200 'notin' => 8713,
201 'nsub' => 8836,
202 'Ntilde' => 209,
203 'ntilde' => 241,
204 'Nu' => 925,
205 'nu' => 957,
206 'Oacute' => 211,
207 'oacute' => 243,
208 'Ocirc' => 212,
209 'ocirc' => 244,
210 'OElig' => 338,
211 'oelig' => 339,
212 'Ograve' => 210,
213 'ograve' => 242,
214 'oline' => 8254,
215 'Omega' => 937,
216 'omega' => 969,
217 'Omicron' => 927,
218 'omicron' => 959,
219 'oplus' => 8853,
220 'or' => 8744,
221 'ordf' => 170,
222 'ordm' => 186,
223 'Oslash' => 216,
224 'oslash' => 248,
225 'Otilde' => 213,
226 'otilde' => 245,
227 'otimes' => 8855,
228 'Ouml' => 214,
229 'ouml' => 246,
230 'para' => 182,
231 'part' => 8706,
232 'permil' => 8240,
233 'perp' => 8869,
234 'Phi' => 934,
235 'phi' => 966,
236 'Pi' => 928,
237 'pi' => 960,
238 'piv' => 982,
239 'plusmn' => 177,
240 'pound' => 163,
241 'prime' => 8242,
242 'Prime' => 8243,
243 'prod' => 8719,
244 'prop' => 8733,
245 'Psi' => 936,
246 'psi' => 968,
247 'quot' => 34,
248 'radic' => 8730,
249 'rang' => 9002,
250 'raquo' => 187,
251 'rarr' => 8594,
252 'rArr' => 8658,
253 'rceil' => 8969,
254 'rdquo' => 8221,
255 'real' => 8476,
256 'reg' => 174,
257 'rfloor' => 8971,
258 'Rho' => 929,
259 'rho' => 961,
260 'rlm' => 8207,
261 'rsaquo' => 8250,
262 'rsquo' => 8217,
263 'sbquo' => 8218,
264 'Scaron' => 352,
265 'scaron' => 353,
266 'sdot' => 8901,
267 'sect' => 167,
268 'shy' => 173,
269 'Sigma' => 931,
270 'sigma' => 963,
271 'sigmaf' => 962,
272 'sim' => 8764,
273 'spades' => 9824,
274 'sub' => 8834,
275 'sube' => 8838,
276 'sum' => 8721,
277 'sup' => 8835,
278 'sup1' => 185,
279 'sup2' => 178,
280 'sup3' => 179,
281 'supe' => 8839,
282 'szlig' => 223,
283 'Tau' => 932,
284 'tau' => 964,
285 'there4' => 8756,
286 'Theta' => 920,
287 'theta' => 952,
288 'thetasym' => 977,
289 'thinsp' => 8201,
290 'THORN' => 222,
291 'thorn' => 254,
292 'tilde' => 732,
293 'times' => 215,
294 'trade' => 8482,
295 'Uacute' => 218,
296 'uacute' => 250,
297 'uarr' => 8593,
298 'uArr' => 8657,
299 'Ucirc' => 219,
300 'ucirc' => 251,
301 'Ugrave' => 217,
302 'ugrave' => 249,
303 'uml' => 168,
304 'upsih' => 978,
305 'Upsilon' => 933,
306 'upsilon' => 965,
307 'Uuml' => 220,
308 'uuml' => 252,
309 'weierp' => 8472,
310 'Xi' => 926,
311 'xi' => 958,
312 'Yacute' => 221,
313 'yacute' => 253,
314 'yen' => 165,
315 'Yuml' => 376,
316 'yuml' => 255,
317 'Zeta' => 918,
318 'zeta' => 950,
319 'zwj' => 8205,
320 'zwnj' => 8204 );
321
322 /** @package MediaWiki */
323 class Sanitizer {
324 /**
325 * Cleans up HTML, removes dangerous tags and attributes, and
326 * removes HTML comments
327 * @access private
328 * @param string $text
329 * @param callback $processCallback to do any variable or parameter replacements in HTML attribute values
330 * @param array $args for the processing callback
331 * @return string
332 */
333 function removeHTMLtags( $text, $processCallback = null, $args = array() ) {
334 global $wgUseTidy, $wgUserHtml;
335 // $fname = 'Parser::removeHTMLtags';
336 // wfProfileIn( $fname );
337
338 if( $wgUserHtml ) {
339 $htmlpairs = array( # Tags that must be closed
340 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
341 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
342 'strike', 'strong', 'tt', 'var', 'div', 'center',
343 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
344 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span'
345 );
346 $htmlsingle = array(
347 'br', 'hr', 'li', 'dt', 'dd'
348 );
349 $htmlsingleonly = array( # Elements that cannot have close tags
350 'br', 'hr'
351 );
352 $htmlnest = array( # Tags that can be nested--??
353 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
354 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
355 );
356 $tabletags = array( # Can only appear inside table
357 'td', 'th', 'tr'
358 );
359 } else {
360 $htmlpairs = array();
361 $htmlsingle = array();
362 $htmlnest = array();
363 $tabletags = array();
364 }
365
366 $htmlsingle = array_merge( $tabletags, $htmlsingle );
367 $htmlelements = array_merge( $htmlsingle, $htmlpairs );
368
369 # Remove HTML comments
370 $text = Sanitizer::removeHTMLcomments( $text );
371
372 $bits = explode( '<', $text );
373 $text = array_shift( $bits );
374 if(!$wgUseTidy) {
375 $tagstack = array(); $tablestack = array();
376 foreach ( $bits as $x ) {
377 $prev = error_reporting( E_ALL & ~( E_NOTICE | E_WARNING ) );
378 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
379 $x, $regs );
380 list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
381 error_reporting( $prev );
382
383 $badtag = 0 ;
384 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
385 # Check our stack
386 if ( $slash ) {
387 # Closing a tag...
388 if( in_array( $t, $htmlsingleonly ) ) {
389 $badtag = 1;
390 } elseif( !in_array( $t, $htmlsingle ) &&
391 ( $ot = @array_pop( $tagstack ) ) != $t ) {
392 @array_push( $tagstack, $ot );
393 $badtag = 1;
394 } else {
395 if ( $t == 'table' ) {
396 $tagstack = array_pop( $tablestack );
397 }
398 $newparams = '';
399 }
400 } else {
401 # Keep track for later
402 if ( in_array( $t, $tabletags ) &&
403 ! in_array( 'table', $tagstack ) ) {
404 $badtag = 1;
405 } else if ( in_array( $t, $tagstack ) &&
406 ! in_array ( $t , $htmlnest ) ) {
407 $badtag = 1 ;
408 } elseif( in_array( $t, $htmlsingleonly ) ) {
409 # Hack to force empty tag for uncloseable elements
410 $brace = '/>';
411 } else if ( ! in_array( $t, $htmlsingle ) ) {
412 if ( $t == 'table' ) {
413 array_push( $tablestack, $tagstack );
414 $tagstack = array();
415 }
416 array_push( $tagstack, $t );
417 }
418
419 # Replace any variables or template parameters with
420 # plaintext results.
421 if( is_callable( $processCallback ) ) {
422 call_user_func_array( $processCallback, array( &$params, $args ) );
423 }
424
425 # Strip non-approved attributes from the tag
426 $newparams = Sanitizer::fixTagAttributes( $params, $t );
427 }
428 if ( ! $badtag ) {
429 $rest = str_replace( '>', '&gt;', $rest );
430 $close = ( $brace == '/>' ) ? ' /' : '';
431 $text .= "<$slash$t$newparams$close>$rest";
432 continue;
433 }
434 }
435 $text .= '&lt;' . str_replace( '>', '&gt;', $x);
436 }
437 # Close off any remaining tags
438 while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
439 $text .= "</$t>\n";
440 if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
441 }
442 } else {
443 # this might be possible using tidy itself
444 foreach ( $bits as $x ) {
445 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
446 $x, $regs );
447 @list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
448 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
449 if( is_callable( $processCallback ) ) {
450 call_user_func_array( $processCallback, array( &$params, $args ) );
451 }
452 $newparams = Sanitizer::fixTagAttributes( $params, $t );
453 $rest = str_replace( '>', '&gt;', $rest );
454 $text .= "<$slash$t$newparams$brace$rest";
455 } else {
456 $text .= '&lt;' . str_replace( '>', '&gt;', $x);
457 }
458 }
459 }
460 // wfProfileOut( $fname );
461 return $text;
462 }
463
464 /**
465 * Remove '<!--', '-->', and everything between.
466 * To avoid leaving blank lines, when a comment is both preceded
467 * and followed by a newline (ignoring spaces), trim leading and
468 * trailing spaces and one of the newlines.
469 *
470 * @access private
471 * @param string $text
472 * @return string
473 */
474 function removeHTMLcomments( $text ) {
475 // $fname='Parser::removeHTMLcomments';
476 // wfProfileIn( $fname );
477 while (($start = strpos($text, '<!--')) !== false) {
478 $end = strpos($text, '-->', $start + 4);
479 if ($end === false) {
480 # Unterminated comment; bail out
481 break;
482 }
483
484 $end += 3;
485
486 # Trim space and newline if the comment is both
487 # preceded and followed by a newline
488 $spaceStart = max($start - 1, 0);
489 $spaceLen = $end - $spaceStart;
490 while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
491 $spaceStart--;
492 $spaceLen++;
493 }
494 while (substr($text, $spaceStart + $spaceLen, 1) === ' ')
495 $spaceLen++;
496 if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") {
497 # Remove the comment, leading and trailing
498 # spaces, and leave only one newline.
499 $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);
500 }
501 else {
502 # Remove just the comment.
503 $text = substr_replace($text, '', $start, $end - $start);
504 }
505 }
506 // wfProfileOut( $fname );
507 return $text;
508 }
509
510 /**
511 * Take a tag soup fragment listing an HTML element's attributes
512 * and normalize it to well-formed XML, discarding unwanted attributes.
513 *
514 * - Normalizes attribute names to lowercase
515 * - Discards attributes not on a whitelist for the given element
516 * - Turns broken or invalid entities into plaintext
517 * - Double-quotes all attribute values
518 * - Attributes without values are given the name as attribute
519 * - Double attributes are discarded
520 * - Unsafe style attributes are discarded
521 * - Prepends space if there are attributes.
522 *
523 * @param string $text
524 * @param string $element
525 * @return string
526 *
527 * @todo Check for legal values where the DTD limits things.
528 * @todo Check for unique id attribute :P
529 */
530 function fixTagAttributes( $text, $element ) {
531 global $wgUrlProtocols;
532 if( trim( $text ) == '' ) {
533 return '';
534 }
535
536 # Unquoted attribute
537 # Since we quote this later, this can be anything distinguishable
538 # from the end of the attribute
539 if( !preg_match_all(
540 MW_ATTRIBS_REGEX,
541 $text,
542 $pairs,
543 PREG_SET_ORDER ) ) {
544 return '';
545 }
546
547 $whitelist = array_flip( Sanitizer::attributeWhitelist( $element ) );
548 $attribs = array();
549 foreach( $pairs as $set ) {
550 $attribute = strtolower( $set[1] );
551 if( !isset( $whitelist[$attribute] ) ) {
552 continue;
553 }
554
555 $raw = Sanitizer::getTagAttributeCallback( $set );
556 $value = Sanitizer::normalizeAttributeValue( $raw );
557
558 # Strip javascript "expression" from stylesheets.
559 # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
560 if( $attribute == 'style' ) {
561 $stripped = Sanitizer::decodeCharReferences( $value );
562
563 // Remove any comments; IE gets token splitting wrong
564 $stripped = preg_replace( '!/\\*.*?\\*/!S', ' ', $stripped );
565 $value = htmlspecialchars( $stripped );
566
567 // ... and continue checks
568 $stripped = preg_replace( '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!e',
569 'codepointToUtf8(hexdec("$1"))', $stripped );
570 $stripped = str_replace( '\\', '', $stripped );
571 if( preg_match( '/(expression|tps*:\/\/|url\\s*\().*/is',
572 $stripped ) ) {
573 # haxx0r
574 continue;
575 }
576 }
577
578 # Templates and links may be expanded in later parsing,
579 # creating invalid or dangerous output. Suppress this.
580 $value = strtr( $value, array(
581 '{' => '&#123;',
582 '[' => '&#91;',
583 "''" => '&#39;&#39;',
584 'ISBN' => '&#73;SBN',
585 'RFC' => '&#82;FC',
586 'PMID' => '&#80;MID',
587 ) );
588
589 # Stupid hack
590 $value = preg_replace_callback(
591 '/(' . $wgUrlProtocols . ')/',
592 array( 'Sanitizer', 'armorLinksCallback' ),
593 $value );
594
595 // If this attribute was previously set, override it.
596 // Output should only have one attribute of each name.
597 $attribs[$attribute] = "$attribute=\"$value\"";
598 }
599 if( empty( $attribs ) ) {
600 return '';
601 } else {
602 return ' ' . implode( ' ', $attribs );
603 }
604 }
605
606 /**
607 * Regex replace callback for armoring links against further processing.
608 * @param array $matches
609 * @return string
610 * @access private
611 */
612 function armorLinksCallback( $matches ) {
613 return str_replace( ':', '&#58;', $matches[1] );
614 }
615
616 /**
617 * Return an associative array of attribute names and values from
618 * a partial tag string. Attribute names are forces to lowercase,
619 * character references are decoded to UTF-8 text.
620 *
621 * @param string
622 * @return array
623 */
624 function decodeTagAttributes( $text ) {
625 $attribs = array();
626
627 if( trim( $text ) == '' ) {
628 return $attribs;
629 }
630
631 if( !preg_match_all(
632 MW_ATTRIBS_REGEX,
633 $text,
634 $pairs,
635 PREG_SET_ORDER ) ) {
636 return $attribs;
637 }
638
639 foreach( $pairs as $set ) {
640 $attribute = strtolower( $set[1] );
641 $value = Sanitizer::getTagAttributeCallback( $set );
642 $attribs[$attribute] = Sanitizer::decodeCharReferences( $value );
643 }
644 return $attribs;
645 }
646
647 /**
648 * Pick the appropriate attribute value from a match set from the
649 * MW_ATTRIBS_REGEX matches.
650 *
651 * @param array $set
652 * @return string
653 * @access private
654 */
655 function getTagAttributeCallback( $set ) {
656 if( isset( $set[6] ) ) {
657 # Illegal #XXXXXX color with no quotes.
658 return $set[6];
659 } elseif( isset( $set[5] ) ) {
660 # No quotes.
661 return $set[5];
662 } elseif( isset( $set[4] ) ) {
663 # Single-quoted
664 return $set[4];
665 } elseif( isset( $set[3] ) ) {
666 # Double-quoted
667 return $set[3];
668 } elseif( !isset( $set[2] ) ) {
669 # In XHTML, attributes must have a value.
670 # For 'reduced' form, return explicitly the attribute name here.
671 return $set[1];
672 } else {
673 wfDebugDieBacktrace( "Tag conditions not met. This should never happen and is a bug." );
674 }
675 }
676
677 /**
678 * Normalize whitespace and character references in an XML source-
679 * encoded text for an attribute value.
680 *
681 * See http://www.w3.org/TR/REC-xml/#AVNormalize for background,
682 * but note that we're not returning the value, but are returning
683 * XML source fragments that will be slapped into output.
684 *
685 * @param string $text
686 * @return string
687 * @access private
688 */
689 function normalizeAttributeValue( $text ) {
690 return str_replace( '"', '&quot;',
691 preg_replace(
692 '/\r\n|[\x20\x0d\x0a\x09]/',
693 ' ',
694 Sanitizer::normalizeCharReferences( $text ) ) );
695 }
696
697 /**
698 * Ensure that any entities and character references are legal
699 * for XML and XHTML specifically. Any stray bits will be
700 * &amp;-escaped to result in a valid text fragment.
701 *
702 * a. any named char refs must be known in XHTML
703 * b. any numeric char refs must be legal chars, not invalid or forbidden
704 * c. use &#x, not &#X
705 * d. fix or reject non-valid attributes
706 *
707 * @param string $text
708 * @return string
709 * @access private
710 */
711 function normalizeCharReferences( $text ) {
712 return preg_replace_callback(
713 MW_CHAR_REFS_REGEX,
714 array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
715 $text );
716 }
717 /**
718 * @param string $matches
719 * @return string
720 */
721 function normalizeCharReferencesCallback( $matches ) {
722 $ret = null;
723 if( $matches[1] != '' ) {
724 $ret = Sanitizer::normalizeEntity( $matches[1] );
725 } elseif( $matches[2] != '' ) {
726 $ret = Sanitizer::decCharReference( $matches[2] );
727 } elseif( $matches[3] != '' ) {
728 $ret = Sanitizer::hexCharReference( $matches[3] );
729 } elseif( $matches[4] != '' ) {
730 $ret = Sanitizer::hexCharReference( $matches[4] );
731 }
732 if( is_null( $ret ) ) {
733 return htmlspecialchars( $matches[0] );
734 } else {
735 return $ret;
736 }
737 }
738
739 /**
740 * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
741 * return the named entity reference as is. Otherwise, returns
742 * HTML-escaped text of pseudo-entity source (eg &amp;foo;)
743 *
744 * @param string $name
745 * @return string
746 */
747 function normalizeEntity( $name ) {
748 global $wgHtmlEntities;
749 if( isset( $wgHtmlEntities[$name] ) ) {
750 return "&$name;";
751 } else {
752 return "&amp;$name;";
753 }
754 }
755
756 function decCharReference( $codepoint ) {
757 $point = IntVal( $codepoint );
758 if( Sanitizer::validateCodepoint( $point ) ) {
759 return sprintf( '&#%d;', $point );
760 } else {
761 return null;
762 }
763 }
764
765 function hexCharReference( $codepoint ) {
766 $point = hexdec( $codepoint );
767 if( Sanitizer::validateCodepoint( $point ) ) {
768 return sprintf( '&#x%x;', $point );
769 } else {
770 return null;
771 }
772 }
773
774 /**
775 * Returns true if a given Unicode codepoint is a valid character in XML.
776 * @param int $codepoint
777 * @return bool
778 */
779 function validateCodepoint( $codepoint ) {
780 return ($codepoint == 0x09)
781 || ($codepoint == 0x0a)
782 || ($codepoint == 0x0d)
783 || ($codepoint >= 0x20 && $codepoint <= 0xd7ff)
784 || ($codepoint >= 0xe000 && $codepoint <= 0xfffd)
785 || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
786 }
787
788 /**
789 * Decode any character references, numeric or named entities,
790 * in the text and return a UTF-8 string.
791 *
792 * @param string $text
793 * @return string
794 * @access public
795 */
796 function decodeCharReferences( $text ) {
797 return preg_replace_callback(
798 MW_CHAR_REFS_REGEX,
799 array( 'Sanitizer', 'decodeCharReferencesCallback' ),
800 $text );
801 }
802
803 /**
804 * @param string $matches
805 * @return string
806 */
807 function decodeCharReferencesCallback( $matches ) {
808 if( $matches[1] != '' ) {
809 return Sanitizer::decodeEntity( $matches[1] );
810 } elseif( $matches[2] != '' ) {
811 return Sanitizer::decodeChar( intval( $matches[2] ) );
812 } elseif( $matches[3] != '' ) {
813 return Sanitizer::decodeChar( hexdec( $matches[3] ) );
814 } elseif( $matches[4] != '' ) {
815 return Sanitizer::decodeChar( hexdec( $matches[4] ) );
816 }
817 # Last case should be an ampersand by itself
818 return $matches[0];
819 }
820
821 /**
822 * Return UTF-8 string for a codepoint if that is a valid
823 * character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
824 * @param int $codepoint
825 * @return string
826 * @access private
827 */
828 function decodeChar( $codepoint ) {
829 if( Sanitizer::validateCodepoint( $codepoint ) ) {
830 return codepointToUtf8( $codepoint );
831 } else {
832 return UTF8_REPLACEMENT;
833 }
834 }
835
836 /**
837 * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
838 * return the UTF-8 encoding of that character. Otherwise, returns
839 * pseudo-entity source (eg &foo;)
840 *
841 * @param string $name
842 * @return string
843 */
844 function decodeEntity( $name ) {
845 global $wgHtmlEntities;
846 if( isset( $wgHtmlEntities[$name] ) ) {
847 return codepointToUtf8( $wgHtmlEntities[$name] );
848 } else {
849 return "&$name;";
850 }
851 }
852
853 /**
854 * Fetch the whitelist of acceptable attributes for a given
855 * element name.
856 *
857 * @param string $element
858 * @return array
859 */
860 function attributeWhitelist( $element ) {
861 static $list;
862 if( !isset( $list ) ) {
863 $list = Sanitizer::setupAttributeWhitelist();
864 }
865 return isset( $list[$element] )
866 ? $list[$element]
867 : array();
868 }
869
870 /**
871 * @return array
872 */
873 function setupAttributeWhitelist() {
874 $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
875 $block = array_merge( $common, array( 'align' ) );
876 $tablealign = array( 'align', 'char', 'charoff', 'valign' );
877 $tablecell = array( 'abbr',
878 'axis',
879 'headers',
880 'scope',
881 'rowspan',
882 'colspan',
883 'nowrap', # deprecated
884 'width', # deprecated
885 'height', # deprecated
886 'bgcolor' # deprecated
887 );
888
889 # Numbers refer to sections in HTML 4.01 standard describing the element.
890 # See: http://www.w3.org/TR/html4/
891 $whitelist = array (
892 # 7.5.4
893 'div' => $block,
894 'center' => $common, # deprecated
895 'span' => $block, # ??
896
897 # 7.5.5
898 'h1' => $block,
899 'h2' => $block,
900 'h3' => $block,
901 'h4' => $block,
902 'h5' => $block,
903 'h6' => $block,
904
905 # 7.5.6
906 # address
907
908 # 8.2.4
909 # bdo
910
911 # 9.2.1
912 'em' => $common,
913 'strong' => $common,
914 'cite' => $common,
915 # dfn
916 'code' => $common,
917 # samp
918 # kbd
919 'var' => $common,
920 # abbr
921 # acronym
922
923 # 9.2.2
924 'blockquote' => array_merge( $common, array( 'cite' ) ),
925 # q
926
927 # 9.2.3
928 'sub' => $common,
929 'sup' => $common,
930
931 # 9.3.1
932 'p' => $block,
933
934 # 9.3.2
935 'br' => array( 'id', 'class', 'title', 'style', 'clear' ),
936
937 # 9.3.4
938 'pre' => array_merge( $common, array( 'width' ) ),
939
940 # 9.4
941 'ins' => array_merge( $common, array( 'cite', 'datetime' ) ),
942 'del' => array_merge( $common, array( 'cite', 'datetime' ) ),
943
944 # 10.2
945 'ul' => array_merge( $common, array( 'type' ) ),
946 'ol' => array_merge( $common, array( 'type', 'start' ) ),
947 'li' => array_merge( $common, array( 'type', 'value' ) ),
948
949 # 10.3
950 'dl' => $common,
951 'dd' => $common,
952 'dt' => $common,
953
954 # 11.2.1
955 'table' => array_merge( $common,
956 array( 'summary', 'width', 'border', 'frame',
957 'rules', 'cellspacing', 'cellpadding',
958 'align', 'bgcolor', 'frame', 'rules',
959 'border' ) ),
960
961 # 11.2.2
962 'caption' => array_merge( $common, array( 'align' ) ),
963
964 # 11.2.3
965 'thead' => array_merge( $common, $tablealign ),
966 'tfoot' => array_merge( $common, $tablealign ),
967 'tbody' => array_merge( $common, $tablealign ),
968
969 # 11.2.4
970 'colgroup' => array_merge( $common, array( 'span', 'width' ), $tablealign ),
971 'col' => array_merge( $common, array( 'span', 'width' ), $tablealign ),
972
973 # 11.2.5
974 'tr' => array_merge( $common, array( 'bgcolor' ), $tablealign ),
975
976 # 11.2.6
977 'td' => array_merge( $common, $tablecell, $tablealign ),
978 'th' => array_merge( $common, $tablecell, $tablealign ),
979
980 # 15.2.1
981 'tt' => $common,
982 'b' => $common,
983 'i' => $common,
984 'big' => $common,
985 'small' => $common,
986 'strike' => $common,
987 's' => $common,
988 'u' => $common,
989
990 # 15.2.2
991 'font' => array_merge( $common, array( 'size', 'color', 'face' ) ),
992 # basefont
993
994 # 15.3
995 'hr' => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
996
997 # XHTML Ruby annotation text module, simple ruby only.
998 # http://www.w3c.org/TR/ruby/
999 'ruby' => $common,
1000 # rbc
1001 # rtc
1002 'rb' => $common,
1003 'rt' => $common, #array_merge( $common, array( 'rbspan' ) ),
1004 'rp' => $common,
1005 );
1006 return $whitelist;
1007 }
1008
1009 /**
1010 * Take a fragment of (potentially invalid) HTML and return
1011 * a version with any tags removed, encoded suitably for literal
1012 * inclusion in an attribute value.
1013 *
1014 * @param string $text HTML fragment
1015 * @return string
1016 */
1017 function stripAllTags( $text ) {
1018 # Actual <tags>
1019 $text = preg_replace( '/<[^>]*>/', '', $text );
1020
1021 # Normalize &entities and whitespace
1022 $text = Sanitizer::normalizeAttributeValue( $text );
1023
1024 # Will be placed into "double-quoted" attributes,
1025 # make sure remaining bits are safe.
1026 $text = str_replace(
1027 array('<', '>', '"'),
1028 array('&lt;', '&gt;', '&quot;'),
1029 $text );
1030
1031 return $text;
1032 }
1033
1034 }
1035
1036 ?>

  ViewVC Help
Powered by ViewVC 1.1.2