/[drupal]/contributions/modules/liquid/mw_parser.inc
ViewVC logotype

Contents of /contributions/modules/liquid/mw_parser.inc

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.3 - (show annotations) (download) (as text)
Thu Feb 8 21:02:10 2007 UTC (2 years, 9 months ago) by sorenp
Branch: MAIN
CVS Tags: DRUPAL-5--0-1, HEAD
Branch point for: DRUPAL-5
Changes since 1.2: +3 -2 lines
File MIME type: text/x-php
The next development version of liquid. Updated for Drupal 5.1 (new files)
1 <?php
2
3 // $Id$
4
5 require_once( 'mw_sanitizer.inc' );
6
7 $wgUrlProtocols = 'http:\/\/|https:\/\/|ftp:\/\/|irc:\/\/|gopher:\/\/|news:|mailto:';
8 // Constants needed for external link processing
9 define( 'HTTP_PROTOCOLS', 'http:\/\/|https:\/\/' );
10 // Everything except bracket, space, or control characters
11 define( 'EXT_LINK_URL_CLASS', '[^]<>"\\x00-\\x20\\x7F]' );
12 // Including space
13 define( 'EXT_LINK_TEXT_CLASS', '[^\]\\x00-\\x1F\\x7F]' );
14 define( 'EXT_IMAGE_FNAME_CLASS', '[A-Za-z0-9_.,~%\\-+&;#*?!=()@\\x80-\\xFF]' );
15 define( 'EXT_IMAGE_EXTENSIONS', 'gif|png|jpg|jpeg' );
16 define( 'EXT_LINK_BRACKETED', '/\[(\b('.$wgUrlProtocols.')'.EXT_LINK_URL_CLASS.'+) *('.EXT_LINK_TEXT_CLASS.'*?)\]/S' );
17 define( 'EXT_IMAGE_REGEX',
18 '/^('.HTTP_PROTOCOLS.')'. # Protocol
19 '('.EXT_LINK_URL_CLASS.'+)\\/'. # Hostname and path
20 '('.EXT_IMAGE_FNAME_CLASS.'+)\\.((?i)'.EXT_IMAGE_EXTENSIONS.')$/S' # Filename
21 );
22
23
24 class mw_parser
25 {
26 // Cleared with clearState():
27 var $mDTopen, $mStripState = array();
28 var $mLastSection, $mInPre;
29 var $mUniqPrefix;
30
31 function clearState() {
32 $this->mLastSection = '';
33 $this->mDTopen = false;
34 $this->mStripState = array();
35 $this->mInPre = false;
36 $this->mUniqPrefix = 'UNIQ' . mw_parser::getRandomString();
37 }
38
39 function parse($text) {
40 $this->clearState();
41 $this->mStripState = NULL;
42 $x = & $this->mStripState;
43 $text = $this->strip($text, $x);
44
45 $text = $this->internalParse( $text );
46
47 $text = $this->unstrip( $text, $this->mStripState );
48
49 // Clean up special characters, only run once, next-to-last before doBlockLevels
50 $fixtags = array(
51 // french spaces, last one Guillemet-left
52 // only if there is something before the space
53 '/(.) (?=\\?|:|;|!|\\302\\273)/' => '\\1&nbsp;\\2',
54 // french spaces, Guillemet-right
55 '/(\\302\\253) /' => '\\1&nbsp;',
56 '/<center *>(.*)<\\/center *>/i' => '<div class="center">\\1</div>',
57 );
58
59 $text = preg_replace( array_keys($fixtags), array_values($fixtags), $text );
60
61 // only once and last
62 $text = $this->doBlockLevels( $text, true);
63
64 $text = $this->unstripNoWiki( $text, $this->mStripState );
65
66 return $text;
67 }
68
69 /**
70 * Get a random string
71 *
72 * @access private
73 * @static
74 */
75 function getRandomString() {
76 return dechex(mt_rand(0, 0x7fffffff)) . dechex(mt_rand(0, 0x7fffffff));
77 }
78
79 /**
80 * Replaces all occurrences of <$tag>content</$tag> in the text
81 * with a random marker and returns the new text. the output parameter
82 * $content will be an associative array filled with data on the form
83 * $unique_marker => content.
84 *
85 * If $content is already set, the additional entries will be appended
86 * If $tag is set to STRIP_COMMENTS, the function will extract
87 * <!-- HTML comments -->
88 *
89 * @access private
90 * @static
91 */
92 function extractTagsAndParams($tag, $text, &$content, &$tags, &$params, $uniq_prefix = ''){
93 $rnd = $uniq_prefix . '-' . $tag . mw_parser::getRandomString();
94 if ( !$content ) {
95 $content = array( );
96 }
97 $n = 1;
98 $stripped = '';
99
100 if ( !$tags ) {
101 $tags = array( );
102 }
103
104 if ( !$params ) {
105 $params = array( );
106 }
107
108 if( $tag == STRIP_COMMENTS ) {
109 $start = '/<!--()/';
110 $end = '/-->/';
111 } else {
112 $start = "/<$tag(\\s+[^>]*|\\s*)>/i";
113 $end = "/<\\/$tag\\s*>/i";
114 }
115
116 while ( '' != $text ) {
117 $p = preg_split( $start, $text, 2, PREG_SPLIT_DELIM_CAPTURE );
118 $stripped .= $p[0];
119 if( count( $p ) < 3 ) {
120 break;
121 }
122 $attributes = $p[1];
123 $inside = $p[2];
124
125 $marker = $rnd . sprintf('%08X', $n++);
126 $stripped .= $marker;
127
128 $tags[$marker] = "<$tag$attributes>";
129 $params[$marker] = Sanitizer::decodeTagAttributes( $attributes );
130
131 $q = preg_split( $end, $inside, 2 );
132 $content[$marker] = $q[0];
133 if( count( $q ) < 2 ) {
134 # No end tag -- let it run out to the end of the text.
135 break;
136 } else {
137 $text = $q[1];
138 }
139 }
140 return $stripped;
141 }
142
143 /**
144 * Wrapper function for extractTagsAndParams
145 * for cases where $tags and $params isn't needed
146 * i.e. where tags will never have params, like <nowiki>
147 *
148 * @access private
149 * @static
150 */
151 function extractTags( $tag, $text, &$content, $uniq_prefix = '' ) {
152 $dummy_tags = array();
153 $dummy_params = array();
154
155 return mw_parser::extractTagsAndParams( $tag, $text, $content,
156 $dummy_tags, $dummy_params, $uniq_prefix );
157 }
158
159 /**
160 * Strips and renders nowiki, pre, math, hiero
161 * If $render is set, performs necessary rendering operations on plugins
162 * Returns the text, and fills an array with data needed in unstrip()
163 * If the $state is already a valid strip state, it adds to the state
164 *
165 * @param bool $stripcomments when set, HTML comments <!-- like this -->
166 * will be stripped in addition to other tags. This is important
167 * for section editing, where these comments cause confusion when
168 * counting the sections in the wikisource
169 *
170 * @access private
171 */
172 function strip( $text, &$state, $stripcomments = false ) {
173 $html_content = array();
174 $nowiki_content = array();
175 $math_content = array();
176 $pre_content = array();
177 $comment_content = array();
178 $ext_content = array();
179 $ext_tags = array();
180 $ext_params = array();
181 $gallery_content = array();
182
183 # Replace any instances of the placeholders
184 $uniq_prefix = $this->mUniqPrefix;
185 #$text = str_replace( $uniq_prefix, wfHtmlEscapeFirst( $uniq_prefix ), $text );
186
187 # html
188 global $wgRawHtml;
189 if( $wgRawHtml ) {
190 $text = mw_parser::extractTags('html', $text, $html_content, $uniq_prefix);
191 foreach( $html_content as $marker => $content ) {
192 $html_content[$marker] = $content;
193 }
194 }
195
196 # nowiki
197 $text = mw_parser::extractTags('nowiki', $text, $nowiki_content, $uniq_prefix);
198 foreach( $nowiki_content as $marker => $content ) {
199 $nowiki_content[$marker] = wfEscapeHTMLTagsOnly( $content );
200 }
201
202 # math
203 $text = mw_parser::extractTags('math', $text, $math_content, $uniq_prefix);
204 foreach( $math_content as $marker => $content ){
205 $math_content[$marker] = renderMath( $content );
206 }
207
208 # pre
209 $text = mw_parser::extractTags('pre', $text, $pre_content, $uniq_prefix);
210 foreach( $pre_content as $marker => $content ){
211 $pre_content[$marker] = '<pre>' . wfEscapeHTMLTagsOnly( $content ) . '</pre>';
212 }
213
214 # Comments
215 if($stripcomments) {
216 $text = mw_parser::extractTags(STRIP_COMMENTS, $text, $comment_content, $uniq_prefix);
217 foreach( $comment_content as $marker => $content ){
218 $comment_content[$marker] = '<!--'.$content.'-->';
219 }
220 }
221
222 # Merge state with the pre-existing state, if there is one
223 if ( $state ) {
224 $state['html'] = $state['html'] + $html_content;
225 $state['nowiki'] = $state['nowiki'] + $nowiki_content;
226 $state['math'] = $state['math'] + $math_content;
227 $state['pre'] = $state['pre'] + $pre_content;
228 $state['comment'] = $state['comment'] + $comment_content;
229
230 } else {
231 $state = array(
232 'html' => $html_content,
233 'nowiki' => $nowiki_content,
234 'math' => $math_content,
235 'pre' => $pre_content,
236 'comment' => $comment_content,
237 );
238 }
239 return $text;
240 }
241
242 /**
243 * restores pre, math, and hiero removed by strip()
244 *
245 * always call unstripNoWiki() after this one
246 * @access private
247 */
248 function unstrip( $text, &$state ) {
249 # Must expand in reverse order, otherwise nested tags will be corrupted
250 foreach( array_reverse( $state, true ) as $tag => $contentDict ) {
251 if( $tag != 'nowiki' && $tag != 'html' ) {
252 foreach( array_reverse( $contentDict, true ) as $uniq => $content ) {
253 $text = str_replace( $uniq, $content, $text );
254 }
255 }
256 }
257
258 return $text;
259 }
260
261 /**
262 * always call this after unstrip() to preserve the order
263 *
264 * @access private
265 */
266 function unstripNoWiki( $text, &$state ) {
267 # Must expand in reverse order, otherwise nested tags will be corrupted
268 for ( $content = end($state['nowiki']); $content !== false; $content = prev( $state['nowiki'] ) ) {
269 $text = str_replace( key( $state['nowiki'] ), $content, $text );
270 }
271
272 return $text;
273 }
274
275 /**
276 * Helper function for parse() that transforms wiki markup into
277 * HTML. Only called for $mOutputType == OT_HTML.
278 *
279 * @access private
280 */
281 function internalParse( $text ) {
282 $args = array();
283
284 $text = strtr( $text, array( '<noinclude>' => '', '</noinclude>' => '') );
285 $text = preg_replace( '/<includeonly>.*?<\/includeonly>/s', '', $text );
286
287 $text = Sanitizer::removeHTMLtags( $text, array( &$this, 'attributeStripCallback' ) );
288
289 $text = preg_replace( '/(^|\n)-----*/', '\\1<hr />', $text );
290
291 $text = $this->doHeadings( $text );
292 $text = $this->doAllQuotes( $text );
293 $text = $this->replaceInternalLinks( $text );
294 $text = $this->replaceExternalLinks( $text );
295
296 # replaceInternalLinks may sometimes leave behind
297 # absolute URLs, which have to be masked to hide them from replaceExternalLinks
298 $text = str_replace($this->mUniqPrefix."NOPARSE", "", $text);
299
300 $text = $this->doTableStuff( $text );
301 // $text = $this->formatHeadings( $text, $isMain );
302
303 $regex = '/<!--IW_TRANSCLUDE (\d+)-->/';
304 $text = preg_replace_callback($regex, array(&$this, 'scarySubstitution'), $text);
305
306 return $text;
307 }
308
309 /**
310 * Callback from the Sanitizer for expanding items found in HTML attribute
311 * values, so they can be safely tested and escaped.
312 * @param string $text
313 * @param array $args
314 * @return string
315 * @access private
316 */
317 function attributeStripCallback( &$text, $args ) {
318 $text = $this->unstripForHTML( $text );
319 return $text;
320 }
321
322 function unstripForHTML( $text ) {
323 $text = $this->unstrip( $text, $this->mStripState );
324 $text = $this->unstripNoWiki( $text, $this->mStripState );
325 return $text;
326 }
327
328 function legalChars() {
329 $set = " %!\"$&'()*,\\-.\\/0-9:;=?@A-Z\\\\^_`a-z~\\x80-\\xFF";
330 return $set;
331 }
332
333 /**
334 * Parse headers and return html
335 *
336 * @access private
337 */
338 function doHeadings( $text ) {
339 for ( $i = 6; $i >= 1; --$i ) {
340 $h = substr( '======', 0, $i );
341 $text = preg_replace( "/^{$h}(.+){$h}(\\s|$)/m",
342 "<h{$i}>\\1</h{$i}>\\2", $text );
343 }
344 return $text;
345 }
346
347 /**
348 * Replace single quotes with HTML markup
349 * @access private
350 * @return string the altered text
351 */
352 function doAllQuotes( $text ) {
353 $outtext = '';
354 $lines = explode( "\n", $text );
355 foreach ( $lines as $line ) {
356 $outtext .= $this->doQuotes ( $line ) . "\n";
357 }
358 $outtext = substr($outtext, 0,-1);
359 return $outtext;
360 }
361
362 /**
363 * Helper function for doAllQuotes()
364 * @access private
365 */
366 function doQuotes( $text ) {
367 $arr = preg_split( "/(''+)/", $text, -1, PREG_SPLIT_DELIM_CAPTURE );
368 if ( count( $arr ) == 1 )
369 return $text;
370 else
371 {
372 # First, do some preliminary work. This may shift some apostrophes from
373 # being mark-up to being text. It also counts the number of occurrences
374 # of bold and italics mark-ups.
375 $i = 0;
376 $numbold = 0;
377 $numitalics = 0;
378 foreach ( $arr as $r ) {
379 if ( ( $i % 2 ) == 1 ) {
380 # If there are ever four apostrophes, assume the first is supposed to
381 # be text, and the remaining three constitute mark-up for bold text.
382 if ( strlen( $arr[$i] ) == 4 ) {
383 $arr[$i-1] .= "'";
384 $arr[$i] = "'''";
385 } else if ( strlen( $arr[$i] ) > 5 ) {
386 $arr[$i-1] .= str_repeat( "'", strlen( $arr[$i] ) - 5 );
387 $arr[$i] = "'''''";
388 }
389 # Count the number of occurrences of bold and italics mark-ups.
390 # We are not counting sequences of five apostrophes.
391 if ( strlen( $arr[$i] ) == 2 ) $numitalics++; else
392 if ( strlen( $arr[$i] ) == 3 ) $numbold++; else
393 if ( strlen( $arr[$i] ) == 5 ) { $numitalics++; $numbold++; }
394 }
395 $i++;
396 }
397
398 # If there is an odd number of both bold and italics, it is likely
399 # that one of the bold ones was meant to be an apostrophe followed
400 # by italics. Which one we cannot know for certain, but it is more
401 # likely to be one that has a single-letter word before it.
402 if ( ( $numbold % 2 == 1 ) && ( $numitalics % 2 == 1 ) )
403 {
404 $i = 0;
405 $firstsingleletterword = -1;
406 $firstmultiletterword = -1;
407 $firstspace = -1;
408 foreach ( $arr as $r )
409 {
410 if ( ( $i % 2 == 1 ) and ( strlen( $r ) == 3 ) )
411 {
412 $x1 = substr ($arr[$i-1], -1);
413 $x2 = substr ($arr[$i-1], -2, 1);
414 if ($x1 == ' ') {
415 if ($firstspace == -1) $firstspace = $i;
416 } else if ($x2 == ' ') {
417 if ($firstsingleletterword == -1) $firstsingleletterword = $i;
418 } else {
419 if ($firstmultiletterword == -1) $firstmultiletterword = $i;
420 }
421 }
422 $i++;
423 }
424
425 # If there is a single-letter word, use it!
426 if ($firstsingleletterword > -1)
427 {
428 $arr [ $firstsingleletterword ] = "''";
429 $arr [ $firstsingleletterword-1 ] .= "'";
430 }
431 else if ($firstmultiletterword > -1)
432 {
433 $arr [ $firstmultiletterword ] = "''";
434 $arr [ $firstmultiletterword-1 ] .= "'";
435 }
436 # ... otherwise use the first one that has neither.
437 # (notice that it is possible for all three to be -1 if, for example,
438 # there is only one pentuple-apostrophe in the line)
439 else if ($firstspace > -1)
440 {
441 $arr [ $firstspace ] = "''";
442 $arr [ $firstspace-1 ] .= "'";
443 }
444 }
445
446
447 $output = '';
448 $buffer = '';
449 $state = '';
450 $i = 0;
451 foreach ($arr as $r)
452 {
453 if (($i % 2) == 0)
454 {
455 if ($state == 'both')
456 $buffer .= $r;
457 else
458 $output .= $r;
459 }
460 else
461 {
462 if (strlen ($r) == 2)
463 {
464 if ($state == 'i')
465 { $output .= '</i>'; $state = ''; }
466 else if ($state == 'bi')
467 { $output .= '</i>'; $state = 'b'; }
468 else if ($state == 'ib')
469 { $output .= '</b></i><b>'; $state = 'b'; }
470 else if ($state == 'both')
471 { $output .= '<b><i>'.$buffer.'</i>'; $state = 'b'; }
472 else # $state can be 'b' or ''
473 { $output .= '<i>'; $state .= 'i'; }
474 }
475 else if (strlen ($r) == 3)
476 {
477 if ($state == 'b')
478 { $output .= '</b>'; $state = ''; }
479 else if ($state == 'bi')
480 { $output .= '</i></b><i>'; $state = 'i'; }
481 else if ($state == 'ib')
482 { $output .= '</b>'; $state = 'i'; }
483 else if ($state == 'both')
484 { $output .= '<i><b>'.$buffer.'</b>'; $state = 'i'; }
485 else # $state can be 'i' or ''
486 { $output .= '<b>'; $state .= 'b'; }
487 }
488 else if (strlen ($r) == 5)
489 {
490 if ($state == 'b')
491 { $output .= '</b><i>'; $state = 'i'; }
492 else if ($state == 'i')
493 { $output .= '</i><b>'; $state = 'b'; }
494 else if ($state == 'bi')
495 { $output .= '</i></b>'; $state = ''; }
496 else if ($state == 'ib')
497 { $output .= '</b></i>'; $state = ''; }
498 else if ($state == 'both')
499 { $output .= '<i><b>'.$buffer.'</b></i>'; $state = ''; }
500 else # ($state == '')
501 { $buffer = ''; $state = 'both'; }
502 }
503 }
504 $i++;
505 }
506 # Now close all remaining tags. Notice that the order is important.
507 if ($state == 'b' || $state == 'ib')
508 $output .= '</b>';
509 if ($state == 'i' || $state == 'bi' || $state == 'ib')
510 $output .= '</i>';
511 if ($state == 'bi')
512 $output .= '</b>';
513 if ($state == 'both')
514 $output .= '<b><i>'.$buffer.'</i></b>';
515 return $output;
516 }
517 }
518
519 /**
520 * parse the wiki syntax used to render tables
521 *
522 * @access private
523 */
524 function doTableStuff ( $t ) {
525 $t = explode ( "\n" , $t ) ;
526 $td = array () ;
527 $ltd = array () ;
528 $tr = array () ;
529 $ltr = array () ;
530 $indent_level = 0;
531 foreach ( $t AS $k => $x )
532 {
533 $x = trim ( $x ) ;
534 $fc = substr ( $x , 0 , 1 ) ;
535 if ( preg_match( '/^(:*)\{\|(.*)$/', $x, $matches ) ) {
536 $indent_level = strlen( $matches[1] );
537
538 $attributes = $this->unstripForHTML( $matches[2] );
539
540 $t[$k] = str_repeat( '<dl><dd>', $indent_level ) .
541 '<table' . Sanitizer::fixTagAttributes ( $attributes, 'table' ) . '>' ;
542 array_push ( $td , false ) ;
543 array_push ( $ltd , '' ) ;
544 array_push ( $tr , false ) ;
545 array_push ( $ltr , '' ) ;
546 }
547 else if ( count ( $td ) == 0 ) { } // Don't do any of the following
548 else if ( '|}' == substr ( $x , 0 , 2 ) ) {
549 $z = "</table>" . substr ( $x , 2);
550 $l = array_pop ( $ltd ) ;
551 if ( array_pop ( $tr ) ) $z = '</tr>' . $z ;
552 if ( array_pop ( $td ) ) $z = '</'.$l.'>' . $z ;
553 array_pop ( $ltr ) ;
554 $t[$k] = $z . str_repeat( '</dd></dl>', $indent_level );
555 }
556 else if ( '|-' == substr ( $x , 0 , 2 ) ) { // Allows for |---------------
557 $x = substr ( $x , 1 ) ;
558 while ( $x != '' && substr ( $x , 0 , 1 ) == '-' ) $x = substr ( $x , 1 ) ;
559 $z = '' ;
560 $l = array_pop ( $ltd ) ;
561 if ( array_pop ( $tr ) ) $z = '</tr>' . $z ;
562 if ( array_pop ( $td ) ) $z = '</'.$l.'>' . $z ;
563 array_pop ( $ltr ) ;
564 $t[$k] = $z ;
565 array_push ( $tr , false ) ;
566 array_push ( $td , false ) ;
567 array_push ( $ltd , '' ) ;
568 $attributes = $this->unstripForHTML( $x );
569 array_push ( $ltr , Sanitizer::fixTagAttributes ( $attributes, 'tr' ) ) ;
570 }
571 else if ( '|' == $fc || '!' == $fc || '|+' == substr ( $x , 0 , 2 ) ) {
572 if ( '|+' == substr ( $x , 0 , 2 ) ) {
573 $fc = '+' ;
574 $x = substr ( $x , 1 ) ;
575 }
576 $after = substr ( $x , 1 ) ;
577 if ( $fc == '!' ) $after = str_replace ( '!!' , '||' , $after ) ;
578 $after = explode ( '||' , $after ) ;
579 $t[$k] = '' ;
580
581 # Loop through each table cell
582 foreach ( $after AS $theline )
583 {
584 $z = '' ;
585 if ( $fc != '+' )
586 {
587 $tra = array_pop ( $ltr ) ;
588 if ( !array_pop ( $tr ) ) $z = '<tr'.$tra.">\n" ;
589 array_push ( $tr , true ) ;
590 array_push ( $ltr , '' ) ;
591 }
592
593 $l = array_pop ( $ltd ) ;
594 if ( array_pop ( $td ) ) $z = '</'.$l.'>' . $z ;
595 if ( $fc == '|' ) $l = 'td' ;
596 else if ( $fc == '!' ) $l = 'th' ;
597 else if ( $fc == '+' ) $l = 'caption' ;
598 else $l = '' ;
599 array_push ( $ltd , $l ) ;
600
601 # Cell parameters
602 $y = explode ( '|' , $theline , 2 ) ;
603 # Note that a '|' inside an invalid link should not
604 # be mistaken as delimiting cell parameters
605 if ( strpos( $y[0], '[[' ) !== false ) {
606 $y = array ($theline);
607 }
608 if ( count ( $y ) == 1 )
609 $y = "{$z}<{$l}>{$y[0]}" ;
610 else {
611 $attributes = $this->unstripForHTML( $y[0] );
612 $y = "{$z}<{$l}".Sanitizer::fixTagAttributes($attributes, $l).">{$y[1]}" ;
613 }
614 $t[$k] .= $y ;
615 array_push ( $td , true ) ;
616 }
617 }
618 }
619
620 # Closing open td, tr && table
621 while ( count ( $td ) > 0 )
622 {
623 if ( array_pop ( $td ) ) $t[] = '</td>' ;
624 if ( array_pop ( $tr ) ) $t[] = '</tr>' ;
625 $t[] = '</table>' ;
626 }
627
628 $t = implode ( "\n" , $t ) ;
629 //wfProfileOut( $fname );
630 return $t ;
631 }
632
633 function scarySubstitution($matches) {
634 return $this->mIWTransData[(int)$matches[0]];
635 }
636
637 /**
638 * Process [[ ]] wikilinks
639 *
640 * @access private
641 */
642 function replaceInternalLinks( $s ) {
643 $wgUrlProtocols = 'http:\/\/|https:\/\/|ftp:\/\/|irc:\/\/|gopher:\/\/|news:|mailto:';
644 static $tc = FALSE;
645 if ( !$tc ) { $tc = $this->legalChars() . '#%'; }
646
647 //split the entire text string on occurences of [[
648 $a = explode( '[[', ' ' . $s );
649 //get the first element (all text up to first [[), and remove the space we added
650 $s = array_shift( $a );
651 $s = substr( $s, 1 );
652
653 // Match a link having the form [[namespace:link|alternate]]trail
654 static $e1 = FALSE;
655 if ( !$e1 ) { $e1 = "/^([{$tc}]+)(?:\\|(.+?))?]](.*)\$/sD"; }
656 // Loop for each link
657 for ($k = 0; isset( $a[$k] ); $k++) {
658 $line = $a[$k];
659 if ( preg_match( $e1, $line, $m ) ) {// page with normal text or alt
660 $text = $m[2];
661 // If we get a ] at the beginning of $m[3] that means we have a link that's something like:
662 // [[Image:Foo.jpg|[http://example.com desc]]] <- having three ] in a row fucks up,
663 // the real problem is with the $e1 regex
664 // See bug 1300.
665 //
666 // Still some problems for cases where the ] is meant to be outside punctuation,
667 // and no image is in sight. See bug 2095.
668
669 if( $text !== '' && preg_match( "/^\](.*)/s", $m[3], $n ) ) {
670 $text .= ']'; // so that replaceExternalLinks($text) works later
671 $m[3] = $n[1];
672 }
673 // fix up urlencoded title texts
674 if(preg_match('/%/', $m[1] )) $m[1] = urldecode($m[1]);
675 $trail = $m[3];
676 } else { // Invalid form; output directly
677 $s .= $prefix . '[[' . $line ;
678 continue;
679 }
680
681 // Don't allow internal links to pages containing
682 // PROTO: where PROTO is a valid URL protocol; these
683 // should be external links.
684 if (preg_match('/^(\b(?:'.$wgUrlProtocols.'))/', $m[1])) {
685 $s .= $prefix . '[[' . $line ;
686 continue;
687 }
688
689 // Make subpage if necessary
690
691 /*if( $useSubpages ) {
692 $link = $this->maybeDoSubpageLink( $m[1], $text );
693 } else {
694 $link = $m[1];
695 }*/
696 $link = $m[1];
697
698 $noforce = (substr($m[1], 0, 1) != ':');
699 if (!$noforce) {
700 // Strip off leading ':'
701 $link = substr($link, 1);
702 }
703
704
705 $wasblank = ( '' == $text );
706 if( $wasblank ) $text = $link;
707
708 $wid = new WikiId($link);
709 $s .= l($text, 'wiki/'.$wid->toURLString()) . $trail;
710
711 }
712 return $s;
713 }
714
715 /**
716 * Replace external links
717 *
718 * Note: this is all very hackish and the order of execution matters a lot.
719 * Make sure to run maintenance/parserTests.php if you change this code.
720 *
721 * @access private
722 */
723 function replaceExternalLinks( $text ) {
724 $bits = preg_split( EXT_LINK_BRACKETED, $text, -1, PREG_SPLIT_DELIM_CAPTURE );
725
726 $s = $this->replaceFreeExternalLinks( array_shift( $bits ) );
727
728 $i = 0;
729 while ( $i<count( $bits ) ) {
730 $url = $bits[$i++];
731 $protocol = $bits[$i++];
732 $text = $bits[$i++];
733 $trail = $bits[$i++];
734
735 // The characters '<' and '>' (which were escaped by
736 // removeHTMLtags()) should not be included in
737 // URLs, per RFC 2396.
738 if (preg_match('/&(lt|gt);/', $url, $m2, PREG_OFFSET_CAPTURE)) {
739 $text = substr($url, $m2[0][1]) . ' ' . $text;
740 $url = substr($url, 0, $m2[0][1]);
741 }
742
743 // If the link text is an image URL, replace it with an <img> tag
744 // This happened by accident in the original parser, but some people used it extensively
745 $img = $this->maybeMakeExternalImage( $text );
746 if ( $img !== false ) {
747 $text = $img;
748 }
749
750 $dtrail = '';
751
752 // Set linktype for CSS - if URL==text, link is essentially free
753 $linktype = ($text == $url) ? 'free' : 'text';
754
755 // No link text, e.g. [http://domain.tld/some.link]
756 if ( $text == '' ) {
757 $text = htmlspecialchars( $url );
758 $linktype = 'free';
759 } else {
760 // Have link text, e.g. [http://domain.tld/some.link text]s
761 // Check for trail
762 list( $dtrail, $trail ) = $this->splitTrail( $trail );
763 }
764
765 // Replace &amp; from obsolete syntax with &.
766 // All HTML entities will be escaped by makeExternalLink()
767 // or maybeMakeExternalImage()
768 $url = str_replace( '&amp;', '&', $url );
769
770 // Process the trail (i.e. everything after this link up until start of the next link),
771 // replacing any non-bracketed links
772 $trail = $this->replaceFreeExternalLinks( $trail );
773
774
775 // Use the encoded URL
776 // This means that users can paste URLs directly into the text
777 // Funny characters like &ouml; aren't valid in URLs anyway
778 // This was changed in August 2004
779 $s .= ('<a href="'.url($url).'">'.$text.'</a>') . $dtrail . $trail;
780 }
781 return $s;
782 }
783
784 function splitTrail( $trail ) {
785 static $regex = '/^([a-z]+)(.*)$/sD';
786 $inside = '';
787 if ( '' != $trail ) {
788 if ( preg_match( $regex, $trail, $m ) ) {
789 $inside = $m[1];
790 $trail = $m[2];
791 }
792 }
793 return array( $inside, $trail );
794 }
795
796
797 /**
798 * Replace anything that looks like a URL with a link
799 * @access private
800 */
801 function replaceFreeExternalLinks( $text ) {
802 $wgUrlProtocols = 'http:\/\/|https:\/\/|ftp:\/\/|irc:\/\/|gopher:\/\/|news:|mailto:';
803
804 $bits = preg_split( '/(\b(?:'.$wgUrlProtocols.'))/S', $text, -1, PREG_SPLIT_DELIM_CAPTURE );
805 $s = array_shift( $bits );
806 $i = 0;
807 while ( $i < count( $bits ) ){
808 $protocol = $bits[$i++];
809 $remainder = $bits[$i++];
810
811 if ( preg_match( '/^('.EXT_LINK_URL_CLASS.'+)(.*)$/s', $remainder, $m ) ) {
812 // Found some characters after the protocol that look promising
813 $url = $protocol . $m[1];
814 $trail = $m[2];
815
816 // The characters '<' and '>' (which were escaped by
817 // removeHTMLtags()) should not be included in
818 // URLs, per RFC 2396.
819 if (preg_match('/&(lt|gt);/', $url, $m2, PREG_OFFSET_CAPTURE)) {
820 $trail = substr($url, $m2[0][1]) . $trail;
821 $url = substr($url, 0, $m2[0][1]);
822 }
823
824 // Move trailing punctuation to $trail
825 $sep = ',;\.:!?';
826 // If there is no left bracket, then consider right brackets fair game too
827 if ( strpos( $url, '(' ) === false ) {
828 $sep .= ')';
829 }
830
831 $numSepChars = strspn( strrev( $url ), $sep );
832 if ( $numSepChars ) {
833 $trail = substr( $url, -$numSepChars ) . $trail;
834 $url = substr( $url, 0, -$numSepChars );
835 }
836
837 // Replace &amp; from obsolete syntax with &.
838 // All HTML entities will be escaped by makeExternalLink()
839 // or maybeMakeExternalImage()
840 $url = str_replace( '&amp;', '&', $url );
841
842 // Is this an external image?
843 $text = $this->maybeMakeExternalImage( $url );
844 if ( $text === false ) {
845 // $text = "<a href='$url'>$url</a>";//l($url,$url);
846 $text = l($url,$url);
847 }
848 $s .= $text . $trail;
849 } else {
850 $s .= $protocol . $remainder;
851 }
852 }
853 return $s;
854 }
855
856 /**
857 * make an image if it's allowed
858 * @access private
859 */
860 function maybeMakeExternalImage( $url ) {
861 $text = false;
862 if ( preg_match( EXT_IMAGE_REGEX, $url ) ) {
863 // Image found
864 $text = theme('image', htmlspecialchars( $url ), '', '', '', false);
865 }
866 return $text;
867 }
868
869 /**
870 * Make lists from lines starting with ':', '*', '#', etc.
871 *
872 * @access private
873 * @return string the lists rendered as HTML
874 */
875 function doBlockLevels( $text, $linestart ) {
876 // Parsing through the text line by line. The main thing
877 // happening here is handling of block-level elements p, pre,
878 // and making lists from lines starting with * # : etc.
879 //
880 $textLines = explode( "\n", $text );
881
882 $lastPrefix = $output = '';
883 $this->mDTopen = $inBlockElem = false;
884 $prefixLength = 0;
885 $paragraphStack = false;
886
887 if ( !$linestart ) {
888 $output .= array_shift( $textLines );
889 }
890 foreach ( $textLines as $oLine ) {
891 $lastPrefixLength = strlen( $lastPrefix );
892 $preCloseMatch = preg_match('/<\\/pre/i', $oLine );
893 $preOpenMatch = preg_match('/<pre/i', $oLine );
894 if ( !$this->mInPre ) {
895 // Multiple prefixes may abut each other for nested lists.
896 $prefixLength = strspn( $oLine, '*#:;' );
897 $pref = substr( $oLine, 0, $prefixLength );
898
899 // eh?
900 $pref2 = str_replace( ';', ':', $pref );
901 $t = substr( $oLine, $prefixLength );
902 $this->mInPre = !empty($preOpenMatch);
903 } else {
904 // Don't interpret any other prefixes in preformatted text
905 $prefixLength = 0;
906 $pref = $pref2 = '';
907 $t = $oLine;
908 }
909
910 // List generation
911 if( $prefixLength && 0 == strcmp( $lastPrefix, $pref2 ) ) {
912 // Same as the last item, so no need to deal with nesting or opening stuff
913 $output .= $this->nextItem( substr( $pref, -1 ) );
914 $paragraphStack = false;
915
916 if ( substr( $pref, -1 ) == ';') {
917 // The one nasty exception: definition lists work like this:
918 // ; title : definition text
919 // So we check for : in the remainder text to split up the
920 // title and definition, without b0rking links.
921 $term = $t2 = '';
922 if ($this->findColonNoLinks($t, $term, $t2) !== false) {
923 $t = $t2;
924 $output .= $term . $this->nextItem( ':' );
925 }
926 }
927 } elseif( $prefixLength || $lastPrefixLength ) {
928 // Either open or close a level...
929 $commonPrefixLength = $this->getCommon( $pref, $lastPrefix );
930 $paragraphStack = false;
931
932 while( $commonPrefixLength < $lastPrefixLength ) {
933 $output .= $this->closeList( $lastPrefix{$lastPrefixLength-1} );
934 --$lastPrefixLength;
935 }
936 if ( $prefixLength <= $commonPrefixLength && $commonPrefixLength > 0 ) {
937 $output .= $this->nextItem( $pref{$commonPrefixLength-1} );
938 }
939 while ( $prefixLength > $commonPrefixLength ) {
940 $char = substr( $pref, $commonPrefixLength, 1 );
941 $output .= $this->openList( $char );
942
943 if ( ';' == $char ) {
944 // FIXME: This is dupe of code above
945 if ($this->findColonNoLinks($t, $term, $t2) !== false) {
946 $t = $t2;
947 $output .= $term . $this->nextItem( ':' );
948 }
949 }
950 ++$commonPrefixLength;
951 }
952 $lastPrefix = $pref2;
953 }
954 if( 0 == $prefixLength ) {
955 // No prefix (not in list)--go to paragraph mode
956 // XXX: use a stack for nestable elements like span, table and div
957 $openmatch = preg_match('/(<table|<blockquote|<h1|<h2|<h3|<h4|<h5|<h6|<pre|<tr|<p|<ul|<li|<\\/tr|<\\/td|<\\/th)/iS', $t );
958 $closematch = preg_match('/(<\\/table|<\\/blockquote|<\\/h1|<\\/h2|<\\/h3|<\\/h4|<\\/h5|<\\/h6|'.
959 '<td|<th|<div|<\\/div|<hr|<\\/pre|<\\/p|'.$this->mUniqPrefix.'-pre|<\\/li|<\\/ul)/iS', $t );
960 if ( $openmatch or $closematch ) {
961 $paragraphStack = false;
962 $output .= $this->closeParagraph();
963 if ( $preOpenMatch and !$preCloseMatch ) {
964 $this->mInPre = true;
965 }
966 if ( $closematch ) {
967 $inBlockElem = false;
968 } else {
969 $inBlockElem = true;
970 }
971 } else if ( !$inBlockElem && !$this->mInPre ) {
972 if ( ' ' == $t{0} and ( $this->mLastSection == 'pre' or trim($t) != '' ) ) {
973 // pre
974 if ($this->mLastSection != 'pre') {
975 $paragraphStack = false;
976 $output .= $this->closeParagraph().'<pre>';
977 $this->mLastSection = 'pre';
978 }
979 $t = substr( $t, 1 );
980 } else {
981 // paragraph
982 if ( '' == trim($t) ) {
983 if ( $paragraphStack ) {
984 $output .= $paragraphStack.'<br />';
985 $paragraphStack = false;
986 $this->mLastSection = 'p';
987 } else {
988 if ($this->mLastSection != 'p' ) {
989 $output .= $this->closeParagraph();
990 $this->mLastSection = '';
991 $paragraphStack = '<p>';
992 } else {
993 $paragraphStack = '</p><p>';
994 }
995 }
996 } else {
997 if ( $paragraphStack ) {
998 $output .= $paragraphStack;
999 $paragraphStack = false;
1000 $this->mLastSection = 'p';
1001 } else if ($this->mLastSection != 'p') {
1002 $output .= $this->closeParagraph().'<p>';
1003 $this->mLastSection = 'p';
1004 }
1005 }
1006 }
1007 }
1008 }
1009 // somewhere above we forget to get out of pre block (bug 785)
1010 if($preCloseMatch && $this->mInPre) {
1011 $this->mInPre = false;
1012 }
1013 if ($paragraphStack === false) {
1014 $output .= $t."\n";
1015 }
1016 }
1017 while ( $prefixLength ) {
1018 $output .= $this->closeList( $pref2{$prefixLength-1} );
1019 --$prefixLength;
1020 }
1021 if ( '' != $this->mLastSection ) {
1022 $output .= '</' . $this->mLastSection . '>';
1023 $this->mLastSection = '';
1024 }
1025
1026 return $output;
1027 }
1028
1029 /**#@+
1030 * Used by doBlockLevels()
1031 * @access private
1032 */
1033 /* private */ function closeParagraph() {
1034 $result = '';
1035 if ( '' != $this->mLastSection ) {
1036 $result = '</' . $this->mLastSection . ">\n";
1037 }
1038 $this->mInPre = false;
1039 $this->mLastSection = '';
1040 return $result;
1041 }
1042 // getCommon() returns the length of the longest common substring
1043 // of both arguments, starting at the beginning of both.
1044 //
1045 /* private */ function getCommon( $st1, $st2 ) {
1046 $fl = strlen( $st1 );
1047 $shorter = strlen( $st2 );
1048