| 1 |
<?php
|
| 2 |
// $Id: unicode-conversion.php,v 1.5 2009/10/20 09:19:03 mikl Exp $
|
| 3 |
|
| 4 |
/**
|
| 5 |
* Return the unicode conversion maps.
|
| 6 |
*
|
| 7 |
* @param string $type
|
| 8 |
* The map type we're looking for, one of 'ligature', 'punctuation',
|
| 9 |
* 'arrow' 'nested' or 'all'.
|
| 10 |
* @return array
|
| 11 |
* Array of conversions, keyed by the original string.
|
| 12 |
*/
|
| 13 |
function unicode_conversion_map($type = 'all') {
|
| 14 |
$map = array(
|
| 15 |
// See http://www.unicode.org/charts/PDF/UFB00.pdf
|
| 16 |
'ligature' => array(
|
| 17 |
'ffi' => 'ffi',
|
| 18 |
'ffl' => 'ffl',
|
| 19 |
'ff' => 'ff',
|
| 20 |
'fi' => 'fi',
|
| 21 |
'fl' => 'fl',
|
| 22 |
'ij' => 'ij',
|
| 23 |
'IJ' => 'IJ',
|
| 24 |
'st' => 'st',
|
| 25 |
'ss' => 'ß',
|
| 26 |
),
|
| 27 |
// See http:#www.unicode.org/charts/PDF/U2000.pdf
|
| 28 |
'punctuation' => array(
|
| 29 |
'...' => '…',
|
| 30 |
'..' => '‥',
|
| 31 |
'. . .' => '…',
|
| 32 |
'---' => '—',
|
| 33 |
'--' => '–',
|
| 34 |
),
|
| 35 |
// See http:#www.unicode.org/charts/PDF/U2190.pdf
|
| 36 |
'arrow' => array(
|
| 37 |
'->>' => '↠',
|
| 38 |
'<<-' => '↞',
|
| 39 |
'->|' => '⇥',
|
| 40 |
'|<-' => '⇤',
|
| 41 |
'<->' => '↔',
|
| 42 |
'->' => '→',
|
| 43 |
'<-' => '←',
|
| 44 |
'<=>' => '⇔',
|
| 45 |
'=>' => '⇒',
|
| 46 |
'<=' => '⇐',
|
| 47 |
),
|
| 48 |
);
|
| 49 |
|
| 50 |
if ($type == 'all') {
|
| 51 |
return array_merge($map['ligature'], $map['arrow'], $map['punctuation']);
|
| 52 |
}
|
| 53 |
elseif ($type == 'nested') {
|
| 54 |
return $map;
|
| 55 |
}
|
| 56 |
else {
|
| 57 |
return $map[$type];
|
| 58 |
}
|
| 59 |
}
|
| 60 |
|
| 61 |
/**
|
| 62 |
* Perform character conversion.
|
| 63 |
*
|
| 64 |
* @param string $test
|
| 65 |
* Text to be parsed.
|
| 66 |
* @param array $characters_to_convert
|
| 67 |
* Array of ASCII characters to convert.
|
| 68 |
* @return string
|
| 69 |
* The result of the conversion.
|
| 70 |
*/
|
| 71 |
function convert_characters($text, $characters_to_convert) {
|
| 72 |
if (($characters_to_convert == NULL) || (count($characters_to_convert) < 1)) {
|
| 73 |
// do nothing
|
| 74 |
return $text;
|
| 75 |
}
|
| 76 |
|
| 77 |
// get ascii to unicode mappings
|
| 78 |
$unicode_map = unicode_conversion_map();
|
| 79 |
|
| 80 |
foreach ($characters_to_convert as $ascii_string) {
|
| 81 |
$unicode_strings[] = $unicode_map[$ascii_string];
|
| 82 |
}
|
| 83 |
|
| 84 |
$tokens = _TokenizeHTML($text);
|
| 85 |
$result = '';
|
| 86 |
$in_pre = 0; // Keep track of when we're inside <pre> or <code> tags
|
| 87 |
foreach ($tokens as $cur_token) {
|
| 88 |
if ($cur_token[0] == "tag") {
|
| 89 |
// Don't mess with text inside tags, <pre> blocks, or <code> blocks
|
| 90 |
$result .= $cur_token[1];
|
| 91 |
// Get the tags to skip regex from SmartyPants
|
| 92 |
if (preg_match(SMARTYPANTS_TAGS_TO_SKIP, $cur_token[1], $matches)) {
|
| 93 |
$in_pre = isset($matches[1]) && $matches[1] == '/' ? 0 : 1;
|
| 94 |
}
|
| 95 |
} else {
|
| 96 |
$t = $cur_token[1];
|
| 97 |
if ($in_pre == 0) {
|
| 98 |
$t = ProcessEscapes($t);
|
| 99 |
$t = str_replace($characters_to_convert, $unicode_strings, $t);
|
| 100 |
}
|
| 101 |
$result .= $t;
|
| 102 |
}
|
| 103 |
}
|
| 104 |
return $result;
|
| 105 |
}
|
| 106 |
|
| 107 |
|
| 108 |
// _TokenizeHTML is shared between PHP SmartyPants and PHP Markdown.
|
| 109 |
// We're borrowing it for Typogrify.module, too
|
| 110 |
// We only define it if it is not already defined.
|
| 111 |
if (!function_exists('_TokenizeHTML')) {
|
| 112 |
function _TokenizeHTML($str) {
|
| 113 |
//
|
| 114 |
// Parameter: String containing HTML markup.
|
| 115 |
// Returns: An array of the tokens comprising the input
|
| 116 |
// string. Each token is either a tag (possibly with nested,
|
| 117 |
// tags contained therein, such as <a href="<MTFoo>">, or a
|
| 118 |
// run of text between tags. Each element of the array is a
|
| 119 |
// two-element array; the first is either 'tag' or 'text';
|
| 120 |
// the second is the actual value.
|
| 121 |
//
|
| 122 |
//
|
| 123 |
// Regular expression derived from the _tokenize() subroutine in
|
| 124 |
// Brad Choate's MTRegex plugin.
|
| 125 |
// <http://www.bradchoate.com/past/mtregex.php>
|
| 126 |
//
|
| 127 |
$index = 0;
|
| 128 |
$tokens = array();
|
| 129 |
|
| 130 |
$match = '(?s:<!(?:--.*?--\s*)+>)|'. # comment
|
| 131 |
'(?s:<\?.*?\?>)|'. # processing instruction
|
| 132 |
# regular tags
|
| 133 |
'(?:<[/!$]?[-a-zA-Z0-9:]+\b(?>[^"\'>]+|"[^"]*"|\'[^\']*\')*>)';
|
| 134 |
|
| 135 |
$parts = preg_split("{($match)}", $str, -1, PREG_SPLIT_DELIM_CAPTURE);
|
| 136 |
|
| 137 |
foreach ($parts as $part) {
|
| 138 |
if (++$index % 2 && $part != '')
|
| 139 |
$tokens[] = array('text', $part);
|
| 140 |
else
|
| 141 |
$tokens[] = array('tag', $part);
|
| 142 |
}
|
| 143 |
return $tokens;
|
| 144 |
}
|
| 145 |
}
|
| 146 |
|