| 1 |
<?php
|
| 2 |
// $Id: transliteration.inc,v 1.8 2009/10/09 12:19:44 smk Exp $
|
| 3 |
|
| 4 |
/**
|
| 5 |
* Transliterate UTF-8 text to ASCII.
|
| 6 |
*
|
| 7 |
* Based on Mediawiki's UtfNormal::quickIsNFCVerify().
|
| 8 |
*
|
| 9 |
* @param $string
|
| 10 |
* UTF-8 text input.
|
| 11 |
* @param $unknown
|
| 12 |
* Replacement string for characters that do not have a suitable ASCII
|
| 13 |
* equivalent.
|
| 14 |
* @param $source_langcode
|
| 15 |
* Optional ISO 639 language code that denotes the language of the input.
|
| 16 |
* Used to apply language-specific variations and defaults to the current
|
| 17 |
* display language. If transliteration takes place during output (instead
|
| 18 |
* of creation) and the source language is not known at that time, it is
|
| 19 |
* recommended to set this argument to 'en' to produce consistent results
|
| 20 |
* for all enabled languages.
|
| 21 |
* @return
|
| 22 |
* Transliterated text.
|
| 23 |
*/
|
| 24 |
function transliteration_process($string, $unknown = '?', $source_langcode = NULL) {
|
| 25 |
// Screen out some characters that eg won't be allowed in XML.
|
| 26 |
$string = preg_replace('/[\x00-\x08\x0b\x0c\x0e-\x1f]/', $unknown, $string);
|
| 27 |
|
| 28 |
// ASCII is always valid NFC!
|
| 29 |
// If we're only ever given plain ASCII, we can avoid the overhead
|
| 30 |
// of initializing the decomposition tables by skipping out early.
|
| 31 |
if (!preg_match('/[\x80-\xff]/', $string)) {
|
| 32 |
return $string;
|
| 33 |
}
|
| 34 |
|
| 35 |
static $tailBytes;
|
| 36 |
|
| 37 |
if (!isset($tailBytes)) {
|
| 38 |
// Each UTF-8 head byte is followed by a certain
|
| 39 |
// number of tail bytes.
|
| 40 |
$tailBytes = array();
|
| 41 |
for ($n = 0; $n < 256; $n++) {
|
| 42 |
if ($n < 0xc0) {
|
| 43 |
$remaining = 0;
|
| 44 |
}
|
| 45 |
elseif ($n < 0xe0) {
|
| 46 |
$remaining = 1;
|
| 47 |
}
|
| 48 |
elseif ($n < 0xf0) {
|
| 49 |
$remaining = 2;
|
| 50 |
}
|
| 51 |
elseif ($n < 0xf8) {
|
| 52 |
$remaining = 3;
|
| 53 |
}
|
| 54 |
elseif ($n < 0xfc) {
|
| 55 |
$remaining = 4;
|
| 56 |
}
|
| 57 |
elseif ($n < 0xfe) {
|
| 58 |
$remaining = 5;
|
| 59 |
}
|
| 60 |
else {
|
| 61 |
$remaining = 0;
|
| 62 |
}
|
| 63 |
$tailBytes[chr($n)] = $remaining;
|
| 64 |
}
|
| 65 |
}
|
| 66 |
|
| 67 |
// Chop the text into pure-ASCII and non-ASCII areas;
|
| 68 |
// large ASCII parts can be handled much more quickly.
|
| 69 |
// Don't chop up Unicode areas for punctuation, though,
|
| 70 |
// that wastes energy.
|
| 71 |
preg_match_all('/[\x00-\x7f]+|[\x80-\xff][\x00-\x40\x5b-\x5f\x7b-\xff]*/', $string, $matches);
|
| 72 |
|
| 73 |
$result = '';
|
| 74 |
foreach ($matches[0] as $str) {
|
| 75 |
if ($str[0] < "\x80") {
|
| 76 |
// ASCII chunk: guaranteed to be valid UTF-8
|
| 77 |
// and in normal form C, so skip over it.
|
| 78 |
$result .= $str;
|
| 79 |
continue;
|
| 80 |
}
|
| 81 |
|
| 82 |
// We'll have to examine the chunk byte by byte to ensure
|
| 83 |
// that it consists of valid UTF-8 sequences, and to see
|
| 84 |
// if any of them might not be normalized.
|
| 85 |
//
|
| 86 |
// Since PHP is not the fastest language on earth, some of
|
| 87 |
// this code is a little ugly with inner loop optimizations.
|
| 88 |
|
| 89 |
$head = '';
|
| 90 |
$chunk = strlen($str);
|
| 91 |
// Counting down is faster. I'm *so* sorry.
|
| 92 |
$len = $chunk + 1;
|
| 93 |
|
| 94 |
for ($i = -1; --$len; ) {
|
| 95 |
$c = $str[++$i];
|
| 96 |
if ($remaining = $tailBytes[$c]) {
|
| 97 |
// UTF-8 head byte!
|
| 98 |
$sequence = $head = $c;
|
| 99 |
do {
|
| 100 |
// Look for the defined number of tail bytes...
|
| 101 |
if (--$len && ($c = $str[++$i]) >= "\x80" && $c < "\xc0") {
|
| 102 |
// Legal tail bytes are nice.
|
| 103 |
$sequence .= $c;
|
| 104 |
}
|
| 105 |
else {
|
| 106 |
if ($len == 0) {
|
| 107 |
// Premature end of string!
|
| 108 |
// Drop a replacement character into output to
|
| 109 |
// represent the invalid UTF-8 sequence.
|
| 110 |
$result .= $unknown;
|
| 111 |
break 2;
|
| 112 |
}
|
| 113 |
else {
|
| 114 |
// Illegal tail byte; abandon the sequence.
|
| 115 |
$result .= $unknown;
|
| 116 |
// Back up and reprocess this byte; it may itself
|
| 117 |
// be a legal ASCII or UTF-8 sequence head.
|
| 118 |
--$i;
|
| 119 |
++$len;
|
| 120 |
continue 2;
|
| 121 |
}
|
| 122 |
}
|
| 123 |
} while (--$remaining);
|
| 124 |
|
| 125 |
$n = ord($head);
|
| 126 |
if ($n <= 0xdf) {
|
| 127 |
$ord = ($n - 192) * 64 + (ord($sequence[1]) - 128);
|
| 128 |
}
|
| 129 |
elseif ($n <= 0xef) {
|
| 130 |
$ord = ($n - 224) * 4096 + (ord($sequence[1]) - 128) * 64 + (ord($sequence[2]) - 128);
|
| 131 |
}
|
| 132 |
elseif ($n <= 0xf7) {
|
| 133 |
$ord = ($n - 240) * 262144 + (ord($sequence[1]) - 128) * 4096 + (ord($sequence[2]) - 128) * 64 + (ord($sequence[3]) - 128);
|
| 134 |
}
|
| 135 |
elseif ($n <= 0xfb) {
|
| 136 |
$ord = ($n - 248) * 16777216 + (ord($sequence[1]) - 128) * 262144 + (ord($sequence[2]) - 128) * 4096 + (ord($sequence[3]) - 128) * 64 + (ord($sequence[4]) - 128);
|
| 137 |
}
|
| 138 |
elseif ($n <= 0xfd) {
|
| 139 |
$ord = ($n - 252) * 1073741824 + (ord($sequence[1]) - 128) * 16777216 + (ord($sequence[2]) - 128) * 262144 + (ord($sequence[3]) - 128) * 4096 + (ord($sequence[4]) - 128) * 64 + (ord($sequence[5]) - 128);
|
| 140 |
}
|
| 141 |
$result .= _transliteration_replace($ord, $unknown, $source_langcode);
|
| 142 |
$head = '';
|
| 143 |
}
|
| 144 |
elseif ($c < "\x80") {
|
| 145 |
// ASCII byte.
|
| 146 |
$result .= $c;
|
| 147 |
$head = '';
|
| 148 |
}
|
| 149 |
elseif ($c < "\xc0") {
|
| 150 |
// Illegal tail bytes.
|
| 151 |
if ($head == '') {
|
| 152 |
$result .= $unknown;
|
| 153 |
}
|
| 154 |
}
|
| 155 |
else {
|
| 156 |
// Miscellaneous freaks.
|
| 157 |
$result .= $unknown;
|
| 158 |
$head = '';
|
| 159 |
}
|
| 160 |
}
|
| 161 |
}
|
| 162 |
return $result;
|
| 163 |
}
|
| 164 |
|
| 165 |
/**
|
| 166 |
* Load the transliteration database and replace a Unicode character.
|
| 167 |
*
|
| 168 |
* @param $ord
|
| 169 |
* A ordinal Unicode character code.
|
| 170 |
* @param $unknown
|
| 171 |
* Replacement string for characters that do not have a suitable ASCII
|
| 172 |
* equivalent.
|
| 173 |
* @param $langcode
|
| 174 |
* Optional ISO 639 language code that denotes the language of the input.
|
| 175 |
* Used to apply language-specific optimizations. Defaults to the current
|
| 176 |
* display language.
|
| 177 |
* @return
|
| 178 |
* ASCII replacement character.
|
| 179 |
*/
|
| 180 |
function _transliteration_replace($ord, $unknown = '?', $langcode = NULL) {
|
| 181 |
static $map = array();
|
| 182 |
|
| 183 |
if (!isset($langcode)) {
|
| 184 |
global $language;
|
| 185 |
$langcode = $language->language;
|
| 186 |
}
|
| 187 |
|
| 188 |
$bank = $ord >> 8;
|
| 189 |
|
| 190 |
if (!isset($map[$bank][$langcode])) {
|
| 191 |
$file = drupal_get_path('module', 'transliteration') . '/data/' . sprintf('x%02x', $bank) . '.php';
|
| 192 |
if (file_exists($file)) {
|
| 193 |
include $file;
|
| 194 |
if ($langcode != 'en' && isset($variant[$langcode])) {
|
| 195 |
// Merge in language specific mappings.
|
| 196 |
$map[$bank][$langcode] = $variant[$langcode] + $base;
|
| 197 |
}
|
| 198 |
else {
|
| 199 |
$map[$bank][$langcode] = $base;
|
| 200 |
}
|
| 201 |
}
|
| 202 |
else {
|
| 203 |
$map[$bank][$langcode] = array();
|
| 204 |
}
|
| 205 |
}
|
| 206 |
|
| 207 |
$ord = $ord & 255;
|
| 208 |
|
| 209 |
return isset($map[$bank][$langcode][$ord]) ? $map[$bank][$langcode][$ord] : $unknown;
|
| 210 |
}
|
| 211 |
|