/[drupal]/contributions/modules/transliteration/transliteration.inc
ViewVC logotype

Contents of /contributions/modules/transliteration/transliteration.inc

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.9 - (show annotations) (download) (as text)
Thu Oct 15 20:01:47 2009 UTC (5 weeks, 6 days ago) by smk
Branch: MAIN
CVS Tags: HEAD
Changes since 1.8: +1 -33 lines
File MIME type: text/x-php
Moved transliteration_clean_filename() to main module.
1 <?php
2 // $Id: transliteration.inc,v 1.8 2009/10/09 12:19:44 smk Exp $
3
4 /**
5 * Transliterate UTF-8 text to ASCII.
6 *
7 * Based on Mediawiki's UtfNormal::quickIsNFCVerify().
8 *
9 * @param $string
10 * UTF-8 text input.
11 * @param $unknown
12 * Replacement string for characters that do not have a suitable ASCII
13 * equivalent.
14 * @param $source_langcode
15 * Optional ISO 639 language code that denotes the language of the input.
16 * Used to apply language-specific variations and defaults to the current
17 * display language. If transliteration takes place during output (instead
18 * of creation) and the source language is not known at that time, it is
19 * recommended to set this argument to 'en' to produce consistent results
20 * for all enabled languages.
21 * @return
22 * Transliterated text.
23 */
24 function transliteration_process($string, $unknown = '?', $source_langcode = NULL) {
25 // Screen out some characters that eg won't be allowed in XML.
26 $string = preg_replace('/[\x00-\x08\x0b\x0c\x0e-\x1f]/', $unknown, $string);
27
28 // ASCII is always valid NFC!
29 // If we're only ever given plain ASCII, we can avoid the overhead
30 // of initializing the decomposition tables by skipping out early.
31 if (!preg_match('/[\x80-\xff]/', $string)) {
32 return $string;
33 }
34
35 static $tailBytes;
36
37 if (!isset($tailBytes)) {
38 // Each UTF-8 head byte is followed by a certain
39 // number of tail bytes.
40 $tailBytes = array();
41 for ($n = 0; $n < 256; $n++) {
42 if ($n < 0xc0) {
43 $remaining = 0;
44 }
45 elseif ($n < 0xe0) {
46 $remaining = 1;
47 }
48 elseif ($n < 0xf0) {
49 $remaining = 2;
50 }
51 elseif ($n < 0xf8) {
52 $remaining = 3;
53 }
54 elseif ($n < 0xfc) {
55 $remaining = 4;
56 }
57 elseif ($n < 0xfe) {
58 $remaining = 5;
59 }
60 else {
61 $remaining = 0;
62 }
63 $tailBytes[chr($n)] = $remaining;
64 }
65 }
66
67 // Chop the text into pure-ASCII and non-ASCII areas;
68 // large ASCII parts can be handled much more quickly.
69 // Don't chop up Unicode areas for punctuation, though,
70 // that wastes energy.
71 preg_match_all('/[\x00-\x7f]+|[\x80-\xff][\x00-\x40\x5b-\x5f\x7b-\xff]*/', $string, $matches);
72
73 $result = '';
74 foreach ($matches[0] as $str) {
75 if ($str[0] < "\x80") {
76 // ASCII chunk: guaranteed to be valid UTF-8
77 // and in normal form C, so skip over it.
78 $result .= $str;
79 continue;
80 }
81
82 // We'll have to examine the chunk byte by byte to ensure
83 // that it consists of valid UTF-8 sequences, and to see
84 // if any of them might not be normalized.
85 //
86 // Since PHP is not the fastest language on earth, some of
87 // this code is a little ugly with inner loop optimizations.
88
89 $head = '';
90 $chunk = strlen($str);
91 // Counting down is faster. I'm *so* sorry.
92 $len = $chunk + 1;
93
94 for ($i = -1; --$len; ) {
95 $c = $str[++$i];
96 if ($remaining = $tailBytes[$c]) {
97 // UTF-8 head byte!
98 $sequence = $head = $c;
99 do {
100 // Look for the defined number of tail bytes...
101 if (--$len && ($c = $str[++$i]) >= "\x80" && $c < "\xc0") {
102 // Legal tail bytes are nice.
103 $sequence .= $c;
104 }
105 else {
106 if ($len == 0) {
107 // Premature end of string!
108 // Drop a replacement character into output to
109 // represent the invalid UTF-8 sequence.
110 $result .= $unknown;
111 break 2;
112 }
113 else {
114 // Illegal tail byte; abandon the sequence.
115 $result .= $unknown;
116 // Back up and reprocess this byte; it may itself
117 // be a legal ASCII or UTF-8 sequence head.
118 --$i;
119 ++$len;
120 continue 2;
121 }
122 }
123 } while (--$remaining);
124
125 $n = ord($head);
126 if ($n <= 0xdf) {
127 $ord = ($n - 192) * 64 + (ord($sequence[1]) - 128);
128 }
129 elseif ($n <= 0xef) {
130 $ord = ($n - 224) * 4096 + (ord($sequence[1]) - 128) * 64 + (ord($sequence[2]) - 128);
131 }
132 elseif ($n <= 0xf7) {
133 $ord = ($n - 240) * 262144 + (ord($sequence[1]) - 128) * 4096 + (ord($sequence[2]) - 128) * 64 + (ord($sequence[3]) - 128);
134 }
135 elseif ($n <= 0xfb) {
136 $ord = ($n - 248) * 16777216 + (ord($sequence[1]) - 128) * 262144 + (ord($sequence[2]) - 128) * 4096 + (ord($sequence[3]) - 128) * 64 + (ord($sequence[4]) - 128);
137 }
138 elseif ($n <= 0xfd) {
139 $ord = ($n - 252) * 1073741824 + (ord($sequence[1]) - 128) * 16777216 + (ord($sequence[2]) - 128) * 262144 + (ord($sequence[3]) - 128) * 4096 + (ord($sequence[4]) - 128) * 64 + (ord($sequence[5]) - 128);
140 }
141 $result .= _transliteration_replace($ord, $unknown, $source_langcode);
142 $head = '';
143 }
144 elseif ($c < "\x80") {
145 // ASCII byte.
146 $result .= $c;
147 $head = '';
148 }
149 elseif ($c < "\xc0") {
150 // Illegal tail bytes.
151 if ($head == '') {
152 $result .= $unknown;
153 }
154 }
155 else {
156 // Miscellaneous freaks.
157 $result .= $unknown;
158 $head = '';
159 }
160 }
161 }
162 return $result;
163 }
164
165 /**
166 * Load the transliteration database and replace a Unicode character.
167 *
168 * @param $ord
169 * A ordinal Unicode character code.
170 * @param $unknown
171 * Replacement string for characters that do not have a suitable ASCII
172 * equivalent.
173 * @param $langcode
174 * Optional ISO 639 language code that denotes the language of the input.
175 * Used to apply language-specific optimizations. Defaults to the current
176 * display language.
177 * @return
178 * ASCII replacement character.
179 */
180 function _transliteration_replace($ord, $unknown = '?', $langcode = NULL) {
181 static $map = array();
182
183 if (!isset($langcode)) {
184 global $language;
185 $langcode = $language->language;
186 }
187
188 $bank = $ord >> 8;
189
190 if (!isset($map[$bank][$langcode])) {
191 $file = drupal_get_path('module', 'transliteration') . '/data/' . sprintf('x%02x', $bank) . '.php';
192 if (file_exists($file)) {
193 include $file;
194 if ($langcode != 'en' && isset($variant[$langcode])) {
195 // Merge in language specific mappings.
196 $map[$bank][$langcode] = $variant[$langcode] + $base;
197 }
198 else {
199 $map[$bank][$langcode] = $base;
200 }
201 }
202 else {
203 $map[$bank][$langcode] = array();
204 }
205 }
206
207 $ord = $ord & 255;
208
209 return isset($map[$bank][$langcode][$ord]) ? $map[$bank][$langcode][$ord] : $unknown;
210 }
211

  ViewVC Help
Powered by ViewVC 1.1.2