| 1 |
<?php |
<?php |
| 2 |
// $Id: transliteration.inc,v 1.6 2009/08/25 14:44:25 smk Exp $ |
// $Id: transliteration.inc,v 1.7 2009/09/05 22:24:26 smk Exp $ |
| 3 |
|
|
| 4 |
/** |
/** |
| 5 |
* Sanitize a file name. |
* Sanitize a file name. |
| 8 |
* |
* |
| 9 |
* @param $filename |
* @param $filename |
| 10 |
* A file name. |
* A file name. |
| 11 |
* @param $langcode |
* @param $source_langcode |
| 12 |
* Optional ISO 639 language code that denotes the language of the input. |
* Optional ISO 639 language code that denotes the language of the input. |
| 13 |
* Used to apply language-specific optimizations. Defaults to the current |
* Used to apply language-specific variations and defaults to the current |
| 14 |
* display language. |
* display language. If transliteration takes place during output (instead |
| 15 |
|
* of creation) and the source language is not known at that time, it is |
| 16 |
|
* recommended to set this argument to 'en' to produce consistent results |
| 17 |
|
* for all enabled languages. |
| 18 |
* @return |
* @return |
| 19 |
* Cleaned file name. |
* Cleaned file name. |
| 20 |
*/ |
*/ |
| 21 |
function transliteration_clean_filename($filename, $langcode = NULL) { |
function transliteration_clean_filename($filename, $source_langcode = NULL) { |
| 22 |
// Trim any leading/trailing dots. |
// Trim any leading/trailing dots. |
| 23 |
$filename = trim($filename, '.'); |
$filename = trim($filename, '.'); |
| 24 |
// Transliterate to ASCII. |
// Transliterate to ASCII. |
| 25 |
$filename = transliteration_process($filename, '', $langcode); |
$filename = transliteration_process($filename, '', $source_langcode); |
| 26 |
// Replace whitespace. |
// Replace whitespace. |
| 27 |
$filename = str_replace(' ', '_', $filename); |
$filename = str_replace(' ', '_', $filename); |
| 28 |
// Remove any remaining non-safe characters. |
// Remove any remaining non-safe characters. |
| 43 |
* @param $unknown |
* @param $unknown |
| 44 |
* Replacement string for characters that do not have a suitable ASCII |
* Replacement string for characters that do not have a suitable ASCII |
| 45 |
* equivalent. |
* equivalent. |
| 46 |
* @param $langcode |
* @param $source_langcode |
| 47 |
* Optional ISO 639 language code that denotes the language of the input. |
* Optional ISO 639 language code that denotes the language of the input. |
| 48 |
* Used to apply language-specific optimizations. Defaults to the current |
* Used to apply language-specific variations and defaults to the current |
| 49 |
* display language. |
* display language. If transliteration takes place during output (instead |
| 50 |
|
* of creation) and the source language is not known at that time, it is |
| 51 |
|
* recommended to set this argument to 'en' to produce consistent results |
| 52 |
|
* for all enabled languages. |
| 53 |
* @return |
* @return |
| 54 |
* Transliterated text. |
* Transliterated text. |
| 55 |
*/ |
*/ |
| 56 |
function transliteration_process($string, $unknown = '?', $langcode = NULL) { |
function transliteration_process($string, $unknown = '?', $source_langcode = NULL) { |
| 57 |
// Screen out some characters that eg won't be allowed in XML. |
// Screen out some characters that eg won't be allowed in XML. |
| 58 |
$string = preg_replace('/[\x00-\x08\x0b\x0c\x0e-\x1f]/', $unknown, $string); |
$string = preg_replace('/[\x00-\x08\x0b\x0c\x0e-\x1f]/', $unknown, $string); |
| 59 |
|
|
| 158 |
if ($n <= 0xdf) { |
if ($n <= 0xdf) { |
| 159 |
$ord = ($n - 192) * 64 + (ord($sequence[1]) - 128); |
$ord = ($n - 192) * 64 + (ord($sequence[1]) - 128); |
| 160 |
} |
} |
| 161 |
else if ($n <= 0xef) { |
elseif ($n <= 0xef) { |
| 162 |
$ord = ($n - 224) * 4096 + (ord($sequence[1]) - 128) * 64 + (ord($sequence[2]) - 128); |
$ord = ($n - 224) * 4096 + (ord($sequence[1]) - 128) * 64 + (ord($sequence[2]) - 128); |
| 163 |
} |
} |
| 164 |
else if ($n <= 0xf7) { |
elseif ($n <= 0xf7) { |
| 165 |
$ord = ($n - 240) * 262144 + (ord($sequence[1]) - 128) * 4096 + (ord($sequence[2]) - 128) * 64 + (ord($sequence[3]) - 128); |
$ord = ($n - 240) * 262144 + (ord($sequence[1]) - 128) * 4096 + (ord($sequence[2]) - 128) * 64 + (ord($sequence[3]) - 128); |
| 166 |
} |
} |
| 167 |
else if ($n <= 0xfb) { |
elseif ($n <= 0xfb) { |
| 168 |
$ord = ($n - 248) * 16777216 + (ord($sequence[1]) - 128) * 262144 + (ord($sequence[2]) - 128) * 4096 + (ord($sequence[3]) - 128) * 64 + (ord($sequence[4]) - 128); |
$ord = ($n - 248) * 16777216 + (ord($sequence[1]) - 128) * 262144 + (ord($sequence[2]) - 128) * 4096 + (ord($sequence[3]) - 128) * 64 + (ord($sequence[4]) - 128); |
| 169 |
} |
} |
| 170 |
else if ($n <= 0xfd) { |
elseif ($n <= 0xfd) { |
| 171 |
$ord = ($n - 252) * 1073741824 + (ord($sequence[1]) - 128) * 16777216 + (ord($sequence[2]) - 128) * 262144 + (ord($sequence[3]) - 128) * 4096 + (ord($sequence[4]) - 128) * 64 + (ord($sequence[5]) - 128); |
$ord = ($n - 252) * 1073741824 + (ord($sequence[1]) - 128) * 16777216 + (ord($sequence[2]) - 128) * 262144 + (ord($sequence[3]) - 128) * 4096 + (ord($sequence[4]) - 128) * 64 + (ord($sequence[5]) - 128); |
| 172 |
} |
} |
| 173 |
$result .= _transliteration_replace($ord, $unknown, $langcode); |
$result .= _transliteration_replace($ord, $unknown, $source_langcode); |
| 174 |
$head = ''; |
$head = ''; |
| 175 |
} |
} |
| 176 |
elseif ($c < "\x80") { |
elseif ($c < "\x80") { |