| 1 |
<?php |
<?php |
| 2 |
// $Id: transliteration.inc,v 1.2.2.2 2009/06/07 19:05:50 smk Exp $ |
// $Id: transliteration.inc,v 1.2.2.3 2009/09/05 22:25:01 smk Exp $ |
| 3 |
|
|
| 4 |
/** |
/** |
| 5 |
* Sanitize a file name. |
* Sanitize a file name. |
| 6 |
* |
* |
| 7 |
* Transliterates the file name and removes all problematic characters. |
* Transliterates the file name and removes invalid characters. |
| 8 |
* |
* |
| 9 |
* @param string $filename |
* @param $filename |
| 10 |
* A file name. |
* A file name. |
| 11 |
* @param string $langcode |
* @param $source_langcode |
| 12 |
* Optional ISO 639 language code used to import language specific |
* Optional ISO 639 language code that denotes the language of the input. |
| 13 |
* replacements. Defaults to the current display language. |
* Used to apply language-specific variations and defaults to the current |
| 14 |
* |
* display language. If transliteration takes place during output (instead |
| 15 |
* @return string |
* of creation) and the source language is not known at that time, it is |
| 16 |
|
* recommended to set this argument to 'en' to produce consistent results |
| 17 |
|
* for all enabled languages. |
| 18 |
|
* @return |
| 19 |
* Cleaned file name. |
* Cleaned file name. |
| 20 |
*/ |
*/ |
| 21 |
function transliteration_clean_filename($filename, $langcode = NULL) { |
function transliteration_clean_filename($filename, $source_langcode = NULL) { |
| 22 |
// Trim any leading/trailing dots. |
// Trim any leading/trailing dots. |
| 23 |
$filename = trim($filename, '.'); |
$filename = trim($filename, '.'); |
| 24 |
// Transliterate to ASCII. |
// Transliterate to ASCII. |
| 25 |
$filename = transliteration_process($filename, '', $langcode); |
$filename = transliteration_process($filename, '', $source_langcode); |
| 26 |
// Replace whitespace. |
// Replace whitespace. |
| 27 |
$filename = str_replace(' ', '_', $filename); |
$filename = str_replace(' ', '_', $filename); |
| 28 |
// Remove any remaining non-safe characters. |
// Remove any remaining non-safe characters. |
| 34 |
} |
} |
| 35 |
|
|
| 36 |
/** |
/** |
| 37 |
* Transliterate UTF-8 input to plain ASCII. |
* Transliterate UTF-8 text to ASCII. |
| 38 |
* |
* |
| 39 |
* Based on Mediawiki's UtfNormal::quickIsNFCVerify(). |
* Based on Mediawiki's UtfNormal::quickIsNFCVerify(). |
| 40 |
* |
* |
| 41 |
* @param string $string |
* @param $string |
| 42 |
* UTF-8 text input. |
* UTF-8 text input. |
| 43 |
* @param string $unknown |
* @param $unknown |
| 44 |
* Replacement for unknown characters and illegal UTF-8 sequences. |
* Replacement string for characters that do not have a suitable ASCII |
| 45 |
* @param string $langcode |
* equivalent. |
| 46 |
* Optional ISO 639 language code used to import language specific |
* @param $source_langcode |
| 47 |
* replacements. Defaults to the current display language. |
* Optional ISO 639 language code that denotes the language of the input. |
| 48 |
* |
* Used to apply language-specific variations and defaults to the current |
| 49 |
* @return string |
* display language. If transliteration takes place during output (instead |
| 50 |
* Plain ASCII output. |
* of creation) and the source language is not known at that time, it is |
| 51 |
* @see transliteration_get() |
* recommended to set this argument to 'en' to produce consistent results |
| 52 |
|
* for all enabled languages. |
| 53 |
|
* @return |
| 54 |
|
* Transliterated text. |
| 55 |
*/ |
*/ |
| 56 |
function transliteration_process($string, $unknown = '?', $langcode = NULL) { |
function transliteration_process($string, $unknown = '?', $source_langcode = NULL) { |
| 57 |
// Screen out some characters that eg won't be allowed in XML. |
// Screen out some characters that eg won't be allowed in XML. |
| 58 |
$string = preg_replace('/[\x00-\x08\x0b\x0c\x0e-\x1f]/', $unknown, $string); |
$string = preg_replace('/[\x00-\x08\x0b\x0c\x0e-\x1f]/', $unknown, $string); |
| 59 |
|
|
| 170 |
else if ($n <= 0xfd) { |
else if ($n <= 0xfd) { |
| 171 |
$ord = ($n - 252) * 1073741824 + (ord($sequence{1}) - 128) * 16777216 + (ord($sequence{2}) - 128) * 262144 + (ord($sequence{3}) - 128) * 4096 + (ord($sequence{4}) - 128) * 64 + (ord($sequence{5}) - 128); |
$ord = ($n - 252) * 1073741824 + (ord($sequence{1}) - 128) * 16777216 + (ord($sequence{2}) - 128) * 262144 + (ord($sequence{3}) - 128) * 4096 + (ord($sequence{4}) - 128) * 64 + (ord($sequence{5}) - 128); |
| 172 |
} |
} |
| 173 |
$result .= _transliteration_replace($ord, $unknown, $langcode); |
$result .= _transliteration_replace($ord, $unknown, $source_langcode); |
| 174 |
$head = ''; |
$head = ''; |
| 175 |
} |
} |
| 176 |
elseif ($c < "\x80") { |
elseif ($c < "\x80") { |
| 195 |
} |
} |
| 196 |
|
|
| 197 |
/** |
/** |
| 198 |
* Lookup and replace a character from the transliteration database. |
* Load the transliteration database and replace a Unicode character. |
|
* |
|
|
* @param integer $ord |
|
|
* A unicode ordinal character code. |
|
|
* @param string $unknown |
|
|
* Replacement for unknown characters. |
|
|
* @param string $langcode |
|
|
* Optional ISO 639 language code used to import language specific |
|
|
* replacements. Defaults to the current display language. |
|
| 199 |
* |
* |
| 200 |
* @return string |
* @param $ord |
| 201 |
* Plain ASCII replacement character. |
* A ordinal Unicode character code. |
| 202 |
* @see transliteration_get() |
* @param $unknown |
| 203 |
|
* Replacement string for characters that do not have a suitable ASCII |
| 204 |
|
* equivalent. |
| 205 |
|
* @param $langcode |
| 206 |
|
* Optional ISO 639 language code that denotes the language of the input. |
| 207 |
|
* Used to apply language-specific optimizations. Defaults to the current |
| 208 |
|
* display language. |
| 209 |
|
* @return |
| 210 |
|
* ASCII replacement character. |
| 211 |
*/ |
*/ |
| 212 |
function _transliteration_replace($ord, $unknown = '?', $langcode = NULL) { |
function _transliteration_replace($ord, $unknown = '?', $langcode = NULL) { |
| 213 |
if (!isset($langcode)) { |
if (!isset($langcode)) { |