| 1 |
<?php
|
| 2 |
// $Id$
|
| 3 |
/*
|
| 4 |
* Spanish stemmer algorithm adaption for Drupal
|
| 5 |
* Based on the porterstemmer Drupal module
|
| 6 |
*
|
| 7 |
* Author:
|
| 8 |
* Gonzalo González Rodríguez gonzalo.koeln 2008-12-29
|
| 9 |
* Algorithm:
|
| 10 |
* Paolo Ragone (c) 2007 (http://stemmer-es.sourceforge.net/), based on Dr Martin Porter http://snowball.tartarus.org/
|
| 11 |
*
|
| 12 |
*/
|
| 13 |
|
| 14 |
define('SPANISHSTEMMER_CHARS','/([^a-zA-ZáéíóúüñÁÉÍÓÚÜÑ]+)/');
|
| 15 |
|
| 16 |
/**
|
| 17 |
* Implementation of hook_search_preprocess
|
| 18 |
*/
|
| 19 |
function spanishstemmer_search_preprocess(&$text) {
|
| 20 |
// Split words from noise
|
| 21 |
$words = preg_split(SPANISHSTEMMER_CHARS, $text, -1, PREG_SPLIT_DELIM_CAPTURE);
|
| 22 |
|
| 23 |
// Process each word
|
| 24 |
$odd = true;
|
| 25 |
foreach ($words as $k => $word) {
|
| 26 |
if ($odd) {
|
| 27 |
$words[$k] = spanishstemmer_stem($word);
|
| 28 |
}
|
| 29 |
$odd = !$odd;
|
| 30 |
}
|
| 31 |
|
| 32 |
// Put it all back together
|
| 33 |
return implode('', $words);
|
| 34 |
}
|
| 35 |
|
| 36 |
/**
|
| 37 |
* Implementation of hook_help().
|
| 38 |
*/
|
| 39 |
function spanishstemmer_help($section = 'admin/help#search') {
|
| 40 |
switch ($section) {
|
| 41 |
case 'admin/modules#description':
|
| 42 |
return t('Implements the Porter-stemmer algorithm to improve searching in Spanish.');
|
| 43 |
}
|
| 44 |
}
|
| 45 |
|
| 46 |
/**
|
| 47 |
* This could be replaced by a regex. Returns true if the argument is a vowel
|
| 48 |
*/
|
| 49 |
function _spanishstemmer_is_vowel($c) {
|
| 50 |
return ($c == 'a' || $c == 'e' || $c == 'i' || $c == 'o' || $c == 'u' || $c == 'á' || $c == 'é' ||
|
| 51 |
$c == 'í' || $c == 'ó' || $c == 'ú' || $c == 'ü');
|
| 52 |
}
|
| 53 |
|
| 54 |
/**
|
| 55 |
* Look for the position of the next vowel in the word
|
| 56 |
*/
|
| 57 |
function _spanishstemmer_getNextVowelPos($word, $start = 0) {
|
| 58 |
$len = strlen($word);
|
| 59 |
for ($i = $start; $i < $len; $i++)
|
| 60 |
if (_spanishstemmer_is_vowel($word[$i])) return $i;
|
| 61 |
return $len;
|
| 62 |
}
|
| 63 |
|
| 64 |
/**
|
| 65 |
* Look for the position of the next consonant in the word
|
| 66 |
*/
|
| 67 |
function _spanishstemmer_getNextConsonantPos($word, $start = 0) {
|
| 68 |
$len = strlen($word);
|
| 69 |
for ($i = $start; $i < $len; $i++)
|
| 70 |
if (!_spanishstemmer_is_vowel($word[$i])) return $i;
|
| 71 |
return $len;
|
| 72 |
}
|
| 73 |
|
| 74 |
/**
|
| 75 |
* Internal function to process endings
|
| 76 |
*/
|
| 77 |
function _spanishstemmer_endsin($word, $suffix) {
|
| 78 |
if (strlen($word) < strlen($suffix)) return false;
|
| 79 |
return (substr($word, -strlen($suffix)) == $suffix);
|
| 80 |
}
|
| 81 |
|
| 82 |
/**
|
| 83 |
* Internal function to process endings
|
| 84 |
*/
|
| 85 |
function _spanishstemmer_endsinArr($word, $suffixes) {
|
| 86 |
foreach ($suffixes as $suff) {
|
| 87 |
if (_spanishstemmer_endsin($word, $suff)) return $suff;
|
| 88 |
}
|
| 89 |
return '';
|
| 90 |
}
|
| 91 |
|
| 92 |
/**
|
| 93 |
* Remove accents
|
| 94 |
*/
|
| 95 |
function _spanishstemmer_removeAccent($word) {
|
| 96 |
return str_replace(array('á', 'é', 'í', 'ó', 'ú'), array('a', 'e', 'i', 'o', 'u'), $word);
|
| 97 |
}
|
| 98 |
|
| 99 |
/**
|
| 100 |
* Stems a word. Simple huh?
|
| 101 |
*
|
| 102 |
* @param string $word Word to stem
|
| 103 |
* @return string Stemmed word
|
| 104 |
*/
|
| 105 |
function spanishstemmer_stem($word)
|
| 106 |
{
|
| 107 |
$len = strlen($word);
|
| 108 |
if ($len <=2) return $word;
|
| 109 |
|
| 110 |
// Use drupal_strtolower to assure proper functionality with accents
|
| 111 |
$word = drupal_strtolower($word);
|
| 112 |
|
| 113 |
$r1 = $r2 = $rv = $len;
|
| 114 |
//R1 is the region after the first non-vowel following a vowel, or is the null region at the end of the word if there is no such non-vowel.
|
| 115 |
for ($i = 0; $i < ($len-1) && $r1 == $len; $i++) {
|
| 116 |
if (_spanishstemmer_is_vowel($word[$i]) && !_spanishstemmer_is_vowel($word[$i+1])) {
|
| 117 |
$r1 = $i+2;
|
| 118 |
}
|
| 119 |
}
|
| 120 |
|
| 121 |
//R2 is the region after the first non-vowel following a vowel in R1, or is the null region at the end of the word if there is no such non-vowel.
|
| 122 |
for ($i = $r1; $i < ($len -1) && $r2 == $len; $i++) {
|
| 123 |
if (_spanishstemmer_is_vowel($word[$i]) && !_spanishstemmer_is_vowel($word[$i+1])) {
|
| 124 |
$r2 = $i+2;
|
| 125 |
}
|
| 126 |
}
|
| 127 |
|
| 128 |
if ($len > 3) {
|
| 129 |
if(!_spanishstemmer_is_vowel($word[1])) {
|
| 130 |
// If the second letter is a consonant, RV is the region after the next following vowel
|
| 131 |
$rv = _spanishstemmer_getNextVowelPos($word, 2) +1;
|
| 132 |
} elseif (_spanishstemmer_is_vowel($word[0]) && _spanishstemmer_is_vowel($word[1])) {
|
| 133 |
// or if the first two letters are vowels, RV is the region after the next consonant
|
| 134 |
$rv = _spanishstemmer_getNextConsonantPos($word, 2) + 1;
|
| 135 |
} else {
|
| 136 |
//otherwise (consonant-vowel case) RV is the region after the third letter. But RV is the end of the word if these positions cannot be found.
|
| 137 |
$rv = 3;
|
| 138 |
}
|
| 139 |
}
|
| 140 |
|
| 141 |
$r1_txt = substr($word,$r1);
|
| 142 |
$r2_txt = substr($word,$r2);
|
| 143 |
$rv_txt = substr($word,$rv);
|
| 144 |
|
| 145 |
$word_orig = $word;
|
| 146 |
|
| 147 |
// Step 0: Attached pronoun
|
| 148 |
$pronoun_suf = array('me', 'se', 'sela', 'selo', 'selas', 'selos', 'la', 'le', 'lo', 'las', 'les', 'los', 'nos');
|
| 149 |
$pronoun_suf_pre1 = array('éndo', 'ándo', 'ár', 'ér', 'ír');
|
| 150 |
$pronoun_suf_pre2 = array('ando', 'iendo', 'ar', 'er', 'ir');
|
| 151 |
$suf = _spanishstemmer_endsinArr($word, $pronoun_suf);
|
| 152 |
if ($suf != '') {
|
| 153 |
$pre_suff = _spanishstemmer_endsinArr(substr($rv_txt,0,-strlen($suf)),$pronoun_suf_pre1);
|
| 154 |
if ($pre_suff != '') {
|
| 155 |
$word = _spanishstemmer_removeAccent(substr($word,0,-strlen($suf)));
|
| 156 |
} else {
|
| 157 |
$pre_suff = _spanishstemmer_endsinArr(substr($rv_txt,0,-strlen($suf)),$pronoun_suf_pre2);
|
| 158 |
if ($pre_suff != '' ||
|
| 159 |
(_spanishstemmer_endsin($word, 'yendo' ) &&
|
| 160 |
(substr($word, -strlen($suf)-6,1) == 'u'))) {
|
| 161 |
$word = substr($word,0,-strlen($suf));
|
| 162 |
}
|
| 163 |
}
|
| 164 |
}
|
| 165 |
|
| 166 |
if ($word != $word_orig) {
|
| 167 |
$r1_txt = substr($word,$r1);
|
| 168 |
$r2_txt = substr($word,$r2);
|
| 169 |
$rv_txt = substr($word,$rv);
|
| 170 |
}
|
| 171 |
$word_after0 = $word;
|
| 172 |
|
| 173 |
if (($suf = _spanishstemmer_endsinArr($r2_txt, array('anza', 'anzas', 'ico', 'ica', 'icos', 'icas', 'ismo', 'ismos', 'able', 'ables', 'ible', 'ibles', 'ista', 'istas', 'oso', 'osa', 'osos', 'osas', 'amiento', 'amientos', 'imiento', 'imientos'))) != '') {
|
| 174 |
$word = substr($word,0, -strlen($suf));
|
| 175 |
} elseif (($suf = _spanishstemmer_endsinArr($r2_txt, array('icadora', 'icador', 'icación', 'icadoras', 'icadores', 'icaciones', 'icante', 'icantes', 'icancia', 'icancias', 'adora', 'ador', 'ación', 'adoras', 'adores', 'aciones', 'ante', 'antes', 'ancia', 'ancias'))) != '') {
|
| 176 |
$word = substr($word,0, -strlen($suf));
|
| 177 |
} elseif (($suf = _spanishstemmer_endsinArr($r2_txt, array('logía', 'logías'))) != '') {
|
| 178 |
$word = substr($word,0, -strlen($suf)) . 'log';
|
| 179 |
} elseif (($suf = _spanishstemmer_endsinArr($r2_txt, array('ución', 'uciones'))) != '') {
|
| 180 |
$word = substr($word,0, -strlen($suf)) . 'u';
|
| 181 |
} elseif (($suf = _spanishstemmer_endsinArr($r2_txt, array('encia', 'encias'))) != '') {
|
| 182 |
$word = substr($word,0, -strlen($suf)) . 'ente';
|
| 183 |
} elseif (($suf = _spanishstemmer_endsinArr($r2_txt, array('ativamente', 'ivamente', 'osamente', 'icamente', 'adamente'))) != '') {
|
| 184 |
$word = substr($word,0, -strlen($suf));
|
| 185 |
} elseif (($suf = _spanishstemmer_endsinArr($r1_txt, array('amente'))) != '') {
|
| 186 |
$word = substr($word,0, -strlen($suf));
|
| 187 |
} elseif (($suf = _spanishstemmer_endsinArr($r2_txt, array('antemente', 'ablemente', 'iblemente', 'mente'))) != '') {
|
| 188 |
$word = substr($word,0, -strlen($suf));
|
| 189 |
} elseif (($suf = _spanishstemmer_endsinArr($r2_txt, array('abilidad', 'abilidades', 'icidad', 'icidades', 'ividad', 'ividades', 'idad', 'idades'))) != '') {
|
| 190 |
$word = substr($word,0, -strlen($suf));
|
| 191 |
} elseif (($suf = _spanishstemmer_endsinArr($r2_txt, array('ativa', 'ativo', 'ativas', 'ativos', 'iva', 'ivo', 'ivas', 'ivos'))) != '') {
|
| 192 |
$word = substr($word,0, -strlen($suf));
|
| 193 |
}
|
| 194 |
|
| 195 |
if ($word != $word_after0) {
|
| 196 |
$r1_txt = substr($word,$r1);
|
| 197 |
$r2_txt = substr($word,$r2);
|
| 198 |
$rv_txt = substr($word,$rv);
|
| 199 |
}
|
| 200 |
$word_after1 = $word;
|
| 201 |
|
| 202 |
if ($word_after0 == $word_after1) {
|
| 203 |
// Do step 2a if no ending was removed by step 1.
|
| 204 |
if (($suf = _spanishstemmer_endsinArr($rv_txt, array('ya', 'ye', 'yan', 'yen', 'yeron', 'yendo', 'yo', 'yó', 'yas', 'yes', 'yais', 'yamos'))) != '' && (substr($word,-strlen($suf)-1,1) == 'u')) {
|
| 205 |
$word = substr($word,0, -strlen($suf));
|
| 206 |
}
|
| 207 |
|
| 208 |
if ($word != $word_after1) {
|
| 209 |
$r1_txt = substr($word,$r1);
|
| 210 |
$r2_txt = substr($word,$r2);
|
| 211 |
$rv_txt = substr($word,$rv);
|
| 212 |
}
|
| 213 |
$word_after2a = $word;
|
| 214 |
|
| 215 |
// Do Step 2b if step 2a was done, but failed to remove a suffix.
|
| 216 |
if ($word_after2a == $word_after1) {
|
| 217 |
if (($suf = _spanishstemmer_endsinArr($rv_txt, array('en', 'es', 'éis', 'emos'))) != '') {
|
| 218 |
$word = substr($word,0, -strlen($suf));
|
| 219 |
if (_spanishstemmer_endsin($word, 'gu')) {
|
| 220 |
$word = substr($word,0,-1);
|
| 221 |
}
|
| 222 |
} elseif (($suf = _spanishstemmer_endsinArr($rv_txt, array('arían', 'arías', 'arán', 'arás', 'aríais', 'aría', 'aréis', 'aríamos', 'aremos', 'ará', 'aré', 'erían', 'erías', 'erán', 'erás', 'eríais', 'ería', 'eréis', 'eríamos', 'eremos', 'erá', 'eré', 'irían', 'irías', 'irán', 'irás', 'iríais', 'iría', 'iréis', 'iríamos', 'iremos', 'irá', 'iré', 'aba', 'ada', 'ida', 'ía', 'ara', 'iera', 'ad', 'ed', 'id', 'ase', 'iese', 'aste', 'iste', 'an', 'aban', 'ían', 'aran', 'ieran', 'asen', 'iesen', 'aron', 'ieron', 'ado', 'ido', 'ando', 'iendo', 'ió', 'ar', 'er', 'ir', 'as', 'abas', 'adas', 'idas', 'ías', 'aras', 'ieras', 'ases', 'ieses', 'ís', 'áis', 'abais', 'íais', 'arais', 'ierais', ' aseis', 'ieseis', 'asteis', 'isteis', 'ados', 'idos', 'amos', 'ábamos', 'íamos', 'imos', 'áramos', 'iéramos', 'iésemos', 'ásemos'))) != '') {
|
| 223 |
$word = substr($word,0, -strlen($suf));
|
| 224 |
}
|
| 225 |
}
|
| 226 |
}
|
| 227 |
|
| 228 |
// Always do step 3.
|
| 229 |
$r1_txt = substr($word,$r1);
|
| 230 |
$r2_txt = substr($word,$r2);
|
| 231 |
$rv_txt = substr($word,$rv);
|
| 232 |
|
| 233 |
if (($suf = _spanishstemmer_endsinArr($rv_txt, array('os', 'a', 'o', 'á', 'í', 'ó'))) != '') {
|
| 234 |
$word = substr($word,0, -strlen($suf));
|
| 235 |
} elseif (($suf = _spanishstemmer_endsinArr($rv_txt ,array('e','é'))) != '') {
|
| 236 |
$word = substr($word,0,-1);
|
| 237 |
$rv_txt = substr($word,$rv);
|
| 238 |
if (_spanishstemmer_endsin($rv_txt,'u') && _spanishstemmer_endsin($word,'gu')) {
|
| 239 |
$word = substr($word,0,-1);
|
| 240 |
}
|
| 241 |
}
|
| 242 |
|
| 243 |
return _spanishstemmer_removeAccent($word);
|
| 244 |
}
|
| 245 |
|
| 246 |
|
| 247 |
?>
|