/[drupal]/contributions/modules/spanishstemmer/spanishstemmer.module
ViewVC logotype

Contents of /contributions/modules/spanishstemmer/spanishstemmer.module

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.1 - (show annotations) (download) (as text)
Mon Jan 5 15:01:06 2009 UTC (10 months, 3 weeks ago) by gonzalokoeln
Branch: MAIN
CVS Tags: DRUPAL-6--1-0, HEAD
Branch point for: DRUPAL-6--1
File MIME type: text/x-php
Initial commit of a module porting a stemming algorith of the Spanish language to improve search within a site.
1 <?php
2 // $Id$
3 /*
4 * Spanish stemmer algorithm adaption for Drupal
5 * Based on the porterstemmer Drupal module
6 *
7 * Author:
8 * Gonzalo González Rodríguez gonzalo.koeln 2008-12-29
9 * Algorithm:
10 * Paolo Ragone (c) 2007 (http://stemmer-es.sourceforge.net/), based on Dr Martin Porter http://snowball.tartarus.org/
11 *
12 */
13
14 define('SPANISHSTEMMER_CHARS','/([^a-zA-ZáéíóúüñÁÉÍÓÚÜÑ]+)/');
15
16 /**
17 * Implementation of hook_search_preprocess
18 */
19 function spanishstemmer_search_preprocess(&$text) {
20 // Split words from noise
21 $words = preg_split(SPANISHSTEMMER_CHARS, $text, -1, PREG_SPLIT_DELIM_CAPTURE);
22
23 // Process each word
24 $odd = true;
25 foreach ($words as $k => $word) {
26 if ($odd) {
27 $words[$k] = spanishstemmer_stem($word);
28 }
29 $odd = !$odd;
30 }
31
32 // Put it all back together
33 return implode('', $words);
34 }
35
36 /**
37 * Implementation of hook_help().
38 */
39 function spanishstemmer_help($section = 'admin/help#search') {
40 switch ($section) {
41 case 'admin/modules#description':
42 return t('Implements the Porter-stemmer algorithm to improve searching in Spanish.');
43 }
44 }
45
46 /**
47 * This could be replaced by a regex. Returns true if the argument is a vowel
48 */
49 function _spanishstemmer_is_vowel($c) {
50 return ($c == 'a' || $c == 'e' || $c == 'i' || $c == 'o' || $c == 'u' || $c == 'á' || $c == 'é' ||
51 $c == 'í' || $c == 'ó' || $c == 'ú' || $c == 'ü');
52 }
53
54 /**
55 * Look for the position of the next vowel in the word
56 */
57 function _spanishstemmer_getNextVowelPos($word, $start = 0) {
58 $len = strlen($word);
59 for ($i = $start; $i < $len; $i++)
60 if (_spanishstemmer_is_vowel($word[$i])) return $i;
61 return $len;
62 }
63
64 /**
65 * Look for the position of the next consonant in the word
66 */
67 function _spanishstemmer_getNextConsonantPos($word, $start = 0) {
68 $len = strlen($word);
69 for ($i = $start; $i < $len; $i++)
70 if (!_spanishstemmer_is_vowel($word[$i])) return $i;
71 return $len;
72 }
73
74 /**
75 * Internal function to process endings
76 */
77 function _spanishstemmer_endsin($word, $suffix) {
78 if (strlen($word) < strlen($suffix)) return false;
79 return (substr($word, -strlen($suffix)) == $suffix);
80 }
81
82 /**
83 * Internal function to process endings
84 */
85 function _spanishstemmer_endsinArr($word, $suffixes) {
86 foreach ($suffixes as $suff) {
87 if (_spanishstemmer_endsin($word, $suff)) return $suff;
88 }
89 return '';
90 }
91
92 /**
93 * Remove accents
94 */
95 function _spanishstemmer_removeAccent($word) {
96 return str_replace(array('á', 'é', 'í', 'ó', 'ú'), array('a', 'e', 'i', 'o', 'u'), $word);
97 }
98
99 /**
100 * Stems a word. Simple huh?
101 *
102 * @param string $word Word to stem
103 * @return string Stemmed word
104 */
105 function spanishstemmer_stem($word)
106 {
107 $len = strlen($word);
108 if ($len <=2) return $word;
109
110 // Use drupal_strtolower to assure proper functionality with accents
111 $word = drupal_strtolower($word);
112
113 $r1 = $r2 = $rv = $len;
114 //R1 is the region after the first non-vowel following a vowel, or is the null region at the end of the word if there is no such non-vowel.
115 for ($i = 0; $i < ($len-1) && $r1 == $len; $i++) {
116 if (_spanishstemmer_is_vowel($word[$i]) && !_spanishstemmer_is_vowel($word[$i+1])) {
117 $r1 = $i+2;
118 }
119 }
120
121 //R2 is the region after the first non-vowel following a vowel in R1, or is the null region at the end of the word if there is no such non-vowel.
122 for ($i = $r1; $i < ($len -1) && $r2 == $len; $i++) {
123 if (_spanishstemmer_is_vowel($word[$i]) && !_spanishstemmer_is_vowel($word[$i+1])) {
124 $r2 = $i+2;
125 }
126 }
127
128 if ($len > 3) {
129 if(!_spanishstemmer_is_vowel($word[1])) {
130 // If the second letter is a consonant, RV is the region after the next following vowel
131 $rv = _spanishstemmer_getNextVowelPos($word, 2) +1;
132 } elseif (_spanishstemmer_is_vowel($word[0]) && _spanishstemmer_is_vowel($word[1])) {
133 // or if the first two letters are vowels, RV is the region after the next consonant
134 $rv = _spanishstemmer_getNextConsonantPos($word, 2) + 1;
135 } else {
136 //otherwise (consonant-vowel case) RV is the region after the third letter. But RV is the end of the word if these positions cannot be found.
137 $rv = 3;
138 }
139 }
140
141 $r1_txt = substr($word,$r1);
142 $r2_txt = substr($word,$r2);
143 $rv_txt = substr($word,$rv);
144
145 $word_orig = $word;
146
147 // Step 0: Attached pronoun
148 $pronoun_suf = array('me', 'se', 'sela', 'selo', 'selas', 'selos', 'la', 'le', 'lo', 'las', 'les', 'los', 'nos');
149 $pronoun_suf_pre1 = array('éndo', 'ándo', 'ár', 'ér', 'ír');
150 $pronoun_suf_pre2 = array('ando', 'iendo', 'ar', 'er', 'ir');
151 $suf = _spanishstemmer_endsinArr($word, $pronoun_suf);
152 if ($suf != '') {
153 $pre_suff = _spanishstemmer_endsinArr(substr($rv_txt,0,-strlen($suf)),$pronoun_suf_pre1);
154 if ($pre_suff != '') {
155 $word = _spanishstemmer_removeAccent(substr($word,0,-strlen($suf)));
156 } else {
157 $pre_suff = _spanishstemmer_endsinArr(substr($rv_txt,0,-strlen($suf)),$pronoun_suf_pre2);
158 if ($pre_suff != '' ||
159 (_spanishstemmer_endsin($word, 'yendo' ) &&
160 (substr($word, -strlen($suf)-6,1) == 'u'))) {
161 $word = substr($word,0,-strlen($suf));
162 }
163 }
164 }
165
166 if ($word != $word_orig) {
167 $r1_txt = substr($word,$r1);
168 $r2_txt = substr($word,$r2);
169 $rv_txt = substr($word,$rv);
170 }
171 $word_after0 = $word;
172
173 if (($suf = _spanishstemmer_endsinArr($r2_txt, array('anza', 'anzas', 'ico', 'ica', 'icos', 'icas', 'ismo', 'ismos', 'able', 'ables', 'ible', 'ibles', 'ista', 'istas', 'oso', 'osa', 'osos', 'osas', 'amiento', 'amientos', 'imiento', 'imientos'))) != '') {
174 $word = substr($word,0, -strlen($suf));
175 } elseif (($suf = _spanishstemmer_endsinArr($r2_txt, array('icadora', 'icador', 'icación', 'icadoras', 'icadores', 'icaciones', 'icante', 'icantes', 'icancia', 'icancias', 'adora', 'ador', 'ación', 'adoras', 'adores', 'aciones', 'ante', 'antes', 'ancia', 'ancias'))) != '') {
176 $word = substr($word,0, -strlen($suf));
177 } elseif (($suf = _spanishstemmer_endsinArr($r2_txt, array('logía', 'logías'))) != '') {
178 $word = substr($word,0, -strlen($suf)) . 'log';
179 } elseif (($suf = _spanishstemmer_endsinArr($r2_txt, array('ución', 'uciones'))) != '') {
180 $word = substr($word,0, -strlen($suf)) . 'u';
181 } elseif (($suf = _spanishstemmer_endsinArr($r2_txt, array('encia', 'encias'))) != '') {
182 $word = substr($word,0, -strlen($suf)) . 'ente';
183 } elseif (($suf = _spanishstemmer_endsinArr($r2_txt, array('ativamente', 'ivamente', 'osamente', 'icamente', 'adamente'))) != '') {
184 $word = substr($word,0, -strlen($suf));
185 } elseif (($suf = _spanishstemmer_endsinArr($r1_txt, array('amente'))) != '') {
186 $word = substr($word,0, -strlen($suf));
187 } elseif (($suf = _spanishstemmer_endsinArr($r2_txt, array('antemente', 'ablemente', 'iblemente', 'mente'))) != '') {
188 $word = substr($word,0, -strlen($suf));
189 } elseif (($suf = _spanishstemmer_endsinArr($r2_txt, array('abilidad', 'abilidades', 'icidad', 'icidades', 'ividad', 'ividades', 'idad', 'idades'))) != '') {
190 $word = substr($word,0, -strlen($suf));
191 } elseif (($suf = _spanishstemmer_endsinArr($r2_txt, array('ativa', 'ativo', 'ativas', 'ativos', 'iva', 'ivo', 'ivas', 'ivos'))) != '') {
192 $word = substr($word,0, -strlen($suf));
193 }
194
195 if ($word != $word_after0) {
196 $r1_txt = substr($word,$r1);
197 $r2_txt = substr($word,$r2);
198 $rv_txt = substr($word,$rv);
199 }
200 $word_after1 = $word;
201
202 if ($word_after0 == $word_after1) {
203 // Do step 2a if no ending was removed by step 1.
204 if (($suf = _spanishstemmer_endsinArr($rv_txt, array('ya', 'ye', 'yan', 'yen', 'yeron', 'yendo', 'yo', 'yó', 'yas', 'yes', 'yais', 'yamos'))) != '' && (substr($word,-strlen($suf)-1,1) == 'u')) {
205 $word = substr($word,0, -strlen($suf));
206 }
207
208 if ($word != $word_after1) {
209 $r1_txt = substr($word,$r1);
210 $r2_txt = substr($word,$r2);
211 $rv_txt = substr($word,$rv);
212 }
213 $word_after2a = $word;
214
215 // Do Step 2b if step 2a was done, but failed to remove a suffix.
216 if ($word_after2a == $word_after1) {
217 if (($suf = _spanishstemmer_endsinArr($rv_txt, array('en', 'es', 'éis', 'emos'))) != '') {
218 $word = substr($word,0, -strlen($suf));
219 if (_spanishstemmer_endsin($word, 'gu')) {
220 $word = substr($word,0,-1);
221 }
222 } elseif (($suf = _spanishstemmer_endsinArr($rv_txt, array('arían', 'arías', 'arán', 'arás', 'aríais', 'aría', 'aréis', 'aríamos', 'aremos', 'ará', 'aré', 'erían', 'erías', 'erán', 'erás', 'eríais', 'ería', 'eréis', 'eríamos', 'eremos', 'erá', 'eré', 'irían', 'irías', 'irán', 'irás', 'iríais', 'iría', 'iréis', 'iríamos', 'iremos', 'irá', 'iré', 'aba', 'ada', 'ida', 'ía', 'ara', 'iera', 'ad', 'ed', 'id', 'ase', 'iese', 'aste', 'iste', 'an', 'aban', 'ían', 'aran', 'ieran', 'asen', 'iesen', 'aron', 'ieron', 'ado', 'ido', 'ando', 'iendo', 'ió', 'ar', 'er', 'ir', 'as', 'abas', 'adas', 'idas', 'ías', 'aras', 'ieras', 'ases', 'ieses', 'ís', 'áis', 'abais', 'íais', 'arais', 'ierais', ' aseis', 'ieseis', 'asteis', 'isteis', 'ados', 'idos', 'amos', 'ábamos', 'íamos', 'imos', 'áramos', 'iéramos', 'iésemos', 'ásemos'))) != '') {
223 $word = substr($word,0, -strlen($suf));
224 }
225 }
226 }
227
228 // Always do step 3.
229 $r1_txt = substr($word,$r1);
230 $r2_txt = substr($word,$r2);
231 $rv_txt = substr($word,$rv);
232
233 if (($suf = _spanishstemmer_endsinArr($rv_txt, array('os', 'a', 'o', 'á', 'í', 'ó'))) != '') {
234 $word = substr($word,0, -strlen($suf));
235 } elseif (($suf = _spanishstemmer_endsinArr($rv_txt ,array('e','é'))) != '') {
236 $word = substr($word,0,-1);
237 $rv_txt = substr($word,$rv);
238 if (_spanishstemmer_endsin($rv_txt,'u') && _spanishstemmer_endsin($word,'gu')) {
239 $word = substr($word,0,-1);
240 }
241 }
242
243 return _spanishstemmer_removeAccent($word);
244 }
245
246
247 ?>

  ViewVC Help
Powered by ViewVC 1.1.2