Stripping CVS keywords
[project/porterstemmer.git] / porterstemmer.module
CommitLineData
c1960e34
SW
1<?php
2/*
3 * The implementation of the Porter Stemmer is free software,
4 * and is 2005 Richard Heyes (http://www.phpguru.org/). Certain elements
5 * were borrowed from the (broken) implementation by Jon Abernathy.
6 *
7 * It was modified by Steven Wittens for PHP4 compatibility and Drupal integration.
8 */
9
10/**
11* Implementation of hook_search_preprocess
12*/
13function porterstemmer_search_preprocess(&$text) {
14 // Split words from noise and remove apostrophes
15 $words = preg_split('/([^a-zA-Z]+)/', str_replace("'", '', $text), -1, PREG_SPLIT_DELIM_CAPTURE);
16
17 // Process each word
18 $odd = true;
19 foreach ($words as $k => $word) {
20 if ($odd) {
21 $words[$k] = Stem($word);
22 }
23 $odd = !$odd;
24 }
25
26 // Put it all back together
27 return implode('', $words);
28}
29
30/**
31* Implementation of hook_help().
32*/
33function porterstemmer_help($section = 'admin/help#search') {
34 switch ($section) {
35 case 'admin/modules#description':
36 return t('Implements the Porter-Stemmer algorithm to improve English searching.');
37 }
38}
39
40/**
41* Regex for matching a consonant
42*/
43define('regex_consonant', '(?:[bcdfghjklmnpqrstvwxz]|(?<=[aeiou])y|^y)');
44
45
46/**
47* Regex for matching a vowel
48*/
49define('regex_vowel', '(?:[aeiou]|(?<![aeiou])y)');
50
51
52/**
53* Stems a word. Simple huh?
54*
55* @param string $word Word to stem
56* @return string Stemmed word
57*/
58function stem($word)
59{
60 if (strlen($word) <= 2) {
61 return $word;
62 }
63
64 $word = step1ab($word);
65 $word = step1c($word);
66 $word = step2($word);
67 $word = step3($word);
68 $word = step4($word);
69 $word = step5($word);
70
71 return $word;
72}
73
74
75/**
76* Step 1
77*/
78function step1ab($word)
79{
80 // Part a
81 if (substr($word, -1) == 's') {
82
83 replace($word, 'sses', 'ss')
84 OR replace($word, 'ies', 'i')
85 OR replace($word, 'ss', 'ss')
86 OR replace($word, 's', '');
87 }
88
89 // Part b
90 if (substr($word, -2, 1) != 'e' OR !replace($word, 'eed', 'ee', 0)) { // First rule
91 $v = regex_vowel;
92
93 // ing and ed
94 if ( preg_match("#$v+#", substr($word, 0, -3)) && replace($word, 'ing', '')
95 OR preg_match("#$v+#", substr($word, 0, -2)) && replace($word, 'ed', '')) { // Note use of && and OR, for precedence reasons
96
97 // If one of above two test successful
98 if ( !replace($word, 'at', 'ate')
99 AND !replace($word, 'bl', 'ble')
100 AND !replace($word, 'iz', 'ize')) {
101
102 // Double consonant ending
103 if ( doubleConsonant($word)
104 AND substr($word, -2) != 'll'
105 AND substr($word, -2) != 'ss'
106 AND substr($word, -2) != 'zz') {
107
108 $word = substr($word, 0, -1);
109
110 } else if (m($word) == 1 AND cvc($word)) {
111 $word .= 'e';
112 }
113 }
114 }
115 }
116
117 return $word;
118}
119
120
121/**
122* Step 1c
123*
124* @param string $word Word to stem
125*/
126function step1c($word)
127{
128 $v = regex_vowel;
129
130 if (substr($word, -1) == 'y' && preg_match("#$v+#", substr($word, 0, -1))) {
131 replace($word, 'y', 'i');
132 }
133
134 return $word;
135}
136
137
138/**
139* Step 2
140*
141* @param string $word Word to stem
142*/
143function step2($word)
144{
145 switch (substr($word, -2, 1)) {
146 case 'a':
147 replace($word, 'ational', 'ate', 0)
148 OR replace($word, 'tional', 'tion', 0);
149 break;
150
151 case 'c':
152 replace($word, 'enci', 'ence', 0)
153 OR replace($word, 'anci', 'ance', 0);
154 break;
155
156 case 'e':
157 replace($word, 'izer', 'ize', 0);
158 break;
159
160 case 'g':
161 replace($word, 'logi', 'log', 0);
162 break;
163
164 case 'l':
165 replace($word, 'entli', 'ent', 0)
166 OR replace($word, 'ousli', 'ous', 0)
167 OR replace($word, 'alli', 'al', 0)
168 OR replace($word, 'bli', 'ble', 0)
169 OR replace($word, 'eli', 'e', 0);
170 break;
171
172 case 'o':
173 replace($word, 'ization', 'ize', 0)
174 OR replace($word, 'ation', 'ate', 0)
175 OR replace($word, 'ator', 'ate', 0);
176 break;
177
178 case 's':
179 replace($word, 'iveness', 'ive', 0)
180 OR replace($word, 'fulness', 'ful', 0)
181 OR replace($word, 'ousness', 'ous', 0)
182 OR replace($word, 'alism', 'al', 0);
183 break;
184
185 case 't':
186 replace($word, 'biliti', 'ble', 0)
187 OR replace($word, 'aliti', 'al', 0)
188 OR replace($word, 'iviti', 'ive', 0);
189 break;
190 }
191
192 return $word;
193}
194
195
196/**
197* Step 3
198*
199* @param string $word String to stem
200*/
201function step3($word)
202{
203 switch (substr($word, -2, 1)) {
204 case 'a':
205 replace($word, 'ical', 'ic', 0);
206 break;
207
208 case 's':
209 replace($word, 'ness', '', 0);
210 break;
211
212 case 't':
213 replace($word, 'icate', 'ic', 0)
214 OR replace($word, 'iciti', 'ic', 0);
215 break;
216
217 case 'u':
218 replace($word, 'ful', '', 0);
219 break;
220
221 case 'v':
222 replace($word, 'ative', '', 0);
223 break;
224
225 case 'z':
226 replace($word, 'alize', 'al', 0);
227 break;
228 }
229
230 return $word;
231}
232
233
234/**
235* Step 4
236*
237* @param string $word Word to stem
238*/
239function step4($word)
240{
241 switch (substr($word, -2, 1)) {
242 case 'a':
243 replace($word, 'al', '', 1);
244 break;
245
246 case 'c':
247 replace($word, 'ance', '', 1)
248 OR replace($word, 'ence', '', 1);
249 break;
250
251 case 'e':
252 replace($word, 'er', '', 1);
253 break;
254
255 case 'i':
256 replace($word, 'ic', '', 1);
257 break;
258
259 case 'l':
260 replace($word, 'able', '', 1)
261 OR replace($word, 'ible', '', 1);
262 break;
263
264 case 'n':
265 replace($word, 'ant', '', 1)
266 OR replace($word, 'ement', '', 1)
267 OR replace($word, 'ment', '', 1)
268 OR replace($word, 'ent', '', 1);
269 break;
270
271 case 'o':
272 if (substr($word, -4) == 'tion' OR substr($word, -4) == 'sion') {
273 replace($word, 'ion', '', 1);
274 } else {
275 replace($word, 'ou', '', 1);
276 }
277 break;
278
279 case 's':
280 replace($word, 'ism', '', 1);
281 break;
282
283 case 't':
284 replace($word, 'ate', '', 1)
285 OR replace($word, 'iti', '', 1);
286 break;
287
288 case 'u':
289 replace($word, 'ous', '', 1);
290 break;
291
292 case 'v':
293 replace($word, 'ive', '', 1);
294 break;
295
296 case 'z':
297 replace($word, 'ize', '', 1);
298 break;
299 }
300
301 return $word;
302}
303
304
305/**
306* Step 5
307*
308* @param string $word Word to stem
309*/
310function step5($word)
311{
312 // Part a
313 if (substr($word, -1) == 'e') {
314 if (m(substr($word, 0, -1)) > 1) {
315 replace($word, 'e', '');
316
317 } else if (m(substr($word, 0, -1)) == 1) {
318
319 if (!cvc(substr($word, 0, -1))) {
320 replace($word, 'e', '');
321 }
322 }
323 }
324
325 // Part b
326 if (m($word) > 1 AND doubleConsonant($word) AND substr($word, -1) == 'l') {
327 $word = substr($word, 0, -1);
328 }
329
330 return $word;
331}
332
333
334/**
335* Replaces the first string with the second, at the end of the string. If third
336* arg is given, then the preceding string must match that m count at least.
337*
338* @param string $str String to check
339* @param string $check Ending to check for
340* @param string $repl Replacement string
341* @param int $m Optional minimum number of m() to meet
342* @return bool Whether the $check string was at the end
343* of the $str string. True does not necessarily mean
344* that it was replaced.
345*/
346function replace(&$str, $check, $repl, $m = null)
347{
348 $len = 0 - strlen($check);
349
350 if (substr($str, $len) == $check) {
351 $substr = substr($str, 0, $len);
352 if (is_null($m) OR m($substr) > $m) {
353 $str = $substr . $repl;
354 }
355
356 return true;
357 }
358
359 return false;
360}
361
362
363/**
364* What, you mean it's not obvious from the name?
365*
366* m() measures the number of consonant sequences in $str. if c is
367* a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
368* presence,
369*
370* <c><v> gives 0
371* <c>vc<v> gives 1
372* <c>vcvc<v> gives 2
373* <c>vcvcvc<v> gives 3
374*
375* @param string $str The string to return the m count for
376* @return int The m count
377*/
378function m($str)
379{
380 $c = regex_consonant;
381 $v = regex_vowel;
382
383 $str = preg_replace("#^$c+#", '', $str);
384 $str = preg_replace("#$v+$#", '', $str);
385
386 preg_match_all("#($v+$c+)#", $str, $matches);
387
388 return count($matches[1]);
389}
390
391
392/**
393* Returns true/false as to whether the given string contains two
394* of the same consonant next to each other at the end of the string.
395*
396* @param string $str String to check
397* @return bool Result
398*/
399function doubleConsonant($str)
400{
401 $c = regex_consonant;
402
403 return preg_match("#$c{2}$#", $str, $matches) AND $matches[0]{0} == $matches[0]{1};
404}
405
406
407/**
408* Checks for ending CVC sequence where second C is not W, X or Y
409*
410* @param string $str String to check
411* @return bool Result
412*/
413function cvc($str)
414{
415 $c = regex_consonant;
416 $v = regex_vowel;
417
418 return preg_match("#($c$v$c)$#", $str, $matches)
419 AND strlen($matches[1]) == 3
420 AND $matches[1]{2} != 'w'
421 AND $matches[1]{2} != 'x'
422 AND $matches[1]{2} != 'y';
423}
424
425
426?>