/[drupal]/contributions/modules/porterstemmer/porterstemmer.module
ViewVC logotype

Contents of /contributions/modules/porterstemmer/porterstemmer.module

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.2 - (show annotations) (download) (as text)
Tue Jul 14 23:38:47 2009 UTC (4 months, 2 weeks ago) by jhodgdon
Branch: MAIN
CVS Tags: DRUPAL-6--2-0, HEAD
Branch point for: DRUPAL-6--2
Changes since 1.1: +584 -336 lines
File MIME type: text/x-php
#511930 #437094 #219335 by jhodgdon: Upgrade to the Porter 2 Stemmer algorithm; minimum word size 3 characters. Also updated install instructions and Readme, updated for coding standards, and added SimpleTest tests.
1 <?php
2 // $Id$
3
4 /**
5 * @file
6 * This is an implementation of the Porter 2 Stemming algorithm from
7 * http://snowball.tartarus.org/algorithms/english/stemmer.html
8 * by Jennifer Hodgdon of Poplar ProductivityWare, www.poplarware.com
9 */
10
11 /**
12 * Implementation of hook_search_preprocess().
13 *
14 * Stems the words in $text, using the Porter Stemmer 2 algorithm.
15 */
16 function porterstemmer_search_preprocess(&$text) {
17
18 // Convert text to lower case, and replace special apostrophes with regular
19 // apostrophes
20 $text = drupal_strtolower(str_replace('’', "'", $text));
21
22 // Split into words
23 $words = preg_split('/(' . PORTERSTEMMER_BOUNDARY . '+)/', $text, -1, PREG_SPLIT_DELIM_CAPTURE);
24
25 if (!count( $words )) {
26 return $text;
27 }
28
29 // Process each word, skipping delimiters
30 $isword = !preg_match('/' . PORTERSTEMMER_BOUNDARY . '/', $words[0] );
31 foreach ($words as $k => $word) {
32 if ($isword) {
33 $words[$k] = porterstemmer_stem($word);
34 }
35 $isword = !$isword;
36 }
37
38 // Put it all back together
39 return implode('', $words);
40 }
41
42 /**
43 * Implementation of hook_help().
44 */
45 function porterstemmer_help($section = 'admin/help#search') {
46 switch ($section) {
47 case 'admin/modules#description':
48 return t('Implements the Porter-Stemmer version 2 algorithm to improve English searching.');
49 }
50 }
51
52 /**
53 * Regular expression defining a vowel for Porter Stemmer purposes.
54 */
55 define('PORTERSTEMMER_VOWEL', '[aeiouy]');
56
57 /**
58 * Regular expression defining not-a-vowel for Porter Stemmer purposes.
59 */
60 define('PORTERSTEMMER_NOT_VOWEL', '[^aeiouy]');
61
62 /**
63 * Regular expression defining not-a-vowel and not w, x, Y,
64 * for Porter Stemmer purposes.
65 */
66 define('PORTERSTEMMER_NOT_VOWEL_WXY', '[^aeiouywxY]');
67
68 /**
69 * Regular expression defining a double consonant for Porter Stemmer purposes.
70 */
71 define('PORTERSTEMMER_DOUBLE', '(bb|dd|ff|gg|mm|nn|pp|rr|tt)');
72
73 /**
74 * Regular expression defining an li-ending for Porter Stemmer purposes.
75 */
76 define('PORTERSTEMMER_LI_END', '[cdeghkmnrt]');
77
78 /**
79 * Regular expression defining a word boundary for Porter Stemmer purposes
80 * (anything not a letter or an apostrophe).
81 */
82 define('PORTERSTEMMER_BOUNDARY', "[^a-zA-Z']+");
83
84 /**
85 * Minimum number of characters in a "word", for Drupal purposes.
86 * Note that the Porter Stemmer algorithm leaves alone words less than
87 * 2 characters, but Drupal Search doesn't like search terms less than
88 * 3 characters, so we establish a minimum stemming length of 3.
89 */
90 define('PORTERSTEMMER_MIN_CHARS', 3);
91
92 /**
93 * Stems a word, using the Porter Stemmer 2 algorithm.
94 *
95 * @param $word
96 * Word to stem.
97 * @return
98 * Stemmed word
99 */
100 function porterstemmer_stem($word) {
101 // Each of these helper functions returns TRUE if it is time to stop
102 // stemming and return. If everything is fine, they modify params by
103 // reference, as necessary, for the next function.
104
105 $r1 = 0; // position of R1 region in original word
106 $r2 = 0; // position of R1 region in original word
107 porterstemmer_prestemming($word, $r1, $r2) OR
108 porterstemmer_exception1($word) OR
109 porterstemmer_step0($word) OR
110 porterstemmer_step1a($word) OR
111 porterstemmer_exception2($word) OR
112 porterstemmer_step1b($word, $r1) OR
113 porterstemmer_step1c($word) OR
114 porterstemmer_step2($word, $r1) OR
115 porterstemmer_step3($word, $r1, $r2) OR
116 porterstemmer_step4($word, $r2) OR
117 porterstemmer_step5($word, $r1, $r2);
118
119 porterstemmer_poststemming( $word );
120 return $word;
121 }
122
123 /**
124 * Returns TRUE if word is too short to continue stemming.
125 */
126 function porterstemmer_too_short($word) {
127 if ( drupal_strlen($word) < PORTERSTEMMER_MIN_CHARS ) {
128 return TRUE;
129 }
130 }
131
132 /**
133 * Replaces word and calculates return value for steps.
134 *
135 * If $tmp is long enough, replaces $word with $tmp and returns FALSE
136 * to continue stemming process. If $tmp is too short, no replacement
137 * and returns TRUE to end stemming process.
138 */
139 function porterstemmer_step_ending(&$word, $tmp) {
140 if ( drupal_strlen($tmp) < PORTERSTEMMER_MIN_CHARS ) {
141 return TRUE;
142 }
143
144 $word = $tmp;
145 return FALSE;
146 }
147
148 /**
149 * Replaces one word ending with another, if tests pass.
150 *
151 * The return value is TRUE of the ending is present at the end
152 * of the word, and FALSE if the ending is not present. The found
153 * word ending is also replaced with the given replacement, only if
154 * the additional regular expression (if present) matches and if the
155 * word is at least the given length.
156 *
157 * @param $word
158 * Word to performm search/replace on.
159 * @param $oldend
160 * Ending to check for.
161 * @param $newend
162 * Replacement ending.
163 * @param $didit
164 * Set to TRUE in the case that a replacement is done; left alone
165 * otherwise.
166 * @param $other
167 * Extra regular expression; must match to allow ending replacement.
168 * @param $minlen
169 * Minimum word length required to allow ending replacement. For
170 * instance, to see if a particular ending is in the R1 region,
171 * pass in $r1 + length of ending as the minimum word length.
172 * @return
173 * TRUE if ending was at the end of the word, FALSE if not.
174 */
175 function porterstemmer_suffix(&$word, $oldend, $newend, &$didit, $other = NULL, $minlen = 1) {
176 // Check to see if the ending is there
177 $end_regexp = '/' . $oldend . '$/';
178 if (!preg_match( $end_regexp, $word )) {
179 // ending isn't even there
180 return FALSE;
181 }
182
183 // Does word match other regular expression?
184 if ($other && !preg_match($other, $word)) {
185 // no match, so just return without replacing
186 return TRUE;
187 }
188
189 // Is word long enough?
190 if (drupal_strlen($word) < $minlen) {
191 // too short, so just return without replacing
192 return TRUE;
193 }
194
195 // Replace word ending
196 $word = preg_replace($end_regexp, $newend, $word);
197 $didit = TRUE;
198 return TRUE;
199 }
200
201 /**
202 * Checks to see if a word is considered "short" in Porter Stemmer 2.
203 *
204 * A word is "short" if region R1 doesn't exist, and if it ends in a
205 * short syllable. A short syllable is consonant, followed by vowel,
206 * followed by consonant not w, x, Y; or else vowel starting a word,
207 * followed by a non-vowel.
208 *
209 * @param $word
210 * Word to check.
211 * @param $r1
212 * Start position of R1 region in word.
213 * @return
214 * TRUE if the word is short, false if not.
215 */
216 function porterstemmer_short_word( $word, $r1 ) {
217
218 if (drupal_strlen($word) > $r1) {
219 // R1 region exists, so this is not a short word
220 return FALSE;
221 }
222
223 // Does it end in one type of short syllable?
224 if (preg_match('/^' . PORTERSTEMMER_VOWEL . PORTERSTEMMER_NOT_VOWEL . '$/',
225 $word)) {
226 return TRUE;
227 }
228
229 // Does it end in the other type of short syllable?
230 if (preg_match('/' . PORTERSTEMMER_NOT_VOWEL . PORTERSTEMMER_VOWEL .
231 PORTERSTEMMER_NOT_VOWEL_WXY . '$/', $word)) {
232 return TRUE;
233 }
234
235 return FALSE;
236 }
237
238 /**
239 * Pre-processes a word for the Porter Stemmer 2 algorithm.
240 *
241 * Checks for too-short words, removes initial apostrophes, sets y to
242 * Y (so as not to be considered a vowel) if y is at start of word or
243 * after a vowel. Then calculates the position of the R1 and R2
244 * regions in the word.
245 *
246 * @param $word
247 * Word to stem, modified in place if successful.
248 * @param $r1
249 * Returns the start position of the "R1" region in the word.
250 * @param $r2
251 * Returns the start position of the "R2" region in the word.
252 * @return
253 * TRUE if it is time to stop stemming, FALSE to continue.
254 */
255 function porterstemmer_prestemming(&$word, &$r1, &$r2) {
256 if (porterstemmer_too_short($word)) {
257 return TRUE;
258 }
259
260 $tmp = $word;
261
262 // Remove initial apostrophes
263 $tmp = preg_replace("/^'+/", '', $tmp);
264 if (porterstemmer_too_short($tmp)) {
265 return TRUE;
266 }
267
268 // y -> Y if we should treat it as consonant
269 $tmp = preg_replace('/^y/', 'Y', $tmp);
270 $count = 1;
271 while ( $count ) {
272 // Do this replacement one by one, to avoid unlikely yyyy issues
273 $tmp = preg_replace('/(' . PORTERSTEMMER_VOWEL . ')y/', '$1Y',
274 $tmp, 1, $count);
275 }
276
277 // This y/Y step should not have changed the word length
278 $word = $tmp;
279
280 // Find R1 and R2. R1 is the region after the first non-vowel
281 // following a vowel. R2 is the region after the first non-vowel
282 // following a vowel in R1.
283 $max = drupal_strlen($word);
284 $r1 = $max;
285 $r2 = $max;
286 $matches = array();
287 $rdef = '/^' . PORTERSTEMMER_NOT_VOWEL . '*' .
288 PORTERSTEMMER_VOWEL . '+(' . PORTERSTEMMER_NOT_VOWEL . ')/';
289
290 // Exceptions to R1: If word begins with 'gener', 'commun', or 'arsen',
291 // R1 is the remainder of the word.
292 if ( preg_match( '/^(gener|commun|arsen)/', $word, $matches )) {
293 $r1 = drupal_strlen( $matches[1] );
294 }
295 elseif (preg_match( $rdef, $word, $matches, PREG_OFFSET_CAPTURE)) {
296 $r1 = $matches[1][1] + 1;
297 };
298 $R1 = drupal_substr($word, $r1);
299 if ($R1 && preg_match( $rdef, $R1, $matches, PREG_OFFSET_CAPTURE)) {
300 $r2 = $r1 + $matches[1][1] + 1;
301 };
302
303 return FALSE;
304 }
305
306 /**
307 * Turn Y back into y to undo pre-processing.
308 */
309 function porterstemmer_poststemming(&$word) {
310 $word = str_replace('Y', 'y', $word);
311 }
312
313 /**
314 * Step 0 of the algorithm: remove possessive endings.
315 *
316 * @param $word
317 * Word to stem, modified in place if successful.
318 * @return
319 * TRUE if it is time to stop stemming, FALSE to continue.
320 */
321 function porterstemmer_step0(&$word) {
322 $tmp = $word;
323 $didit = FALSE;
324 porterstemmer_suffix($tmp, "'s'", '', $didit) OR
325 porterstemmer_suffix($tmp, "'s", '', $didit) OR
326 porterstemmer_suffix($tmp, "'", '', $didit);
327
328 if (porterstemmer_too_short($tmp)) {
329 return TRUE;
330 }
331
332 $word = $tmp;
333 return FALSE;
334 }
335
336
337 /**
338 * Step 1a of algorithm: plurals, etc.
339 *
340 * @param $word
341 * Word to stem, modified in place if successful.
342 * @return
343 * TRUE if it is time to stop stemming, FALSE to continue.
344 */
345 function porterstemmer_step1a(&$word) {
346 $tmp = $word;
347 $didit = FALSE;
348
349 $done = porterstemmer_suffix($tmp, 'sses', 'ss', $didit);
350
351 // ies/ied endings -- have different replacements depending on
352 // if there is more than one letter preceeding. So make sure to
353 // test/replace for both conditions.
354
355 if ( !$done && porterstemmer_suffix($tmp, 'ies', 'ie', $didit, '/^.ies$/')) {
356 if ( !$didit ) {
357 porterstemmer_suffix($tmp, 'ies', 'i', $didit);
358 }
359 $done = TRUE;
360 }
361
362 if ( !$done && porterstemmer_suffix($tmp, 'ied', 'ie', $didit, '/^.ied$/')) {
363 if ( !$didit ) {
364 porterstemmer_suffix($tmp, 'ied', 'i', $didit);
365 }
366 $done = TRUE;
367 }
368
369 if ( !$done ) {
370 porterstemmer_suffix($tmp, 'ss', 'ss', $didit) OR
371 porterstemmer_suffix($tmp, 'us', 'us', $didit) OR
372 // only delete s at end of word if there is at least one vowel that
373 // is not immediately before the s
374 porterstemmer_suffix($tmp, 's', '', $didit,
375 '/' . PORTERSTEMMER_VOWEL . '.+s$/');
376 }
377
378 if (porterstemmer_too_short($tmp)) {
379 return TRUE;
380 }
381
382 $word = $tmp;
383 return FALSE;
384 }
385
386
387 /**
388 * Step 1b of algorithm: eed, eedly, ed, edly, ing, ingly
389 *
390 * @param $word
391 * Word to stem, modified in place if successful.
392 * @param $r1
393 * Position of start of R1 region in word.
394 * @return
395 * TRUE if it is time to stop stemming, FALSE to continue.
396 */
397 function porterstemmer_step1b(&$word, $r1) {
398 $tmp = $word;
399 $didit = FALSE;
400
401 // Replace these endings if in R1 region
402 $done = ( porterstemmer_suffix($tmp, 'eedly', 'ee', $didit, NULL, $r1 + 5) OR
403 porterstemmer_suffix($tmp, 'eed', 'ee', $didit, NULL, $r1 + 3));
404
405 // Delete these endings if there's a vowel before the ending
406 $didit = FALSE;
407 if ( !$done ) {
408 porterstemmer_suffix($tmp, 'edly', '', $didit,
409 '/' . PORTERSTEMMER_VOWEL . '.*edly$/' ) OR
410 porterstemmer_suffix($tmp, 'ed', '', $didit,
411 '/' . PORTERSTEMMER_VOWEL . '.*ed$/' ) OR
412 porterstemmer_suffix($tmp, 'ingly', '', $didit,
413 '/' . PORTERSTEMMER_VOWEL . '.*ingly$/' ) OR
414 porterstemmer_suffix($tmp, 'ing', '', $didit,
415 '/' . PORTERSTEMMER_VOWEL . '.*ing$/' );
416 }
417
418 // If we did one of these replacements, post-process...
419 if ( $didit ) {
420 $done = porterstemmer_suffix($tmp, 'at', 'ate', $didit) OR
421 porterstemmer_suffix($tmp, 'bl', 'ble', $didit) OR
422 porterstemmer_suffix($tmp, 'iz', 'ize', $didit);
423 if (!$done &&
424 preg_match('/' . PORTERSTEMMER_DOUBLE . '$/', $tmp)) {
425 // drop last letter if it's a double-letter ending
426 $tmp = drupal_substr($tmp, 0, -1);
427 $done = TRUE;
428 }
429 if ( !$done && porterstemmer_short_word($tmp, $r1)) {
430 $tmp = $tmp . 'e';
431 }
432 }
433
434 return porterstemmer_step_ending($word, $tmp);
435 }
436
437 /**
438 * Step 1c of algorithm: y suffixes
439 *
440 * @param $word
441 * Word to stem, modified in place if successful.
442 * @return
443 * TRUE if it is time to stop stemming, FALSE to continue.
444 */
445 function porterstemmer_step1c(&$word) {
446 $tmp = $word;
447 $didit = FALSE;
448
449 // Replace y or Y by i if the letter before is not a vowel,
450 // and that non-vowel is not the beginning of the word.
451
452 $ytest = '/.' . PORTERSTEMMER_NOT_VOWEL . '[Yy]$/';
453 porterstemmer_suffix($tmp, 'Y', 'i', $didit, $ytest ) OR
454 porterstemmer_suffix($tmp, 'y', 'i', $didit, $ytest );
455
456 return porterstemmer_step_ending($word, $tmp);
457 }
458
459 /**
460 * Step 2 of algorithm: misc endings in region R1.
461 *
462 * @param $word
463 * Word to stem, modified in place if successful.
464 * @param $r1
465 * Position of start of R1 region in word.
466 * @return
467 * TRUE if it is time to stop stemming, FALSE to continue.
468 */
469 function porterstemmer_step2(&$word, $r1) {
470 $tmp = $word;
471 $didit = FALSE;
472
473 // Search for the longest of these suffixes, and if found in R1, replace
474 porterstemmer_suffix($tmp, 'ational', 'ate', $didit, NULL, $r1 + 7) OR
475 porterstemmer_suffix($tmp, 'fulness', 'ful', $didit, NULL, $r1 + 7) OR
476 porterstemmer_suffix($tmp, 'iveness', 'ive', $didit, NULL, $r1 + 7) OR
477 porterstemmer_suffix($tmp, 'ization', 'ize', $didit, NULL, $r1 + 7) OR
478 porterstemmer_suffix($tmp, 'ousness', 'ous', $didit, NULL, $r1 + 7) OR
479 porterstemmer_suffix($tmp, 'biliti', 'ble', $didit, NULL, $r1 + 6) OR
480 porterstemmer_suffix($tmp, 'lessli', 'less', $didit, NULL, $r1 + 6) OR
481 porterstemmer_suffix($tmp, 'tional', 'tion', $didit, NULL, $r1 + 6) OR
482 porterstemmer_suffix($tmp, 'aliti', 'al', $didit, NULL, $r1 + 5) OR
483 porterstemmer_suffix($tmp, 'ation', 'ate', $didit, NULL, $r1 + 5) OR
484 porterstemmer_suffix($tmp, 'alism', 'al', $didit, NULL, $r1 + 5) OR
485 porterstemmer_suffix($tmp, 'entli', 'ent', $didit, NULL, $r1 + 5) OR
486 porterstemmer_suffix($tmp, 'fulli', 'ful', $didit, NULL, $r1 + 5) OR
487 porterstemmer_suffix($tmp, 'iviti', 'ive', $didit, NULL, $r1 + 5) OR
488 porterstemmer_suffix($tmp, 'ousli', 'ous', $didit, NULL, $r1 + 5) OR
489 porterstemmer_suffix($tmp, 'abli', 'able', $didit, NULL, $r1 + 4) OR
490 porterstemmer_suffix($tmp, 'alli', 'al', $didit, NULL, $r1 + 4) OR
491 porterstemmer_suffix($tmp, 'ator', 'ate', $didit, NULL, $r1 + 4) OR
492 porterstemmer_suffix($tmp, 'anci', 'ance', $didit, NULL, $r1 + 4) OR
493 porterstemmer_suffix($tmp, 'enci', 'ence', $didit, NULL, $r1 + 4) OR
494 porterstemmer_suffix($tmp, 'izer', 'ize', $didit, NULL, $r1 + 4) OR
495 porterstemmer_suffix($tmp, 'bli', 'ble', $didit, NULL, $r1 + 3) OR
496 // ogi is only replaced if preceeded by l
497 porterstemmer_suffix($tmp, 'ogi', 'og', $didit,
498 '/logi$/', $r1 + 3) OR
499 // li is only replaced if preceeded by a valid li-ending
500 porterstemmer_suffix($tmp, 'li', '', $didit,
501 '/' . PORTERSTEMMER_LI_END . 'li$/', $r1 + 2);
502
503 return porterstemmer_step_ending($word, $tmp);
504 }
505
506 /**
507 * Step 3 of algorithm: misc endings in region R1.
508 *
509 * @param $word
510 * Word to stem, modified in place if successful.
511 * @param $r1
512 * Position of start of R1 region in word.
513 * @param $r2
514 * Position of start of R2 region in word.
515 * @return
516 * TRUE if it is time to stop stemming, FALSE to continue.
517 */
518 function porterstemmer_step3(&$word, $r1, $r2) {
519 $tmp = $word;
520 $didit = FALSE;
521
522 porterstemmer_suffix($tmp, 'ational', 'ate', $didit, NULL, $r1 + 7) OR
523 porterstemmer_suffix($tmp, 'tional', 'tion', $didit, NULL, $r1 + 6) OR
524 porterstemmer_suffix($tmp, 'alize', 'al', $didit, NULL, $r1 + 5) OR
525 porterstemmer_suffix($tmp, 'ative', '', $didit, NULL, $r2 + 5) OR
526 porterstemmer_suffix($tmp, 'icate', 'ic', $didit, NULL, $r1 + 5) OR
527 porterstemmer_suffix($tmp, 'iciti', 'ic', $didit, NULL, $r1 + 5) OR
528 porterstemmer_suffix($tmp, 'ical', 'ic', $didit, NULL, $r1 + 4) OR
529 porterstemmer_suffix($tmp, 'ness', '', $didit, NULL, $r1 + 4) OR
530 porterstemmer_suffix($tmp, 'ful', '', $didit, NULL, $r1 + 3);
531
532 return porterstemmer_step_ending($word, $tmp);
533 }
534
535 /**
536 * Step 4 of algorithm: misc endings in region R2.
537 *
538 * @param $word
539 * Word to stem, modified in place if successful.
540 * @param $r2
541 * Position of start of R2 region in word.
542 * @return
543 * TRUE if it is time to stop stemming, FALSE to continue.
544 */
545 function porterstemmer_step4(&$word, $r2) {
546 $tmp = $word;
547 $didit = FALSE;
548
549 porterstemmer_suffix($tmp, 'ement', '', $didit, NULL, $r2 + 5) OR
550 porterstemmer_suffix($tmp, 'able', '', $didit, NULL, $r2 + 4) OR
551 porterstemmer_suffix($tmp, 'ance', '', $didit, NULL, $r2 + 4) OR
552 porterstemmer_suffix($tmp, 'ence', '', $didit, NULL, $r2 + 4) OR
553 porterstemmer_suffix($tmp, 'ible', '', $didit, NULL, $r2 + 4) OR
554 porterstemmer_suffix($tmp, 'ment', '', $didit, NULL, $r2 + 4) OR
555 porterstemmer_suffix($tmp, 'ant', '', $didit, NULL, $r2 + 3) OR
556 porterstemmer_suffix($tmp, 'ate', '', $didit, NULL, $r2 + 3) OR
557 porterstemmer_suffix($tmp, 'ent', '', $didit, NULL, $r2 + 3) OR
558 porterstemmer_suffix($tmp, 'ion', '', $didit, '/[st]ion$/', $r2 + 3) OR
559 porterstemmer_suffix($tmp, 'ism', '', $didit, NULL, $r2 + 3) OR
560 porterstemmer_suffix($tmp, 'iti', '', $didit, NULL, $r2 + 3) OR
561 porterstemmer_suffix($tmp, 'ive', '', $didit, NULL, $r2 + 3) OR
562 porterstemmer_suffix($tmp, 'ize', '', $didit, NULL, $r2 + 3) OR
563 porterstemmer_suffix($tmp, 'ous', '', $didit, NULL, $r2 + 3) OR
564 porterstemmer_suffix($tmp, 'al', '', $didit, NULL, $r2 + 2) OR
565 porterstemmer_suffix($tmp, 'er', '', $didit, NULL, $r2 + 2) OR
566 porterstemmer_suffix($tmp, 'ic', '', $didit, NULL, $r2 + 2);
567
568 return porterstemmer_step_ending($word, $tmp);
569 }
570
571 /**
572 * Step 5 of algorithm: e, l endings in region R1/R2.
573 *
574 * @param $word
575 * Word to stem, modified in place if successful.
576 * @param $r1
577 * Position of start of R1 region in word.
578 * @param $r2
579 * Position of start of R2 region in word.
580 * @return
581 * TRUE if it is time to stop stemming, FALSE to continue.
582 */
583 function porterstemmer_step5(&$word, $r1, $r2) {
584 $tmp = $word;
585 $didit = FALSE;
586 $done = FALSE;
587
588 // Delete l at end of word if in R2 and preceded by another l
589 $done = porterstemmer_suffix( $tmp, 'll', 'l', $didit, NULL, $r2 + 1 );
590
591 // Delete e at end of word if in R2, or in R1 and not preceded by
592 // a short syllable
593 $len = drupal_strlen( $tmp );
594 if ( !$done && preg_match( '/e$/', $tmp ) &&
595 ( $len > $r2 ||
596 ( $len > $r1 &&
597 !preg_match( '/^' . PORTERSTEMMER_VOWEL . PORTERSTEMMER_NOT_VOWEL .
598 'e$/', $tmp ) &&
599 !preg_match( '/' . PORTERSTEMMER_NOT_VOWEL . PORTERSTEMMER_VOWEL .
600 PORTERSTEMMER_NOT_VOWEL_WXY . 'e$/', $tmp )))) {
601 $tmp = drupal_substr( $tmp, 0, -1 );
602 }
603
604 return porterstemmer_step_ending($word, $tmp);
605 }
606
607 /**
608 * Checks exceptions for Porter Stemmer.
609 *
610 * @param $word
611 * Word to stem, modified in place if successful.
612 * @return
613 * TRUE if it is time to stop stemming, FALSE to continue.
614 */
615 function porterstemmer_exception1(&$word) {
616 // Special cases for stemming. Don't add anything in this list that
617 // is shorter than the minimum allowed length!
618 $repl = array(
619 'skis' => 'ski',
620 'skies' => 'sky',
621 'dying' => 'die',
622 'lying' => 'lie',
623 'tying' => 'tie',
624 'idly' => 'idl',
625 'gently' => 'gentl',
626 'ugly' => 'ugli',
627 'early' => 'earli',
628 'only' => 'onli',
629 'singly' => 'singl',
630 'sky' => 'sky',
631 'news' => 'news',
632 'howe' => 'howe',
633 'atlas' => 'atlas',
634 'cosmos' => 'cosmos',
635 'bias' => 'bias',
636 'andes' => 'andes',
637 );
638
639 // If our word is in that list, we're done.
640 if ( isset( $repl[ $word ])) {
641 $word = $repl[ $word ];
642 return TRUE;
643 }
644
645 return FALSE;
646 }
647
648 /**
649 * Checks exceptions for Porter Stemmer after Step 1a.
650 *
651 * @param $word
652 * Word to stem, modified in place if successful.
653 * @return
654 * TRUE if it is time to stop stemming, FALSE to continue.
655 */
656 function porterstemmer_exception2(&$word) {
657 // The following words are to be left invariant.
658 $repl = array(
659 'inning' => 1,
660 'outing' => 1,
661 'canning' => 1,
662 'herring' => 1,
663 'earring' => 1,
664 'proceed' => 1,
665 'exceed' => 1,
666 'succeed' => 1,
667 );
668
669 if ( isset( $repl[ $word ])) {
670 return TRUE;
671 }
672
673 return FALSE;
674 }

  ViewVC Help
Powered by ViewVC 1.1.2