| 1 |
<?php |
<?php |
| 2 |
// $Id: porterstemmer.module,v 1.2 2009/07/14 23:38:47 jhodgdon Exp $ |
// $Id: porterstemmer.module,v 1.2.2.1 2009/08/26 19:22:49 jhodgdon Exp $ |
| 3 |
|
|
| 4 |
/** |
/** |
| 5 |
* @file |
* @file |
| 50 |
} |
} |
| 51 |
|
|
| 52 |
/** |
/** |
| 53 |
|
* Implementation of hook_sbp_excerpt_match(). |
| 54 |
|
*/ |
| 55 |
|
function porterstemmer_sbp_excerpt_match( $key, $text, $offset, $boundary ) { |
| 56 |
|
// Stem the keyword down to its root form. |
| 57 |
|
$key = porterstemmer_stem( $key ); |
| 58 |
|
|
| 59 |
|
// In many cases, the root word is a substring of the full word, but not |
| 60 |
|
// all. The cases where it is not, the root ends in e, i, or y, and if this |
| 61 |
|
// last letter is removed, the root is a substring of the full word. |
| 62 |
|
// So remove these letters at the end of the root. |
| 63 |
|
|
| 64 |
|
$didit = FALSE; |
| 65 |
|
porterstemmer_suffix( $key, 'i', '', $didit, NULL, 2 ) OR |
| 66 |
|
porterstemmer_suffix( $key, 'e', '', $didit, NULL, 2 ) OR |
| 67 |
|
porterstemmer_suffix( $key, 'y', '', $didit, NULL, 2 ); |
| 68 |
|
|
| 69 |
|
// Look for this modified key at the start of a word. |
| 70 |
|
|
| 71 |
|
$match = array(); |
| 72 |
|
if (!preg_match('/'. $boundary . $key . '/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset )) { |
| 73 |
|
// didn't match our modified key. |
| 74 |
|
return FALSE; |
| 75 |
|
} |
| 76 |
|
|
| 77 |
|
// If we get here, we have a potential match. Find the end of the word we |
| 78 |
|
// actually matched, so it can be highlighted (making sure it's a real match |
| 79 |
|
// for our key). |
| 80 |
|
$newmatch = array(); |
| 81 |
|
foreach ( $match as $item ) { |
| 82 |
|
$pos = $match[0][1]; |
| 83 |
|
if (preg_match('/' . $boundary . '/iu', $text, $newmatch, |
| 84 |
|
PREG_OFFSET_CAPTURE, $pos + strlen($key))) { |
| 85 |
|
$keyfound = substr($text, $pos, $newmatch[0][1] - $pos); |
| 86 |
|
$foundstem = porterstemmer_stem( $keyfound ); |
| 87 |
|
porterstemmer_suffix( $foundstem, 'i', '', $didit, NULL, 2 ) OR |
| 88 |
|
porterstemmer_suffix( $foundstem, 'e', '', $didit, NULL, 2 ) OR |
| 89 |
|
porterstemmer_suffix( $foundstem, 'y', '', $didit, NULL, 2 ); |
| 90 |
|
|
| 91 |
|
if ($foundstem == $key) { |
| 92 |
|
return array('where' => $pos, 'keyword' => $keyfound); |
| 93 |
|
} |
| 94 |
|
} |
| 95 |
|
} |
| 96 |
|
|
| 97 |
|
// If we get here, none of the potential matches worked out. |
| 98 |
|
return FALSE; |
| 99 |
|
} |
| 100 |
|
|
| 101 |
|
/** |
| 102 |
* Regular expression defining a vowel for Porter Stemmer purposes. |
* Regular expression defining a vowel for Porter Stemmer purposes. |
| 103 |
*/ |
*/ |
| 104 |
define('PORTERSTEMMER_VOWEL', '[aeiouy]'); |
define('PORTERSTEMMER_VOWEL', '[aeiouy]'); |
| 232 |
// ending isn't even there |
// ending isn't even there |
| 233 |
return FALSE; |
return FALSE; |
| 234 |
} |
} |
| 235 |
|
|
| 236 |
// Does word match other regular expression? |
// Does word match other regular expression? |
| 237 |
if ($other && !preg_match($other, $word)) { |
if ($other && !preg_match($other, $word)) { |
| 238 |
// no match, so just return without replacing |
// no match, so just return without replacing |
| 280 |
} |
} |
| 281 |
|
|
| 282 |
// Does it end in the other type of short syllable? |
// Does it end in the other type of short syllable? |
| 283 |
if (preg_match('/' . PORTERSTEMMER_NOT_VOWEL . PORTERSTEMMER_VOWEL . |
if (preg_match('/' . PORTERSTEMMER_NOT_VOWEL . PORTERSTEMMER_VOWEL . |
| 284 |
PORTERSTEMMER_NOT_VOWEL_WXY . '$/', $word)) { |
PORTERSTEMMER_NOT_VOWEL_WXY . '$/', $word)) { |
| 285 |
return TRUE; |
return TRUE; |
| 286 |
} |
} |
| 323 |
$count = 1; |
$count = 1; |
| 324 |
while ( $count ) { |
while ( $count ) { |
| 325 |
// Do this replacement one by one, to avoid unlikely yyyy issues |
// Do this replacement one by one, to avoid unlikely yyyy issues |
| 326 |
$tmp = preg_replace('/(' . PORTERSTEMMER_VOWEL . ')y/', '$1Y', |
$tmp = preg_replace('/(' . PORTERSTEMMER_VOWEL . ')y/', '$1Y', |
| 327 |
$tmp, 1, $count); |
$tmp, 1, $count); |
| 328 |
} |
} |
| 329 |
|
|
| 337 |
$r1 = $max; |
$r1 = $max; |
| 338 |
$r2 = $max; |
$r2 = $max; |
| 339 |
$matches = array(); |
$matches = array(); |
| 340 |
$rdef = '/^' . PORTERSTEMMER_NOT_VOWEL . '*' . |
$rdef = '/^' . PORTERSTEMMER_NOT_VOWEL . '*' . |
| 341 |
PORTERSTEMMER_VOWEL . '+(' . PORTERSTEMMER_NOT_VOWEL . ')/'; |
PORTERSTEMMER_VOWEL . '+(' . PORTERSTEMMER_NOT_VOWEL . ')/'; |
| 342 |
|
|
| 343 |
// Exceptions to R1: If word begins with 'gener', 'commun', or 'arsen', |
// Exceptions to R1: If word begins with 'gener', 'commun', or 'arsen', |
| 419 |
porterstemmer_suffix($tmp, 'us', 'us', $didit) OR |
porterstemmer_suffix($tmp, 'us', 'us', $didit) OR |
| 420 |
// only delete s at end of word if there is at least one vowel that |
// only delete s at end of word if there is at least one vowel that |
| 421 |
// is not immediately before the s |
// is not immediately before the s |
| 422 |
porterstemmer_suffix($tmp, 's', '', $didit, |
porterstemmer_suffix($tmp, 's', '', $didit, |
| 423 |
'/' . PORTERSTEMMER_VOWEL . '.+s$/'); |
'/' . PORTERSTEMMER_VOWEL . '.+s$/'); |
| 424 |
} |
} |
| 425 |
|
|
| 463 |
$done = porterstemmer_suffix($tmp, 'at', 'ate', $didit) OR |
$done = porterstemmer_suffix($tmp, 'at', 'ate', $didit) OR |
| 464 |
porterstemmer_suffix($tmp, 'bl', 'ble', $didit) OR |
porterstemmer_suffix($tmp, 'bl', 'ble', $didit) OR |
| 465 |
porterstemmer_suffix($tmp, 'iz', 'ize', $didit); |
porterstemmer_suffix($tmp, 'iz', 'ize', $didit); |
| 466 |
if (!$done && |
if (!$done && |
| 467 |
preg_match('/' . PORTERSTEMMER_DOUBLE . '$/', $tmp)) { |
preg_match('/' . PORTERSTEMMER_DOUBLE . '$/', $tmp)) { |
| 468 |
// drop last letter if it's a double-letter ending |
// drop last letter if it's a double-letter ending |
| 469 |
$tmp = drupal_substr($tmp, 0, -1); |
$tmp = drupal_substr($tmp, 0, -1); |
| 537 |
porterstemmer_suffix($tmp, 'izer', 'ize', $didit, NULL, $r1 + 4) OR |
porterstemmer_suffix($tmp, 'izer', 'ize', $didit, NULL, $r1 + 4) OR |
| 538 |
porterstemmer_suffix($tmp, 'bli', 'ble', $didit, NULL, $r1 + 3) OR |
porterstemmer_suffix($tmp, 'bli', 'ble', $didit, NULL, $r1 + 3) OR |
| 539 |
// ogi is only replaced if preceeded by l |
// ogi is only replaced if preceeded by l |
| 540 |
porterstemmer_suffix($tmp, 'ogi', 'og', $didit, |
porterstemmer_suffix($tmp, 'ogi', 'og', $didit, |
| 541 |
'/logi$/', $r1 + 3) OR |
'/logi$/', $r1 + 3) OR |
| 542 |
// li is only replaced if preceeded by a valid li-ending |
// li is only replaced if preceeded by a valid li-ending |
| 543 |
porterstemmer_suffix($tmp, 'li', '', $didit, |
porterstemmer_suffix($tmp, 'li', '', $didit, |
| 544 |
'/' . PORTERSTEMMER_LI_END . 'li$/', $r1 + 2); |
'/' . PORTERSTEMMER_LI_END . 'li$/', $r1 + 2); |
| 545 |
|
|
| 546 |
return porterstemmer_step_ending($word, $tmp); |
return porterstemmer_step_ending($word, $tmp); |
| 635 |
// a short syllable |
// a short syllable |
| 636 |
$len = drupal_strlen( $tmp ); |
$len = drupal_strlen( $tmp ); |
| 637 |
if ( !$done && preg_match( '/e$/', $tmp ) && |
if ( !$done && preg_match( '/e$/', $tmp ) && |
| 638 |
( $len > $r2 || |
( $len > $r2 || |
| 639 |
( $len > $r1 && |
( $len > $r1 && |
| 640 |
!preg_match( '/^' . PORTERSTEMMER_VOWEL . PORTERSTEMMER_NOT_VOWEL . |
!preg_match( '/^' . PORTERSTEMMER_VOWEL . PORTERSTEMMER_NOT_VOWEL . |
| 641 |
'e$/', $tmp ) && |
'e$/', $tmp ) && |
| 642 |
!preg_match( '/' . PORTERSTEMMER_NOT_VOWEL . PORTERSTEMMER_VOWEL . |
!preg_match( '/' . PORTERSTEMMER_NOT_VOWEL . PORTERSTEMMER_VOWEL . |
| 643 |
PORTERSTEMMER_NOT_VOWEL_WXY . 'e$/', $tmp )))) { |
PORTERSTEMMER_NOT_VOWEL_WXY . 'e$/', $tmp )))) { |
| 644 |
$tmp = drupal_substr( $tmp, 0, -1 ); |
$tmp = drupal_substr( $tmp, 0, -1 ); |
| 645 |
} |
} |