| 1 |
<?php
|
| 2 |
// $Id: swedishstemmer.module,v 1.2 2008/11/25 05:33:09 frjo Exp $
|
| 3 |
|
| 4 |
/**
|
| 5 |
* @file
|
| 6 |
* "Improve Swedish language searching by simplifying related words to their root (verbs, plurals, ...).
|
| 7 |
* Algorithm based on http://snowball.tartarus.org/algorithms/swedish/stemmer.html.
|
| 8 |
*/
|
| 9 |
|
| 10 |
/**
|
| 11 |
* Implementation of hook_search_preprocess.
|
| 12 |
*/
|
| 13 |
function swedishstemmer_search_preprocess($text) {
|
| 14 |
// Split words from noise and remove apostrophes
|
| 15 |
$words = preg_split('/([^a-zA-ZéåäöÅÄÖ\']+)/u', $text, -1, PREG_SPLIT_DELIM_CAPTURE);
|
| 16 |
|
| 17 |
// Process each word
|
| 18 |
$odd = true;
|
| 19 |
foreach ($words as $k => $word) {
|
| 20 |
if ($odd) {
|
| 21 |
$words[$k] = swedishstemmer_stem($word);
|
| 22 |
}
|
| 23 |
$odd = !$odd;
|
| 24 |
}
|
| 25 |
|
| 26 |
// Put it all back together
|
| 27 |
return implode('', $words);
|
| 28 |
}
|
| 29 |
|
| 30 |
/**
|
| 31 |
* Stem a Swedish word.
|
| 32 |
*/
|
| 33 |
function swedishstemmer_stem($word) {
|
| 34 |
// Lowercase
|
| 35 |
$word = drupal_strtolower($word);
|
| 36 |
|
| 37 |
/* R1 is the region after the first non-vowel following a vowel, or is the
|
| 38 |
null region at the end of the word if there is no such non-vowel. */
|
| 39 |
if (preg_match('/[aeiouyäåö][^aeiouyäåö]/u', $word, $matches, PREG_OFFSET_CAPTURE)) {
|
| 40 |
$r1 = $matches[0][1] + 2;
|
| 41 |
}
|
| 42 |
|
| 43 |
// Steps 1-3: suffix removal
|
| 44 |
$word = swedishstemmer_step1($word, $r1);
|
| 45 |
$word = swedishstemmer_step2($word, $r1);
|
| 46 |
$word = swedishstemmer_step3($word, $r1);
|
| 47 |
|
| 48 |
return $word;
|
| 49 |
}
|
| 50 |
|
| 51 |
function swedishstemmer_step1($word, $r1) {
|
| 52 |
// Step 1:
|
| 53 |
// Search for the longest among the following suffixes in R1, and perform the action indicated.
|
| 54 |
if ($r1) {
|
| 55 |
$word = preg_replace(array_reverse(array('/a$/', '/arna$/', '/erna$/', '/heterna$/', '/orna$/', '/ad$/', '/e$/', '/ade$/', '/ande$/', '/arne$/', '/are$/', '/aste$/', '/en$/', '/anden$/', '/aren$/', '/heten$/', '/ern$/', '/ar$/', '/er$/', '/heter$/', '/or$/', '/as$/', '/arnas$/', '/ernas$/', '/ornas$/', '/es$/', '/ades$/', '/andes$/', '/ens$/', '/arens$/', '/hetens$/', '/erns$/', '/at$/', '/andet$/', '/het$/', '/ast$/')), '', $word, 1);
|
| 56 |
}
|
| 57 |
|
| 58 |
// Delete 's' if preceded by a valid s-ending
|
| 59 |
$word = preg_replace('/([bcdfghjklmnoprtvy])s$/', '\\1', $word);
|
| 60 |
|
| 61 |
return $word;
|
| 62 |
}
|
| 63 |
|
| 64 |
function swedishstemmer_step2($word, $r1) {
|
| 65 |
// Step 2:
|
| 66 |
// Search for one of the following suffixes in R1, and if found delete the last letter.
|
| 67 |
if ($r1) {
|
| 68 |
$word = preg_match('/(dd|gd|nn|dt|gt|kt|tt)$/', $word) ? substr($word, 0, -1) : $word;
|
| 69 |
}
|
| 70 |
|
| 71 |
return $word;
|
| 72 |
}
|
| 73 |
|
| 74 |
function swedishstemmer_step3($word, $r1) {
|
| 75 |
// Step 3:
|
| 76 |
// Search for the longest among the following suffixes in R1, and perform the action indicated.
|
| 77 |
if ($r1) {
|
| 78 |
$word = preg_replace('/(lig|ig|els)$/', '', $word);
|
| 79 |
$word = preg_replace('/löst$/', 'lös', $word);
|
| 80 |
$word = preg_replace('/fullt$/', 'full', $word);
|
| 81 |
}
|
| 82 |
|
| 83 |
return $word;
|
| 84 |
}
|