/[drupal]/contributions/sandbox/frjo/swedishstemmer6/swedishstemmer.module
ViewVC logotype

Contents of /contributions/sandbox/frjo/swedishstemmer6/swedishstemmer.module

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.2 - (show annotations) (download) (as text)
Tue Nov 25 05:33:09 2008 UTC (12 months ago) by frjo
Branch: MAIN
CVS Tags: HEAD
Changes since 1.1: +6 -2 lines
File MIME type: text/x-php
Added @file to swedishstemmer.module.
1 <?php
2 // $Id: swedishstemmer.module,v 1.1 2008/10/15 19:23:54 frjo Exp $
3
4 /**
5 * @file
6 * "Improve Swedish language searching by simplifying related words to their root (verbs, plurals, ...).
7 * Algorithm based on http://snowball.tartarus.org/algorithms/swedish/stemmer.html.
8 */
9
10 /**
11 * Implementation of hook_search_preprocess.
12 */
13 function swedishstemmer_search_preprocess($text) {
14 // Split words from noise and remove apostrophes
15 $words = preg_split('/([^a-zA-ZéåäöÅÄÖ\']+)/u', $text, -1, PREG_SPLIT_DELIM_CAPTURE);
16
17 // Process each word
18 $odd = true;
19 foreach ($words as $k => $word) {
20 if ($odd) {
21 $words[$k] = swedishstemmer_stem($word);
22 }
23 $odd = !$odd;
24 }
25
26 // Put it all back together
27 return implode('', $words);
28 }
29
30 /**
31 * Stem a Swedish word.
32 */
33 function swedishstemmer_stem($word) {
34 // Lowercase
35 $word = drupal_strtolower($word);
36
37 /* R1 is the region after the first non-vowel following a vowel, or is the
38 null region at the end of the word if there is no such non-vowel. */
39 if (preg_match('/[aeiouyäåö][^aeiouyäåö]/u', $word, $matches, PREG_OFFSET_CAPTURE)) {
40 $r1 = $matches[0][1] + 2;
41 }
42
43 // Steps 1-3: suffix removal
44 $word = swedishstemmer_step1($word, $r1);
45 $word = swedishstemmer_step2($word, $r1);
46 $word = swedishstemmer_step3($word, $r1);
47
48 return $word;
49 }
50
51 function swedishstemmer_step1($word, $r1) {
52 // Step 1:
53 // Search for the longest among the following suffixes in R1, and perform the action indicated.
54 if ($r1) {
55 $word = preg_replace(array_reverse(array('/a$/', '/arna$/', '/erna$/', '/heterna$/', '/orna$/', '/ad$/', '/e$/', '/ade$/', '/ande$/', '/arne$/', '/are$/', '/aste$/', '/en$/', '/anden$/', '/aren$/', '/heten$/', '/ern$/', '/ar$/', '/er$/', '/heter$/', '/or$/', '/as$/', '/arnas$/', '/ernas$/', '/ornas$/', '/es$/', '/ades$/', '/andes$/', '/ens$/', '/arens$/', '/hetens$/', '/erns$/', '/at$/', '/andet$/', '/het$/', '/ast$/')), '', $word, 1);
56 }
57
58 // Delete 's' if preceded by a valid s-ending
59 $word = preg_replace('/([bcdfghjklmnoprtvy])s$/', '\\1', $word);
60
61 return $word;
62 }
63
64 function swedishstemmer_step2($word, $r1) {
65 // Step 2:
66 // Search for one of the following suffixes in R1, and if found delete the last letter.
67 if ($r1) {
68 $word = preg_match('/(dd|gd|nn|dt|gt|kt|tt)$/', $word) ? substr($word, 0, -1) : $word;
69 }
70
71 return $word;
72 }
73
74 function swedishstemmer_step3($word, $r1) {
75 // Step 3:
76 // Search for the longest among the following suffixes in R1, and perform the action indicated.
77 if ($r1) {
78 $word = preg_replace('/(lig|ig|els)$/', '', $word);
79 $word = preg_replace('/löst$/', 'lös', $word);
80 $word = preg_replace('/fullt$/', 'full', $word);
81 }
82
83 return $word;
84 }

  ViewVC Help
Powered by ViewVC 1.1.2