/[drupal]/contributions/sandbox/frjo/swedishstemmer5/swedishstemmer.module
ViewVC logotype

Contents of /contributions/sandbox/frjo/swedishstemmer5/swedishstemmer.module

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.1 - (show annotations) (download) (as text)
Mon Jun 9 09:35:08 2008 UTC (17 months, 2 weeks ago) by frjo
Branch: MAIN
CVS Tags: HEAD
File MIME type: text/x-php
Swedish stemmer module for Drupal 5.
1 <?php
2 // $Id: site_map.module,v 1.30 2007/03/21 08:14:34 frjo Exp $
3
4 // Algorithm based on http://snowball.tartarus.org/algorithms/swedish/stemmer.html.
5
6 /**
7 * Implementation of hook_search_preprocess.
8 */
9 function swedishstemmer_search_preprocess(&$text) {
10 // Split words from noise and remove apostrophes
11 $words = preg_split('/([^a-zA-ZéåäöÅÄÖ\']+)/u', $text, -1, PREG_SPLIT_DELIM_CAPTURE);
12
13 // Process each word
14 $odd = true;
15 foreach ($words as $k => $word) {
16 if ($odd) {
17 $words[$k] = swedishstemmer_stem($word);
18 }
19 $odd = !$odd;
20 }
21
22 // Put it all back together
23 return implode('', $words);
24 }
25
26 /**
27 * Stem a Swedish word.
28 */
29 function swedishstemmer_stem($word) {
30 // Lowercase
31 $word = drupal_strtolower($word);
32
33 /* R1 is the region after the first non-vowel following a vowel, or is the
34 null region at the end of the word if there is no such non-vowel. */
35 if (preg_match('/[aeiouyäåö][^aeiouyäåö]/u', $word, $matches, PREG_OFFSET_CAPTURE)) {
36 $r1 = $matches[0][1] + 2;
37 }
38
39 // Steps 1-3: suffix removal
40 $word = swedishstemmer_step1($word, $r1);
41 $word = swedishstemmer_step2($word, $r1);
42 $word = swedishstemmer_step3($word, $r1);
43
44 return $word;
45 }
46
47 function swedishstemmer_step1($word, $r1) {
48 // Step 1:
49 // Search for the longest among the following suffixes in R1, and perform the action indicated.
50 if ($r1) {
51 $word = preg_replace(array_reverse(array('/a$/', '/arna$/', '/erna$/', '/heterna$/', '/orna$/', '/ad$/', '/e$/', '/ade$/', '/ande$/', '/arne$/', '/are$/', '/aste$/', '/en$/', '/anden$/', '/aren$/', '/heten$/', '/ern$/', '/ar$/', '/er$/', '/heter$/', '/or$/', '/as$/', '/arnas$/', '/ernas$/', '/ornas$/', '/es$/', '/ades$/', '/andes$/', '/ens$/', '/arens$/', '/hetens$/', '/erns$/', '/at$/', '/andet$/', '/het$/', '/ast$/')), '', $word, 1);
52 }
53
54 // Delete 's' if preceded by a valid s-ending
55 $word = preg_replace('/([bcdfghjklmnoprtvy])s$/', '\\1', $word);
56
57 return $word;
58 }
59
60 function swedishstemmer_step2($word, $r1) {
61 // Step 2:
62 // Search for one of the following suffixes in R1, and if found delete the last letter.
63 if ($r1) {
64 $word = preg_match('/(dd|gd|nn|dt|gt|kt|tt)$/', $word) ? substr($word, 0, -1) : $word;
65 }
66
67 return $word;
68 }
69
70 function swedishstemmer_step3($word, $r1) {
71 // Step 3:
72 // Search for the longest among the following suffixes in R1, and perform the action indicated.
73 if ($r1) {
74 $word = preg_replace('/(lig|ig|els)$/', '', $word);
75 $word = preg_replace('/löst$/', 'lös', $word);
76 $word = preg_replace('/fullt$/', 'full', $word);
77 }
78
79 return $word;
80 }

  ViewVC Help
Powered by ViewVC 1.1.2