/[drupal]/contributions/modules/dutchstemmer/dutchstemmer.module
ViewVC logotype

Contents of /contributions/modules/dutchstemmer/dutchstemmer.module

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.4 - (show annotations) (download) (as text)
Thu Feb 26 14:14:10 2009 UTC (8 months, 4 weeks ago) by clemenstolboom
Branch: MAIN
CVS Tags: HEAD
Branch point for: DRUPAL-6--1
Changes since 1.3: +6 -5 lines
File MIME type: text/x-php
#73881 : Bert Boerland : some small typos
1 <?php
2 // $Id: dutchstemmer.module,v 1.3 2009/02/26 13:53:10 clemenstolboom Exp $
3
4 /**
5 * @file
6 */
7
8 /*
9 Algorithm based on http://www.snowball.tartarus.org/algorithms/dutch/stemmer.html
10
11 Improvements:
12 - Convert s/f to z/v when removing double vowel in last syllable.
13 - Include more consonants in undoubling operation.
14 - Correctly remove apostrophe-s (e.g. "pagina's").
15 - Correctly strip accented suffixes (e.g. industriële)
16 */
17
18 /**
19 * Implementation of hook_search_preprocess
20 */
21 function dutchstemmer_search_preprocess(&$text) {
22 // Split words from noise and remove apostrophes
23 $words = preg_split('/([^a-zA-Zäëïöüáéíóúè\']+)/u', $text, -1, PREG_SPLIT_DELIM_CAPTURE);
24
25 // Process each word
26 $odd = TRUE;
27 foreach ($words as $k => $word) {
28 if ($odd) {
29 $words[$k] = dutchstemmer_stem($word);
30 }
31 $odd = !$odd;
32 }
33
34 // Put it all back together
35 return implode('', $words);
36 }
37
38 /**
39 * Implementation of hook_help().
40 */
41 function dutchstemmer_help($section, $arg) {
42 switch ($section) {
43 case 'admin/modules#description':
44 return t('Implements a Dutch stemming algorithm to improve Dutch searching. See <a href="@l">Stemmer</a>'
45 , array('@l' => 'http://en.wikipedia.org/wiki/Stemmer'));
46 }
47 }
48
49 /**
50 * Stem a dutch word.
51 */
52 function dutchstemmer_stem($word) {
53 global $_dutchstemmer_step2;
54
55 $_dutchstemmer_step2= FALSE;
56
57 // Lowercase
58 $word = drupal_strtolower($word);
59
60 // Step 0: early (accented) suffix removal
61 $word = dutchstemmer_step0($word, $r1, $r2);
62
63 // Remove accents
64 $word = str_replace(array('ä', 'ë', 'ï', 'ö', 'ü', 'á', 'é', 'í', 'ó', 'ú'),
65 array('a', 'e', 'i', 'o', 'u', 'a', 'e', 'i', 'o', 'u'),
66 $word);
67
68 // Put initial y, y after a vowel, and i between vowels into upper case (treat as consonants).
69 $word = preg_replace(array('/^y|(?<=[aeiouyè])y/u', '/(?<=[aeiouyè])i(?=[aeiouyè])/u'),
70 array('Y', 'I'),
71 $word);
72
73 /* R1 is the region after the first non-vowel following a vowel, or is the
74 null region at the end of the word if there is no such non-vowel. */
75 if (preg_match('/[aeiouyè][^aeiouyè]/u', $word, $matches, PREG_OFFSET_CAPTURE)) {
76 $r1 = $matches[0][1] + 2;
77 }
78
79 /* R2 is the region after the first non-vowel following a vowel in R1, or is
80 the null region at the end of the word if there is no such non-vowel. */
81 if (preg_match('/[aeiouyè][^aeiouyè]/u', $word, $matches, PREG_OFFSET_CAPTURE, $r1)) {
82 $r2 = $matches[0][1] + 2;
83 }
84
85 // Steps 1-4: suffix removal
86 $word = dutchstemmer_step1($word, $r1, $r2);
87 $word = dutchstemmer_step2($word, $r1, $r2);
88 $word = dutchstemmer_step3($word, $r1, $r2);
89 $word = dutchstemmer_step4($word, $r1, $r2);
90
91 $word = str_replace(array('Y', 'I'), array('y', 'i'), $word);
92
93 return $word;
94 }
95
96 function dutchstemmer_undouble($word) {
97 return preg_match('/(bb|dd|gg|kk|ll|mm|nn|pp|rr|ss|tt|zz)$/u', $word) ? substr($word, 0, -1) : $word;
98 }
99
100 function dutchstemmer_step0($word) {
101 // Step 0: accented suffixes
102 return preg_replace('/eën$/u', 'e', preg_replace('/(ieel|iële|ieën)$/u', 'ie', $word));
103 }
104
105 function dutchstemmer_step1($word, $r1, $r2) {
106 // Step 1:
107 // Search for the longest among the following suffixes, and perform the action indicated
108 if ($r1) {
109 // -heden
110 if (preg_match('/heden$/u', $word, $matches, 0, $r1)) {
111 return preg_replace('/heden$/u', 'heid', $word, -1, $count);
112 }
113 // -en(e)
114 else if (preg_match('/(?<=[^aeiouyè]|gem)ene?$/u', $word, $matches, 0, $r1)) {
115 return dutchstemmer_undouble(preg_replace('/ene?$/u', '', $word, -1, $count));
116 }
117 // -s(e)
118 else if (preg_match('/(?<=[^jaeiouyè])se?$/u', $word, $matches, 0, $r1)) {
119 return rtrim(preg_replace('/se?$/u', '', $word, -1, $count), "'");
120 }
121 }
122 return $word;
123 }
124
125 function dutchstemmer_step2($word, $r1, $r2) {
126 // Step 2:
127 // Delete suffix e if in R1 and preceded by a non-vowel, and then undouble the ending
128 if ($r1) {
129 if (preg_match('/(?<=[^aeiouyè])e$/u', $word, $matches, 0, $r1)) {
130 // TODO: this should be here to make any sense
131 // global $_dutchstemmer_step2;
132 $_dutchstemmer_step2= TRUE;
133 return dutchstemmer_undouble(preg_replace('/e$/u', '', $word, -1, $count));
134 }
135 }
136 return $word;
137 }
138
139 function dutchstemmer_step3($word, $r1, $r2) {
140 global $_dutchstemmer_step2;
141
142 // Step 3a: heid
143 // delete heid if in R2 and not preceded by c, and treat a preceding en as in step 1(b)
144 if ($r2) {
145 if (preg_match('/(?<!c)heid$/u', $word, $matches, 0, $r2)) {
146 $word = preg_replace('/heid$/u', '', $word, -1, $count);
147 if (preg_match('/en$/u', $word, $matches, 0, $r1)) {
148 $word = dutchstemmer_undouble(preg_replace('/en$/u', '', $word, -1, $count));
149 }
150 }
151 }
152
153 // Step 3b: d-suffixes (*)
154 // Search for the longest among the following suffixes, and perform the action indicated.
155 if ($r2) {
156 // -baar
157 if (preg_match('/baar$/u', $word, $matches, 0, $r2)) {
158 $word = preg_replace('/baar$/u', '', $word, -1, $count);
159 }
160 // -lijk
161 else if (preg_match('/lijk$/u', $word, $matches, 0, $r2)) {
162 $word = dutchstemmer_step2(preg_replace('/lijk$/u', '', $word, -1, $count), $r1, $r2);
163 }
164 // -end / -ing
165 else if (preg_match('/(end|ing)$/u', $word, $matches, 0, $r2)) {
166 $word = preg_replace('/(end|ing)$/u', '', $word, -1, $count);
167 // -ig
168 if (preg_match('/(?<!e)ig$/u', $word, $matches, 0, $r2)) {
169 $word = preg_replace('/ig$/u', '', $word, -1, $count);
170 }
171 }
172 // -ig
173 else if (preg_match('/(?<!e)ig$/u', $word, $matches, 0, $r2)) {
174 $word = preg_replace('/ig$/u', '', $word, -1, $count);
175 }
176 // -bar
177 else if ($_dutchstemmer_step2&& preg_match('/bar$/u', $word, $matches, 0, $r2)) {
178 $word = preg_replace('/bar$/u', '', $word, -1, $count);
179 }
180 }
181
182 return $word;
183 }
184
185 function dutchstemmer_step4($word, $r1, $r2) {
186 // Step 4: undouble vowel
187 // If the words ends CVD, where C is a non-vowel, D is a non-vowel other than
188 // I, and V is double a, e, o or u, remove one of the vowels from V
189 // (for example, maan -> man, brood -> brod).
190 if (preg_match('/[^aeiouyè](aa|ee|oo|uu)[^Iaeiouyè]$/u', $word)) {
191 $word = drupal_substr($word, 0, -2) . str_replace(array('s', 'f'), array('z', 'v'), drupal_substr($word, -1));
192 }
193 return $word;
194 }
195

  ViewVC Help
Powered by ViewVC 1.1.2