| 1 |
<?php
|
| 2 |
// $Id: dutchstemmer.module,v 1.3 2009/02/26 13:53:10 clemenstolboom Exp $
|
| 3 |
|
| 4 |
/**
|
| 5 |
* @file
|
| 6 |
*/
|
| 7 |
|
| 8 |
/*
|
| 9 |
Algorithm based on http://www.snowball.tartarus.org/algorithms/dutch/stemmer.html
|
| 10 |
|
| 11 |
Improvements:
|
| 12 |
- Convert s/f to z/v when removing double vowel in last syllable.
|
| 13 |
- Include more consonants in undoubling operation.
|
| 14 |
- Correctly remove apostrophe-s (e.g. "pagina's").
|
| 15 |
- Correctly strip accented suffixes (e.g. industriële)
|
| 16 |
*/
|
| 17 |
|
| 18 |
/**
|
| 19 |
* Implementation of hook_search_preprocess
|
| 20 |
*/
|
| 21 |
function dutchstemmer_search_preprocess(&$text) {
|
| 22 |
// Split words from noise and remove apostrophes
|
| 23 |
$words = preg_split('/([^a-zA-Zäëïöüáéíóúè\']+)/u', $text, -1, PREG_SPLIT_DELIM_CAPTURE);
|
| 24 |
|
| 25 |
// Process each word
|
| 26 |
$odd = TRUE;
|
| 27 |
foreach ($words as $k => $word) {
|
| 28 |
if ($odd) {
|
| 29 |
$words[$k] = dutchstemmer_stem($word);
|
| 30 |
}
|
| 31 |
$odd = !$odd;
|
| 32 |
}
|
| 33 |
|
| 34 |
// Put it all back together
|
| 35 |
return implode('', $words);
|
| 36 |
}
|
| 37 |
|
| 38 |
/**
|
| 39 |
* Implementation of hook_help().
|
| 40 |
*/
|
| 41 |
function dutchstemmer_help($section, $arg) {
|
| 42 |
switch ($section) {
|
| 43 |
case 'admin/modules#description':
|
| 44 |
return t('Implements a Dutch stemming algorithm to improve Dutch searching. See <a href="@l">Stemmer</a>'
|
| 45 |
, array('@l' => 'http://en.wikipedia.org/wiki/Stemmer'));
|
| 46 |
}
|
| 47 |
}
|
| 48 |
|
| 49 |
/**
|
| 50 |
* Stem a dutch word.
|
| 51 |
*/
|
| 52 |
function dutchstemmer_stem($word) {
|
| 53 |
global $_dutchstemmer_step2;
|
| 54 |
|
| 55 |
$_dutchstemmer_step2= FALSE;
|
| 56 |
|
| 57 |
// Lowercase
|
| 58 |
$word = drupal_strtolower($word);
|
| 59 |
|
| 60 |
// Step 0: early (accented) suffix removal
|
| 61 |
$word = dutchstemmer_step0($word, $r1, $r2);
|
| 62 |
|
| 63 |
// Remove accents
|
| 64 |
$word = str_replace(array('ä', 'ë', 'ï', 'ö', 'ü', 'á', 'é', 'í', 'ó', 'ú'),
|
| 65 |
array('a', 'e', 'i', 'o', 'u', 'a', 'e', 'i', 'o', 'u'),
|
| 66 |
$word);
|
| 67 |
|
| 68 |
// Put initial y, y after a vowel, and i between vowels into upper case (treat as consonants).
|
| 69 |
$word = preg_replace(array('/^y|(?<=[aeiouyè])y/u', '/(?<=[aeiouyè])i(?=[aeiouyè])/u'),
|
| 70 |
array('Y', 'I'),
|
| 71 |
$word);
|
| 72 |
|
| 73 |
/* R1 is the region after the first non-vowel following a vowel, or is the
|
| 74 |
null region at the end of the word if there is no such non-vowel. */
|
| 75 |
if (preg_match('/[aeiouyè][^aeiouyè]/u', $word, $matches, PREG_OFFSET_CAPTURE)) {
|
| 76 |
$r1 = $matches[0][1] + 2;
|
| 77 |
}
|
| 78 |
|
| 79 |
/* R2 is the region after the first non-vowel following a vowel in R1, or is
|
| 80 |
the null region at the end of the word if there is no such non-vowel. */
|
| 81 |
if (preg_match('/[aeiouyè][^aeiouyè]/u', $word, $matches, PREG_OFFSET_CAPTURE, $r1)) {
|
| 82 |
$r2 = $matches[0][1] + 2;
|
| 83 |
}
|
| 84 |
|
| 85 |
// Steps 1-4: suffix removal
|
| 86 |
$word = dutchstemmer_step1($word, $r1, $r2);
|
| 87 |
$word = dutchstemmer_step2($word, $r1, $r2);
|
| 88 |
$word = dutchstemmer_step3($word, $r1, $r2);
|
| 89 |
$word = dutchstemmer_step4($word, $r1, $r2);
|
| 90 |
|
| 91 |
$word = str_replace(array('Y', 'I'), array('y', 'i'), $word);
|
| 92 |
|
| 93 |
return $word;
|
| 94 |
}
|
| 95 |
|
| 96 |
function dutchstemmer_undouble($word) {
|
| 97 |
return preg_match('/(bb|dd|gg|kk|ll|mm|nn|pp|rr|ss|tt|zz)$/u', $word) ? substr($word, 0, -1) : $word;
|
| 98 |
}
|
| 99 |
|
| 100 |
function dutchstemmer_step0($word) {
|
| 101 |
// Step 0: accented suffixes
|
| 102 |
return preg_replace('/eën$/u', 'e', preg_replace('/(ieel|iële|ieën)$/u', 'ie', $word));
|
| 103 |
}
|
| 104 |
|
| 105 |
function dutchstemmer_step1($word, $r1, $r2) {
|
| 106 |
// Step 1:
|
| 107 |
// Search for the longest among the following suffixes, and perform the action indicated
|
| 108 |
if ($r1) {
|
| 109 |
// -heden
|
| 110 |
if (preg_match('/heden$/u', $word, $matches, 0, $r1)) {
|
| 111 |
return preg_replace('/heden$/u', 'heid', $word, -1, $count);
|
| 112 |
}
|
| 113 |
// -en(e)
|
| 114 |
else if (preg_match('/(?<=[^aeiouyè]|gem)ene?$/u', $word, $matches, 0, $r1)) {
|
| 115 |
return dutchstemmer_undouble(preg_replace('/ene?$/u', '', $word, -1, $count));
|
| 116 |
}
|
| 117 |
// -s(e)
|
| 118 |
else if (preg_match('/(?<=[^jaeiouyè])se?$/u', $word, $matches, 0, $r1)) {
|
| 119 |
return rtrim(preg_replace('/se?$/u', '', $word, -1, $count), "'");
|
| 120 |
}
|
| 121 |
}
|
| 122 |
return $word;
|
| 123 |
}
|
| 124 |
|
| 125 |
function dutchstemmer_step2($word, $r1, $r2) {
|
| 126 |
// Step 2:
|
| 127 |
// Delete suffix e if in R1 and preceded by a non-vowel, and then undouble the ending
|
| 128 |
if ($r1) {
|
| 129 |
if (preg_match('/(?<=[^aeiouyè])e$/u', $word, $matches, 0, $r1)) {
|
| 130 |
// TODO: this should be here to make any sense
|
| 131 |
// global $_dutchstemmer_step2;
|
| 132 |
$_dutchstemmer_step2= TRUE;
|
| 133 |
return dutchstemmer_undouble(preg_replace('/e$/u', '', $word, -1, $count));
|
| 134 |
}
|
| 135 |
}
|
| 136 |
return $word;
|
| 137 |
}
|
| 138 |
|
| 139 |
function dutchstemmer_step3($word, $r1, $r2) {
|
| 140 |
global $_dutchstemmer_step2;
|
| 141 |
|
| 142 |
// Step 3a: heid
|
| 143 |
// delete heid if in R2 and not preceded by c, and treat a preceding en as in step 1(b)
|
| 144 |
if ($r2) {
|
| 145 |
if (preg_match('/(?<!c)heid$/u', $word, $matches, 0, $r2)) {
|
| 146 |
$word = preg_replace('/heid$/u', '', $word, -1, $count);
|
| 147 |
if (preg_match('/en$/u', $word, $matches, 0, $r1)) {
|
| 148 |
$word = dutchstemmer_undouble(preg_replace('/en$/u', '', $word, -1, $count));
|
| 149 |
}
|
| 150 |
}
|
| 151 |
}
|
| 152 |
|
| 153 |
// Step 3b: d-suffixes (*)
|
| 154 |
// Search for the longest among the following suffixes, and perform the action indicated.
|
| 155 |
if ($r2) {
|
| 156 |
// -baar
|
| 157 |
if (preg_match('/baar$/u', $word, $matches, 0, $r2)) {
|
| 158 |
$word = preg_replace('/baar$/u', '', $word, -1, $count);
|
| 159 |
}
|
| 160 |
// -lijk
|
| 161 |
else if (preg_match('/lijk$/u', $word, $matches, 0, $r2)) {
|
| 162 |
$word = dutchstemmer_step2(preg_replace('/lijk$/u', '', $word, -1, $count), $r1, $r2);
|
| 163 |
}
|
| 164 |
// -end / -ing
|
| 165 |
else if (preg_match('/(end|ing)$/u', $word, $matches, 0, $r2)) {
|
| 166 |
$word = preg_replace('/(end|ing)$/u', '', $word, -1, $count);
|
| 167 |
// -ig
|
| 168 |
if (preg_match('/(?<!e)ig$/u', $word, $matches, 0, $r2)) {
|
| 169 |
$word = preg_replace('/ig$/u', '', $word, -1, $count);
|
| 170 |
}
|
| 171 |
}
|
| 172 |
// -ig
|
| 173 |
else if (preg_match('/(?<!e)ig$/u', $word, $matches, 0, $r2)) {
|
| 174 |
$word = preg_replace('/ig$/u', '', $word, -1, $count);
|
| 175 |
}
|
| 176 |
// -bar
|
| 177 |
else if ($_dutchstemmer_step2&& preg_match('/bar$/u', $word, $matches, 0, $r2)) {
|
| 178 |
$word = preg_replace('/bar$/u', '', $word, -1, $count);
|
| 179 |
}
|
| 180 |
}
|
| 181 |
|
| 182 |
return $word;
|
| 183 |
}
|
| 184 |
|
| 185 |
function dutchstemmer_step4($word, $r1, $r2) {
|
| 186 |
// Step 4: undouble vowel
|
| 187 |
// If the words ends CVD, where C is a non-vowel, D is a non-vowel other than
|
| 188 |
// I, and V is double a, e, o or u, remove one of the vowels from V
|
| 189 |
// (for example, maan -> man, brood -> brod).
|
| 190 |
if (preg_match('/[^aeiouyè](aa|ee|oo|uu)[^Iaeiouyè]$/u', $word)) {
|
| 191 |
$word = drupal_substr($word, 0, -2) . str_replace(array('s', 'f'), array('z', 'v'), drupal_substr($word, -1));
|
| 192 |
}
|
| 193 |
return $word;
|
| 194 |
}
|
| 195 |
|