/[drupal]/contributions/modules/porterstemmer/porterstemmer.test
ViewVC logotype

Contents of /contributions/modules/porterstemmer/porterstemmer.test

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.1 - (show annotations) (download) (as text)
Tue Jul 14 23:38:47 2009 UTC (4 months, 2 weeks ago) by jhodgdon
Branch: MAIN
CVS Tags: DRUPAL-6--2-0, HEAD
Branch point for: DRUPAL-6--2
File MIME type: text/x-php
#511930 #437094 #219335 by jhodgdon: Upgrade to the Porter 2 Stemmer algorithm; minimum word size 3 characters. Also updated install instructions and Readme, updated for coding standards, and added SimpleTest tests.
1 <?php
2 // $Id$
3
4 /**
5 * @file
6 * Tests for the Porter Stemmer module.
7 * By Jennifer Hodgdon of Poplar ProductivityWare, www.poplarware.com
8 * Based on sample words from
9 * http://snowball.tartarus.org/algorithms/english/stemmer.html
10 * which are stored in a separate file (testwords.txt).
11 */
12
13 /**
14 * Unit tests for Porter Stemmer - Stemming output part 1.
15 */
16 class PorterStemmerOutput1UnitTest extends DrupalWebTestCase {
17
18 public static function getInfo() {
19 return array(
20 'name' => t('Stemming output 1'),
21 'description' => t('Test that the stemming function returns the correct stemmed words'),
22 'group' => t('Porter Stemmer'),
23 );
24 }
25
26
27 /**
28 * Tests the first 2000 words in the file for stemming accuracy.
29 */
30 function testStemmingUnitTest() {
31 $this->_run_porterstemmer_stem_test( 0, 2000 );
32 }
33
34 /**
35 * Runs a unit test for a portion of words in the test file.
36 *
37 * The output of the porterstemmer_stem() function is tested against
38 * known input/output pairs from
39 * http://snowball.tartarus.org/algorithms/english/stemmer.html
40 *
41 * Note that words whose input or stemmed version is less than 3
42 * characters are always skipped.
43 *
44 * @param $skipto
45 * Line of file to start on (count starts at 0), not counting short ones.
46 * @param $runto
47 * Number of lines to test, not counting short ones.
48 */
49 function _run_porterstemmer_stem_test( $skipto = 0, $runto = 2000 ) {
50
51 // Open word file
52 $file = drupal_get_path('module', 'porterstemmer') . '/testwords.txt';
53
54 $handle = @fopen($file, "r");
55 $this->assertTrue($handle, "Open file containing words to test", 'Startup');
56 if ( !$handle ) {
57 // no point doing rest of test...
58 return;
59 }
60
61 $ran = 0;
62 $skipped = 0;
63
64 while ( !feof( $handle ) && $ran < $runto ) {
65 // Read a line of the file, and split into words
66 $line = fgets($handle, 4096);
67 $words = preg_split( "/\s+/", $line, -1, PREG_SPLIT_NO_EMPTY );
68 if ( count( $words ) < 2 ) {
69 continue;
70 }
71
72 // Make sure the words are long enough
73 $in = $words[0];
74 $right = $words[1];
75 if ( drupal_strlen( $in ) < 3 ||
76 drupal_strlen( $right ) < 3 ) {
77 // unknown consequences if words are too short, don't test
78 continue;
79 }
80
81 $skipped++;
82 if ( $skipped < $skipto ) {
83 continue;
84 }
85
86 // Stem the word
87 $stem = porterstemmer_stem( $in );
88
89 // Test correctness
90 $this->assertEqual( $right, $stem, "Stemming $in results in $right (was $stem)",
91 t( 'Stemming test' ));
92 $ran++;
93 }
94 fclose($handle);
95 }
96 }
97
98 /**
99 * Unit tests for Porter Stemmer - Stemming output part 2.
100 */
101 class PorterStemmerOutput2UnitTest extends PorterStemmerOutput1UnitTest {
102
103 public static function getInfo() {
104 return array(
105 'name' => t('Stemming output 2'),
106 'description' => t('Test that the stemming function returns the correct stemmed words'),
107 'group' => t('Porter Stemmer'),
108 );
109 }
110
111 /**
112 * Tests the next 2000 words in the file for stemming accuracy.
113 */
114 function testStemmingUnitTest() {
115 $this->_run_porterstemmer_stem_test( 2000, 2000 );
116 }
117 }
118
119 /**
120 * Unit tests for Porter Stemmer - Stemming output part 3.
121 */
122 class PorterStemmerOutput3UnitTest extends PorterStemmerOutput1UnitTest {
123
124 public static function getInfo() {
125 return array(
126 'name' => t('Stemming output 3'),
127 'description' => t('Test that the stemming function returns the correct stemmed words'),
128 'group' => t('Porter Stemmer'),
129 );
130 }
131
132 /**
133 * Tests the next 2000 words in the file for stemming accuracy.
134 */
135 function testStemmingUnitTest() {
136 $this->_run_porterstemmer_stem_test( 4000, 2000 );
137 }
138 }
139
140 /**
141 * Unit tests for Porter Stemmer - Stemming output part 4.
142 */
143 class PorterStemmerOutput4UnitTest extends PorterStemmerOutput1UnitTest {
144
145 public static function getInfo() {
146 return array(
147 'name' => t('Stemming output 4'),
148 'description' => t('Test that the stemming function returns the correct stemmed words'),
149 'group' => t('Porter Stemmer'),
150 );
151 }
152
153 /**
154 * Tests the next 2000 words in the file for stemming accuracy.
155 */
156 function testStemmingUnitTest() {
157 $this->_run_porterstemmer_stem_test( 6000, 2000 );
158 }
159 }
160
161 /**
162 * Unit tests for Porter Stemmer - Stemming output part 5.
163 */
164 class PorterStemmerOutput5UnitTest extends PorterStemmerOutput1UnitTest {
165
166 public static function getInfo() {
167 return array(
168 'name' => t('Stemming output 5'),
169 'description' => t('Test that the stemming function returns the correct stemmed words'),
170 'group' => t('Porter Stemmer'),
171 );
172 }
173
174 /**
175 * Tests the next 2000 words in the file for stemming accuracy.
176 */
177 function testStemmingUnitTest() {
178 $this->_run_porterstemmer_stem_test( 8000, 2000 );
179 }
180 }
181
182 /**
183 * Unit tests for Porter Stemmer - Stemming output part 6.
184 */
185 class PorterStemmerOutput6UnitTest extends PorterStemmerOutput1UnitTest {
186
187 public static function getInfo() {
188 return array(
189 'name' => t('Stemming output 6'),
190 'description' => t('Test that the stemming function returns the correct stemmed words'),
191 'group' => t('Porter Stemmer'),
192 );
193 }
194
195 /**
196 * Tests the next 2000 words in the file for stemming accuracy.
197 */
198 function testStemmingUnitTest() {
199 $this->_run_porterstemmer_stem_test( 10000, 2000 );
200 }
201 }
202
203 /**
204 * Unit tests for Porter Stemmer - Stemming output part 7.
205 */
206 class PorterStemmerOutput7UnitTest extends PorterStemmerOutput1UnitTest {
207
208 public static function getInfo() {
209 return array(
210 'name' => t('Stemming output 7'),
211 'description' => t('Test that the stemming function returns the correct stemmed words'),
212 'group' => t('Porter Stemmer'),
213 );
214 }
215
216 /**
217 * Tests the next 2000 words in the file for stemming accuracy.
218 */
219 function testStemmingUnitTest() {
220 $this->_run_porterstemmer_stem_test( 12000, 2000 );
221 }
222 }
223
224 /**
225 * Unit tests for Porter Stemmer - Stemming output part 8.
226 */
227 class PorterStemmerOutput8UnitTest extends PorterStemmerOutput1UnitTest {
228
229 public static function getInfo() {
230 return array(
231 'name' => t('Stemming output 8'),
232 'description' => t('Test that the stemming function returns the correct stemmed words'),
233 'group' => t('Porter Stemmer'),
234 );
235 }
236
237 /**
238 * Tests the next 2000 words in the file for stemming accuracy.
239 */
240 function testStemmingUnitTest() {
241 $this->_run_porterstemmer_stem_test( 14000, 2000 );
242 }
243 }
244
245 /**
246 * Unit tests for Porter Stemmer - Stemming output part 9.
247 */
248 class PorterStemmerOutput9UnitTest extends PorterStemmerOutput1UnitTest {
249
250 public static function getInfo() {
251 return array(
252 'name' => t('Stemming output 9'),
253 'description' => t('Test that the stemming function returns the correct stemmed words'),
254 'group' => t('Porter Stemmer'),
255 );
256 }
257
258 /**
259 * Tests the next 2000 words in the file for stemming accuracy.
260 */
261 function testStemmingUnitTest() {
262 $this->_run_porterstemmer_stem_test( 16000, 2000 );
263 }
264 }
265
266 /**
267 * Unit tests for Porter Stemmer - Stemming output part 10.
268 */
269 class PorterStemmerOutput10UnitTest extends PorterStemmerOutput1UnitTest {
270
271 public static function getInfo() {
272 return array(
273 'name' => t('Stemming output 10'),
274 'description' => t('Test that the stemming function returns the correct stemmed words'),
275 'group' => t('Porter Stemmer'),
276 );
277 }
278
279 /**
280 * Tests the next 2000 words in the file for stemming accuracy.
281 */
282 function testStemmingUnitTest() {
283 $this->_run_porterstemmer_stem_test( 18000, 2000 );
284 }
285 }
286
287 /**
288 * Unit tests for Porter Stemmer - Stemming output part 11.
289 */
290 class PorterStemmerOutput11UnitTest extends PorterStemmerOutput1UnitTest {
291
292 public static function getInfo() {
293 return array(
294 'name' => t('Stemming output 11'),
295 'description' => t('Test that the stemming function returns the correct stemmed words'),
296 'group' => t('Porter Stemmer'),
297 );
298 }
299
300 /**
301 * Tests the next 2000 words in the file for stemming accuracy.
302 */
303 function testStemmingUnitTest() {
304 $this->_run_porterstemmer_stem_test( 20000, 2000 );
305 }
306 }
307
308 /**
309 * Unit tests for Porter Stemmer - Stemming output part 12.
310 */
311 class PorterStemmerOutput12UnitTest extends PorterStemmerOutput1UnitTest {
312
313 public static function getInfo() {
314 return array(
315 'name' => t('Stemming output 12'),
316 'description' => t('Test that the stemming function returns the correct stemmed words'),
317 'group' => t('Porter Stemmer'),
318 );
319 }
320
321 /**
322 * Tests the next 2000 words in the file for stemming accuracy.
323 */
324 function testStemmingUnitTest() {
325 $this->_run_porterstemmer_stem_test( 22000, 2000 );
326 }
327 }
328
329 /**
330 * Unit tests for Porter Stemmer - Stemming output part 13.
331 */
332 class PorterStemmerOutput13UnitTest extends PorterStemmerOutput1UnitTest {
333
334 public static function getInfo() {
335 return array(
336 'name' => t('Stemming output 13'),
337 'description' => t('Test that the stemming function returns the correct stemmed words'),
338 'group' => t('Porter Stemmer'),
339 );
340 }
341
342 /**
343 * Tests the next 2000 words in the file for stemming accuracy.
344 */
345 function testStemmingUnitTest() {
346 $this->_run_porterstemmer_stem_test( 24000, 2000 );
347 }
348 }
349
350 /**
351 * Unit tests for Porter Stemmer - Stemming output part 14.
352 */
353 class PorterStemmerOutput14UnitTest extends PorterStemmerOutput1UnitTest {
354
355 public static function getInfo() {
356 return array(
357 'name' => t('Stemming output 14'),
358 'description' => t('Test that the stemming function returns the correct stemmed words'),
359 'group' => t('Porter Stemmer'),
360 );
361 }
362
363 /**
364 * Tests the next 2000 words in the file for stemming accuracy.
365 */
366 function testStemmingUnitTest() {
367 $this->_run_porterstemmer_stem_test( 26000, 2000 );
368 }
369 }
370
371 /**
372 * Unit tests for Porter Stemmer - Stemming output part 15.
373 */
374 class PorterStemmerOutput15UnitTest extends PorterStemmerOutput1UnitTest {
375
376 public static function getInfo() {
377 return array(
378 'name' => t('Stemming output 15'),
379 'description' => t('Test that the stemming function returns the correct stemmed words'),
380 'group' => t('Porter Stemmer'),
381 );
382 }
383
384 /**
385 * Tests the next 2000 words in the file for stemming accuracy.
386 */
387 function testStemmingUnitTest() {
388 $this->_run_porterstemmer_stem_test( 28000, 2000 );
389 }
390 }
391
392 /**
393 * Unit tests for Porter Stemmer - Stemming internals.
394 */
395 class PorterStemmerInternalsUnitTest extends DrupalWebTestCase {
396
397 public static function getInfo() {
398 return array(
399 'name' => t('Stemming internals'),
400 'description' => t('Test that various algorithm steps and internal functions are working correctly'),
401 'group' => t('Porter Stemmer'),
402 );
403 }
404
405 /**
406 * Verify that short words are not stemmed, and longer ones are.
407 */
408 function testStemLengthUnitTest() {
409 // Words 3 letters or less should not be stemmed
410 $words = array(
411 'a' => 'a',
412 'at' => 'at',
413 'say' => 'say',
414 'fished' => 'fish',
415 'saying' => 'say',
416 );
417
418 foreach ( $words as $in => $out ) {
419 $stem = porterstemmer_stem( $in );
420 $this->assertEqual( $out, $stem, "Stemming length test for $in gives $out (was $stem)", t( 'Stemming length' ));
421 }
422 }
423
424 /**
425 * Tests the function that determines if a word is "short".
426 */
427 function testShortWord() {
428 // Test "shortness", using examples from algorithm web page,
429 // as well as several variations on the word "administ...".
430
431 $this->assertFalse( porterstemmer_short_word( 'administered', 2, 'Stemmer steps' ),
432 "administered should not be a short word" );
433
434 $this->assertFalse( porterstemmer_short_word( 'administer', 2, 'Stemmer steps'),
435 "administer should not be a short word" );
436
437 $this->assertFalse( porterstemmer_short_word( 'admin', 2, 'Stemmer steps' ),
438 "admin should not be a short word" );
439
440 $this->assertTrue( porterstemmer_short_word( 'bed', 3, 'Stemmer steps' ),
441 "bed should be a short word" );
442
443 $this->assertTrue( porterstemmer_short_word( 'shed', 4, 'Stemmer steps' ),
444 "shed should be a short word" );
445
446 $this->assertTrue( porterstemmer_short_word( 'shred', 5, 'Stemmer steps' ),
447 "shred should be a short word" );
448
449 $this->assertFalse( porterstemmer_short_word( 'bead', 4, 'Stemmer steps' ),
450 "bead should not be a short word" );
451
452 $this->assertFalse( porterstemmer_short_word( 'beds', 3, 'Stemmer steps' ),
453 "beds should not be a short word" );
454
455 $this->assertFalse( porterstemmer_short_word( 'bake', 3, 'Stemmer steps' ),
456 "bake should not be a short word" );
457
458 $this->assertTrue( porterstemmer_short_word( 'bak', 3, 'Stemmer steps' ),
459 "bak should be a short word" );
460 }
461
462 /**
463 * Test internal steps on the word "administered".
464 */
465 function testAdministered() {
466
467 $r1 = 0;
468 $r2 = 0;
469 $word = 'administered';
470
471 porterstemmer_prestemming( $word, $r1, $r2 );
472
473 // Test calculation of R1 and R2
474 $this->assertEqual( $r1, 2, "R1 for administered should be 2, was $r1", 'Stemmer steps' );
475 $this->assertEqual( $r2, 5, "R2 for administered should be 5, was $r2", 'Stemmer steps' );
476
477 // Test step 1b of the algorithm
478 porterstemmer_step1b($word, $r1);
479 $this->assertEqual( $word, 'administer', "Step1b should be administer, was $word", 'Stemmer steps' );
480
481 // Test step 4 of the algorithm
482 porterstemmer_step4($word, $r2);
483 $this->assertEqual( $word, 'administ', "Step4 should be administ, was $word", 'Stemmer steps' );
484 }
485
486 /**
487 * Test internal steps on the word "baked".
488 */
489 function testBaked() {
490
491 $r1 = 0;
492 $r2 = 0;
493 $word = 'baked';
494
495 porterstemmer_prestemming( $word, $r1, $r2 );
496
497 // Test calculation of R1 and R2
498 $this->assertEqual( $r1, 3, "R1 for baked should be 3, was $r1", 'Stemmer steps' );
499 $this->assertEqual( $r2, 5, "R2 for baked should be 5, was $r2", 'Stemmer steps' );
500
501 // Test step 1b of the algorithm
502 porterstemmer_step1b($word, $r1);
503 $this->assertEqual( $word, 'bake', "Step1b should be bake, was $word", 'Stemmer steps' );
504
505 // Test step 5 of the algorithm
506 porterstemmer_step5($word, $r1, $r2);
507 $this->assertEqual( $word, 'bake', "Step5 should be bake, was $word", 'Stemmer steps' );
508 }
509
510 /**
511 * Test internal steps on the word "geology".
512 */
513 function testGeology() {
514
515 $r1 = 0;
516 $r2 = 0;
517 $word = 'geology';
518
519 porterstemmer_prestemming( $word, $r1, $r2 );
520
521 // Test calculation of R1 and R2
522 $this->assertEqual( $r1, 4, "R1 for geology should be 4, was $r1", 'Stemmer steps' );
523 $this->assertEqual( $r2, 6, "R2 for geology should be 6, was $r2", 'Stemmer steps' );
524
525 // Test step 1c of the algorithm
526 porterstemmer_step1c($word);
527 $this->assertEqual( $word, 'geologi', "Step1c should be geologi, was $word", 'Stemmer steps' );
528
529 // Test step 2 of the algorithm
530 porterstemmer_step2($word, $r1);
531 $this->assertEqual( $word, 'geolog', "Step2 should be geolog, was $word", 'Stemmer steps' );
532 }
533
534 }

  ViewVC Help
Powered by ViewVC 1.1.2