/[drupal]/contributions/modules/porterstemmer/porterstemmer.module
ViewVC logotype

Diff of /contributions/modules/porterstemmer/porterstemmer.module

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph | View Patch Patch

revision 1.2.2.1, Wed Aug 26 19:22:49 2009 UTC revision 1.2.2.2, Thu Sep 10 14:50:28 2009 UTC
# Line 1  Line 1 
1  <?php  <?php
2  // $Id: porterstemmer.module,v 1.2 2009/07/14 23:38:47 jhodgdon Exp $  // $Id: porterstemmer.module,v 1.2.2.1 2009/08/26 19:22:49 jhodgdon Exp $
3    
4  /**  /**
5   * @file   * @file
# Line 50  function porterstemmer_help($section = ' Line 50  function porterstemmer_help($section = '
50  }  }
51    
52  /**  /**
53     * Implementation of hook_sbp_excerpt_match().
54     */
55    function porterstemmer_sbp_excerpt_match( $key, $text, $offset, $boundary ) {
56      // Stem the keyword down to its root form.
57      $key = porterstemmer_stem( $key );
58    
59      // In many cases, the root word is a substring of the full word, but not
60      // all. The cases where it is not, the root ends in e, i, or y, and if this
61      // last letter is removed, the root is a substring of the full word.
62      // So remove these letters at the end of the root.
63    
64      $didit = FALSE;
65      porterstemmer_suffix( $key, 'i', '', $didit, NULL, 2 ) OR
66        porterstemmer_suffix( $key, 'e', '', $didit, NULL, 2 ) OR
67        porterstemmer_suffix( $key, 'y', '', $didit, NULL, 2 );
68    
69      // Look for this modified key at the start of a word.
70    
71      $match = array();
72      if (!preg_match('/'. $boundary . $key . '/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset )) {
73        // didn't match our modified key.
74        return FALSE;
75      }
76    
77      // If we get here, we have a potential match. Find the end of the word we
78      // actually matched, so it can be highlighted (making sure it's a real match
79      // for our key).
80      $newmatch = array();
81      foreach ( $match as $item ) {
82        $pos = $match[0][1];
83        if (preg_match('/' . $boundary . '/iu', $text, $newmatch,
84            PREG_OFFSET_CAPTURE, $pos + strlen($key))) {
85          $keyfound = substr($text, $pos, $newmatch[0][1] - $pos);
86          $foundstem = porterstemmer_stem( $keyfound );
87          porterstemmer_suffix( $foundstem, 'i', '', $didit, NULL, 2 ) OR
88            porterstemmer_suffix( $foundstem, 'e', '', $didit, NULL, 2 ) OR
89            porterstemmer_suffix( $foundstem, 'y', '', $didit, NULL, 2 );
90    
91          if ($foundstem == $key) {
92            return array('where' => $pos, 'keyword' => $keyfound);
93          }
94        }
95      }
96    
97      // If we get here, none of the potential matches worked out.
98      return FALSE;
99    }
100    
101    /**
102   * Regular expression defining a vowel for Porter Stemmer purposes.   * Regular expression defining a vowel for Porter Stemmer purposes.
103   */   */
104  define('PORTERSTEMMER_VOWEL', '[aeiouy]');  define('PORTERSTEMMER_VOWEL', '[aeiouy]');
# Line 183  function porterstemmer_suffix(&$word, $o Line 232  function porterstemmer_suffix(&$word, $o
232      // ending isn't even there      // ending isn't even there
233      return FALSE;      return FALSE;
234    }    }
235    
236    // Does word match other regular expression?    // Does word match other regular expression?
237    if ($other && !preg_match($other, $word)) {    if ($other && !preg_match($other, $word)) {
238      // no match, so just return without replacing      // no match, so just return without replacing
# Line 231  function porterstemmer_short_word( $word Line 280  function porterstemmer_short_word( $word
280    }    }
281    
282    // Does it end in the other type of short syllable?    // Does it end in the other type of short syllable?
283    if (preg_match('/' . PORTERSTEMMER_NOT_VOWEL . PORTERSTEMMER_VOWEL .    if (preg_match('/' . PORTERSTEMMER_NOT_VOWEL . PORTERSTEMMER_VOWEL .
284        PORTERSTEMMER_NOT_VOWEL_WXY . '$/', $word)) {        PORTERSTEMMER_NOT_VOWEL_WXY . '$/', $word)) {
285      return TRUE;      return TRUE;
286    }    }
# Line 274  function porterstemmer_prestemming(&$wor Line 323  function porterstemmer_prestemming(&$wor
323    $count = 1;    $count = 1;
324    while ( $count ) {    while ( $count ) {
325      // Do this replacement one by one, to avoid unlikely yyyy issues      // Do this replacement one by one, to avoid unlikely yyyy issues
326      $tmp = preg_replace('/(' . PORTERSTEMMER_VOWEL . ')y/', '$1Y',      $tmp = preg_replace('/(' . PORTERSTEMMER_VOWEL . ')y/', '$1Y',
327        $tmp, 1, $count);        $tmp, 1, $count);
328    }    }
329    
# Line 288  function porterstemmer_prestemming(&$wor Line 337  function porterstemmer_prestemming(&$wor
337    $r1 = $max;    $r1 = $max;
338    $r2 = $max;    $r2 = $max;
339    $matches = array();    $matches = array();
340    $rdef = '/^' . PORTERSTEMMER_NOT_VOWEL . '*' .    $rdef = '/^' . PORTERSTEMMER_NOT_VOWEL . '*' .
341      PORTERSTEMMER_VOWEL . '+(' . PORTERSTEMMER_NOT_VOWEL . ')/';      PORTERSTEMMER_VOWEL . '+(' . PORTERSTEMMER_NOT_VOWEL . ')/';
342    
343    // Exceptions to R1: If word begins with 'gener', 'commun', or 'arsen',    // Exceptions to R1: If word begins with 'gener', 'commun', or 'arsen',
# Line 370  function porterstemmer_step1a(&$word) { Line 419  function porterstemmer_step1a(&$word) {
419      porterstemmer_suffix($tmp, 'us', 'us', $didit) OR      porterstemmer_suffix($tmp, 'us', 'us', $didit) OR
420      // only delete s at end of word if there is at least one vowel that      // only delete s at end of word if there is at least one vowel that
421      // is not immediately before the s      // is not immediately before the s
422      porterstemmer_suffix($tmp, 's', '', $didit,      porterstemmer_suffix($tmp, 's', '', $didit,
423        '/' . PORTERSTEMMER_VOWEL . '.+s$/');        '/' . PORTERSTEMMER_VOWEL . '.+s$/');
424    }    }
425    
# Line 414  function porterstemmer_step1b(&$word, $r Line 463  function porterstemmer_step1b(&$word, $r
463      $done = porterstemmer_suffix($tmp, 'at', 'ate', $didit) OR      $done = porterstemmer_suffix($tmp, 'at', 'ate', $didit) OR
464        porterstemmer_suffix($tmp, 'bl', 'ble', $didit) OR        porterstemmer_suffix($tmp, 'bl', 'ble', $didit) OR
465        porterstemmer_suffix($tmp, 'iz', 'ize', $didit);        porterstemmer_suffix($tmp, 'iz', 'ize', $didit);
466      if (!$done &&      if (!$done &&
467        preg_match('/' . PORTERSTEMMER_DOUBLE . '$/', $tmp)) {        preg_match('/' . PORTERSTEMMER_DOUBLE . '$/', $tmp)) {
468        // drop last letter if it's a double-letter ending        // drop last letter if it's a double-letter ending
469        $tmp = drupal_substr($tmp, 0, -1);        $tmp = drupal_substr($tmp, 0, -1);
# Line 488  function porterstemmer_step2(&$word, $r1 Line 537  function porterstemmer_step2(&$word, $r1
537      porterstemmer_suffix($tmp, 'izer', 'ize', $didit, NULL, $r1 + 4) OR      porterstemmer_suffix($tmp, 'izer', 'ize', $didit, NULL, $r1 + 4) OR
538      porterstemmer_suffix($tmp, 'bli', 'ble', $didit, NULL, $r1 + 3) OR      porterstemmer_suffix($tmp, 'bli', 'ble', $didit, NULL, $r1 + 3) OR
539      // ogi is only replaced if preceeded by l      // ogi is only replaced if preceeded by l
540      porterstemmer_suffix($tmp, 'ogi', 'og', $didit,      porterstemmer_suffix($tmp, 'ogi', 'og', $didit,
541        '/logi$/', $r1 + 3) OR        '/logi$/', $r1 + 3) OR
542      // li is only replaced if preceeded by a valid li-ending      // li is only replaced if preceeded by a valid li-ending
543      porterstemmer_suffix($tmp, 'li', '', $didit,      porterstemmer_suffix($tmp, 'li', '', $didit,
544        '/' . PORTERSTEMMER_LI_END . 'li$/', $r1 + 2);        '/' . PORTERSTEMMER_LI_END . 'li$/', $r1 + 2);
545    
546    return porterstemmer_step_ending($word, $tmp);    return porterstemmer_step_ending($word, $tmp);
# Line 586  function porterstemmer_step5(&$word, $r1 Line 635  function porterstemmer_step5(&$word, $r1
635    // a short syllable    // a short syllable
636    $len = drupal_strlen( $tmp );    $len = drupal_strlen( $tmp );
637    if ( !$done && preg_match( '/e$/', $tmp ) &&    if ( !$done && preg_match( '/e$/', $tmp ) &&
638      ( $len > $r2 ||      ( $len > $r2 ||
639        ( $len > $r1 &&        ( $len > $r1 &&
640          !preg_match( '/^' . PORTERSTEMMER_VOWEL . PORTERSTEMMER_NOT_VOWEL .          !preg_match( '/^' . PORTERSTEMMER_VOWEL . PORTERSTEMMER_NOT_VOWEL .
641            'e$/', $tmp ) &&            'e$/', $tmp ) &&
642          !preg_match( '/' . PORTERSTEMMER_NOT_VOWEL . PORTERSTEMMER_VOWEL .          !preg_match( '/' . PORTERSTEMMER_NOT_VOWEL . PORTERSTEMMER_VOWEL .
643            PORTERSTEMMER_NOT_VOWEL_WXY . 'e$/', $tmp )))) {            PORTERSTEMMER_NOT_VOWEL_WXY . 'e$/', $tmp )))) {
644      $tmp = drupal_substr( $tmp, 0, -1 );      $tmp = drupal_substr( $tmp, 0, -1 );
645    }    }

Legend:
Removed from v.1.2.2.1  
changed lines
  Added in v.1.2.2.2

  ViewVC Help
Powered by ViewVC 1.1.2