/[drupal]/contributions/modules/csplitter/csplitter.module
ViewVC logotype

Diff of /contributions/modules/csplitter/csplitter.module

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph | View Patch Patch

revision 1.6, Mon Mar 26 14:56:52 2007 UTC revision 1.6.2.1, Sat Apr 19 12:48:02 2008 UTC
# Line 1  Line 1 
1  <?php  <?php
2  // $Id: csplitter.module,v 1.3 2006/04/29 05:18:43 zealy Exp $  // $Id: csplitter.module,v 1.5 2007/03/26 14:34:52 zealy Exp $
3  // mailto: i.zealy AT gmail dot com  // mailto: i.zealy AT gmail dot com
4    
5  /**  /**
# Line 19  function csplitter_help($section) { Line 19  function csplitter_help($section) {
19  /**  /**
20   * Implementation of hook_menu   * Implementation of hook_menu
21   */   */
22  function csplitter_menu($may_cache) {  function csplitter_menu() {
23    $items = array();    $items = array();
24    
25    if ($may_cache) {      $items['csplitter'] = array(
   
     $items[] = array('path' => 'csplitter',  
26        'title' => t('chinese splitter'),        'title' => t('chinese splitter'),
27        'callback' => 'csplitter_view',        'page callback' => 'csplitter_view',
28        'access' => user_access('search content'),        'access arguments' => array('search content'),
29        'type' => MENU_CALLBACK);        'type' => MENU_CALLBACK);
30    
31      $items[] = array('path' => 'admin/settings/csplitter',      $items['admin/settings/csplitter'] = array(
32        'title' => t('Chinese splitter'),        'title' => t('Chinese splitter'),
33        'description' => t('Configure relevance settings for splitter options'),        'description' => t('Configure relevance settings for splitter options'),
34        'callback' => 'drupal_get_form',        'page callback' => 'drupal_get_form',
35        'callback arguments' => array('csplitter_admin_settings'),        'page arguments' => array('csplitter_admin_settings'),
36        'access' => user_access('administer search'),        'access arguments' => array('administer search'),
37        'type' => MENU_NORMAL_ITEM);        'type' => MENU_NORMAL_ITEM);
38    }  
39    return $items;    return $items;
40  }  }
41    
# Line 49  function csplitter_admin_settings() { Line 47  function csplitter_admin_settings() {
47    
48    $form['weight'] = array(    $form['weight'] = array(
49      '#type' => 'markup',      '#type' => 'markup',
50      '#value' => t('Optional. You can test your configuration '.l('here','csplitter').'.'),      '#value' => 'Optional. You can test your configuration '.l('here','csplitter').'.',
51    );    );
52    
53    $form['Config Algorithm'] = array('#type' => 'fieldset',    $form['Config Algorithm'] = array('#type' => 'fieldset',
# Line 64  function csplitter_admin_settings() { Line 62  function csplitter_admin_settings() {
62      return system_settings_form($form);      return system_settings_form($form);
63    }    }
64    
65    $algors = array(1 => t('Forward maximum matching'), 2 => t('Conversely maximum mathing'));    $algors = array(1 => t('Forward maximum matching'), 2 => t('Conversely maximum matching'), 3 => t('Forward minimum matching'), 4 => t('Conversely minimum matching'));
66    $form['Config Algorithm']['csplitter_algorithm'] = array('#type' => 'radios',    $form['Config Algorithm']['csplitter_algorithm'] = array('#type' => 'radios',
67      '#title' => t('Default Algorithm'),      '#title' => t('Default Algorithm'),
68      '#default_value' => variable_get('csplitter_algorithm', '1'),      '#default_value' => variable_get('csplitter_algorithm', '3'),
69      '#options' => $algors,      '#options' => $algors,
70      '#description' => t('The default algorithm for word splitting.'),      '#description' => t('The default algorithm for word splitting. Minimum algorithm is much faster, but it will less precise.'),
71      );
72    
73      $form['Search Word Length'] = array('#type' => 'fieldset',
74        '#title' => t('Search Word Length'),
75        '#collapsible' => TRUE,
76        '#description' => t('Search word length will visibly effect performance, 4 is default.'),
77      );
78      $form['Search Word Length']['csplitter_word_len'] = array('#type' => 'radios',
79        '#title' => t('Select Word Length'),
80        '#default_value' => variable_get('csplitter_word_len', 2),
81        '#options' => array(2,3,4,5,6,7),
82        '#description' => t("Longer word will take more time to process. If you don't know its meaning, keep it to default"),
83    );    );
84    
85    $form['Define Cache'] = array('#type' => 'fieldset',    $form['Define Cache'] = array('#type' => 'fieldset',
86      '#title' => t('Define Cache'),      '#title' => t('Define Cache'),
87      '#collapsible' => TRUE,      '#collapsible' => TRUE,
88      '#description' => t('Config using cache for dictionary or not. You can test performance <a href="/csplitter">here</a>.'),      '#description' => t('Config using cache for dictionary or not. You can test performance <a href="/csplitter">here</a>.'),
89    );    );
90    $cache_method = array('none' => t('No caching'), 'disk' => t('Caching to disk'), 'database' => t('Caching to database'));    $cache_method = array(
91        'none' => t('No caching'),
92        'disk' => t('Caching to disk'),
93        'database' => t('Caching to database'),
94        'bplus' => t('Using B+ Indexed Dictionary'));
95    $form['Define Cache']['csplitter_cache'] = array('#type' => 'radios',    $form['Define Cache']['csplitter_cache'] = array('#type' => 'radios',
96      '#title' => t('Define Dictionary Caching'),      '#title' => t('Define Dictionary Caching'),
97      '#default_value' => variable_get('csplitter_cache', 'none'),      '#default_value' => variable_get('csplitter_cache', 'bplus'),
98      '#options' => $cache_method,      '#options' => $cache_method,
99      '#description' => t('In most server system, disk caching is fastest. If you use database caching, you should sure your database can accept very large sql sentence(max_allowed_packet for mysql should be > 5M possible). '),      '#description' => t('In most server system, B+ dictionary is fastest, disk is second. The methods except B+ will take you large memory. If you use database caching, you should sure your database can accept very large sql sentence(max_allowed_packet for mysql should be > 5M possible). '),
100    );    );
101    
102    return system_settings_form($form);    return system_settings_form($form);
# Line 89  function csplitter_admin_settings() { Line 104  function csplitter_admin_settings() {
104    
105  /* Menu callback; lists all nodes posted on a given date.  /* Menu callback; lists all nodes posted on a given date.
106   */   */
107  function csplitter_view() {  function csplitter_view($test = NULL) {
108    global $user;    global $user;
109    $test = variable_get('csplitter_test', NULL);    //$test = variable_get('csplitter_test', NULL);
   if ($test){  
     $output = '';  
     $test = trim($test);  
     $function = '_csplitter_splite_'. variable_get('csplitter_algorithm', '1');  
   
     if (mb_strlen($test)>0){  
       $cache_method = variable_get('csplitter_cache', 'none');  
       timer_start('csplitter');  
       $outstr = call_user_func($function, $test);//_csplitter_splite_1($test);  
       drupal_set_message($function.', cache:'.$cache_method.'. Processing used '.timer_read('csplitter')."ms");  
     }  
     variable_set('csplitter_test', NULL);  
     $output .= "<h4>input: ".$test."</h4>";  
     $output .= "<h4>output: ".$outstr."</h4>";  
   }  
110    
111    //return theme('page', $output.csplitter_form());    //return theme('page', $output.csplitter_form());
112    return $output.drupal_get_form('csplitter_form');    return $output.drupal_get_form('csplitter_form', $test);
113  }  }
114    
115  function csplitter_form(){  function csplitter_form($form_state, $test = NULL){
116    
117    $form['test'] = array(    $form['test'] = array(
118      '#type' => 'textarea',      '#type' => 'textarea',
# Line 128  function csplitter_form(){ Line 128  function csplitter_form(){
128    return $form;    return $form;
129  }  }
130    
131  function csplitter_form_submit($form_id, $form_values){  function csplitter_form_submit($form, $form_state){
132    if (mb_strlen($form_values['test'])<=1000){    if (mb_strlen($form_state['values']['test']) > 10000){
133      variable_set('csplitter_test', $form_values['test']);      drupal_set_message('Csplitter: Test String is too long. ( >10000 characters )', 'error');
134    }    }
135    else{    else{
136      drupal_set_message('Csplitter: Test String is too long. ( >1000 characters )', 'error');      $test = $form_state['values']['test'];
137        $output = '';
138        $test = trim($test);
139        $function = '_csplitter_splite_'. variable_get('csplitter_algorithm', '3');
140    
141        if (mb_strlen($test)>0){
142          $cache_method = variable_get('csplitter_cache', 'bplus');
143          timer_start('csplitter');
144          $outstr = call_user_func($function, $test);//_csplitter_splite_1($test);
145          drupal_set_message($function.', cache:'.$cache_method.'. Processing used '.timer_read('csplitter')."ms");
146        }
147        //$output .= "<h4>input: ".$test."</h4>";
148        //$output .= "<h4>output: ".$outstr."</h4>";
149        drupal_set_message(t('Output:').'<br />'.$outstr);
150    }    }
151  }  }
152    
# Line 150  function _csplitter_readkeys($filepath){ Line 163  function _csplitter_readkeys($filepath){
163    return $keys;    return $keys;
164  }  }
165    
166  function _csplitter_splite_1($str){  function _csplitter_splite_3($str){
167      return _csplitter_splite_1($str, true);
168    }
169    
170    function _csplitter_splite_1($str, $minmatch = false){
171    // Search common pattern is ignored    // Search common pattern is ignored
172    // char value is setted by search.module    // char value is setted by search.module
173    //drupal_set_message("$str|".ord($str[0])." ".ord($str[mb_strlen($str)-1]));    //drupal_set_message("$str|".ord($str[0])." ".ord($str[mb_strlen($str)-1]));
# Line 158  function _csplitter_splite_1($str){ Line 175  function _csplitter_splite_1($str){
175      return $str;      return $str;
176    }    }
177    
178      $word_len =  variable_get('csplitter_word_len', 2) + 2;
179      // init dictionary keys and cache array to disk
180      $cache_method = variable_get('csplitter_cache', 'bplus');
181    
182    $filepath = drupal_get_path('module', 'csplitter')."/dict.txt";    $filepath = drupal_get_path('module', 'csplitter')."/dict.txt";
183    $fdicpath = drupal_get_path('module', 'csplitter')."/filter.txt";    // for big B+ dictionary, we use smallest filter keys
184      if ($cache_method == 'bplus')
185        $fdicpath = drupal_get_path('module', 'csplitter')."/bfilter.txt";
186      else
187        $fdicpath = drupal_get_path('module', 'csplitter')."/filter.txt";
188    $arraypath = drupal_get_path('module', 'csplitter')."/cs_keys.obj";    $arraypath = drupal_get_path('module', 'csplitter')."/cs_keys.obj";
189    $fobjpath = drupal_get_path('module', 'csplitter')."/filter_keys.obj";    $fobjpath = drupal_get_path('module', 'csplitter')."/filter_keys.obj";
190    
191    $cs_keys = array();    $cs_keys = array();
192    $cf_keys = array();    $cf_keys = array();
193    
   // init dictionary keys and cache array to disk  
   $cache_method = variable_get('csplitter_cache', 'none');  
194    switch ($cache_method){    switch ($cache_method){
195      case 'none':      case 'none':
196          $cs_keys = _csplitter_readkeys($filepath);          $cs_keys = _csplitter_readkeys($filepath);
197          $cf_keys = _csplitter_readkeys($fdicpath);          $cf_keys = _csplitter_readkeys($fdicpath);
198        break;        break;
199        case 'bplus':
200            $cf_keys = _csplitter_readkeys($fdicpath);
201          break;
202      case 'disk':      case 'disk':
203          if (!file_exists($arraypath)){          if (!file_exists($arraypath)){
204            $cs_keys = _csplitter_readkeys($filepath);            $cs_keys = _csplitter_readkeys($filepath);
# Line 231  function _csplitter_splite_1($str){ Line 257  function _csplitter_splite_1($str){
257    //forward max match    //forward max match
258    $outstr = "";    $outstr = "";
259    //drupal_set_message(mb_strlen($str)."|$str|");    //drupal_set_message(mb_strlen($str)."|$str|");
260    
261      if ($cache_method == 'bplus'){
262        $handle = fopen(drupal_get_path('module', 'csplitter').'/btree.dat', 'rb');
263    
264        if ($handle == false){
265            drupal_set_message("CSplitter: open b+ file failed!");
266            return "";
267        }
268      }
269    
270    for ($i=0; $i<mb_strlen($str);){    for ($i=0; $i<mb_strlen($str);){
271      $found = 1;      $found = 1;
272      for ($len = 1; $len< 8 && $i - $len>= 0; $len++){      $from = 1;
273        $to = $word_len + 1;
274        $step = 1;
275        for ($len = $from; $len< $to; $len += $step){
276          if ($minmatch && $found != 1) break;
277    
278        $word = mb_substr($str, $i, $len, "UTF-8");        $word = mb_substr($str, $i, $len, "UTF-8");
279          //drupal_set_message("$i, $len,$word");
280    
281        if (ord($word[0])<176){ // not chinese        if (ord($word[0])<176){ // not chinese
282          break;          break;
283        }        }
284    
285          $lastchar = mb_substr($word, $len-1, 1, "UTF-8");
286          if (array_key_exists($lastchar, $cf_keys))
287            break;
288    
289        //now process chinese        //now process chinese
290        if ($len ==1 ||array_key_exists($word, $cs_keys)){  
291          $found = $len;        if ($cache_method == 'bplus'){
292          continue;          if ($len ==1 || search_in_file($word, $handle)){
293              $found = $len;
294              continue;
295            }
296          }
297          else{
298            if ($len ==1 ||array_key_exists($word, $cs_keys)){
299              $found = $len;
300              continue;
301            }
302            //else{
303            //  $len--;
304            //  break;
305            //}
306        }        }
       //else{  
       //  $len--;  
       //  break;  
       //}  
307      }      }
308      $len = $found;      $word = mb_substr($str, $i, $found, "UTF-8");
     $word = mb_substr($str, $i, $len, "UTF-8");  
309      //drupal_set_message(mb_strlen($str).": $i - $found :".$word);      //drupal_set_message(mb_strlen($str).": $i - $found :".$word);
310    
311      if ($len >1) // || ($len == 1 && ord($word[0])>=176))      if ($found >1) // || ($found == 1 && ord($word[0])>=176))
312        $outstr .= " ".$word." ";        $outstr .= " ".$word." ";
313      else{      else {
314        if (ord($word[0])>=176){        if (ord($word[0])>=176){
315          if (array_key_exists($word, $cf_keys))          if (array_key_exists($word, $cf_keys))
316            $outstr .= " ".$word." ";            $outstr .= " ".$word." ";
317          else{          else{
318            $nextchar = mb_substr($str, $i+1, 1, "UTF-8");            $lastchar = mb_substr($str, $i-1, 1, "UTF-8");
319            //Need Test: if (mb_strlen($nextchar) > 0 && ord($nextchar[0])<176){            $nextchar = mb_substr($str, $i+$found, 1, "UTF-8");
320            if (ord($nextchar[0])<176){            if (ord($lastchar[0])<176){
321              $outstr .= $word. " ";              //drupal_set_message($lastchar.":".$word.":".$nextchar);
322                $word = " ".$word;
323            }            }
324            else{            if (ord($nextchar[0])<176){
325              $outstr .= "".$word."";              //drupal_set_message($lastchar.":".$word.":".$nextchar);
326                $word = $word." ";
327            }            }
328              $outstr .= $word;
329          }          }
330        }        }
331        else{        else{
332            $outstr .= $word;            $outstr .= $word;
333        }        }
334      }      }
335        $i += $found;
     $i += $len;  
336    }    }
337    
338      if ($cache_method == 'bplus'){
339            fclose($handle);
340      }
341    //drupal_set_message($outstr);    //drupal_set_message($outstr);
342    return $outstr;    return $outstr;
343  }  }
344    
345  function _csplitter_splite_2($str){  function _csplitter_splite_4($str){
346       return _csplitter_splite_2($str, true);
347    }
348    
349    function _csplitter_splite_2($str, $minmatch = fasle){
350      $word_len =  variable_get('csplitter_word_len', 2) + 2;
351      // init dictionary keys and cache array to disk
352      $cache_method = variable_get('csplitter_cache', 'bplus');
353    
354    $filepath = drupal_get_path('module', 'csplitter')."/dict.txt";    $filepath = drupal_get_path('module', 'csplitter')."/dict.txt";
355    $fdicpath = drupal_get_path('module', 'csplitter')."/filter.txt";    if ($cache_method == 'bplus')
356        $fdicpath = drupal_get_path('module', 'csplitter')."/bfilter.txt";
357      else
358        $fdicpath = drupal_get_path('module', 'csplitter')."/filter.txt";
359    $arraypath = drupal_get_path('module', 'csplitter')."/cs_keys.obj";    $arraypath = drupal_get_path('module', 'csplitter')."/cs_keys.obj";
360    $fobjpath = drupal_get_path('module', 'csplitter')."/filter_keys.obj";    $fobjpath = drupal_get_path('module', 'csplitter')."/filter_keys.obj";
361    
362    $cs_keys = array();    $cs_keys = array();
363    $cf_keys = array();    $cf_keys = array();
364    
   // init dictionary keys and cache array to disk  
   $cache_method = variable_get('csplitter_cache', 'none');  
365    switch ($cache_method){    switch ($cache_method){
366      case 'none':      case 'none':
367          $cs_keys = _csplitter_readkeys($filepath);          $cs_keys = _csplitter_readkeys($filepath);
368          $cf_keys = _csplitter_readkeys($fdicpath);          $cf_keys = _csplitter_readkeys($fdicpath);
369        break;        break;
370        case 'bplus':
371            $cf_keys = _csplitter_readkeys($fdicpath);
372          break;
373      case 'disk':      case 'disk':
374          if (!file_exists($arraypath)){          if (!file_exists($arraypath)){
375            $cs_keys = _csplitter_readkeys($filepath);            $cs_keys = _csplitter_readkeys($filepath);
# Line 354  function _csplitter_splite_2($str){ Line 427  function _csplitter_splite_2($str){
427    
428    //converse max match    //converse max match
429    $outstr = "";    $outstr = "";
430    
431      if ($cache_method == 'bplus'){
432        $handle = fopen(drupal_get_path('module', 'csplitter').'/btree.dat', 'rb');
433    
434        if ($handle == false){
435            drupal_set_message("CSplitter: open b+ file failed!");
436            return "";
437        }
438      }
439    
440    for ($i=mb_strlen($str); $i>0;){    for ($i=mb_strlen($str); $i>0;){
441      $found = 1;      $found = 1;
442      for ($len = 1; $len< 8 && $i - $len>= 0; $len++){      for ($len = 1; $len< ($word_len+1); $len++){
443          if ($minmatch && $found != 1) break;
444        $word = mb_substr($str, $i-$len, $len, "UTF-8");        $word = mb_substr($str, $i-$len, $len, "UTF-8");
445    
446          if (ord($word[0])<176){ // not chinese
447            break;
448          }
449    
450          $firstchar = mb_substr($word, 0, 1, "UTF-8");
451          if (array_key_exists($firstchar, $cf_keys))
452            break;
453    
454        //now process chinese        //now process chinese
455        if ($len ==1 ||array_key_exists($word, $cs_keys)){  
456          $found = $len;        if ($cache_method == 'bplus'){
457          continue;          if ($len ==1 || search_in_file($word, $handle)){
458              $found = $len;
459              continue;
460            }
461          }
462          else{
463            if ($len ==1 ||array_key_exists($word, $cs_keys)){
464              $found = $len;
465              continue;
466            }
467        }        }
468        //else{        //else{
469        //  $len--;        //  $len--;
470        //  break;        //  break;
471        //}        //}
   
       if (ord($word[0])<176){ // not chinese  
         break;  
       }  
472      }      }
473      $len = $found;      $len = $found;
474    
# Line 397  function _csplitter_splite_2($str){ Line 494  function _csplitter_splite_2($str){
494          $outstr = $word. " " .$outstr;          $outstr = $word. " " .$outstr;
495        else {        else {
496          $outstr = $word . $outstr;          $outstr = $word . $outstr;
         //drupal_set_message($word);  
497        }        }
498      }      }
499    
500      $i -= $len;      $i -= $len;
501    }    }
502    
503      if ($cache_method == 'bplus'){
504            fclose($handle);
505      }
506    
507    return $outstr;    return $outstr;
508  }  }
509    
510  function csplitter_search_preprocess($text) {  function csplitter_search_preprocess($text) {
511    $function = '_csplitter_splite_'. variable_get('csplitter_algorithm', '1');    $function = '_csplitter_splite_'. variable_get('csplitter_algorithm', '3');
512    return call_user_func($function, $text);//_csplitter_splite_1($text);    return call_user_func($function, $text);//_csplitter_splite_1($text);
513  }  }
514    
515    //Advanced B+ Search
516    function search_in_file($SearchKey, $handle){
517    
518            //Read First Record
519            fseek($handle, 160);
520            $NumItemBin = fread($handle, 4);
521            $NumNodesBin= fread($handle, 4);
522            $RootBin = fread($handle, 4);
523    
524            $NumItemArray = unpack('l',$NumItemBin);
525            $NumNodesArray = unpack('l',$NumNodesBin);
526            $RootArray = unpack('l',$RootBin);
527    
528            $found = false;
529            $NodeSize = 208;
530            $CurrentRoot = $RootArray[1];
531    
532            while(($CurrentRoot != NULL && $CurrentRoot != -1) && (!$found)){
533    
534                fseek($handle, $CurrentRoot * $NodeSize);
535                $CurNodeCount = fread($handle, 4);
536                $CurNodeCount = unpack('l', $CurNodeCount);
537                $CurNodeCount = $CurNodeCount[1];
538    
539                for ($i=0; $i<11; $i++){
540                    $StrBin[$i] = fread($handle, 14);
541                    $StrArray = explode(chr(0x0), $StrBin[$i]);
542                    $StrBin[$i] = trim($StrArray[0]);
543                    //fread($handle, 38);
544                }
545                //Fix aligned bytes
546                fread($handle, 2);
547                for ($i=0; $i<12; $i++){
548                    $BranchArray = unpack('l', fread($handle, 4));
549                    $Branch[$i] = $BranchArray[1];
550                }
551    
552                if (strcmp($SearchKey, $StrBin[0]) < 0)
553                    $Location = -1;
554                else{
555                    $Location = $CurNodeCount -1;
556                    while ((strcmp($SearchKey, $StrBin[$Location])<0) && ($Location > 0)){
557                            $Location--;
558                    }
559    
560                    if (strcmp($SearchKey, $StrBin[$Location]) == 0){
561                            $found = true;
562                    }
563                }
564    
565                if ($found == false)
566                  $CurrentRoot = $Branch[$Location +1];
567            }
568    
569    //      drupal_set_message($SearchKey.':'.$found);
570            return $found;
571    }
572    
573    
574  ?>  ?>

Legend:
Removed from v.1.6  
changed lines
  Added in v.1.6.2.1

  ViewVC Help
Powered by ViewVC 1.1.2