/[drupal]/contributions/modules/csplitter/csplitter.module
ViewVC logotype

Contents of /contributions/modules/csplitter/csplitter.module

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.6 - (show annotations) (download) (as text)
Mon Mar 26 14:56:52 2007 UTC (2 years, 8 months ago) by zealy
Branch: MAIN
CVS Tags: DRUPAL-5--1-0, HEAD
Branch point for: DRUPAL-6--1
Changes since 1.5: +1 -1 lines
File MIME type: text/x-php
add t() translation support for admin setting page
1 <?php
2 // $Id: csplitter.module,v 1.3 2006/04/29 05:18:43 zealy Exp $
3 // mailto: i.zealy AT gmail dot com
4
5 /**
6 * Implementation of hook_help
7 */
8 function csplitter_help($section) {
9 $output = "";
10
11 switch ($section) {
12 case "admin/modules#description":
13 $output = t("Provides an chinese splitter for search");
14 break;
15 }
16 return $output;
17 }
18
19 /**
20 * Implementation of hook_menu
21 */
22 function csplitter_menu($may_cache) {
23 $items = array();
24
25 if ($may_cache) {
26
27 $items[] = array('path' => 'csplitter',
28 'title' => t('chinese splitter'),
29 'callback' => 'csplitter_view',
30 'access' => user_access('search content'),
31 'type' => MENU_CALLBACK);
32
33 $items[] = array('path' => 'admin/settings/csplitter',
34 'title' => t('Chinese splitter'),
35 'description' => t('Configure relevance settings for splitter options'),
36 'callback' => 'drupal_get_form',
37 'callback arguments' => array('csplitter_admin_settings'),
38 'access' => user_access('administer search'),
39 'type' => MENU_NORMAL_ITEM);
40 }
41 return $items;
42 }
43
44 /**
45 * Menu callback; displays the module settings page.
46 */
47 function csplitter_admin_settings() {
48 $form = array();
49
50 $form['weight'] = array(
51 '#type' => 'markup',
52 '#value' => t('Optional. You can test your configuration '.l('here','csplitter').'.'),
53 );
54
55 $form['Config Algorithm'] = array('#type' => 'fieldset',
56 '#title' => t('Config Arithmetic'),
57 '#collapsible' => TRUE,
58 '#description' => t('Choose which algorithm to use. If you change algorithm, you should re-index search database.'),
59 );
60 if (!function_exists('mb_strlen')) {
61 $form['Config Algorithm']['Error Message']= array('#type' => 'markup',
62 '#value' => '<font color=red>'.t('Your PHP have no mbstring support. Csplitter require it to run correctly.').'</font>',
63 );
64 return system_settings_form($form);
65 }
66
67 $algors = array(1 => t('Forward maximum matching'), 2 => t('Conversely maximum mathing'));
68 $form['Config Algorithm']['csplitter_algorithm'] = array('#type' => 'radios',
69 '#title' => t('Default Algorithm'),
70 '#default_value' => variable_get('csplitter_algorithm', '1'),
71 '#options' => $algors,
72 '#description' => t('The default algorithm for word splitting.'),
73 );
74 $form['Define Cache'] = array('#type' => 'fieldset',
75 '#title' => t('Define Cache'),
76 '#collapsible' => TRUE,
77 '#description' => t('Config using cache for dictionary or not. You can test performance <a href="/csplitter">here</a>.'),
78 );
79 $cache_method = array('none' => t('No caching'), 'disk' => t('Caching to disk'), 'database' => t('Caching to database'));
80 $form['Define Cache']['csplitter_cache'] = array('#type' => 'radios',
81 '#title' => t('Define Dictionary Caching'),
82 '#default_value' => variable_get('csplitter_cache', 'none'),
83 '#options' => $cache_method,
84 '#description' => t('In most server system, disk caching is fastest. If you use database caching, you should sure your database can accept very large sql sentence(max_allowed_packet for mysql should be > 5M possible). '),
85 );
86
87 return system_settings_form($form);
88 }
89
90 /* Menu callback; lists all nodes posted on a given date.
91 */
92 function csplitter_view() {
93 global $user;
94 $test = variable_get('csplitter_test', NULL);
95 if ($test){
96 $output = '';
97 $test = trim($test);
98 $function = '_csplitter_splite_'. variable_get('csplitter_algorithm', '1');
99
100 if (mb_strlen($test)>0){
101 $cache_method = variable_get('csplitter_cache', 'none');
102 timer_start('csplitter');
103 $outstr = call_user_func($function, $test);//_csplitter_splite_1($test);
104 drupal_set_message($function.', cache:'.$cache_method.'. Processing used '.timer_read('csplitter')."ms");
105 }
106 variable_set('csplitter_test', NULL);
107 $output .= "<h4>input: ".$test."</h4>";
108 $output .= "<h4>output: ".$outstr."</h4>";
109 }
110
111 //return theme('page', $output.csplitter_form());
112 return $output.drupal_get_form('csplitter_form');
113 }
114
115 function csplitter_form(){
116
117 $form['test'] = array(
118 '#type' => 'textarea',
119 '#title' => t('Input'),
120 //'#default_value' => $edit['test'],
121 '#description' => t('Input test sentences(Chinese or other) here.')
122 );
123 $form['submit'] = array(
124 '#type' => 'submit',
125 '#value' => t('Test')
126 );
127 //return drupal_get_form('csplitter_form', $form, 'csplitter_form');
128 return $form;
129 }
130
131 function csplitter_form_submit($form_id, $form_values){
132 if (mb_strlen($form_values['test'])<=1000){
133 variable_set('csplitter_test', $form_values['test']);
134 }
135 else{
136 drupal_set_message('Csplitter: Test String is too long. ( >1000 characters )', 'error');
137 }
138 }
139
140 function _csplitter_readkeys($filepath){
141 unset($keys);
142 $handle = fopen($filepath, 'r');
143 if ($handle){
144 while (!feof($handle)) {
145 $buffer = trim(fgets($handle));
146 $keys[$buffer] = 0;
147 }
148 fclose($handle);
149 }
150 return $keys;
151 }
152
153 function _csplitter_splite_1($str){
154 // Search common pattern is ignored
155 // char value is setted by search.module
156 //drupal_set_message("$str|".ord($str[0])." ".ord($str[mb_strlen($str)-1]));
157 if (ord($str[0])==239){// && ord($str[mb_strlen($str)-1]) == 239){
158 return $str;
159 }
160
161 $filepath = drupal_get_path('module', 'csplitter')."/dict.txt";
162 $fdicpath = drupal_get_path('module', 'csplitter')."/filter.txt";
163 $arraypath = drupal_get_path('module', 'csplitter')."/cs_keys.obj";
164 $fobjpath = drupal_get_path('module', 'csplitter')."/filter_keys.obj";
165
166 $cs_keys = array();
167 $cf_keys = array();
168
169 // init dictionary keys and cache array to disk
170 $cache_method = variable_get('csplitter_cache', 'none');
171 switch ($cache_method){
172 case 'none':
173 $cs_keys = _csplitter_readkeys($filepath);
174 $cf_keys = _csplitter_readkeys($fdicpath);
175 break;
176 case 'disk':
177 if (!file_exists($arraypath)){
178 $cs_keys = _csplitter_readkeys($filepath);
179 $serialized = serialize($cs_keys);
180 $handle = fopen($arraypath, 'w');
181 if ($handle){
182 fwrite($handle, $serialized);
183 fclose($handle);
184 }
185 }
186 else{
187 $handle = fopen($arraypath, 'r');
188 if ($handle){
189 $cs_keys = unserialize(fread($handle, filesize($arraypath)));
190 fclose($handle);
191 }
192 }
193 // init dictionary keys and cache array to disk
194 if (!file_exists($fobjpath)){
195 $cf_keys = _csplitter_readkeys($fdicpath);
196 $serialized = serialize($cf_keys);
197 $handle = fopen($fobjpath, 'w');
198 if ($handle){
199 fwrite($handle, $serialized);
200 fclose($handle);
201 }
202 }
203 else{
204 $handle = fopen($fobjpath, 'r');
205 if ($handle){
206 $cf_keys = unserialize(fread($handle, filesize($fobjpath)));
207 fclose($handle);
208 }
209 }
210 break;
211 case 'database':
212 $cache = cache_get('csplitter:cs');
213 if ($cache){
214 $cs_keys = unserialize($cache->data);
215 }
216 else{
217 $cs_keys = _csplitter_readkeys($filepath);
218 cache_set('csplitter:cs', serialize($cs_keys), time()+86400);
219 }
220 $cache = cache_get('csplitter:cf');
221 if ($cache){
222 $cf_keys = unserialize($cache->data);
223 }
224 else{
225 $cf_keys = _csplitter_readkeys($fdicpath);
226 cache_set('csplitter:cf', serialize($cf_keys), time()+86400);
227 }
228 break;
229 }
230
231 //forward max match
232 $outstr = "";
233 //drupal_set_message(mb_strlen($str)."|$str|");
234 for ($i=0; $i<mb_strlen($str);){
235 $found = 1;
236 for ($len = 1; $len< 8 && $i - $len>= 0; $len++){
237 $word = mb_substr($str, $i, $len, "UTF-8");
238 if (ord($word[0])<176){ // not chinese
239 break;
240 }
241 //now process chinese
242 if ($len ==1 ||array_key_exists($word, $cs_keys)){
243 $found = $len;
244 continue;
245 }
246 //else{
247 // $len--;
248 // break;
249 //}
250 }
251 $len = $found;
252 $word = mb_substr($str, $i, $len, "UTF-8");
253 //drupal_set_message(mb_strlen($str).": $i - $found :".$word);
254
255 if ($len >1) // || ($len == 1 && ord($word[0])>=176))
256 $outstr .= " ".$word." ";
257 else{
258 if (ord($word[0])>=176){
259 if (array_key_exists($word, $cf_keys))
260 $outstr .= " ".$word." ";
261 else{
262 $nextchar = mb_substr($str, $i+1, 1, "UTF-8");
263 //Need Test: if (mb_strlen($nextchar) > 0 && ord($nextchar[0])<176){
264 if (ord($nextchar[0])<176){
265 $outstr .= $word. " ";
266 }
267 else{
268 $outstr .= "".$word."";
269 }
270 }
271 }
272 else{
273 $outstr .= $word;
274 }
275 }
276
277 $i += $len;
278 }
279
280 //drupal_set_message($outstr);
281 return $outstr;
282 }
283
284 function _csplitter_splite_2($str){
285 $filepath = drupal_get_path('module', 'csplitter')."/dict.txt";
286 $fdicpath = drupal_get_path('module', 'csplitter')."/filter.txt";
287 $arraypath = drupal_get_path('module', 'csplitter')."/cs_keys.obj";
288 $fobjpath = drupal_get_path('module', 'csplitter')."/filter_keys.obj";
289
290 $cs_keys = array();
291 $cf_keys = array();
292
293 // init dictionary keys and cache array to disk
294 $cache_method = variable_get('csplitter_cache', 'none');
295 switch ($cache_method){
296 case 'none':
297 $cs_keys = _csplitter_readkeys($filepath);
298 $cf_keys = _csplitter_readkeys($fdicpath);
299 break;
300 case 'disk':
301 if (!file_exists($arraypath)){
302 $cs_keys = _csplitter_readkeys($filepath);
303 $serialized = serialize($cs_keys);
304 $handle = fopen($arraypath, 'w');
305 if ($handle){
306 fwrite($handle, $serialized);
307 fclose($handle);
308 }
309 }
310 else{
311 $handle = fopen($arraypath, 'r');
312 if ($handle){
313 $cs_keys = unserialize(fread($handle, filesize($arraypath)));
314 fclose($handle);
315 }
316 }
317 // init dictionary keys and cache array to disk
318 if (!file_exists($fobjpath)){
319 $cf_keys = _csplitter_readkeys($fdicpath);
320 $serialized = serialize($cf_keys);
321 $handle = fopen($fobjpath, 'w');
322 if ($handle){
323 fwrite($handle, $serialized);
324 fclose($handle);
325 }
326 }
327 else{
328 $handle = fopen($fobjpath, 'r');
329 if ($handle){
330 $cf_keys = unserialize(fread($handle, filesize($fobjpath)));
331 fclose($handle);
332 }
333 }
334 break;
335 case 'database':
336 $cache = cache_get('csplitter:cs');
337 if ($cache){
338 $cs_keys = unserialize($cache->data);
339 }
340 else{
341 $cs_keys = _csplitter_readkeys($filepath);
342 cache_set('csplitter:cs', serialize($cs_keys), time()+86400);
343 }
344 $cache = cache_get('csplitter:cf');
345 if ($cache){
346 $cf_keys = unserialize($cache->data);
347 }
348 else{
349 $cf_keys = _csplitter_readkeys($fdicpath);
350 cache_set('csplitter:cf', serialize($cf_keys), time()+86400);
351 }
352 break;
353 }
354
355 //converse max match
356 $outstr = "";
357 for ($i=mb_strlen($str); $i>0;){
358 $found = 1;
359 for ($len = 1; $len< 8 && $i - $len>= 0; $len++){
360 $word = mb_substr($str, $i-$len, $len, "UTF-8");
361
362 //now process chinese
363 if ($len ==1 ||array_key_exists($word, $cs_keys)){
364 $found = $len;
365 continue;
366 }
367 //else{
368 // $len--;
369 // break;
370 //}
371
372 if (ord($word[0])<176){ // not chinese
373 break;
374 }
375 }
376 $len = $found;
377
378 $word = mb_substr($str, $i-$len, $len, "UTF-8");
379 //drupal_set_message("$i - $found :".$word);
380
381 if ($len >1){
382 $outstr = " " .$word. " " . $outstr;
383 }
384 else{
385 $nextchar = mb_substr($str, $i-$len-1, 1, "UTF-8");
386 if (ord($nextchar[0])>=176){
387 if (ord($word[0])<176){
388 $outstr = " " . $word . $outstr;
389 }
390 else if (array_key_exists($word, $cf_keys)){
391 $outstr = " " . $word . " ". $outstr;
392 }
393 else
394 $outstr = "" . $word . $outstr;
395 }
396 else if (ord($outstr[0])>=176)
397 $outstr = $word. " " .$outstr;
398 else {
399 $outstr = $word . $outstr;
400 //drupal_set_message($word);
401 }
402 }
403
404 $i -= $len;
405 }
406
407 return $outstr;
408 }
409
410 function csplitter_search_preprocess($text) {
411 $function = '_csplitter_splite_'. variable_get('csplitter_algorithm', '1');
412 return call_user_func($function, $text);//_csplitter_splite_1($text);
413 }
414
415 ?>

  ViewVC Help
Powered by ViewVC 1.1.2