/[drupal]/contributions/modules/apachesolr/apachesolr.index.inc
ViewVC logotype

Contents of /contributions/modules/apachesolr/apachesolr.index.inc

Parent Directory Parent Directory | Revision Log Revision Log | View Revision Graph Revision Graph


Revision 1.3 - (show annotations) (download) (as text)
Mon Jun 29 23:23:07 2009 UTC (4 months, 4 weeks ago) by pwolanin
Branch: MAIN
CVS Tags: HEAD
Changes since 1.2: +2 -2 lines
File MIME type: text/x-php
sync with DRUPAL-6--1
1 <?php
2 // $Id: apachesolr.index.inc,v 1.1.2.5 2009/05/19 19:58:23 pwolanin Exp $
3
4 /**
5 * @file
6 * Functions used when indexing content to Apache Solr.
7 */
8
9 /**
10 * Add a document to the $documents array based on a node ID.
11 */
12 function apachesolr_add_node_document(&$documents, $nid, $namespace) {
13 if ($document = apachesolr_node_to_document($nid, $namespace)) {
14 $documents[] = $document;
15 }
16 }
17
18 /**
19 * Strip control characters that cause Jetty/Solr to fail.
20 */
21 function apachesolr_strip_ctl_chars($text) {
22 // See: http://w3.org/International/questions/qa-forms-utf-8.html
23 // Printable utf-8 does not include any of these chars below x7F
24 return preg_replace('@[\x00-\x08\x0B\x0C\x0E-\x1F]@', ' ', $text);
25 }
26
27 /**
28 * Strip html tags and also control characters that cause Jetty/Solr to fail.
29 */
30 function apachesolr_clean_text($text) {
31 return _apachesolr_strip_decode(preg_replace('@[\x00-\x08\x0B\x0C\x0E-\x1F]@', ' ', $text));
32 }
33
34 function _apachesolr_strip_decode($text) {
35 // Add spaces before stripping tags to avoid running words together.
36 $text = filter_xss(str_replace(array('<', '>'), array(' <', '> '), $text), array());
37 // Decode entities and then make safe any < or > characters.
38 return htmlspecialchars(html_entity_decode($text, ENT_NOQUOTES, 'UTF-8'), ENT_NOQUOTES, 'UTF-8');
39 }
40
41 /**
42 * Given a node ID, return a document representing that node.
43 */
44 function apachesolr_node_to_document($nid, $namespace) {
45 // Set reset = TRUE to avoid static caching of all nodes that get indexed.
46 $node = node_load($nid, NULL, TRUE);
47 if (empty($node)) {
48 return FALSE;
49 }
50
51 $document = FALSE;
52 // Let any module exclude this node from the index.
53 $build_document = TRUE;
54 foreach (module_implements('apachesolr_node_exclude') as $module) {
55 $exclude = module_invoke($module, 'apachesolr_node_exclude', $node, $namespace);
56 if (!empty($exclude)) {
57 $build_document = FALSE;
58 }
59 }
60
61 if ($build_document) {
62 // Build the node body.
63 $node->build_mode = NODE_BUILD_SEARCH_INDEX;
64 $node = node_build_content($node, FALSE, FALSE);
65 $node->body = drupal_render($node->content);
66 $node->title = apachesolr_clean_text($node->title);
67
68 $text = $node->body;
69
70 // Fetch extra data normally not visible, including comments.
71 $extra = node_invoke_nodeapi($node, 'update index');
72 $text .= "\n\n" . implode(' ', $extra);
73 $text = apachesolr_strip_ctl_chars($text);
74
75 $document = new Apache_Solr_Document();
76 $document->id = apachesolr_document_id($node->nid);
77 $document->site = url(NULL, array('absolute' => TRUE));
78 $document->hash = apachesolr_site_hash();
79 $document->nid = $node->nid;
80 $document->uid = $node->uid;
81 $document->title = $node->title;
82 $document->status = $node->status;
83 $document->sticky = $node->sticky;
84 $document->promote = $node->promote;
85 $document->moderate = $node->moderate;
86 $document->tnid = $node->tnid;
87 $document->translate = $node->translate;
88 if (!empty($node->language)) {
89 $document->language = $node->language;
90 }
91 $document->body = _apachesolr_strip_decode($text);
92 $document->type = $node->type;
93 $document->type_name = apachesolr_strip_ctl_chars(node_get_types('name', $node));
94 $document->created = apachesolr_date_iso($node->created);
95 $document->changed = apachesolr_date_iso($node->changed);
96 $last_change = (isset($node->last_comment_timestamp) && $node->last_comment_timestamp > $node->changed) ? $node->last_comment_timestamp : $node->changed;
97 $document->last_comment_or_change = apachesolr_date_iso($last_change);
98 $document->comment_count = isset($node->comment_count) ? $node->comment_count : 0;
99 $document->name = apachesolr_strip_ctl_chars($node->name);
100
101 $path = 'node/' . $node->nid;
102 $document->url = url($path, array('absolute' => TRUE));
103 $document->path = $path;
104 // Path aliases can have important information about the content.
105 // Add them to the index as well.
106 if (function_exists('drupal_get_path_alias')) {
107 // Add any path alias to the index, looking first for language specific
108 // aliases but using language neutral aliases otherwise.
109 $language = empty($node->language) ? '' : $node->language;
110 $output = drupal_get_path_alias($path, $language);
111 if ($output && $output != $path) {
112 $document->path_alias = apachesolr_strip_ctl_chars($output);
113 }
114 }
115
116 // Get CCK fields list
117 $cck_fields = apachesolr_cck_fields();
118 foreach ($cck_fields as $key => $cck_info) {
119 if (isset($node->$key)) {
120 // Got a CCK field. See if it is to be indexed.
121 $function = $cck_info['callback'];
122 if ($cck_info['callback'] && function_exists($function)) {
123 $field = $function($node, $key);
124 }
125 else {
126 $field = $node->$key;
127 }
128 $index_key = apachesolr_index_key($cck_info);
129 foreach ($field as $value) {
130 // Don't index NULLs or empty strings
131 // We can use 'value' rather than 'safe' since we strip tags and later check_plain().
132 if (isset($value['value']) && strlen($value['value'])) {
133 if ($cck_info['multiple']) {
134 $document->setMultiValue($index_key, apachesolr_clean_text($value['value']));
135 }
136 else {
137 $document->$index_key = apachesolr_clean_text($value['value']);
138 }
139 }
140 }
141 }
142 }
143 // Index book module data.
144 if (!empty($node->book['bid'])) {
145 // Hard-coded - must change if apachesolr_index_key() changes.
146 $document->is_book_bid = (int) $node->book['bid'];
147 }
148 apachesolr_add_tags_to_document($document, $text);
149 apachesolr_add_taxonomy_to_document($document, $node);
150
151 // Let modules add to the document - TODO convert to drupal_alter().
152 foreach (module_implements('apachesolr_update_index') as $module) {
153 $function = $module .'_apachesolr_update_index';
154 $function($document, $node);
155 }
156 }
157 return $document;
158 }
159
160 /**
161 * Extract taxonomy from $node and add to dynamic fields.
162 */
163 function apachesolr_add_taxonomy_to_document(&$document, $node) {
164 if (isset($node->taxonomy) && is_array($node->taxonomy)) {
165 foreach ($node->taxonomy as $term) {
166 // Double indexing of tids lets us do effecient searches (on tid)
167 // and do accurate per-vocabulary faceting.
168
169 // By including the ancestors to a term in the index we make
170 // sure that searches for general categories match specific
171 // categories, e.g. Fruit -> apple, a search for fruit will find
172 // content categorized with apple.
173 $ancestors = taxonomy_get_parents_all($term->tid);
174 foreach ($ancestors as $ancestor) {
175 $document->setMultiValue('tid', $ancestor->tid);
176 $document->setMultiValue('im_vid_'. $ancestor->vid, $ancestor->tid);
177 $name = apachesolr_clean_text($ancestor->name);
178 $document->setMultiValue('vid', $ancestor->vid);
179 $document->{'ts_vid_'. $ancestor->vid .'_names'} .= ' '. $name;
180 // We index each name as a string for cross-site faceting
181 // using the vocab name rather than vid in field construction .
182 $document->setMultiValue('sm_vid_'. apachesolr_vocab_name($ancestor->vid), $name);
183 }
184 }
185 }
186 }
187
188 /**
189 * Helper function - return a safe (PHP identifier) vocabulary name.
190 */
191 function apachesolr_vocab_name($vid) {
192 static $names = array();
193
194 if (!isset($names[$vid])) {
195 $vocab_name = db_result(db_query('SELECT v.name FROM {vocabulary} v WHERE v.vid = %d', $vid));
196 $names[$vid] = preg_replace('/[^a-zA-Z0-9_\x7f-\xff]/', '_', $vocab_name);
197 // Fallback for names ending up all as '_'.
198 $check = rtrim($names[$vid], '_');
199 if (!$check) {
200 $names[$vid] = '_' . $vid . '_';
201 }
202 }
203 return $names[$vid];
204 }
205
206 /**
207 * Extract HTML tag contents from $text and add to boost fields.
208 *
209 * $text must be stripped of control characters before hand.
210 */
211 function apachesolr_add_tags_to_document(&$document, $text) {
212 $tags_to_index = variable_get('apachesolr_tags_to_index', array(
213 'h1' => 'tags_h1',
214 'h2' => 'tags_h2_h3',
215 'h3' => 'tags_h2_h3',
216 'h4' => 'tags_h4_h5_h6',
217 'h5' => 'tags_h4_h5_h6',
218 'h6' => 'tags_h4_h5_h6',
219 'u' => 'tags_inline',
220 'b' => 'tags_inline',
221 'i' => 'tags_inline',
222 'strong' => 'tags_inline',
223 'em' => 'tags_inline',
224 'a' => 'tags_a'
225 ));
226
227 // Strip off all ignored tags.
228 $text = strip_tags($text, '<'. implode('><', array_keys($tags_to_index)) .'>');
229
230 preg_match_all('@<('. implode('|', array_keys($tags_to_index)) .')[^>]*>(.*)</\1>@Ui', $text, $matches);
231 foreach ($matches[1] as $key => $tag) {
232 // We don't want to index links auto-generated by the url filter.
233 if ($tag != 'a' || !preg_match('@(?:http://|https://|ftp://|mailto:|smb://|afp://|file://|gopher://|news://|ssl://|sslv2://|sslv3://|tls://|tcp://|udp://|www\.)[a-zA-Z0-9]+@', $matches[2][$key])) {
234 $document->{$tags_to_index[$tag]} .= ' '. $matches[2][$key];
235 }
236 }
237 }
238

  ViewVC Help
Powered by ViewVC 1.1.2