| 1 |
<?php
|
| 2 |
// $Id: apachesolr.index.inc,v 1.1.2.5 2009/05/19 19:58:23 pwolanin Exp $
|
| 3 |
|
| 4 |
/**
|
| 5 |
* @file
|
| 6 |
* Functions used when indexing content to Apache Solr.
|
| 7 |
*/
|
| 8 |
|
| 9 |
/**
|
| 10 |
* Add a document to the $documents array based on a node ID.
|
| 11 |
*/
|
| 12 |
function apachesolr_add_node_document(&$documents, $nid, $namespace) {
|
| 13 |
if ($document = apachesolr_node_to_document($nid, $namespace)) {
|
| 14 |
$documents[] = $document;
|
| 15 |
}
|
| 16 |
}
|
| 17 |
|
| 18 |
/**
|
| 19 |
* Strip control characters that cause Jetty/Solr to fail.
|
| 20 |
*/
|
| 21 |
function apachesolr_strip_ctl_chars($text) {
|
| 22 |
// See: http://w3.org/International/questions/qa-forms-utf-8.html
|
| 23 |
// Printable utf-8 does not include any of these chars below x7F
|
| 24 |
return preg_replace('@[\x00-\x08\x0B\x0C\x0E-\x1F]@', ' ', $text);
|
| 25 |
}
|
| 26 |
|
| 27 |
/**
|
| 28 |
* Strip html tags and also control characters that cause Jetty/Solr to fail.
|
| 29 |
*/
|
| 30 |
function apachesolr_clean_text($text) {
|
| 31 |
return _apachesolr_strip_decode(preg_replace('@[\x00-\x08\x0B\x0C\x0E-\x1F]@', ' ', $text));
|
| 32 |
}
|
| 33 |
|
| 34 |
function _apachesolr_strip_decode($text) {
|
| 35 |
// Add spaces before stripping tags to avoid running words together.
|
| 36 |
$text = filter_xss(str_replace(array('<', '>'), array(' <', '> '), $text), array());
|
| 37 |
// Decode entities and then make safe any < or > characters.
|
| 38 |
return htmlspecialchars(html_entity_decode($text, ENT_NOQUOTES, 'UTF-8'), ENT_NOQUOTES, 'UTF-8');
|
| 39 |
}
|
| 40 |
|
| 41 |
/**
|
| 42 |
* Given a node ID, return a document representing that node.
|
| 43 |
*/
|
| 44 |
function apachesolr_node_to_document($nid, $namespace) {
|
| 45 |
// Set reset = TRUE to avoid static caching of all nodes that get indexed.
|
| 46 |
$node = node_load($nid, NULL, TRUE);
|
| 47 |
if (empty($node)) {
|
| 48 |
return FALSE;
|
| 49 |
}
|
| 50 |
|
| 51 |
$document = FALSE;
|
| 52 |
// Let any module exclude this node from the index.
|
| 53 |
$build_document = TRUE;
|
| 54 |
foreach (module_implements('apachesolr_node_exclude') as $module) {
|
| 55 |
$exclude = module_invoke($module, 'apachesolr_node_exclude', $node, $namespace);
|
| 56 |
if (!empty($exclude)) {
|
| 57 |
$build_document = FALSE;
|
| 58 |
}
|
| 59 |
}
|
| 60 |
|
| 61 |
if ($build_document) {
|
| 62 |
// Build the node body.
|
| 63 |
$node->build_mode = NODE_BUILD_SEARCH_INDEX;
|
| 64 |
$node = node_build_content($node, FALSE, FALSE);
|
| 65 |
$node->body = drupal_render($node->content);
|
| 66 |
$node->title = apachesolr_clean_text($node->title);
|
| 67 |
|
| 68 |
$text = $node->body;
|
| 69 |
|
| 70 |
// Fetch extra data normally not visible, including comments.
|
| 71 |
$extra = node_invoke_nodeapi($node, 'update index');
|
| 72 |
$text .= "\n\n" . implode(' ', $extra);
|
| 73 |
$text = apachesolr_strip_ctl_chars($text);
|
| 74 |
|
| 75 |
$document = new Apache_Solr_Document();
|
| 76 |
$document->id = apachesolr_document_id($node->nid);
|
| 77 |
$document->site = url(NULL, array('absolute' => TRUE));
|
| 78 |
$document->hash = apachesolr_site_hash();
|
| 79 |
$document->nid = $node->nid;
|
| 80 |
$document->uid = $node->uid;
|
| 81 |
$document->title = $node->title;
|
| 82 |
$document->status = $node->status;
|
| 83 |
$document->sticky = $node->sticky;
|
| 84 |
$document->promote = $node->promote;
|
| 85 |
$document->moderate = $node->moderate;
|
| 86 |
$document->tnid = $node->tnid;
|
| 87 |
$document->translate = $node->translate;
|
| 88 |
if (!empty($node->language)) {
|
| 89 |
$document->language = $node->language;
|
| 90 |
}
|
| 91 |
$document->body = _apachesolr_strip_decode($text);
|
| 92 |
$document->type = $node->type;
|
| 93 |
$document->type_name = apachesolr_strip_ctl_chars(node_get_types('name', $node));
|
| 94 |
$document->created = apachesolr_date_iso($node->created);
|
| 95 |
$document->changed = apachesolr_date_iso($node->changed);
|
| 96 |
$last_change = (isset($node->last_comment_timestamp) && $node->last_comment_timestamp > $node->changed) ? $node->last_comment_timestamp : $node->changed;
|
| 97 |
$document->last_comment_or_change = apachesolr_date_iso($last_change);
|
| 98 |
$document->comment_count = isset($node->comment_count) ? $node->comment_count : 0;
|
| 99 |
$document->name = apachesolr_strip_ctl_chars($node->name);
|
| 100 |
|
| 101 |
$path = 'node/' . $node->nid;
|
| 102 |
$document->url = url($path, array('absolute' => TRUE));
|
| 103 |
$document->path = $path;
|
| 104 |
// Path aliases can have important information about the content.
|
| 105 |
// Add them to the index as well.
|
| 106 |
if (function_exists('drupal_get_path_alias')) {
|
| 107 |
// Add any path alias to the index, looking first for language specific
|
| 108 |
// aliases but using language neutral aliases otherwise.
|
| 109 |
$language = empty($node->language) ? '' : $node->language;
|
| 110 |
$output = drupal_get_path_alias($path, $language);
|
| 111 |
if ($output && $output != $path) {
|
| 112 |
$document->path_alias = apachesolr_strip_ctl_chars($output);
|
| 113 |
}
|
| 114 |
}
|
| 115 |
|
| 116 |
// Get CCK fields list
|
| 117 |
$cck_fields = apachesolr_cck_fields();
|
| 118 |
foreach ($cck_fields as $key => $cck_info) {
|
| 119 |
if (isset($node->$key)) {
|
| 120 |
// Got a CCK field. See if it is to be indexed.
|
| 121 |
$function = $cck_info['callback'];
|
| 122 |
if ($cck_info['callback'] && function_exists($function)) {
|
| 123 |
$field = $function($node, $key);
|
| 124 |
}
|
| 125 |
else {
|
| 126 |
$field = $node->$key;
|
| 127 |
}
|
| 128 |
$index_key = apachesolr_index_key($cck_info);
|
| 129 |
foreach ($field as $value) {
|
| 130 |
// Don't index NULLs or empty strings
|
| 131 |
// We can use 'value' rather than 'safe' since we strip tags and later check_plain().
|
| 132 |
if (isset($value['value']) && strlen($value['value'])) {
|
| 133 |
if ($cck_info['multiple']) {
|
| 134 |
$document->setMultiValue($index_key, apachesolr_clean_text($value['value']));
|
| 135 |
}
|
| 136 |
else {
|
| 137 |
$document->$index_key = apachesolr_clean_text($value['value']);
|
| 138 |
}
|
| 139 |
}
|
| 140 |
}
|
| 141 |
}
|
| 142 |
}
|
| 143 |
// Index book module data.
|
| 144 |
if (!empty($node->book['bid'])) {
|
| 145 |
// Hard-coded - must change if apachesolr_index_key() changes.
|
| 146 |
$document->is_book_bid = (int) $node->book['bid'];
|
| 147 |
}
|
| 148 |
apachesolr_add_tags_to_document($document, $text);
|
| 149 |
apachesolr_add_taxonomy_to_document($document, $node);
|
| 150 |
|
| 151 |
// Let modules add to the document - TODO convert to drupal_alter().
|
| 152 |
foreach (module_implements('apachesolr_update_index') as $module) {
|
| 153 |
$function = $module .'_apachesolr_update_index';
|
| 154 |
$function($document, $node);
|
| 155 |
}
|
| 156 |
}
|
| 157 |
return $document;
|
| 158 |
}
|
| 159 |
|
| 160 |
/**
|
| 161 |
* Extract taxonomy from $node and add to dynamic fields.
|
| 162 |
*/
|
| 163 |
function apachesolr_add_taxonomy_to_document(&$document, $node) {
|
| 164 |
if (isset($node->taxonomy) && is_array($node->taxonomy)) {
|
| 165 |
foreach ($node->taxonomy as $term) {
|
| 166 |
// Double indexing of tids lets us do effecient searches (on tid)
|
| 167 |
// and do accurate per-vocabulary faceting.
|
| 168 |
|
| 169 |
// By including the ancestors to a term in the index we make
|
| 170 |
// sure that searches for general categories match specific
|
| 171 |
// categories, e.g. Fruit -> apple, a search for fruit will find
|
| 172 |
// content categorized with apple.
|
| 173 |
$ancestors = taxonomy_get_parents_all($term->tid);
|
| 174 |
foreach ($ancestors as $ancestor) {
|
| 175 |
$document->setMultiValue('tid', $ancestor->tid);
|
| 176 |
$document->setMultiValue('im_vid_'. $ancestor->vid, $ancestor->tid);
|
| 177 |
$name = apachesolr_clean_text($ancestor->name);
|
| 178 |
$document->setMultiValue('vid', $ancestor->vid);
|
| 179 |
$document->{'ts_vid_'. $ancestor->vid .'_names'} .= ' '. $name;
|
| 180 |
// We index each name as a string for cross-site faceting
|
| 181 |
// using the vocab name rather than vid in field construction .
|
| 182 |
$document->setMultiValue('sm_vid_'. apachesolr_vocab_name($ancestor->vid), $name);
|
| 183 |
}
|
| 184 |
}
|
| 185 |
}
|
| 186 |
}
|
| 187 |
|
| 188 |
/**
|
| 189 |
* Helper function - return a safe (PHP identifier) vocabulary name.
|
| 190 |
*/
|
| 191 |
function apachesolr_vocab_name($vid) {
|
| 192 |
static $names = array();
|
| 193 |
|
| 194 |
if (!isset($names[$vid])) {
|
| 195 |
$vocab_name = db_result(db_query('SELECT v.name FROM {vocabulary} v WHERE v.vid = %d', $vid));
|
| 196 |
$names[$vid] = preg_replace('/[^a-zA-Z0-9_\x7f-\xff]/', '_', $vocab_name);
|
| 197 |
// Fallback for names ending up all as '_'.
|
| 198 |
$check = rtrim($names[$vid], '_');
|
| 199 |
if (!$check) {
|
| 200 |
$names[$vid] = '_' . $vid . '_';
|
| 201 |
}
|
| 202 |
}
|
| 203 |
return $names[$vid];
|
| 204 |
}
|
| 205 |
|
| 206 |
/**
|
| 207 |
* Extract HTML tag contents from $text and add to boost fields.
|
| 208 |
*
|
| 209 |
* $text must be stripped of control characters before hand.
|
| 210 |
*/
|
| 211 |
function apachesolr_add_tags_to_document(&$document, $text) {
|
| 212 |
$tags_to_index = variable_get('apachesolr_tags_to_index', array(
|
| 213 |
'h1' => 'tags_h1',
|
| 214 |
'h2' => 'tags_h2_h3',
|
| 215 |
'h3' => 'tags_h2_h3',
|
| 216 |
'h4' => 'tags_h4_h5_h6',
|
| 217 |
'h5' => 'tags_h4_h5_h6',
|
| 218 |
'h6' => 'tags_h4_h5_h6',
|
| 219 |
'u' => 'tags_inline',
|
| 220 |
'b' => 'tags_inline',
|
| 221 |
'i' => 'tags_inline',
|
| 222 |
'strong' => 'tags_inline',
|
| 223 |
'em' => 'tags_inline',
|
| 224 |
'a' => 'tags_a'
|
| 225 |
));
|
| 226 |
|
| 227 |
// Strip off all ignored tags.
|
| 228 |
$text = strip_tags($text, '<'. implode('><', array_keys($tags_to_index)) .'>');
|
| 229 |
|
| 230 |
preg_match_all('@<('. implode('|', array_keys($tags_to_index)) .')[^>]*>(.*)</\1>@Ui', $text, $matches);
|
| 231 |
foreach ($matches[1] as $key => $tag) {
|
| 232 |
// We don't want to index links auto-generated by the url filter.
|
| 233 |
if ($tag != 'a' || !preg_match('@(?:http://|https://|ftp://|mailto:|smb://|afp://|file://|gopher://|news://|ssl://|sslv2://|sslv3://|tls://|tcp://|udp://|www\.)[a-zA-Z0-9]+@', $matches[2][$key])) {
|
| 234 |
$document->{$tags_to_index[$tag]} .= ' '. $matches[2][$key];
|
| 235 |
}
|
| 236 |
}
|
| 237 |
}
|
| 238 |
|