2 // $Id: search.module 144 2007-03-28 07:52:20Z thierry $
6 * Enables site-wide keyword searching.
10 * Matches Unicode character classes to exclude from the search index.
12 * See: http://www.unicode.org/Public/UNIDATA/UCD.html#General_Category_Values
14 * The index only contains the following character classes:
15 * Lu Letter, Uppercase
16 * Ll Letter, Lowercase
17 * Lt Letter, Titlecase
19 * Nd Number, Decimal Digit
22 define('PREG_CLASS_SEARCH_EXCLUDE',
23 '\x{0}-\x{2f}\x{3a}-\x{40}\x{5b}-\x{60}\x{7b}-\x{bf}\x{d7}\x{f7}\x{2b0}-'.
24 '\x{385}\x{387}\x{3f6}\x{482}-\x{489}\x{559}-\x{55f}\x{589}-\x{5c7}\x{5f3}-'.
25 '\x{61f}\x{640}\x{64b}-\x{65e}\x{66a}-\x{66d}\x{670}\x{6d4}\x{6d6}-\x{6ed}'.
26 '\x{6fd}\x{6fe}\x{700}-\x{70f}\x{711}\x{730}-\x{74a}\x{7a6}-\x{7b0}\x{901}-'.
27 '\x{903}\x{93c}\x{93e}-\x{94d}\x{951}-\x{954}\x{962}-\x{965}\x{970}\x{981}-'.
28 '\x{983}\x{9bc}\x{9be}-\x{9cd}\x{9d7}\x{9e2}\x{9e3}\x{9f2}-\x{a03}\x{a3c}-'.
29 '\x{a4d}\x{a70}\x{a71}\x{a81}-\x{a83}\x{abc}\x{abe}-\x{acd}\x{ae2}\x{ae3}'.
30 '\x{af1}-\x{b03}\x{b3c}\x{b3e}-\x{b57}\x{b70}\x{b82}\x{bbe}-\x{bd7}\x{bf0}-'.
31 '\x{c03}\x{c3e}-\x{c56}\x{c82}\x{c83}\x{cbc}\x{cbe}-\x{cd6}\x{d02}\x{d03}'.
32 '\x{d3e}-\x{d57}\x{d82}\x{d83}\x{dca}-\x{df4}\x{e31}\x{e34}-\x{e3f}\x{e46}-'.
33 '\x{e4f}\x{e5a}\x{e5b}\x{eb1}\x{eb4}-\x{ebc}\x{ec6}-\x{ecd}\x{f01}-\x{f1f}'.
34 '\x{f2a}-\x{f3f}\x{f71}-\x{f87}\x{f90}-\x{fd1}\x{102c}-\x{1039}\x{104a}-'.
35 '\x{104f}\x{1056}-\x{1059}\x{10fb}\x{10fc}\x{135f}-\x{137c}\x{1390}-\x{1399}'.
36 '\x{166d}\x{166e}\x{1680}\x{169b}\x{169c}\x{16eb}-\x{16f0}\x{1712}-\x{1714}'.
37 '\x{1732}-\x{1736}\x{1752}\x{1753}\x{1772}\x{1773}\x{17b4}-\x{17db}\x{17dd}'.
38 '\x{17f0}-\x{180e}\x{1843}\x{18a9}\x{1920}-\x{1945}\x{19b0}-\x{19c0}\x{19c8}'.
39 '\x{19c9}\x{19de}-\x{19ff}\x{1a17}-\x{1a1f}\x{1d2c}-\x{1d61}\x{1d78}\x{1d9b}-'.
40 '\x{1dc3}\x{1fbd}\x{1fbf}-\x{1fc1}\x{1fcd}-\x{1fcf}\x{1fdd}-\x{1fdf}\x{1fed}-'.
41 '\x{1fef}\x{1ffd}-\x{2070}\x{2074}-\x{207e}\x{2080}-\x{2101}\x{2103}-\x{2106}'.
42 '\x{2108}\x{2109}\x{2114}\x{2116}-\x{2118}\x{211e}-\x{2123}\x{2125}\x{2127}'.
43 '\x{2129}\x{212e}\x{2132}\x{213a}\x{213b}\x{2140}-\x{2144}\x{214a}-\x{2b13}'.
44 '\x{2ce5}-\x{2cff}\x{2d6f}\x{2e00}-\x{3005}\x{3007}-\x{303b}\x{303d}-\x{303f}'.
45 '\x{3099}-\x{309e}\x{30a0}\x{30fb}-\x{30fe}\x{3190}-\x{319f}\x{31c0}-\x{31cf}'.
46 '\x{3200}-\x{33ff}\x{4dc0}-\x{4dff}\x{a015}\x{a490}-\x{a716}\x{a802}\x{a806}'.
47 '\x{a80b}\x{a823}-\x{a82b}\x{d800}-\x{f8ff}\x{fb1e}\x{fb29}\x{fd3e}\x{fd3f}'.
48 '\x{fdfc}-\x{fe6b}\x{feff}-\x{ff0f}\x{ff1a}-\x{ff20}\x{ff3b}-\x{ff40}\x{ff5b}-'.
49 '\x{ff65}\x{ff70}\x{ff9e}\x{ff9f}\x{ffe0}-\x{fffd}');
52 * Matches all 'N' Unicode character classes (numbers)
54 define('PREG_CLASS_NUMBERS',
55 '\x{30}-\x{39}\x{b2}\x{b3}\x{b9}\x{bc}-\x{be}\x{660}-\x{669}\x{6f0}-\x{6f9}'.
56 '\x{966}-\x{96f}\x{9e6}-\x{9ef}\x{9f4}-\x{9f9}\x{a66}-\x{a6f}\x{ae6}-\x{aef}'.
57 '\x{b66}-\x{b6f}\x{be7}-\x{bf2}\x{c66}-\x{c6f}\x{ce6}-\x{cef}\x{d66}-\x{d6f}'.
58 '\x{e50}-\x{e59}\x{ed0}-\x{ed9}\x{f20}-\x{f33}\x{1040}-\x{1049}\x{1369}-'.
59 '\x{137c}\x{16ee}-\x{16f0}\x{17e0}-\x{17e9}\x{17f0}-\x{17f9}\x{1810}-\x{1819}'.
60 '\x{1946}-\x{194f}\x{2070}\x{2074}-\x{2079}\x{2080}-\x{2089}\x{2153}-\x{2183}'.
61 '\x{2460}-\x{249b}\x{24ea}-\x{24ff}\x{2776}-\x{2793}\x{3007}\x{3021}-\x{3029}'.
62 '\x{3038}-\x{303a}\x{3192}-\x{3195}\x{3220}-\x{3229}\x{3251}-\x{325f}\x{3280}-'.
63 '\x{3289}\x{32b1}-\x{32bf}\x{ff10}-\x{ff19}');
66 * Matches all 'P' Unicode character classes (punctuation)
68 define('PREG_CLASS_PUNCTUATION',
69 '\x{21}-\x{23}\x{25}-\x{2a}\x{2c}-\x{2f}\x{3a}\x{3b}\x{3f}\x{40}\x{5b}-\x{5d}'.
70 '\x{5f}\x{7b}\x{7d}\x{a1}\x{ab}\x{b7}\x{bb}\x{bf}\x{37e}\x{387}\x{55a}-\x{55f}'.
71 '\x{589}\x{58a}\x{5be}\x{5c0}\x{5c3}\x{5f3}\x{5f4}\x{60c}\x{60d}\x{61b}\x{61f}'.
72 '\x{66a}-\x{66d}\x{6d4}\x{700}-\x{70d}\x{964}\x{965}\x{970}\x{df4}\x{e4f}'.
73 '\x{e5a}\x{e5b}\x{f04}-\x{f12}\x{f3a}-\x{f3d}\x{f85}\x{104a}-\x{104f}\x{10fb}'.
74 '\x{1361}-\x{1368}\x{166d}\x{166e}\x{169b}\x{169c}\x{16eb}-\x{16ed}\x{1735}'.
75 '\x{1736}\x{17d4}-\x{17d6}\x{17d8}-\x{17da}\x{1800}-\x{180a}\x{1944}\x{1945}'.
76 '\x{2010}-\x{2027}\x{2030}-\x{2043}\x{2045}-\x{2051}\x{2053}\x{2054}\x{2057}'.
77 '\x{207d}\x{207e}\x{208d}\x{208e}\x{2329}\x{232a}\x{23b4}-\x{23b6}\x{2768}-'.
78 '\x{2775}\x{27e6}-\x{27eb}\x{2983}-\x{2998}\x{29d8}-\x{29db}\x{29fc}\x{29fd}'.
79 '\x{3001}-\x{3003}\x{3008}-\x{3011}\x{3014}-\x{301f}\x{3030}\x{303d}\x{30a0}'.
80 '\x{30fb}\x{fd3e}\x{fd3f}\x{fe30}-\x{fe52}\x{fe54}-\x{fe61}\x{fe63}\x{fe68}'.
81 '\x{fe6a}\x{fe6b}\x{ff01}-\x{ff03}\x{ff05}-\x{ff0a}\x{ff0c}-\x{ff0f}\x{ff1a}'.
82 '\x{ff1b}\x{ff1f}\x{ff20}\x{ff3b}-\x{ff3d}\x{ff3f}\x{ff5b}\x{ff5d}\x{ff5f}-'.
86 * Matches all CJK characters that are candidates for auto-splitting
87 * (Chinese, Japanese, Korean).
88 * Contains kana and BMP ideographs.
90 define('PREG_CLASS_CJK', '\x{3041}-\x{30ff}\x{31f0}-\x{31ff}\x{3400}-\x{4db5}'.
91 '\x{4e00}-\x{9fbb}\x{f900}-\x{fad9}');
94 * Implementation of hook_help().
96 function search_help($section) {
98 case 'admin/help#search':
99 $output = '<p>'. t('The search module adds the ability to search for content by keywords. Search is often the only practical way to find content on a large site. Search is useful for finding users and posts by searching on keywords.') .'</p>';
100 $output .= '<p>'. t('The search engine works by maintaining an index of the words in your site\'s content. It indexes the posts and users. You can adjust the settings to tweak the indexing behaviour. Note that the search requires cron to be set up correctly. The index percentage sets the maximum amount of items that will be indexed in one cron run. Set this number lower if your cron is timing out or if PHP is running out of memory.') .'</p>';
101 $output .= t('<p>You can</p>
103 <li>read about how your site uses cron in the <a href="%admin-help-system">administer >> help >> system</a>.</li>
104 <li>run your <a href="%file-cron">cron.php</a>.</li>
105 <li>read about <a href="%external-http-drupal-org-node-23714">configuring cron jobs</a>.</li>
106 <li><a href="%admin-settings-search">administer >> settings >> search</a>.</li></ul>
107 ', array('%admin-help-system' => url('admin/help/system'), '%file-cron' => 'cron.php', '%external-http-drupal-org-node-23714' => 'http://drupal.org/node/23714', '%admin-settings-search' => url('admin/settings/search')));
108 $output .= '<p>'. t('For more information please read the configuration and customization handbook <a href="%search">Search page</a>.', array('%search' => 'http://drupal.org/handbook/modules/search/')) .'</p>';
110 case 'admin/modules#description':
111 return t('Enables site-wide keyword searching.');
112 case 'admin/settings/search':
114 <p>The search engine works by maintaining an index of the words in your site\'s content. You can adjust the settings below to tweak the indexing behaviour. Note that the search requires cron to be set up correctly.</p>
116 case 'search#noresults':
118 <li>Check if your spelling is correct.</li>
119 <li>Remove quotes around phrases to match each word individually: <em>"blue smurf"</em> will match less than <em>blue smurf</em>.</li>
120 <li>Consider loosening your query with <em>OR</em>: <em>blue smurf</em> will match less than <em>blue OR smurf</em>.</li>
126 * Implementation of hook_perm().
128 function search_perm() {
129 return array('search content', 'administer search');
133 * Implementation of hook_block().
135 function search_block($op = 'list', $delta = 0) {
137 $blocks[0]['info'] = t('Search form');
140 else if ($op == 'view' && user_access('search content')) {
141 $block['content'] = search_box('search_block_form');
142 $block['subject'] = t('Search');
148 * Implementation of hook_menu().
150 function search_menu($may_cache) {
154 $items[] = array('path' => 'search', 'title' => t('search'),
155 'callback' => 'search_view',
156 'access' => user_access('search content'),
157 'type' => MENU_SUGGESTED_ITEM);
158 $items[] = array('path' => 'admin/settings/search/wipe', 'title' => t('Clear index'),
159 'callback' => 'search_wipe_confirm',
160 'access' => user_access('administer search'),
161 'type' => MENU_CALLBACK);
163 else if (arg(0) == 'search') {
164 // To remember the user's search keywords when switching across tabs,
165 // we dynamically add the keywords to the search tabs' paths.
166 $keys = search_get_keys();
167 $keys = strlen($keys) ? '/'. $keys : '';
168 foreach (module_list() as $name) {
169 if (module_hook($name, 'search') && $title = module_invoke($name, 'search', 'name')) {
170 $items[] = array('path' => 'search/'. $name . $keys, 'title' => $title,
171 'callback' => 'search_view',
172 'access' => user_access('search content'),
173 'type' => MENU_LOCAL_TASK);
182 * Implementation of hook_validate().
184 function search_settings_form_validate($form_id, &$form) {
185 if ($_POST['op'] == t('Re-index site')) {
186 drupal_goto('admin/settings/search/wipe');
188 // If these settings change, the index needs to be rebuilt.
189 if ((variable_get('minimum_word_size', 3) != $form['minimum_word_size']) ||
190 (variable_get('overlap_cjk', true) != $form['overlap_cjk'])) {
191 drupal_set_message(t('The index will be rebuilt.'));
197 * Menu callback; displays the search module settings page.
199 function search_settings() {
200 // Collect some stats
203 foreach (module_list() as $module) {
204 if (module_hook($module, 'search')) {
205 $status = module_invoke($module, 'search', 'status');
206 $remaining += $status['remaining'];
207 $total += $status['total'];
210 $count = format_plural($remaining, 'There is 1 item left to index.', 'There are %count items left to index.');
211 $percentage = ((int)min(100, 100 * ($total - $remaining) / max(1, $total))) . '%';
212 $status = '<p><strong>'. t('%percentage of the site has been indexed.', array('%percentage' => $percentage)) .' '. $count .'</strong></p>';
213 $form['status'] = array('#type' => 'fieldset', '#title' => t('Indexing status'));
214 $form['status']['status'] = array('#type' => 'markup', '#value' => $status);
215 $form['status']['wipe'] = array('#type' => 'submit', '#value' => t('Re-index site'));
217 $items = drupal_map_assoc(array(10, 20, 50, 100, 200, 500));
219 // Indexing throttle:
220 $form['indexing_throttle'] = array('#type' => 'fieldset', '#title' => t('Indexing throttle'));
221 $form['indexing_throttle']['search_cron_limit'] = array('#type' => 'select', '#title' => t('Items to index per cron run'), '#default_value' => variable_get('search_cron_limit', 100), '#options' => $items, '#description' => t('The maximum amount of items that will be indexed in one cron run. Set this number lower if your cron is timing out or if PHP is running out of memory.'));
222 // Indexing settings:
223 $form['indexing_settings'] = array('#type' => 'fieldset', '#title' => t('Indexing settings'));
224 $form['indexing_settings']['info'] = array('#type' => 'markup', '#value' => '<em>'. t('<p>Changing the settings below will cause the site index to be rebuilt. The search index is not cleared but systematically updated to reflect the new settings. Searching will continue to work but new content won\'t be indexed until all existing content has been re-indexed.</p><p>The default settings should be appropriate for the majority of sites.</p>') .'</em>');
225 $form['indexing_settings']['minimum_word_size'] = array('#type' => 'textfield', '#title' => t('Minimum word length to index'), '#default_value' => variable_get('minimum_word_size', 3), '#size' => 5, '#maxlength' => 3, '#description' => t('The number of characters a word has to be to be indexed. A lower setting means better search result ranking, but also a larger database. Each search query must contain at least one keyword that is this size (or longer).'));
226 $form['indexing_settings']['overlap_cjk'] = array('#type' => 'checkbox', '#title' => t('Simple CJK handling'), '#default_value' => variable_get('overlap_cjk', true), '#description' => t('Whether to apply a simple Chinese/Japanese/Korean tokenizer based on overlapping sequences. Turn this off if you want to use an external preprocessor for this instead. Does not affect other languages.'));
228 // Per module settings
229 $form = array_merge($form, module_invoke_all('search', 'admin'));
234 * Menu callback: confirm wiping of the index.
236 function search_wipe_confirm() {
237 return confirm_form('search_wipe_confirm', $form, t('Are you sure you want to re-index the site?'),
238 'admin/settings/search', t(' The search index is not cleared but systematically updated to reflect the new settings. Searching will continue to work but new content won\'t be indexed until all existing content has been re-indexed. This action cannot be undone.'), t('Re-index site'), t('Cancel'));
242 * Handler for wipe confirmation
244 function search_wipe_confirm_submit($form_id, &$form) {
245 if ($form['confirm']) {
247 drupal_set_message(t('The index will be rebuilt.'));
248 return 'admin/settings/search';
253 * Wipes a part of or the entire search index.
256 * (optional) The SID of the item to wipe. If specified, $type must be passed
259 * (optional) The type of item to wipe.
261 function search_wipe($sid = NULL, $type = NULL, $reindex = FALSE) {
262 if ($type == NULL && $sid == NULL) {
263 module_invoke_all('search', 'reset');
266 db_query("DELETE FROM {search_dataset} WHERE sid = %d AND type = '%s'", $sid, $type);
267 db_query("DELETE FROM {search_index} WHERE fromsid = %d AND fromtype = '%s'", $sid, $type);
268 // When re-indexing, keep link references
269 db_query("DELETE FROM {search_index} WHERE sid = %d AND type = '%s'". ($reindex ? " AND fromsid = 0" : ''), $sid, $type);
274 * Marks a word as dirty (or retrieves the list of dirty words). This is used
275 * during indexing (cron). Words which are dirty have outdated total counts in
276 * the search_total table, and need to be recounted.
278 function search_dirty($word = null) {
279 static $dirty = array();
280 if ($word !== null) {
281 $dirty[$word] = true;
289 * Implementation of hook_cron().
291 * Fires hook_update_index() in all modules and cleans up dirty words (see
294 function search_cron() {
295 // We register a shutdown function to ensure that search_total is always up
297 register_shutdown_function('search_update_totals');
300 foreach (module_list() as $module) {
301 module_invoke($module, 'update_index');
306 * This function is called on shutdown to ensure that search_total is always
307 * up to date (even if cron times out or otherwise fails).
309 function search_update_totals() {
310 // Update word IDF (Inverse Document Frequency) counts for new/changed words
311 foreach (search_dirty() as $word => $dummy) {
313 $total = db_result(db_query("SELECT SUM(score) FROM {search_index} WHERE word = '%s'", $word));
314 // Apply Zipf's law to equalize the probability distribution
315 $total = log10(1 + 1/(max(1, $total)));
316 db_query("UPDATE {search_total} SET count = %f WHERE word = '%s'", $total, $word);
317 if (!db_affected_rows()) {
318 db_query("INSERT INTO {search_total} (word, count) VALUES ('%s', %f)", $word, $total);
321 // Find words that were deleted from search_index, but are still in
322 // search_total. We use a LEFT JOIN between the two tables and keep only the
323 // rows which fail to join.
324 $result = db_query("SELECT t.word AS realword, i.word FROM {search_total} t LEFT JOIN {search_index} i ON t.word = i.word WHERE i.word IS NULL");
325 while ($word = db_fetch_object($result)) {
326 db_query("DELETE FROM {search_total} WHERE word = '%s'", $word->realword);
331 * Simplifies a string according to indexing rules.
333 function search_simplify($text) {
334 // Decode entities to UTF-8
335 $text = decode_entities($text);
338 $text = drupal_strtolower($text);
340 // Call an external processor for word handling.
341 search_preprocess($text);
343 // Simple CJK handling
344 if (variable_get('overlap_cjk', true)) {
345 $text = preg_replace_callback('/['. PREG_CLASS_CJK .']+/u', 'search_expand_cjk', $text);
348 // To improve searching for numerical data such as dates, IP addresses
349 // or version numbers, we consider a group of numerical characters
350 // separated only by punctuation characters to be one piece.
351 // This also means that searching for e.g. '20/03/1984' also returns
352 // results with '20-03-1984' in them.
353 // Readable regexp: ([number]+)[punctuation]+(?=[number])
354 $text = preg_replace('/(['. PREG_CLASS_NUMBERS .']+)['. PREG_CLASS_PUNCTUATION .']+(?=['. PREG_CLASS_NUMBERS .'])/u', '\1', $text);
356 // The dot, underscore and dash are simply removed. This allows meaningful
357 // search behaviour with acronyms and URLs.
358 $text = preg_replace('/[._-]+/', '', $text);
360 // With the exception of the rules above, we consider all punctuation,
361 // marks, spacers, etc, to be a word boundary.
362 $text = preg_replace('/['. PREG_CLASS_SEARCH_EXCLUDE . ']+/u', ' ', $text);
368 * Basic CJK tokenizer. Simply splits a string into consecutive, overlapping
369 * sequences of characters ('minimum_word_size' long).
371 function search_expand_cjk($matches) {
372 $min = variable_get('minimum_word_size', 3);
374 $l = drupal_strlen($str);
375 // Passthrough short words
377 return ' '. $str .' ';
380 // FIFO queue of characters
383 for ($i = 0; $i < $l; ++$i) {
384 // Grab next character
385 $current = drupal_substr($str, 0, 1);
386 $str = substr($str, strlen($current));
388 if ($i >= $min - 1) {
389 $tokens .= implode('', $chars) .' ';
397 * Splits a string into tokens for indexing.
399 function search_index_split($text) {
401 static $lastsplit = null;
403 if ($last == $text) {
407 $text = search_simplify($text);
408 $words = explode(' ', $text);
409 array_walk($words, '_search_index_truncate');
411 // Save last keyword result
419 * Helper function for array_walk in search_index_split.
421 function _search_index_truncate(&$text) {
422 $text = truncate_utf8($text, 50);
426 * Invokes hook_search_preprocess() in modules.
428 function search_preprocess(&$text) {
429 foreach (module_implements('search_preprocess') as $module) {
430 $text = module_invoke($module, 'search_preprocess', $text);
435 * Update the full-text search index for a particular item.
438 * A number identifying this particular item (e.g. node id).
441 * A string defining this type of item (e.g. 'node')
444 * The content of this item. Must be a piece of HTML text.
448 function search_index($sid, $type, $text) {
449 $minimum_word_size = variable_get('minimum_word_size', 3);
453 $node_regexp = '@href=[\'"]?(?:'. preg_quote($base_url, '@') .'/|'. preg_quote(base_path(), '@') .')(?:\?q=)?/?((?![a-z]+:)[^\'">]+)[\'">]@i';
455 // Multipliers for scores of words inside certain HTML tags.
456 // Note: 'a' must be included for link ranking to work.
457 $tags = array('h1' => 25,
470 // Strip off all ignored tags to speed up processing, but insert space before/after
471 // them to keep word boundaries.
472 $text = str_replace(array('<', '>'), array(' <', '> '), $text);
473 $text = strip_tags($text, '<'. implode('><', array_keys($tags)) .'>');
475 // Split HTML tags from plain text.
476 $split = preg_split('/\s*<([^>]+?)>\s*/', $text, -1, PREG_SPLIT_DELIM_CAPTURE);
477 // Note: PHP ensures the array consists of alternating delimiters and literals
478 // and begins and ends with a literal (inserting $null as required).
480 $tag = false; // Odd/even counter. Tag or no tag.
481 $link = false; // State variable for link analyser
482 $score = 1; // Starting score per word
483 $accum = ' '; // Accumulator for cleaned up data
484 $tagstack = array(); // Stack with open tags
485 $tagwords = 0; // Counter for consecutive words
486 $focus = 1; // Focus state
488 $results = array(0 => array()); // Accumulator for words for index
490 foreach ($split as $value) {
492 // Increase or decrease score per word based on tag
493 list($tagname) = explode(' ', $value, 2);
494 $tagname = drupal_strtolower($tagname);
495 // Closing or opening tag?
496 if ($tagname[0] == '/') {
497 $tagname = substr($tagname, 1);
498 // If we encounter unexpected tags, reset score to avoid incorrect boosting.
499 if (!count($tagstack) || $tagstack[0] != $tagname) {
504 // Remove from tag stack and decrement score
505 $score = max(1, $score - $tags[array_shift($tagstack)]);
507 if ($tagname == 'a') {
512 if ($tagstack[0] == $tagname) {
513 // None of the tags we look for make sense when nested identically.
514 // If they are, it's probably broken HTML.
519 // Add to open tag stack and increment score
520 array_unshift($tagstack, $tagname);
521 $score += $tags[$tagname];
523 if ($tagname == 'a') {
524 // Check if link points to a node on this site
525 if (preg_match($node_regexp, $value, $match)) {
526 $path = drupal_get_normal_path($match[1]);
527 if (preg_match('!(?:node|book)/(?:view/)?([0-9]+)!i', $path, $match)) {
528 $linknid = $match[1];
530 // Note: ignore links to uncachable nodes to avoid redirect bugs.
531 $node = db_fetch_object(db_query('SELECT n.title, n.nid, n.vid, r.format FROM {node} n INNER JOIN {node_revisions} r ON n.vid = r.vid WHERE n.nid = %d', $linknid));
532 if (filter_format_allowcache($node->format)) {
534 $linktitle = $node->title;
541 // A tag change occurred, reset counter.
545 // Note: use of PREG_SPLIT_DELIM_CAPTURE above will introduce empty values
548 // Check to see if the node link text is its URL. If so, we use the target node title instead.
549 if (preg_match('!^https?://!i', $value)) {
553 $words = search_index_split($value);
554 foreach ($words as $word) {
555 // Add word to accumulator
556 $accum .= $word .' ';
557 $num = is_numeric($word);
559 if ($num || drupal_strlen($word) >= $minimum_word_size) {
562 $word = (int)ltrim($word, '-0');
566 if (!isset($results[$linknid])) {
567 $results[$linknid] = array();
569 $results[$linknid][$word] += $score * $focus;
572 $results[0][$word] += $score * $focus;
573 // Focus is a decaying value in terms of the amount of unique words up to this point.
574 // From 100 words and more, it decays, to e.g. 0.5 at 500 words and 0.3 at 1000 words.
575 $focus = min(1, .01 + 3.5 / (2 + count($results[0]) * .015));
579 // Too many words inside a single tag probably mean a tag was accidentally left open.
580 if (count($tagstack) && $tagwords >= 15) {
590 search_wipe($sid, $type, TRUE);
592 // Insert cleaned up data into dataset
593 db_query("INSERT INTO {search_dataset} (sid, type, data) VALUES (%d, '%s', '%s')", $sid, $type, $accum);
595 // Insert results into search index
596 foreach ($results[0] as $word => $score) {
597 db_query("INSERT INTO {search_index} (word, sid, type, score) VALUES ('%s', %d, '%s', %f)", $word, $sid, $type, $score);
602 // Now insert links to nodes
603 foreach ($results as $nid => $words) {
604 foreach ($words as $word => $score) {
605 db_query("INSERT INTO {search_index} (word, sid, type, fromsid, fromtype, score) VALUES ('%s', %d, '%s', %d, '%s', %f)", $word, $nid, 'node', $sid, $type, $score);
612 * Extract a module-specific search option from a search query. e.g. 'type:book'
614 function search_query_extract($keys, $option) {
615 if (preg_match('/(^| )'. $option .':([^ ]*)( |$)/i', $keys, $matches)) {
621 * Return a query with the given module-specific search option inserted in.
624 function search_query_insert($keys, $option, $value = '') {
625 if (search_query_extract($keys, $option)) {
626 $keys = trim(preg_replace('/(^| )'. $option .':[^ ]*/i', '', $keys));
629 $keys .= ' '. $option .':'. $value;
635 * Parse a search query into SQL conditions.
637 * We build a query that matches the dataset bodies.
639 function search_parse_query($text) {
640 $keys = array('positive' => array(), 'negative' => array());
642 // Tokenize query string
643 preg_match_all('/ (-?)("[^"]+"|[^" ]+)/i', ' '. $text, $matches, PREG_SET_ORDER);
645 if (count($matches) < 1) {
651 foreach ($matches as $match) {
653 // Strip off phrase quotes
654 if ($match[2]{0} == '"') {
655 $match[2] = substr($match[2], 1, -1);
658 // Simplify keyword according to indexing rules and external preprocessors
659 $words = search_simplify($match[2]);
660 // Re-explode in case simplification added more words, except when matching a phrase
661 $words = $phrase ? array($words) : preg_split('/ /', $words, -1, PREG_SPLIT_NO_EMPTY);
663 if ($match[1] == '-') {
664 $keys['negative'] = array_merge($keys['negative'], $words);
666 // OR operator: instead of a single keyword, we store an array of all
668 elseif ($match[2] == 'OR' && count($keys['positive'])) {
669 $last = array_pop($keys['positive']);
670 // Starting a new OR?
671 if (!is_array($last)) {
672 $last = array($last);
674 $keys['positive'][] = $last;
681 // Add to last element (which is an array)
682 $keys['positive'][count($keys['positive']) - 1] = array_merge($keys['positive'][count($keys['positive']) - 1], $words);
685 $keys['positive'] = array_merge($keys['positive'], $words);
691 // Convert keywords into SQL statements.
694 $arguments = array();
695 $arguments2 = array();
698 foreach ($keys['positive'] as $key) {
699 // Group of ORed terms
700 if (is_array($key) && count($key)) {
703 foreach ($key as $or) {
704 list($q, $count) = _search_parse_query($or, $arguments2);
711 if (count($queryor)) {
712 $query[] = '('. implode(' OR ', $queryor) .')';
713 // A group of OR keywords only needs to match once
714 $matches += ($any > 0);
719 list($q, $count) = _search_parse_query($key, $arguments2);
723 // Each AND keyword needs to match at least once
729 foreach ($keys['negative'] as $key) {
730 list($q) = _search_parse_query($key, $arguments2, true);
736 $query = implode(' AND ', $query);
738 // Build word-index conditions for the first pass
739 $query2 = substr(str_repeat("i.word = '%s' OR ", count($arguments2)), 0, -4);
741 return array($query, $arguments, $query2, $arguments2, $matches);
745 * Helper function for search_parse_query();
747 function _search_parse_query(&$word, &$scores, $not = false) {
749 // Determine the scorewords of this word/phrase
751 $split = explode(' ', $word);
752 foreach ($split as $s) {
753 $num = is_numeric($s);
754 if ($num || drupal_strlen($s) >= variable_get('minimum_word_size', 3)) {
755 $s = $num ? ((int)ltrim($s, '-0')) : $s;
756 if (!isset($scores[$s])) {
763 // Return matching snippet and number of added words
764 return array("d.data ". ($not ? 'NOT ' : '') ."LIKE '%% %s %%'", $count);
768 * Do a query on the full-text search index for a word or words.
770 * This function is normally only called by each module that support the
771 * indexed search (and thus, implements hook_update_index()).
773 * Two queries are performed which can be extended by the caller.
775 * The first query selects a set of possible matches based on the search index
776 * and any extra given restrictions. This is the classic "OR" search.
778 * SELECT i.type, i.sid, SUM(i.score*t.count) AS relevance
779 * FROM {search_index} i
780 * INNER JOIN {search_total} t ON i.word = t.word
782 * WHERE $where1 AND (...)
783 * GROUP BY i.type, i.sid
785 * The second query further refines this set by verifying advanced text
786 * conditions (such as AND, negative or phrase matches), and orders the results
787 * on a the column or expression 'score':
789 * SELECT i.type, i.sid, $select2
790 * FROM temp_search_sids i
791 * INNER JOIN {search_dataset} d ON i.sid = d.sid AND i.type = d.type
794 * ORDER BY score DESC
797 * A search string as entered by the user.
800 * A string identifying the calling module.
803 * (optional) Inserted into the JOIN part of the first SQL query.
804 * For example "INNER JOIN {node} n ON n.nid = i.sid".
807 * (optional) Inserted into the WHERE part of the first SQL query.
808 * For example "(n.status > %d)".
811 * (optional) Extra SQL arguments belonging to the first query.
814 * (optional) Inserted into the SELECT pat of the second query. Must contain
815 * a column selected as 'score'.
816 * defaults to 'i.relevance AS score'
819 * (optional) Inserted into the JOIN par of the second SQL query.
820 * For example "INNER JOIN {node_comment_statistics} n ON n.nid = i.sid"
823 * (optional) Extra SQL arguments belonging to the second query parameter.
825 * @param $sort_parameters
826 * (optional) SQL arguments for sorting the final results.
827 * Default: 'ORDER BY score DESC'
830 * An array of SIDs for the search results.
834 function do_search($keywords, $type, $join1 = '', $where1 = '1', $arguments1 = array(), $select2 = 'i.relevance AS score', $join2 = '', $arguments2 = array(), $sort_parameters = 'ORDER BY score DESC') {
835 $query = search_parse_query($keywords);
837 if ($query[2] == '') {
838 form_set_error('keys', t('You must include at least one positive keyword with %count characters or more.', array('%count' => variable_get('minimum_word_size', 3))));
840 if ($query === NULL || $query[0] == '' || $query[2] == '') {
844 // First pass: select all possible matching sids, doing a simple index-based OR matching on the keywords.
845 // 'matches' is used to reject those items that cannot possibly match the query.
846 $conditions = $where1 .' AND ('. $query[2] .") AND i.type = '%s'";
847 $arguments = array_merge($arguments1, $query[3], array($type, $query[4]));
848 $result = db_query_temporary("SELECT i.type, i.sid, SUM(i.score * t.count) AS relevance, COUNT(*) AS matches FROM {search_index} i INNER JOIN {search_total} t ON i.word = t.word $join1 WHERE $conditions GROUP BY i.type, i.sid HAVING COUNT(*) >= %d", $arguments, 'temp_search_sids');
850 // Calculate maximum relevance, to normalize it
851 $normalize = db_result(db_query('SELECT MAX(relevance) FROM temp_search_sids'));
855 $select2 = str_replace('i.relevance', '('. (1.0 / $normalize) .' * i.relevance)', $select2);
857 // Second pass: only keep items that match the complicated keywords conditions (phrase search, negative keywords, ...)
858 $conditions = '('. $query[0] .')';
859 $arguments = array_merge($arguments2, $query[1]);
860 $result = db_query_temporary("SELECT i.type, i.sid, $select2 FROM temp_search_sids i INNER JOIN {search_dataset} d ON i.sid = d.sid AND i.type = d.type $join2 WHERE $conditions $sort_parameters", $arguments, 'temp_search_results');
861 if (($count = db_result(db_query('SELECT COUNT(*) FROM temp_search_results'))) == 0) {
864 $count_query = "SELECT $count";
866 // Do actual search query
867 $result = pager_query("SELECT * FROM temp_search_results", 10, 0, $count_query);
869 while ($item = db_fetch_object($result)) {
876 * Helper function for grabbing search keys.
878 function search_get_keys() {
879 // Extract keys as remainder of path
880 // Note: support old GET format of searches for existing links.
881 $path = explode('/', $_GET['q'], 3);
882 return count($path) == 3 ? $path[2] : $_REQUEST['keys'];
886 * Menu callback; presents the search form and/or search results.
888 function search_view() {
891 // Search form submits with POST but redirects to GET. This way we can keep
892 // the search query URL clean as a whistle:
893 // search/type/keyword+keyword
894 if (!isset($_POST['edit']['form_id'])) {
896 // Note: search/node can not be a default tab because it would take on the
897 // path of its parent (search). It would prevent remembering keywords when
898 // switching tabs. This is why we drupal_goto to it from the parent instead.
899 drupal_goto('search/node');
902 $keys = search_get_keys();
903 // Only perform search if there is non-whitespace search term:
905 // Log the search keys:
906 watchdog('search', t('Search: %keys (%type).', array('%keys' => theme('placeholder', $keys), '%type' => module_invoke($type, 'search', 'name'))), WATCHDOG_NOTICE, l(t('results'), 'search/'. $type .'/'. $keys));
908 // Collect the search results:
909 $results = search_data($keys, $type);
912 $results = theme('box', t('Search results'), $results);
915 $results = theme('box', t('Your search yielded no results'), search_help('search#noresults'));
919 // Construct the search form.
920 $output = search_form(NULL, $keys, $type);
926 return search_form(NULL, $keys, $type);
930 * @defgroup search Search interface
932 * The Drupal search interface manages a global search mechanism.
934 * Modules may plug into this system to provide searches of different types of
935 * data. Most of the system is handled by search.module, so this must be enabled
936 * for all of the search features to work.
938 * There are three ways to interact with the search system:
939 * - Specifically for searching nodes, you can implement nodeapi('update index')
940 * and nodeapi('search result'). However, note that the search system already
941 * indexes all visible output of a node, i.e. everything displayed normally
942 * by hook_view() and hook_nodeapi('view'). This is usually sufficient.
943 * You should only use this mechanism if you want additional, non-visible data
945 * - Implement hook_search(). This will create a search tab for your module on
946 * the /search page with a simple keyword search form. You may optionally
947 * implement hook_search_item() to customize the display of your results.
948 * - Implement hook_update_index(). This allows your module to use Drupal's
949 * HTML indexing mechanism for searching full text efficiently.
951 * If your module needs to provide a more complicated search form, then you need
952 * to implement it yourself without hook_search(). In that case, you should
953 * define it as a local task (tab) under the /search page (e.g. /search/mymodule)
954 * so that users can easily find it.
958 * Render a search form.
961 * Form action. Defaults to "search".
963 * The search string entered by the user, containing keywords for the search.
965 * The type of search to render the node for. Must be the name of module
966 * which implements hook_search(). Defaults to 'node'.
968 * A piece of text to put before the form (e.g. "Enter your keywords")
970 * An HTML string containing the search form.
972 function search_form($action = '', $keys = '', $type = NULL, $prompt = NULL) {
974 $action = url('search/'. $type);
976 if (is_null($prompt)) {
977 $prompt = t('Enter your keywords');
981 '#action' => $action,
982 '#attributes' => array('class' => 'search-form'),
984 $form['module'] = array('#type' => 'value', '#value' => $type);
985 $form['basic'] = array('#type' => 'item', '#title' => $prompt);
986 $form['basic']['inline'] = array('#prefix' => '<div class="container-inline">', '#suffix' => '</div>');
987 $form['basic']['inline']['keys'] = array(
988 '#type' => 'textfield',
990 '#default_value' => $keys,
991 '#size' => $prompt ? 40 : 20,
994 // processed_keys is used to coordinate keyword passing between other forms
995 // that hook into the basic search form.
996 $form['basic']['inline']['processed_keys'] = array('#type' => 'value', '#value' => array());
997 $form['basic']['inline']['submit'] = array('#type' => 'submit', '#value' => t('Search'));
999 return drupal_get_form('search_form', $form);
1003 * As the search form collates keys from other modules hooked in via
1004 * hook_form_alter, the validation takes place in _submit.
1005 * search_form_validate() is used solely to set the 'processed_keys' form
1006 * value for the basic search form.
1008 function search_form_validate($form_id, $form_values, $form) {
1009 form_set_value($form['basic']['inline']['processed_keys'], trim($form_values['keys']));
1013 * Process a search form submission.
1015 function search_form_submit($form_id, $form_values) {
1016 $keys = $form_values['processed_keys'];
1018 form_set_error('keys', t('Please enter some keywords.'));
1019 // Fall through to the drupal_goto() call.
1022 $type = $form_values['module'] ? $form_values['module'] : 'node';
1023 return 'search/'. $type .'/'. $keys;
1027 * Output a search form for the search block and the theme's search box.
1029 function search_box($form_id = 'search_theme_form') {
1030 // Use search_keys instead of keys to avoid ID conflicts with the search block.
1031 $form[$form_id .'_keys'] = array(
1032 '#type' => 'textfield',
1034 '#default_value' => '',
1035 '#attributes' => array('title' => t('Enter the terms you wish to search for.')),
1037 $form['submit'] = array('#type' => 'submit', '#value' => t('Search'));
1039 return drupal_get_form($form_id, $form, 'search_box_form');
1043 * Process a block search form submission.
1045 function search_box_form_submit($form_id, $form_values) {
1046 return 'search/node/'. trim($form_values[$form_id .'_keys']);
1050 * Theme the theme search form.
1052 function theme_search_theme_form($form) {
1053 return '<div id="search" class="container-inline">'. form_render($form) .'</div>';
1057 * Theme the block search form.
1059 function theme_search_block_form($form) {
1060 return '<div class="container-inline">'. form_render($form) .'</div>';
1064 * Perform a standard search on the given keys, and return the formatted results.
1066 function search_data($keys = NULL, $type = 'node') {
1068 if (module_hook($type, 'search')) {
1069 $results = module_invoke($type, 'search', 'search', $keys);
1070 if (isset($results) && is_array($results) && count($results)) {
1071 if (module_hook($type, 'search_page')) {
1072 return module_invoke($type, 'search_page', $results);
1075 return theme('search_page', $results, $type);
1083 * Returns snippets from a piece of text, with certain keywords highlighted.
1084 * Used for formatting search results.
1087 * A string containing a search query.
1090 * The text to extract fragments from.
1093 * A string containing HTML for the excerpt.
1095 function search_excerpt($keys, $text) {
1096 // We highlight around non-indexable or CJK characters.
1097 $boundary = '(?:(?<=['. PREG_CLASS_SEARCH_EXCLUDE . PREG_CLASS_CJK .'])|(?=['. PREG_CLASS_SEARCH_EXCLUDE . PREG_CLASS_CJK .']))';
1099 // Extract positive keywords and phrases
1100 preg_match_all('/ ("([^"]+)"|(?!OR)([^" ]+))/', ' '. $keys, $matches);
1101 $keys = array_merge($matches[2], $matches[3]);
1104 $text = ' '. strip_tags(str_replace(array('<', '>'), array(' <', '> '), $text)) .' ';
1105 array_walk($keys, '_search_excerpt_replace');
1108 // Extract a fragment per keyword for at most 4 keywords.
1109 // First we collect ranges of text around each keyword, starting/ending
1111 // If the sum of all fragments is too short, we look for second occurrences.
1113 $included = array();
1115 while ($length < 256 && count($workkeys)) {
1116 foreach ($workkeys as $k => $key) {
1117 if (strlen($key) == 0) {
1118 unset($workkeys[$k]);
1122 if ($length >= 256) {
1125 // Remember occurrence of key so we can skip over it if more occurrences
1127 if (!isset($included[$key])) {
1128 $included[$key] = 0;
1130 // Locate a keyword (position $p), then locate a space in front (position
1131 // $q) and behind it (position $s)
1132 if (preg_match('/'. $boundary . $key . $boundary .'/iu', $text, $match, PREG_OFFSET_CAPTURE, $included[$key])) {
1134 if (($q = strpos($text, ' ', max(0, $p - 60))) !== false) {
1135 $end = substr($text, $p, 80);
1136 if (($s = strrpos($end, ' ')) !== false) {
1137 $ranges[$q] = $p + $s;
1138 $length += $p + $s - $q;
1139 $included[$key] = $p + 1;
1142 unset($workkeys[$k]);
1146 unset($workkeys[$k]);
1150 unset($workkeys[$k]);
1155 // If we didn't find anything, return the beginning.
1156 if (count($ranges) == 0) {
1157 return truncate_utf8($text, 256) . ' ...';
1160 // Sort the text ranges by starting position.
1163 // Now we collapse overlapping text ranges into one. The sorting makes it O(n).
1164 $newranges = array();
1165 foreach ($ranges as $from2 => $to2) {
1166 if (!isset($from1)) {
1171 if ($from2 <= $to1) {
1172 $to1 = max($to1, $to2);
1175 $newranges[$from1] = $to1;
1180 $newranges[$from1] = $to1;
1184 foreach ($newranges as $from => $to) {
1185 $out[] = substr($text, $from, $to - $from);
1187 $text = (isset($newranges[0]) ? '' : '... '). implode(' ... ', $out) .' ...';
1189 // Highlight keywords. Must be done at once to prevent conflicts ('strong' and '<strong>').
1190 $text = preg_replace('/'. $boundary .'('. implode('|', $keys) .')'. $boundary .'/iu', '<strong>\0</strong>', $text);
1195 * @} End of "defgroup search".
1199 * Helper function for array_walk in search_except.
1201 function _search_excerpt_replace(&$text) {
1202 $text = preg_quote($text, '/');
1206 * Format a single result entry of a search query. This function is normally
1207 * called by theme_search_page() or hook_search_page().
1210 * A single search result as returned by hook_search(). The result should be
1211 * an array with keys "link", "title", "type", "user", "date", and "snippet".
1212 * Optionally, "extra" can be an array of extra info to show along with the
1215 * The type of item found, such as "user" or "node".
1217 * @ingroup themeable
1219 function theme_search_item($item, $type) {
1220 $output = ' <dt class="title"><a href="'. check_url($item['link']) .'">'. check_plain($item['title']) .'</a></dt>';
1222 if ($item['type']) {
1223 $info[] = $item['type'];
1225 if ($item['user']) {
1226 $info[] = $item['user'];
1228 if ($item['date']) {
1229 $info[] = format_date($item['date'], 'small');
1231 if (is_array($item['extra'])) {
1232 $info = array_merge($info, $item['extra']);
1234 $output .= ' <dd>'. ($item['snippet'] ? '<p>'. $item['snippet'] . '</p>' : '') . '<p class="search-info">' . implode(' - ', $info) .'</p></dd>';
1239 * Format the result page of a search query.
1241 * Modules may implement hook_search_page() in order to override this default
1242 * function to display search results. In that case it is expected they provide
1243 * their own themeable functions.
1246 * All search result as returned by hook_search().
1248 * The type of item found, such as "user" or "node".
1250 * @ingroup themeable
1252 function theme_search_page($results, $type) {
1253 $output = '<dl class="search-results">';
1255 foreach ($results as $entry) {
1256 $output .= theme('search_item', $entry, $type);
1259 $output .= theme('pager', NULL, 10, 0);