MDL-65806 Search: Solr does not find words in italic

This commit is contained in:
sam marshall 2019-05-30 16:19:57 +01:00
parent f3507273e9
commit c207289127
2 changed files with 88 additions and 1 deletions

View file

@ -272,7 +272,7 @@ class engine extends \core_search\engine {
$query = new \SolrDisMaxQuery();
$this->set_query($query, $data->q);
$this->set_query($query, self::replace_underlines($data->q));
$this->add_fields($query);
// Search filters applied, we don't cache these filters as we don't want to pollute the cache with tmp filters
@ -750,6 +750,23 @@ class engine extends \core_search\engine {
return true;
}
/**
* Replaces underlines at edges of words in the content with spaces.
*
* For example '_frogs_' will become 'frogs', '_frogs and toads_' will become 'frogs and toads',
* and 'frogs_and_toads' will be left as 'frogs_and_toads'.
*
* The reason for this is that for italic content_to_text puts _italic_ underlines at the start
* and end of the italicised phrase (not between words). Solr treats underlines as part of the
* word, which means that if you search for a word in italic then you can't find it.
*
* @param string $str String to replace
* @return string Replaced string
*/
protected static function replace_underlines(string $str): string {
return preg_replace('~\b_|_\b~', '', $str);
}
/**
* Adds a text document to the search engine.
*
@ -758,6 +775,14 @@ class engine extends \core_search\engine {
*/
protected function add_solr_document($doc) {
$solrdoc = new \SolrInputDocument();
// Replace underlines in the content with spaces. The reason for this is that for italic
// text, content_to_text puts _italic_ underlines. Solr treats underlines as part of the
// word, which means that if you search for a word in italic then you can't find it.
if (array_key_exists('content', $doc)) {
$doc['content'] = self::replace_underlines($doc['content']);
}
foreach ($doc as $field => $value) {
$solrdoc->addField($field, $value);
}