MDL-53516 search: Adding file indexing support to Solr

Expand the Solr engine to use the built in (to Solr) Tika file indexing engine. Files are shipped off for indexing with curl. Each file must have it's own document. So grouping is used to keep files and Moodle docs associated.
2025-08-06 09:26:35 +02:00 · 2016-03-17 01:19:40 -04:00 · 2016-03-17 01:19:40 -04:00 · cd894f84b3
commit cd894f84b3
parent 091973dbd7
5 changed files with 808 additions and 9 deletions
--- a/search/engine/solr/classes/document.php
+++ b/search/engine/solr/classes/document.php
@ -33,6 +33,49 @@ defined('MOODLE_INTERNAL') || die();
 * @license    http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later
 */
 class document extends \core_search\document {
+    /**
+     * Indicates the file contents were not indexed due to an error.
+     */
+    const INDEXED_FILE_ERROR = -1;
+
+    /**
+     * Indicates the file contents were not indexed due filtering/settings.
+     */
+    const INDEXED_FILE_FALSE = 0;
+
+    /**
+     * Indicates the file contents are indexed with the record.
+     */
+    const INDEXED_FILE_TRUE = 1;
+
+    /**
+     * Any fields that are engine specifc. These are fields that are solely used by a seach engine plugin
+     * for internal purposes.
+     *
+     * @var array
+     */
+    protected static $enginefields = array(
+        'solr_filegroupingid' => array(
+            'type' => 'string',
+            'stored' => true,
+            'indexed' => true
+        ),
+        'solr_fileid' => array(
+            'type' => 'string',
+            'stored' => true,
+            'indexed' => false
+        ),
+        'solr_filecontenthash' => array(
+            'type' => 'string',
+            'stored' => true,
+            'indexed' => false
+        ),
+        'solr_fileindexedcontent' => array(
+            'type' => 'int',
+            'stored' => true,
+            'indexed' => true
+        )
+    );

    /**
     * Formats the timestamp according to the search engine needs.
@ -74,4 +117,43 @@ class document extends \core_search\document {
    protected function get_text_format() {
        return FORMAT_MARKDOWN;
    }
+
+    /**
+     * Apply any defaults to unset fields before export. Called after document building, but before export.
+     *
+     * Sub-classes of this should make sure to call parent::apply_defaults().
+     */
+    protected function apply_defaults() {
+        parent::apply_defaults();
+
+        // We want to set the solr_filegroupingid to id if it isn't set.
+        if (!isset($this->data['solr_filegroupingid'])) {
+            $this->data['solr_filegroupingid'] = $this->data['id'];
+        }
+    }
+
+    /**
+     * Export the data for the given file in relation to this document.
+     *
+     * @param \stored_file $file The stored file we are talking about.
+     * @return array
+     */
+    public function export_file_for_engine($file) {
+        $data = $this->export_for_engine();
+
+        // Content is index in the main document.
+        unset($data['content']);
+        unset($data['description1']);
+        unset($data['description2']);
+
+        // Going to append the fileid to give it a unique id.
+        $data['id'] = $data['id'].'-solrfile'.$file->get_id();
+        $data['type'] = \core_search\manager::TYPE_FILE;
+        $data['solr_fileid'] = $file->get_id();
+        $data['solr_filecontenthash'] = $file->get_contenthash();
+        $data['solr_fileindexedcontent'] = self::INDEXED_FILE_TRUE;
+        $data['title'] = $file->get_filename();
+
+        return $data;
+    }
 }
--- a/search/engine/solr/classes/engine.php
+++ b/search/engine/solr/classes/engine.php
@ -88,7 +88,12 @@ class engine extends \core_search\engine {
        }

        $query = new \SolrQuery();
-        $this->set_query($query, $data->q);
+        $maxrows = \core_search\manager::MAX_RESULTS;
+        if ($this->file_indexing_enabled()) {
+            // When using file indexing and grouping, we are going to collapse results, so we want extra results.
+            $maxrows *= 2;
+        }
+        $this->set_query($query, $data->q, $maxrows);
        $this->add_fields($query);

        // Search filters applied, we don't cache these filters as we don't want to pollute the cache with tmp filters
@ -140,7 +145,15 @@ class engine extends \core_search\engine {
        }

        try {
-            return $this->query_response($this->client->query($query));
+            if ($this->file_indexing_enabled()) {
+                // Now group records by solr_filegroupingid. Limit to 3 results per group.
+                $query->setGroup(true);
+                $query->setGroupLimit(3);
+                $query->addGroupField('solr_filegroupingid');
+                return $this->grouped_files_query_response($this->client->query($query));
+            } else {
+                return $this->query_response($this->client->query($query));
+            }
        } catch (\SolrClientException $ex) {
            debugging('Error executing the provided query: ' . $ex->getMessage(), DEBUG_DEVELOPER);
            $this->queryerror = $ex->getMessage();
@ -156,9 +169,13 @@ class engine extends \core_search\engine {
    /**
     * Prepares a new query by setting the query, start offset and rows to return.
     * @param SolrQuery $query
-     * @param object $q Containing query and filters.
+     * @param object    $q Containing query and filters.
+     * @param null|int  $maxresults The number of results to limit. manager::MAX_RESULTS if not set.
     */
-    protected function set_query($query, $q) {
+    protected function set_query($query, $q, $maxresults = null) {
+        if (!is_numeric($maxresults)) {
+            $maxresults = \core_search\manager::MAX_RESULTS;
+        }

        // Set hightlighting.
        $query->setHighlight(true);
@ -172,7 +189,7 @@ class engine extends \core_search\engine {
        $query->setQuery($q);

        // A reasonable max.
-        $query->setRows(\core_search\manager::MAX_RESULTS);
+        $query->setRows($maxresults);
    }

    /**
@ -193,6 +210,11 @@ class engine extends \core_search\engine {
     * @param object $response containing results.
     */
    public function add_highlight_content($response) {
+        if (!isset($response->highlighting)) {
+            // There is no highlighting to add.
+            return;
+        }
+
        $highlightedobject = $response->highlighting;
        foreach ($response->response->docs as $doc) {
            $x = $doc->id;
@ -291,6 +313,155 @@ class engine extends \core_search\engine {
        return $docs;
    }

+    /**
+     * Processes grouped file results into documents, with attached matching files.
+     *
+     * @param SolrQueryResponse $queryresponse The response returned from solr server
+     * @return array Final results to be displayed.
+     */
+    protected function grouped_files_query_response($queryresponse) {
+        $response = $queryresponse->getResponse();
+
+        // If we can't find the grouping, or there are no matches in the grouping, return empty.
+        if (!isset($response->grouped->solr_filegroupingid) || empty($response->grouped->solr_filegroupingid->matches)) {
+            return array();
+        }
+
+        $numgranted = 0;
+        $orderedids = array();
+        $completedocs = array();
+        $incompletedocs = array();
+
+        $highlightingobj = $response->highlighting;
+
+        // Each group represents a "master document".
+        $groups = $response->grouped->solr_filegroupingid->groups;
+        foreach ($groups as $group) {
+            $groupid = $group->groupValue;
+            $groupdocs = $group->doclist->docs;
+            $firstdoc = reset($groupdocs);
+
+            if (!$searcharea = $this->get_search_area($firstdoc->areaid)) {
+                // Well, this is a problem.
+                continue;
+            }
+
+            // Check for access.
+            $access = $searcharea->check_access($firstdoc->itemid);
+            switch ($access) {
+                case \core_search\manager::ACCESS_DELETED:
+                    // If deleted from Moodle, delete from index and then continue.
+                    $this->delete_by_id($firstdoc->id);
+                    continue 2;
+                    break;
+                case \core_search\manager::ACCESS_DENIED:
+                    // This means we should just skip for the current user.
+                    continue 2;
+                    break;
+            }
+            $numgranted++;
+
+            $maindoc = false;
+            $fileids = array();
+            // Seperate the main document and any files returned.
+            foreach ($groupdocs as $groupdoc) {
+                if ($groupdoc->id == $groupid) {
+                    $maindoc = $groupdoc;
+                } else if (isset($groupdoc->solr_fileid)) {
+                    $fileids[] = $groupdoc->solr_fileid;
+                }
+            }
+
+            // Store the id of this group, in order, for later merging.
+            $orderedids[] = $groupid;
+
+            if (!$maindoc) {
+                // We don't have the main doc, store what we know for later building.
+                $incompletedocs[$groupid] = $fileids;
+            } else {
+                if (isset($highlightingobj->$groupid)) {
+                    // Merge the highlighting for this doc.
+                    $this->merge_highlight_field_values($maindoc, $highlightingobj->$groupid);
+                }
+                $docdata = $this->standarize_solr_obj($maindoc);
+                $doc = $this->to_document($searcharea, $docdata);
+                // Now we need to attach the result files to the doc.
+                foreach ($fileids as $fileid) {
+                    $doc->add_stored_file($fileid);
+                }
+                $completedocs[$groupid] = $doc;
+            }
+
+            if ($numgranted >= \core_search\manager::MAX_RESULTS) {
+                // We have hit the max results, we will just ignore the rest.
+                break;
+            }
+        }
+
+        $incompletedocs = $this->get_missing_docs($incompletedocs);
+
+        $out = array();
+        // Now merge the complete and incomplete documents, in results order.
+        foreach ($orderedids as $docid) {
+            if (isset($completedocs[$docid])) {
+                $out[] = $completedocs[$docid];
+            } else if (isset($incompletedocs[$docid])) {
+                $out[] = $incompletedocs[$docid];
+            }
+        }
+
+        return $out;
+    }
+
+    /**
+     * Retreive any missing main documents and attach provided files.
+     *
+     * The missingdocs array should be an array, indexed by document id, of main documents we need to retrieve. The value
+     * associated to the key should be an array of stored_files or stored file ids to attach to the result document.
+     *
+     * Return array also indexed by document id.
+     *
+     * @param array() $missingdocs An array, indexed by document id, with arrays of files/ids to attach.
+     * @return document[]
+     */
+    protected function get_missing_docs($missingdocs) {
+        if (empty($missingdocs)) {
+            return array();
+        }
+
+        $docids = array_keys($missingdocs);
+
+        // Build a custom query that will get all the missing documents.
+        $query = new \SolrQuery();
+        $this->set_query($query, '*', count($docids));
+        $this->add_fields($query);
+        $query->addFilterQuery('{!cache=false}id:(' . implode(' OR ', $docids) . ')');
+
+        try {
+            $results = $this->query_response($this->get_search_client()->query($query));
+        } catch (\SolrClientException $ex) {
+            return array();
+        } catch (\SolrServerException $ex) {
+            return array();
+        }
+
+        $out = array();
+        foreach ($results as $result) {
+            $resultid = $result->get('id');
+            if (!isset($missingdocs[$resultid])) {
+                // We got a result we didn't expect. Skip it.
+                continue;
+            }
+            // Attach the files.
+            foreach ($missingdocs[$resultid] as $filedoc) {
+                $result->add_stored_file($filedoc);
+            }
+            $out[$resultid] = $result;
+        }
+
+        return $out;
+    }
+
    /**
     * Returns a standard php array from a \SolrObject instance.
     *
@ -321,20 +492,25 @@ class engine extends \core_search\engine {
    public function add_document($document, $fileindexing = false) {
        $docdata = $document->export_for_engine();

-        if (!$this->add_text_document($docdata)) {
+        if (!$this->add_solr_document($docdata)) {
            return false;
        }

+        if ($fileindexing) {
+            // This will take care of updating all attached files in the index.
+            $this->process_document_files($document);
+        }
+
        return true;
    }

    /**
     * Adds a text document to the search engine.
     *
-     * @param array $filedoc
+     * @param array $doc
     * @return bool
     */
-    protected function add_text_document($doc) {
+    protected function add_solr_document($doc) {
        $solrdoc = new \SolrInputDocument();
        foreach ($doc as $field => $value) {
            $solrdoc->addField($field, $value);
@ -354,6 +530,293 @@ class engine extends \core_search\engine {
        return false;
    }

+    /**
+     * Index files attached to the docuemnt, ensuring the index matches the current document files.
+     *
+     * For documents that aren't known to be new, we check the index for existing files.
+     * - New files we will add.
+     * - Existing and unchanged files we will skip.
+     * - File that are in the index but not on the document will be deleted from the index.
+     * - Files that have changed will be re-indexed.
+     *
+     * @param document $document
+     */
+    protected function process_document_files($document) {
+        if (!$this->file_indexing_enabled()) {
+            return;
+        }
+
+        // Maximum rows to process at a time.
+        $rows = 500;
+
+        // Get the attached files.
+        $files = $document->get_files();
+
+        // If this isn't a new document, we need to check the exiting indexed files.
+        if (!$document->get_is_new()) {
+            // We do this progressively, so we can handle lots of files cleanly.
+            list($numfound, $indexedfiles) = $this->get_indexed_files($document, 0, $rows);
+            $count = 0;
+            $idstodelete = array();
+
+            do {
+                // Go through each indexed file. We want to not index any stored and unchanged ones, delete any missing ones.
+                foreach ($indexedfiles as $indexedfile) {
+                    $fileid = $indexedfile->solr_fileid;
+
+                    if (isset($files[$fileid])) {
+                        // Check for changes that would mean we need to re-index the file. If so, just leave in $files.
+                        // Filelib does not guarantee time modified is updated, so we will check important values.
+                        if ($indexedfile->modified < $files[$fileid]->get_timemodified()) {
+                            continue;
+                        }
+                        if (strcmp($indexedfile->title, $files[$fileid]->get_filename()) !== 0) {
+                            continue;
+                        }
+                        if ($indexedfile->solr_filecontenthash != $files[$fileid]->get_contenthash()) {
+                            continue;
+                        }
+                        if ($indexedfile->solr_fileindexedcontent == document::INDEXED_FILE_FALSE &&
+                                $this->file_is_indexable($files[$fileid])) {
+                            // This means that the last time we indexed this file, filtering blocked it.
+                            // Current settings say it is indexable, so we will allow it to be indexed.
+                            continue;
+                        }
+
+                        // If the file is already indexed, we can just remove it from the files array and skip it.
+                        unset($files[$fileid]);
+                    } else {
+                        // This means we have found a file that is no longer attached, so we need to delete from the index.
+                        // We do it later, since this is progressive, and it could reorder results.
+                        $idstodelete[] = $indexedfile->id;
+                    }
+                }
+                $count += $rows;
+
+                if ($count < $numfound) {
+                    // If we haven't hit the total count yet, fetch the next batch.
+                    list($numfound, $indexedfiles) = $this->get_indexed_files($document, $count, $rows);
+                }
+
+            } while ($count < $numfound);
+
+            // Delete files that are no longer attached.
+            foreach ($idstodelete as $id) {
+                // We directly delete the item using the client, as the engine delete_by_id won't work on file docs.
+                $this->get_search_client()->deleteById($id);
+            }
+        }
+
+        // Now we can actually index all the remaining files.
+        foreach ($files as $file) {
+            $this->add_stored_file($document, $file);
+        }
+    }
+
+    /**
+     * Get the currently indexed files for a particular document, returns the total count, and a subset of files.
+     *
+     * @param document $document
+     * @param int      $start The row to start the results on. Zero indexed.
+     * @param int      $rows The number of rows to fetch
+     * @return array   A two element array, the first is the total number of availble results, the second is an array
+     *                 of documents for the current request.
+     */
+    protected function get_indexed_files($document, $start = 0, $rows = 500) {
+        // Build a custom query that will get any document files that are in our solr_filegroupingid.
+        $query = new \SolrQuery();
+
+        // We want to get all file records tied to a document.
+        // For efficiency, we are building our own, stripped down, query.
+        $query->setQuery('*');
+        $query->setRows($rows);
+        $query->setStart($start);
+        // We want a consistent sorting.
+        $query->addSortField('id');
+
+        // We only want the bare minimum of fields.
+        $query->addField('id');
+        $query->addField('modified');
+        $query->addField('title');
+        $query->addField('solr_fileid');
+        $query->addField('solr_filecontenthash');
+        $query->addField('solr_fileindexedcontent');
+
+        $query->addFilterQuery('{!cache=false}solr_filegroupingid:(' . $document->get('id') . ')');
+        $query->addFilterQuery('type:' . \core_search\manager::TYPE_FILE);
+
+        try {
+            $response = $this->get_search_client()->query($query);
+            $responsedoc = $response->getResponse();
+
+            if (empty($responsedoc->response->numFound)) {
+                return array(0, array());
+            }
+            $numfound = $responsedoc->response->numFound;
+
+            return array($numfound, $this->convert_file_results($responsedoc));
+        } catch (\SolrClientException $ex) {
+            debugging('Error executing the provided query: ' . $ex->getMessage(), DEBUG_DEVELOPER);
+            $this->queryerror = $ex->getMessage();
+            return array(0, array());
+        } catch (\SolrServerException $ex) {
+            debugging('Error executing the provided query: ' . $ex->getMessage(), DEBUG_DEVELOPER);
+            $this->queryerror = $ex->getMessage();
+            return array(0, array());
+        }
+    }
+
+    /**
+     * A very lightweight handler for getting information about already indexed files from a Solr response.
+     *
+     * @param SolrObject $responsedoc A Solr response document
+     * @return stdClass[] An array of objects that contain the basic information for file processing.
+     */
+    protected function convert_file_results($responsedoc) {
+        if (!$docs = $responsedoc->response->docs) {
+            return array();
+        }
+
+        $out = array();
+
+        foreach ($docs as $doc) {
+            // Copy the bare minimim needed info.
+            $result = new \stdClass();
+            $result->id = $doc->id;
+            $result->modified = document::import_time_from_engine($doc->modified);
+            $result->title = $doc->title;
+            $result->solr_fileid = $doc->solr_fileid;
+            $result->solr_filecontenthash = $doc->solr_filecontenthash;
+            $result->solr_fileindexedcontent = $doc->solr_fileindexedcontent;
+            $out[] = $result;
+        }
+
+        return $out;
+    }
+
+    /**
+     * Adds a file to the search engine.
+     *
+     * Notes about Solr and Tika indexing. We do not send the mime type, only the filename.
+     * Tika has much better content type detection than Moodle, and we will have many more doc failures
+     * if we try to send mime types.
+     *
+     * @param document $document
+     * @param \stored_file $storedfile
+     * @return void
+     */
+    protected function add_stored_file($document, $storedfile) {
+        $filedoc = $document->export_file_for_engine($storedfile);
+
+        if (!$this->file_is_indexable($storedfile)) {
+            // For files that we don't consider indexable, we will still place a reference in the search engine.
+            $filedoc['solr_fileindexedcontent'] = document::INDEXED_FILE_FALSE;
+            $this->add_solr_document($filedoc);
+            return;
+        }
+
+        $curl = $this->get_curl_object();
+
+        $url = $this->get_connection_url('/update/extract');
+
+        // This will prevent solr from automatically making fields for every tika output.
+        $url->param('uprefix', 'ignored_');
+
+        // These are common fields that matches the standard *_point dynamic field and causes an error.
+        $url->param('fmap.media_white_point', 'ignored_mwp');
+        $url->param('fmap.media_black_point', 'ignored_mbp');
+
+        // Copy each key to the url with literal.
+        // We place in a temp name then copy back to the true field, which prevents errors or Tika overwriting common field names.
+        foreach ($filedoc as $key => $value) {
+            // This will take any fields from tika that match our schema and discard them, so they don't overwrite ours.
+            $url->param('fmap.'.$key, 'ignored_'.$key);
+            // Place data in a tmp field.
+            $url->param('literal.mdltmp_'.$key, $value);
+            // Then move to the final field.
+            $url->param('fmap.mdltmp_'.$key, $key);
+        }
+
+        // This sets the true filename for Tika.
+        $url->param('resource.name', $storedfile->get_filename());
+
+        // A giant block of code that is really just error checking around the curl request.
+        try {
+            // Now actually do the request.
+            $result = $curl->post($url->out(false), array('myfile' => $storedfile));
+
+            $code = $curl->get_errno();
+            $info = $curl->get_info();
+
+            // Now error handling. It is just informational, since we aren't tracking per file/doc results.
+            if ($code != 0) {
+                // This means an internal cURL error occurred error is in result.
+                $message = 'Curl error '.$code.' while indexing file with document id '.$filedoc['id'].': '.$result.'.';
+                debugging($message, DEBUG_DEVELOPER);
+            } else if (isset($info['http_code']) && ($info['http_code'] !== 200)) {
+                // Unexpected HTTP response code.
+                $message = 'Error while indexing file with document id '.$filedoc['id'];
+                // Try to get error message out of msg or title if it exists.
+                if (preg_match('|<str [^>]*name="msg"[^>]*>(.*?)</str>|i', $result, $matches)) {
+                    $message .= ': '.$matches[1];
+                } else if (preg_match('|<title[^>]*>([^>]*)</title>|i', $result, $matches)) {
+                    $message .= ': '.$matches[1];
+                }
+                // This is a common error, happening whenever a file fails to index for any reason, so we will make it quieter.
+                if (CLI_SCRIPT && !PHPUNIT_TEST) {
+                    mtrace($message);
+                }
+            } else {
+                // Check for the expected status field.
+                if (preg_match('|<int [^>]*name="status"[^>]*>(\d*)</int>|i', $result, $matches)) {
+                    // Now check for the expected status of 0, if not, error.
+                    if ((int)$matches[1] !== 0) {
+                        $message = 'Unexpected Solr status code '.(int)$matches[1];
+                        $message .= ' while indexing file with document id '.$filedoc['id'].'.';
+                        debugging($message, DEBUG_DEVELOPER);
+                    } else {
+                        // The document was successfully indexed.
+                        return;
+                    }
+                } else {
+                    // We received an unprocessable response.
+                    $message = 'Unexpected Solr response while indexing file with document id '.$filedoc['id'].': ';
+                    $message .= strtok($result, "\n");
+                    debugging($message, DEBUG_DEVELOPER);
+                }
+            }
+        } catch (\Exception $e) {
+            // There was an error, but we are not tracking per-file success, so we just continue on.
+            debugging('Unknown exception while indexing file "'.$storedfile->get_filename().'".', DEBUG_DEVELOPER);
+        }
+
+        // If we get here, the document was not indexed due to an error. So we will index just the base info without the file.
+        $filedoc['solr_fileindexedcontent'] = document::INDEXED_FILE_ERROR;
+        $this->add_solr_document($filedoc);
+    }
+
+    /**
+     * Checks to see if a passed file is indexable.
+     *
+     * @param \stored_file $file The file to check
+     * @return bool True if the file can be indexed
+     */
+    protected function file_is_indexable($file) {
+        if (!empty($this->config->maxindexfilekb) && ($file->get_filesize() > ($this->config->maxindexfilekb * 1024))) {
+            // The file is too big to index.
+            return false;
+        }
+
+        $mime = $file->get_mimetype();
+
+        if ($mime == 'application/vnd.moodle.backup') {
+            // We don't index Moodle backup files. There is nothing usefully indexable in them.
+            return false;
+        }
+
+        return true;
+    }
+
    /**
     * Commits all pending changes.
     *
@ -379,6 +842,15 @@ class engine extends \core_search\engine {
        return true;
    }

+    /**
+     * Return true if file indexing is supported and enabled. False otherwise.
+     *
+     * @return bool
+     */
+    public function file_indexing_enabled() {
+        return (bool)$this->config->fileindexing;
+    }
+
    /**
     * Defragments the index.
     *
@ -395,7 +867,8 @@ class engine extends \core_search\engine {
     * @return void
     */
    public function delete_by_id($id) {
-        $this->get_search_client()->deleteById($id);
+        // We need to make sure we delete the item and all related files, which can be done with solr_filegroupingid.
+        $this->get_search_client()->deleteByQuery('solr_filegroupingid:' . $id);
        $this->commit();
    }

--- a/search/engine/solr/lang/en/search_solr.php
+++ b/search/engine/solr/lang/en/search_solr.php
@ -23,9 +23,15 @@
 */

 $string['connectionerror'] = 'The specified Solr server is not available or the specified index does not exist';
+$string['connectionsettings'] = 'Connection settings';
 $string['errorcreatingschema'] = 'Error creating the Solr schema: {$a}';
 $string['errorvalidatingschema'] = 'Error validating Solr schema, field {$a->fieldname} does not exist. Please <a href="{$a->setupurl}">follow this link</a> to setup the fields required by Moodle.';
 $string['extensionerror'] = 'The Apache Solr PHP extension is not installed. Please check the documentation.';
+$string['fileindexing'] = 'Enable file indexing';
+$string['fileindexing_help'] = 'If your Solr install supports it, this feature allows Moodle to send files to be indexed.';
+$string['fileindexsettings'] = 'File indexing settings';
+$string['maxindexfilekb'] = 'Maximum file size to index (kB)';
+$string['maxindexfilekb_help'] = 'Files larger than this number of kilobytes will be skipped for search indexing. 0 to index files of any size.';
 $string['missingconfig'] = 'Your Apache Solr server is not yet configured in Moodle.';
 $string['multivaluedfield'] = 'Field "{$a}" returned an array instead of a scalar, the field is probably defined in Solr with "Multivalued" to true, this means that Solr autocreated the field for you when you indexed data because you forgot to run search/engine/solr/cli/setup_schema.php. Please delete the current index, create a new one and run setup_schema.php before indexing data in Solr.';
 $string['nodatafromserver'] = 'No data from server';
--- a/search/engine/solr/settings.php
+++ b/search/engine/solr/settings.php
@ -31,6 +31,8 @@ if ($ADMIN->fulltree) {
            $settings->add(new admin_setting_heading('search_solr_settings', '', get_string('extensionerror', 'search_solr')));

        } else {
+            $settings->add(new admin_setting_heading('search_solr_connection',
+                    new lang_string('connectionsettings', 'search_solr'), ''));
            $settings->add(new admin_setting_configtext('search_solr/server_hostname', new lang_string('solrserverhostname', 'search_solr'), new lang_string('solrserverhostname_desc', 'search_solr'), '127.0.0.1', PARAM_TEXT));
            $settings->add(new admin_setting_configtext('search_solr/indexname', new lang_string('solrindexname', 'search_solr'), '', 'moodle', PARAM_TEXT));
            $settings->add(new admin_setting_configcheckbox('search_solr/secure', new lang_string('solrsecuremode', 'search_solr'), '', 0, 1, 0));
@ -46,6 +48,15 @@ if ($ADMIN->fulltree) {
            $settings->add(new admin_setting_configtext('search_solr/ssl_keypassword', new lang_string('solrsslkeypassword', 'search_solr'), new lang_string('solrsslkeypassword_desc', 'search_solr'), '', PARAM_RAW));
            $settings->add(new admin_setting_configtext('search_solr/ssl_cainfo', new lang_string('solrsslcainfo', 'search_solr'), new lang_string('solrsslcainfo_desc', 'search_solr'), '', PARAM_RAW));
            $settings->add(new admin_setting_configtext('search_solr/ssl_capath', new lang_string('solrsslcapath', 'search_solr'), new lang_string('solrsslcapath_desc', 'search_solr'), '', PARAM_RAW));
+
+            $settings->add(new admin_setting_heading('search_solr_fileindexing',
+                    new lang_string('fileindexsettings', 'search_solr'), ''));
+            $settings->add(new admin_setting_configcheckbox('search_solr/fileindexing',
+                    new lang_string('fileindexing', 'search_solr'),
+                    new lang_string('fileindexing_help', 'search_solr'), 1));
+            $settings->add(new admin_setting_configtext('search_solr/maxindexfilekb',
+                    new lang_string('maxindexfilekb', 'search_solr'),
+                    new lang_string('maxindexfilekb_help', 'search_solr'), '20197152', PARAM_INT));
        }
    }
 }
--- a/search/engine/solr/tests/engine_test.php
+++ b/search/engine/solr/tests/engine_test.php
@ -23,6 +23,7 @@
 * - define('TEST_SEARCH_SOLR_INDEXNAME', 'unittest');
 *
 * Optional params:
+ * - define('TEST_SEARCH_SOLR_DISABLE_FILE_INDEXING', 1);
 * - define('TEST_SEARCH_SOLR_USERNAME', '');
 * - define('TEST_SEARCH_SOLR_PASSWORD', '');
 * - define('TEST_SEARCH_SOLR_SSLCERT', '');
@ -99,6 +100,14 @@ class search_solr_engine_testcase extends advanced_testcase {
            set_config('ssl_cainfo', TEST_SEARCH_SOLR_CAINFOCERT, 'search_solr');
        }

+        if (defined('TEST_SEARCH_SOLR_DISABLE_FILE_INDEXING') && (TEST_SEARCH_SOLR_DISABLE_FILE_INDEXING == 1)) {
+            set_config('fileindexing', 0, 'search_solr');
+        } else {
+            set_config('fileindexing', 1, 'search_solr');
+        }
+
+        // We are only test indexing small string files, so setting this as low as we can.
+        set_config('maxindexfilekb', 1, 'search_solr');

        // Inject search solr engine into the testable core search as we need to add the mock
        // search component to it.
@ -275,4 +284,222 @@ class search_solr_engine_testcase extends advanced_testcase {
        $this->assertEquals(0, $results[0]->get('owneruserid'));
        $this->assertEquals($originalid, $results[0]->get('id'));
    }
+
+    public function test_index_file() {
+        if (defined('TEST_SEARCH_SOLR_DISABLE_FILE_INDEXING') && (TEST_SEARCH_SOLR_DISABLE_FILE_INDEXING == 1)) {
+            $this->markTestSkipped('Solr file indexing not enabled.');
+            return;
+        }
+
+        // Very simple test.
+        $this->search->index();
+        $querydata = new stdClass();
+        $querydata->q = '"File contents"';
+
+        $this->assertCount(2, $this->search->search($querydata));
+    }
+
+    public function test_reindexing_files() {
+        if (defined('TEST_SEARCH_SOLR_DISABLE_FILE_INDEXING') && (TEST_SEARCH_SOLR_DISABLE_FILE_INDEXING == 1)) {
+            $this->markTestSkipped('Solr file indexing not enabled.');
+            return;
+        }
+
+        // Get engine and area to work with.
+        $engine = $this->search->get_engine();
+        $areaid = \core_search\manager::generate_areaid('core_mocksearch', 'role_capabilities');
+        $area = \core_search\manager::get_search_area($areaid);
+
+        // Get a single record to make a doc from.
+        $recordset = $area->get_recordset_by_timestamp(0);
+        $record = $recordset->current();
+        $recordset->close();
+
+        $doc = $area->get_document($record);
+
+        // Now we are going to make some files.
+        $fs = get_file_storage();
+        $syscontext = \context_system::instance();
+
+        $files = array();
+        $filerecord = array(
+            'contextid' => $syscontext->id,
+            'component' => 'core',
+            'filearea'  => 'unittest',
+            'itemid'    => 0,
+            'filepath'  => '/',
+        );
+
+        // We make enough so that we pass the 500 files threashold. That is the boundary when getting files.
+        $boundary = 500;
+        $top = (int)($boundary * 1.1);
+        for ($i = 0; $i < $top; $i++) {
+            $filerecord['filename']  = 'searchfile'.$i;
+            $file = $fs->create_file_from_string($filerecord, 'Some FileContents'.$i);
+            $doc->add_stored_file($file);
+            $files[] = $file;
+        }
+
+        // Add the doc with lots of files, then commit.
+        $engine->add_document($doc, true);
+        $engine->area_index_complete($area->get_area_id());
+
+        // Indexes we are going to check. 0 means we will delete, 1 means we will keep.
+        $checkfiles = array(
+            0 => 0,                        // Check the begining of the set.
+            1 => 1,
+            2 => 0,
+            ($top - 3) => 0,               // Check the end of the set.
+            ($top - 2) => 1,
+            ($top - 1) => 0,
+            ($boundary - 2) => 0,          // Check at the boundary between fetch groups.
+            ($boundary - 1) => 0,
+            $boundary => 0,
+            ($boundary + 1) => 0,
+            ((int)($boundary * 0.5)) => 1, // Make sure we keep some middle ones.
+            ((int)($boundary * 1.05)) => 1
+        );
+
+        $querydata = new stdClass();
+
+        // First, check that all the files are currently there.
+        foreach ($checkfiles as $key => $unused) {
+            $querydata->q = 'FileContents'.$key;
+            $this->assertCount(1, $this->search->search($querydata));
+            $querydata->q = 'searchfile'.$key;
+            $this->assertCount(1, $this->search->search($querydata));
+        }
+
+        // Remove the files we want removed from the files array.
+        foreach ($checkfiles as $key => $keep) {
+            if (!$keep) {
+                unset($files[$key]);
+            }
+        }
+
+        // And make us a new file to add.
+        $filerecord['filename']  = 'searchfileNew';
+        $files[] = $fs->create_file_from_string($filerecord, 'Some FileContentsNew');
+        $checkfiles['New'] = 1;
+
+        $doc = $area->get_document($record);
+        foreach($files as $file) {
+            $doc->add_stored_file($file);
+        }
+
+        // Reindex the document with the changed files.
+        $engine->add_document($doc, true);
+        $engine->area_index_complete($area->get_area_id());
+        cache_helper::purge_by_definition('core', 'search_results');
+
+        // Go through our check array, and see if the file is there or not.
+        foreach ($checkfiles as $key => $keep) {
+            $querydata->q = 'FileContents'.$key;
+            $this->assertCount($keep, $this->search->search($querydata));
+            $querydata->q = 'searchfile'.$key;
+            $this->assertCount($keep, $this->search->search($querydata));
+        }
+
+        // Now check that we get one result when we search from something in all of them.
+        $querydata->q = 'Some';
+        $this->assertCount(1, $this->search->search($querydata));
+    }
+
+    public function test_index_filtered_file() {
+        if (defined('TEST_SEARCH_SOLR_DISABLE_FILE_INDEXING') && (TEST_SEARCH_SOLR_DISABLE_FILE_INDEXING == 1)) {
+            $this->markTestSkipped('Solr file indexing not enabled.');
+            return;
+        }
+
+        // Get engine and area to work with.
+        $engine = $this->search->get_engine();
+        $areaid = \core_search\manager::generate_areaid('core_mocksearch', 'role_capabilities');
+        $area = \core_search\manager::get_search_area($areaid);
+
+        // Get a single record to make a doc from.
+        $recordset = $area->get_recordset_by_timestamp(0);
+        $record = $recordset->current();
+        $recordset->close();
+
+        $doc = $area->get_document($record);
+
+        // Now we are going to make some files.
+        $fs = get_file_storage();
+        $syscontext = \context_system::instance();
+
+        $files = array();
+        $filerecord = array(
+            'contextid' => $syscontext->id,
+            'component' => 'core',
+            'filearea'  => 'unittest',
+            'itemid'    => 0,
+            'filepath'  => '/',
+            'filename'  => 'largefile'
+        );
+
+        // We need to make a file greater than 1kB in size, which is the lowest filter size.
+        $contents = 'Some LargeFindContent to find.';
+        for ($i = 0; $i < 200; $i++) {
+            $contents .= ' The quick brown fox jumps over the lazy dog.';
+        }
+
+        $this->assertGreaterThan(1024, strlen($contents));
+
+        $file = $fs->create_file_from_string($filerecord, $contents);
+        $doc->add_stored_file($file);
+
+        $filerecord['filename'] = 'smallfile';
+        $file = $fs->create_file_from_string($filerecord, 'Some SmallFindContent to find.');
+        $doc->add_stored_file($file);
+
+        $engine->add_document($doc, true);
+        $engine->area_index_complete($area->get_area_id());
+
+        $querydata = new stdClass();
+        // We shouldn't be able to find the large file contents.
+        $querydata->q = 'LargeFindContent';
+        $this->assertCount(0, $this->search->search($querydata));
+
+        // But we should be able to find the filename.
+        $querydata->q = 'largefile';
+        $this->assertCount(1, $this->search->search($querydata));
+
+        // We should be able to find the small file contents.
+        $querydata->q = 'SmallFindContent';
+        $this->assertCount(1, $this->search->search($querydata));
+
+        // And we should be able to find the filename.
+        $querydata->q = 'smallfile';
+        $this->assertCount(1, $this->search->search($querydata));
+    }
+
+    public function test_delete_by_id() {
+        if (defined('TEST_SEARCH_SOLR_DISABLE_FILE_INDEXING') && (TEST_SEARCH_SOLR_DISABLE_FILE_INDEXING == 1)) {
+            $this->markTestSkipped('Solr file indexing not enabled.');
+            return;
+        }
+
+        // First get files in the index.
+        $this->search->index();
+        $engine = $this->search->get_engine();
+
+        $querydata = new stdClass();
+
+        // Then search to make sure they are there.
+        $querydata->q = '"File contents"';
+        $results = $this->search->search($querydata);
+        $this->assertCount(2, $results);
+
+        $first = reset($results);
+        $deleteid = $first->get('id');
+
+        $engine->delete_by_id($deleteid);
+        cache_helper::purge_by_definition('core', 'search_results');
+
+        // Check that we don't get a result for it anymore.
+        $results = $this->search->search($querydata);
+        $this->assertCount(1, $results);
+        $result = reset($results);
+        $this->assertNotEquals($deleteid, $result->get('id'));
+    }
 }