/* * QueryEngine.java * * Copyright (c) 2007-2011, The University of Sheffield. * * This file is part of GATE Mímir (see http://gate.ac.uk/family/mimir.html), * and is free software, licenced under the GNU Lesser General Public License, * Version 3, June 2007 (also included with this distribution as file * LICENCE-LGPL3.html). * * Valentin Tablan, 04 Mar 2009 * * $Id$ */ package gate.mimir.search; import gate.LanguageAnalyser; import gate.mimir.DocumentMetadataHelper; import gate.mimir.DocumentRenderer; import gate.mimir.IndexConfig; import gate.mimir.IndexConfig.SemanticIndexerConfig; import gate.mimir.MimirIndex; import gate.mimir.SemanticAnnotationHelper; import gate.mimir.index.AtomicAnnotationIndex; import gate.mimir.index.AtomicTokenIndex; import gate.mimir.index.DocumentData; import gate.mimir.index.IndexException; import gate.mimir.search.query.AnnotationQuery; import gate.mimir.search.query.Binding; import gate.mimir.search.query.QueryExecutor; import gate.mimir.search.query.QueryNode; import gate.mimir.search.query.parser.ParseException; import gate.mimir.search.query.parser.QueryParser; import gate.mimir.search.score.MimirScorer; import java.io.IOException; import java.io.Serializable; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.concurrent.Callable; import java.util.concurrent.Executor; import org.apache.log4j.Logger; /** * This class represents the entry point to the Mimir search API. */ public class QueryEngine { /** * Represents the type of index that should be searched. Mimir uses two types * of indexes: token indexes (which index the text input) and annotation * indexes (which index semantic annotations). */ public static enum IndexType{ /** * Value representing token indexes, used for the document text. */ TOKENS, /** * Value representing annotation indexes, used for the document semantic * annotations. */ ANNOTATIONS } /** * The maximum size of an index that can be loaded in memory (by default 64 * MB). */ public static final long MAX_IN_MEMORY_INDEX = 64 * 1024 * 1024; /** * The default value for the document block size. * @see #setDocumentBlockSize(int) */ public static final int DEFAULT_DOCUMENT_BLOCK_SIZE = 1000; /** * The index being searched. */ protected final MimirIndex index; /** * The index configuration this index was built from. */ protected IndexConfig indexConfig; /** * Should sub-bindings be generated when searching? */ protected boolean subBindingsEnabled; /** * A callable that produces new {@link MimirScorer} instances on request. */ protected Callable<MimirScorer> scorerSource; protected static final Logger logger = Logger.getLogger(QueryEngine.class); /** * The tokeniser (technically any GATE LA) used to split the text segments * found in queries into individual tokens. The same tokeniser used to create * the indexed documents should be used here. If this value is not set, then a * default ANNIE tokeniser will be used. */ protected LanguageAnalyser queryTokeniser; /** * The executor used to run tasks for query execution. If the value is not * set, then new threads are created as needed. */ protected Executor executor; /** * How many documents get ranked in one ranking stage. */ private int documentBlockSize = DEFAULT_DOCUMENT_BLOCK_SIZE; /** * A list of currently active QueryRunners. This is used to close all active * runners when the query engine itself is closed (thus releasing all open * files). */ private List<QueryRunner> activeQueryRunners; /** * Are sub-bindings used in this query engine. Sub-bindings are used to * associate sub-queries with segments of the returned hits. This can be * useful for showing high-level details about the returned hits. By default, * sub-bindings are not used. * * @return the subBindingsEnabled */ public boolean isSubBindingsEnabled() { return subBindingsEnabled; } /** * @param subBindingsEnabled * the subBindingsEnabled to set */ public void setSubBindingsEnabled(boolean subBindingsEnabled) { this.subBindingsEnabled = subBindingsEnabled; } /** * Gets the configuration parameter specifying the number of documents that * get processed as a block. This is used to optimise the search * process by limiting the number of results that get calculated by default. * @return */ public int getDocumentBlockSize() { return documentBlockSize; } /** * Sets the configuration parameter specifying the number of documents that * get processed in one go (e.g. the number of documents that get ranked when * enumerating results). This is used to optimise the search * process by limiting the number of results that get calculated by default. * Defaults to {@link #DEFAULT_DOCUMENT_BLOCK_SIZE}. * @param documentBlockSize */ public void setDocumentBlockSize(int documentBlockSize) { this.documentBlockSize = documentBlockSize; } /** * Gets the current source of scorers. * @see #setScorerSource(Callable) * @return */ public Callable<MimirScorer> getScorerSource() { return scorerSource; } /** * Provides a {@link Callable} that the Query Engine can use for obtaining * new instances of {@link MimirScorer} to be used for ranking new queries. * @param scorerSource */ public void setScorerSource(Callable<MimirScorer> scorerSource) { this.scorerSource = scorerSource; } /** * Gets the executor used by this query engine. * * @return an executor that can be used for running tasks pertinent to this * QueryEngine. */ public Executor getExecutor() { return executor; } /** * Sets the {@link Executor} used for executing tasks required for running * queries. This allows the use of some type thread pooling, is needed. If * this value is not set, then new threads are created as required. * * @param executor */ public void setExecutor(Executor executor) { this.executor = executor; } /** * Sets the tokeniser (technically any GATE analyser) used to split the text * segments found in queries into individual tokens. The same tokeniser used * to create the indexed documents should be used here. If this value is not * set, then a default ANNIE tokeniser will be used. * * @param queryTokeniser * the new tokeniser to be used for parsing queries. */ public void setQueryTokeniser(LanguageAnalyser queryTokeniser) { this.queryTokeniser = queryTokeniser; } /** * Finds the location for a given sub-index in the arrays returned by * {@link #getIndexes()} and {@link #getDirectIndexes()}. * @param indexType the IndexType of the requested sub-index (tokens or * annotations). * @param indexName the "name" of the requested sub-index (the * indexed feature name for {@link IndexType#TOKENS} indexes, or the * annotation type in the case of {@link IndexType#ANNOTATIONS} indexes). * @return the position in the indexes array for the requested index, or -1 if * the requested index does not exist. */ public int getSubIndexPosition(IndexType indexType, String indexName) { if(indexType == IndexType.TOKENS) { for(int i = 0; i < indexConfig.getTokenIndexers().length; i++) { if(indexConfig.getTokenIndexers()[i].getFeatureName().equals(indexName)) { return i; } } return -1; } else if(indexType == IndexType.ANNOTATIONS) { for(int i = 0; i < indexConfig.getSemanticIndexers().length; i++) { for(String aType : indexConfig.getSemanticIndexers()[i].getAnnotationTypes()) { if(aType.equals(indexName)) { return indexConfig.getTokenIndexers().length + i; } } } return -1; } else { throw new IllegalArgumentException( "Don't understand sub-indexes of type " + indexType); } } /** * Returns the index that stores the data for a particular feature of token * annotations. * * @param featureName * @return */ public AtomicTokenIndex getTokenIndex(String featureName) { return index.getTokenIndex(featureName); } /** * Returns the index that stores the data for a particular semantic annotation * type. * * @param annotationType * @return */ public AtomicAnnotationIndex getAnnotationIndex(String annotationType) { return index.getAnnotationIndex(annotationType); } public SemanticAnnotationHelper getAnnotationHelper(String annotationType) { for(int i = 0; i < indexConfig.getSemanticIndexers().length; i++) { String[] annTypes = indexConfig.getSemanticIndexers()[i] .getAnnotationTypes(); for(int j = 0; j < annTypes.length; j++) { if(annTypes[j].equals(annotationType)) { return indexConfig.getSemanticIndexers()[i].getHelpers()[j]; } } } return null; } /** * Gets the index this query engine is searching. * @return */ public MimirIndex getIndex() { return index; } /** * @return the index configuration for this index */ public IndexConfig getIndexConfig() { return indexConfig; } /** * Constructs a new query engine for a {@link MimirIndex}. * @param index the index to be searched. */ public QueryEngine(MimirIndex index) { this.index = index; this.indexConfig = index.getIndexConfig(); activeQueryRunners = Collections.synchronizedList( new ArrayList<QueryRunner>()); subBindingsEnabled = false; } // /** // * Constructs a new {@link QueryEngine} for a specified Mimir index. The mimir // * semantic repository will be initialized using the default location in the // * filesystem, provided by the IndexConfig // * // * @param indexDir // * the directory containing an index. // * @throws IndexException // * if there are problems while opening the indexes. // */ // public QueryEngine(File indexDir) throws gate.mimir.index.IndexException { // // read the index config // try { // indexConfig = // IndexConfig.readConfigFromFile(new File(indexDir, // Indexer.INDEX_CONFIG_FILENAME), indexDir); // initMG4J(); // // initialise the semantic indexers // if(indexConfig.getSemanticIndexers() != null && // indexConfig.getSemanticIndexers().length > 0) { // for(SemanticIndexerConfig sic : indexConfig.getSemanticIndexers()){ // for(SemanticAnnotationHelper sah : sic.getHelpers()){ // sah.init(this); // if(sah.getMode() == SemanticAnnotationHelper.Mode.DOCUMENT && // documentSizes == null) { // // we need to load the document sizes from a token index // documentSizes = getIndexes()[0].getIndex().sizes; // } // } // } // } // // // activeQueryRunners = Collections.synchronizedList( // new ArrayList<QueryRunner>()); // } catch(FileNotFoundException e) { // throw new IndexException("File not found!", e); // } catch(IOException e) { // throw new IndexException("Input/output exception!", e); // } // subBindingsEnabled = false; // // } /** * Get the {@link SemanticAnnotationHelper} corresponding to a query's * annotation type. * @throws IllegalArgumentException if the annotation helper for this * type cannot be found. */ public SemanticAnnotationHelper getAnnotationHelper(AnnotationQuery query) { for(SemanticIndexerConfig semConfig : indexConfig.getSemanticIndexers()){ for(int i = 0; i < semConfig.getAnnotationTypes().length; i++){ if(query.getAnnotationType().equals( semConfig.getAnnotationTypes()[i])){ return semConfig.getHelpers()[i]; } } } throw new IllegalArgumentException("Semantic annotation type \"" + query.getAnnotationType() + "\" not known to this query engine."); } /** * Obtains a query executor for a given {@link QueryNode}. * * @param query * the query to be executed. * @return a {@link QueryExecutor} for the provided query, running over the * indexes in this query engine. * @throws IOException * if the index files cannot be accessed. */ public QueryRunner getQueryRunner(QueryNode query) throws IOException { logger.info("Executing query: " + query.toString()); QueryExecutor qExecutor = query.getQueryExecutor(this); QueryRunner qRunner; MimirScorer scorer = null; try { scorer = scorerSource == null ? null : scorerSource.call(); } catch(Exception e) { logger.error("Could not obtain a scorer. Running query unranked.", e); } qRunner = new RankingQueryRunnerImpl(qExecutor, scorer); activeQueryRunners.add(qRunner); return qRunner; } /** * Notifies the QueryEngine that the given QueryRunner has been closed. * @param qRunner */ public void releaseQueryRunner(QueryRunner qRunner) { activeQueryRunners.remove(qRunner); } /** * Obtains a query executor for a given query, expressed as a String. * * @param query * the query to be executed. * @return a {@link QueryExecutor} for the provided query, running over the * indexes in this query engine. * @throws IOException * if the index files cannot be accessed. * @throws ParseException * if the string provided for the query cannot be parsed. */ public QueryRunner getQueryRunner(String query) throws IOException, ParseException { logger.info("Executing query: " + query.toString()); QueryNode qNode = (queryTokeniser == null) ? QueryParser.parse(query) : QueryParser .parse(query, queryTokeniser); return getQueryRunner(qNode); } /** * Obtains the document text for a given search hit. * * @param hit * the search hit for which the text is sought. * @param leftContext * the number of tokens to the left of the hit to be included in the * result. * @param rightContext * the number of tokens to the right of the hit to be included in the * result. * @return an array of arrays of {@link String}s, representing the tokens and * spaces at the location of the search hit. The first element of the * array is an array of tokens, the second element contains the * spaces.The first element of each array corresponds to the first * token of the left context. * @throws IOException */ public String[][] getHitText(Binding hit, int leftContext, int rightContext) throws IndexException { return getText(hit.getDocumentId(), hit.getTermPosition() - leftContext, leftContext + hit.getLength() + rightContext); } /** * Gets the text covered by a given binding. * * @param hit * the binding. * @return an array of two string arrays, the first representing the tokens * covered by the binding and the second the spaces after each token. * @throws IOException */ public String[][] getHitText(Binding hit) throws IndexException { return getText(hit.getDocumentId(), hit.getTermPosition(), hit.getLength()); } /** * Get the text to the left of the given binding. * * @param hit * the binding. * @param numTokens * the maximum number of tokens of context to return. The actual * number of tokens returned may be smaller than this if the hit * starts within <code>numTokens</code> tokens of the start of the * document. * @return an array of two string arrays, the first representing the tokens * before the binding and the second the spaces after each token. * @throws IOException */ public String[][] getLeftContext(Binding hit, int numTokens) throws IndexException { int startOffset = hit.getTermPosition() - numTokens; // if numTokens is greater than the start offset of the hit // then we need to return all the document text up to the // token before the hit position (possibly no tokens...) if(startOffset < 0) { numTokens += startOffset; // startOffset is negative, so this will // subtract from numTokens startOffset = 0; } return getText(hit.getDocumentId(), startOffset, numTokens); } /** * Get the text to the right of the given binding. * * @param hit * the binding. * @param numTokens * the maximum number of tokens of context to return. The actual * number of tokens returned may be smaller than this if the hit ends * within <code>numTokens</code> tokens of the end of the document. * @return an array of two string arrays, the first representing the tokens * after the binding and the second the spaces after each token. * @throws IOException */ public String[][] getRightContext(Binding hit, int numTokens) throws IndexException { DocumentData docData; try { docData = index.getDocumentData(hit.getDocumentId()); } catch(IOException e) { throw new IndexException(e); } int startOffset = hit.getTermPosition() + hit.getLength(); if(startOffset >= docData.getTokens().length) { // hit is at the end of the document return new String[][]{new String[0], new String[0]}; } if(startOffset + numTokens > docData.getTokens().length) { // fewer than numTokens tokens of right context available, adjust numTokens = docData.getTokens().length - startOffset; } return getText(hit.getDocumentId(), startOffset, numTokens); } /** * Obtains the text for a specified region of a document. The return value is * a pair of parallel arrays, one of tokens and the other of the spaces * between them. If <code>length >= 0</code>, the two parallel arrays will * always be exactly <code>length</code> items long, but any token positions * that do not exist in the document (i.e. before the start or beyond the end * of the text) will be <code>null</code>. If <code>length < 0</code> the * arrays will be of sufficient length to hold all the tokens from * <code>termPosition</code> to the end of the document, with no trailing * <code>null</code>s (there may be leading <code>null</code>s if * <code>termPosition < 0</code>). * * @param documentID * the document ID * @param termPosition * the position of the first term required * @param length * the number of terms to return. May be negativem, in which case all * terms from termPosition to the end of the document will be * returned. * @return an array of two string arrays. The first represents the tokens and * the second represents the spaces between them * @throws IndexException */ public String[][] getText(long documentID, int termPosition, int length) throws IndexException { try { return index.getDocumentData(documentID).getText(termPosition, length); } catch(IOException e) { throw new IndexException(e); } } /** * Renders a document and a list of hits. * * @param docID * the document to be rendered. * @param hits * the list of hits to be rendered. * @param output * the {@link Appendable} used to write the output. * @throws IOException * if the output cannot be written to. * @throws IndexException * if no document renderer is available. */ public void renderDocument(long docID, List<Binding> hits, Appendable output) throws IOException, IndexException { DocumentRenderer docRenderer = indexConfig.getDocumentRenderer(); if(docRenderer == null) { throw new IndexException( "No document renderer is configured for this index!"); } docRenderer.render(index.getDocumentData(docID), hits, output); } public String getDocumentTitle(long docID) throws IndexException { try { return index.getDocumentData(docID).getDocumentTitle(); } catch(IOException e) { throw new IndexException(e); } } public String getDocumentURI(long docID) throws IndexException { try { return index.getDocumentData(docID).getDocumentURI(); } catch(IOException e) { throw new IndexException(e); } } /** * Obtains an arbitrary document metadata field from the stored document data. * {@link DocumentMetadataHelper}s used at indexing time can add arbitrary * {@link Serializable} values as metadata fields for the documents being * indexed. This method is used at search time to retrieve those values. * * @param docID the ID of document for which the metadata is sought. * @param fieldName the name of the metadata filed to be obtained * @return the de-serialised value stored at indexing time for the given * field name and document. * @throws IndexException */ public Serializable getDocumentMetadataField(long docID, String fieldName) throws IndexException { try { return index.getDocumentData(docID).getMetadataField(fieldName); } catch(IOException e) { throw new IndexException(e); } } /** * Closes this {@link QueryEngine} and releases all resources. */ public void close() { // close all active query runners List<QueryRunner> runnersCopy = new ArrayList<QueryRunner>(activeQueryRunners); for(QueryRunner aRunner : runnersCopy) { try { logger.debug("Closing query runner: " + aRunner.toString()); aRunner.close(); } catch(IOException e) { // log and ignore logger.error("Exception while closing query runner.", e); } } } }