QueryEngine.java example

Explorer

mimir-master
- etc
  - generated-header.java
- mimir-client
  - src
    - gate
      - mimir
        index
        MimirConnector.java
        MimirIndexingPR.java
        search
        RemoteQueryRunner.java
        tool
        WebUtils.java
- mimir-cloud
  - archive-unpacker
    - src
      - gate
        mimir
        util
        MultiFileInputStream.java
        UnpackWizard.java
  - src
    - java
      - gate
        mimir
        util
        IndexArchiveState.java
        MultiFileOutputStream.java
- mimir-core
  - src
    - gate
      - mimir
        AbstractSemanticAnnotationHelper.java
        Constraint.java
        ConstraintType.java
        DocumentMetadataHelper.java
        DocumentRenderer.java
        IndexConfig.java
        MimirIndex.java
        SemanticAnnotationHelper.java
        index
        AtomicAnnotationIndex.java
        AtomicIndex.java
        AtomicTokenIndex.java
        DocumentCollection.java
        DocumentData.java
        GATEDocument.java
        GATEDocumentFactory.java
        IndexException.java
        Mention.java
        OriginalMarkupMetadataHelper.java
        package-info.java
        search
        FederatedQueryRunner.java
        IndexReaderPool.java
        QueryEngine.java
        QueryRunner.java
        RankingQueryRunnerImpl.java
        query
        AbstractIntersectionQueryExecutor.java
        AbstractOverlapQuery.java
        AbstractQueryExecutor.java
        AndQuery.java
        AnnotationQuery.java
        Binding.java
        ConstQuery.java
        ContainsQuery.java
        ExecutorsList.java
        GapQuery.java
        MinusQuery.java
        OrQuery.java
        QueryExecutor.java
        QueryNode.java
        RepeatsQuery.java
        SequenceQuery.java
        TermQuery.java
        WithinQuery.java
        parser
        ParseException.java
        Query.java
        QueryParser.java
        QueryParserConstants.java
        QueryParserTokenManager.java
        SimpleCharStream.java
        Token.java
        TokenMgrError.java
        score
        BindingScorer.java
        DelegatingScoringQueryExecutor.java
        MimirScorer.java
        terms
        AbstractCompoundTermsQuery.java
        AbstractDocumentsBasedTermsQuery.java
        AbstractIndexTermsQuery.java
        AndTermsQuery.java
        AnnotationTermsQuery.java
        CompoundTermsQuery.java
        ConstTermsQuery.java
        DocumentTermsQuery.java
        DocumentsAndTermsQuery.java
        DocumentsBasedTermsQuery.java
        DocumentsOrTermsQuery.java
        LimitTermsQuery.java
        OrTermsQuery.java
        SortedTermsQuery.java
        TermTypeTermsQuery.java
        TermsQuery.java
        TermsResultSet.java
        util
        DefaultMentionDescriber.java
        DelegatingSemanticAnnotationHelper.java
        DocumentFeaturesMetadataHelper.java
        IgnoreEmptiesTermProcessor.java
        IndexUpgrader.java
        MG4JTools.java
        NormalizingTermProcessor.java
        OntologyMentionDescriber.java
        TruncateIndex.java
- mimir-test
  - src
    - gate
      - mimir
        test
        QueryTests.java
        RenderZipCollection.java
        Scratch.java
        ScratchConsole.java
        TestQueryParser.java
        TestUtils.java
- mimir-web
  - src
    - gwt
      - gate
        mimir
        web
        client
        UI.java
    - java
      - gate
        mimir
        util
        LogAnalyser.java
        web
        client
        DocumentData.java
        GwtRpcService.java
        GwtRpcServiceAsync.java
        MimirSearchException.java
        ResultsData.java
- plugins
  - db-h2
    - src
      - gate
        mimir
        db
        AnnotationTemplateCache.java
        DBSemanticAnnotationHelper.java
  - measurements
    - src
      - gate
        mimir
        measurements
        MeasurementAnnotationHelper.java
        MeasurementPluginResource.java
  - sparql
    - src
      - gate
        mimir
        sparql
        RequestMethod.java
        SPARQLResultSet.java
        SPARQLSemanticAnnotationHelper.java

/*
 *  QueryEngine.java
 *
 *  Copyright (c) 2007-2011, The University of Sheffield.
 *
 *  This file is part of GATE Mímir (see http://gate.ac.uk/family/mimir.html), 
 *  and is free software, licenced under the GNU Lesser General Public License,
 *  Version 3, June 2007 (also included with this distribution as file
 *  LICENCE-LGPL3.html).
 *
 *  Valentin Tablan, 04 Mar 2009
 *  
 *  $Id$
 */
package gate.mimir.search;

import gate.LanguageAnalyser;
import gate.mimir.DocumentMetadataHelper;
import gate.mimir.DocumentRenderer;
import gate.mimir.IndexConfig;
import gate.mimir.IndexConfig.SemanticIndexerConfig;
import gate.mimir.MimirIndex;
import gate.mimir.SemanticAnnotationHelper;
import gate.mimir.index.AtomicAnnotationIndex;
import gate.mimir.index.AtomicTokenIndex;
import gate.mimir.index.DocumentData;
import gate.mimir.index.IndexException;
import gate.mimir.search.query.AnnotationQuery;
import gate.mimir.search.query.Binding;
import gate.mimir.search.query.QueryExecutor;
import gate.mimir.search.query.QueryNode;
import gate.mimir.search.query.parser.ParseException;
import gate.mimir.search.query.parser.QueryParser;
import gate.mimir.search.score.MimirScorer;

import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.concurrent.Callable;
import java.util.concurrent.Executor;

import org.apache.log4j.Logger;

/**
 * This class represents the entry point to the Mimir search API.
 */
public class QueryEngine {
  
  
  /**
   * Represents the type of index that should be searched. Mimir uses two types
   * of indexes: token indexes (which index the text input) and annotation
   * indexes (which index semantic annotations).
   */
  public static enum IndexType{
    /**
     * Value representing token indexes, used for the document text.
     */
    TOKENS,
    
    /**
     * Value representing annotation indexes, used for the document semantic
     * annotations.
     */
    ANNOTATIONS
  }
  
  /**
   * The maximum size of an index that can be loaded in memory (by default 64
   * MB).
   */
  public static final long MAX_IN_MEMORY_INDEX = 64 * 1024 * 1024;
  
  /**
   * The default value for the document block size.
   * @see #setDocumentBlockSize(int)
   */
  public static final int DEFAULT_DOCUMENT_BLOCK_SIZE = 1000;

  /**
   * The index being searched.
   */
  protected final MimirIndex index;

  /**
   * The index configuration this index was built from.
   */
  protected IndexConfig indexConfig;

  /**
   * Should sub-bindings be generated when searching?
   */
  protected boolean subBindingsEnabled;

  /**
   * A callable that produces new {@link MimirScorer} instances on request. 
   */
  protected Callable<MimirScorer> scorerSource;
  
  protected static final Logger logger = Logger.getLogger(QueryEngine.class);

  /**
   * The tokeniser (technically any GATE LA) used to split the text segments
   * found in queries into individual tokens. The same tokeniser used to create
   * the indexed documents should be used here. If this value is not set, then a
   * default ANNIE tokeniser will be used.
   */
  protected LanguageAnalyser queryTokeniser;

  /**
   * The executor used to run tasks for query execution. If the value is not
   * set, then new threads are created as needed.
   */
  protected Executor executor;

  /**
   * How many documents get ranked in one ranking stage.
   */
  private int documentBlockSize = DEFAULT_DOCUMENT_BLOCK_SIZE;
  
  /**
   * A list of currently active QueryRunners. This is used to close all active 
   * runners when the query engine itself is closed (thus releasing all open 
   * files).
   */
  private List<QueryRunner> activeQueryRunners;

  /**
   * Are sub-bindings used in this query engine. Sub-bindings are used to
   * associate sub-queries with segments of the returned hits. This can be
   * useful for showing high-level details about the returned hits. By default,
   * sub-bindings are not used.
   * 
   * @return the subBindingsEnabled
   */
  public boolean isSubBindingsEnabled() {
    return subBindingsEnabled;
  }

  /**
   * @param subBindingsEnabled
   *          the subBindingsEnabled to set
   */
  public void setSubBindingsEnabled(boolean subBindingsEnabled) {
    this.subBindingsEnabled = subBindingsEnabled;
  }

  /**
   * Gets the configuration parameter specifying the number of documents that 
   * get processed as a block. This is used to optimise the search 
   * process by limiting the number of results that get calculated by default.
   * @return
   */
  public int getDocumentBlockSize() {
    return documentBlockSize;
  }
  
  /**
   * Sets the configuration parameter specifying the number of documents that 
   * get processed in one go (e.g. the number of documents that get ranked when
   * enumerating results). This is used to optimise the search 
   * process by limiting the number of results that get calculated by default.
   * Defaults to {@link #DEFAULT_DOCUMENT_BLOCK_SIZE}.
   * @param documentBlockSize
   */
  public void setDocumentBlockSize(int documentBlockSize) {
    this.documentBlockSize = documentBlockSize;
  }

  /**
   * Gets the current source of scorers.
   * @see #setScorerSource(Callable)
   * @return
   */
  public Callable<MimirScorer> getScorerSource() {
    return scorerSource;
  }

  /**
   * Provides a {@link Callable} that the Query Engine can use for obtaining
   * new instances of {@link MimirScorer} to be used for ranking new queries.
   * @param scorerSource
   */
  public void setScorerSource(Callable<MimirScorer> scorerSource) {
    this.scorerSource = scorerSource;
  }

  /**
   * Gets the executor used by this query engine.
   * 
   * @return an executor that can be used for running tasks pertinent to this
   *         QueryEngine.
   */
  public Executor getExecutor() {
    return executor;
  }

  /**
   * Sets the {@link Executor} used for executing tasks required for running
   * queries. This allows the use of some type thread pooling, is needed. If
   * this value is not set, then new threads are created as required.
   * 
   * @param executor
   */
  public void setExecutor(Executor executor) {
    this.executor = executor;
  }

  /**
   * Sets the tokeniser (technically any GATE analyser) used to split the text
   * segments found in queries into individual tokens. The same tokeniser used
   * to create the indexed documents should be used here. If this value is not
   * set, then a default ANNIE tokeniser will be used.
   * 
   * @param queryTokeniser
   *          the new tokeniser to be used for parsing queries.
   */
  public void setQueryTokeniser(LanguageAnalyser queryTokeniser) {
    this.queryTokeniser = queryTokeniser;
  }

  /**
   * Finds the location for a given sub-index in the arrays returned by 
   * {@link #getIndexes()} and {@link #getDirectIndexes()}.
   * @param indexType the IndexType of the requested sub-index (tokens or 
   * annotations).
   * @param indexName the "name" of the requested sub-index (the 
   * indexed feature name for {@link IndexType#TOKENS} indexes, or the 
   * annotation type in the case of {@link IndexType#ANNOTATIONS} indexes). 
   * @return the position in the indexes array for the requested index, or -1 if
   * the requested index does not exist.
   */
  public int getSubIndexPosition(IndexType indexType, String indexName) {
    if(indexType == IndexType.TOKENS) {
      for(int i = 0; i < indexConfig.getTokenIndexers().length; i++) {
        if(indexConfig.getTokenIndexers()[i].getFeatureName().equals(indexName)) {
          return i; 
        }
      }
      return -1;
    } else if(indexType == IndexType.ANNOTATIONS) {
      for(int i = 0; i < indexConfig.getSemanticIndexers().length; i++) {
        for(String aType : 
            indexConfig.getSemanticIndexers()[i].getAnnotationTypes()) {
          if(aType.equals(indexName)) { 
            return indexConfig.getTokenIndexers().length + i; 
          }
        }
      }      
      return -1;
    } else {
      throw new IllegalArgumentException(
        "Don't understand sub-indexes of type " + indexType);
    }
  }

  /**
   * Returns the index that stores the data for a particular feature of token
   * annotations.
   * 
   * @param featureName
   * @return
   */
  public AtomicTokenIndex getTokenIndex(String featureName) {
    return index.getTokenIndex(featureName);
  }
  
  /**
   * Returns the index that stores the data for a particular semantic annotation
   * type.
   * 
   * @param annotationType
   * @return
   */
  public AtomicAnnotationIndex getAnnotationIndex(String annotationType) {
    return index.getAnnotationIndex(annotationType);
  }
  
  public SemanticAnnotationHelper getAnnotationHelper(String annotationType) {
    for(int i = 0; i < indexConfig.getSemanticIndexers().length; i++) {
      String[] annTypes = indexConfig.getSemanticIndexers()[i]
          .getAnnotationTypes(); 
      for(int j = 0; j < annTypes.length; j++) {
        if(annTypes[j].equals(annotationType)) {
          return indexConfig.getSemanticIndexers()[i].getHelpers()[j];
        }
      }
    }
    return null;
  }
  
  
  /**
   * Gets the index this query engine is searching.
   * @return
   */
  public MimirIndex getIndex() {
    return index;
  }

  /**
   * @return the index configuration for this index
   */
  public IndexConfig getIndexConfig() {
    return indexConfig;
  }

  
  
  /**
   * Constructs a new query engine for a {@link MimirIndex}.
   * @param index the index to be searched.
   */
  public QueryEngine(MimirIndex index) {
    this.index = index;
    this.indexConfig = index.getIndexConfig();
    activeQueryRunners = Collections.synchronizedList(
        new ArrayList<QueryRunner>());
    subBindingsEnabled = false;
  }

//  /**
//   * Constructs a new {@link QueryEngine} for a specified Mimir index. The mimir
//   * semantic repository will be initialized using the default location in the
//   * filesystem, provided by the IndexConfig
//   * 
//   * @param indexDir
//   *          the directory containing an index.
//   * @throws IndexException
//   *           if there are problems while opening the indexes.
//   */
//  public QueryEngine(File indexDir) throws gate.mimir.index.IndexException {
//    // read the index config
//    try {
//      indexConfig =
//        IndexConfig.readConfigFromFile(new File(indexDir,
//                Indexer.INDEX_CONFIG_FILENAME), indexDir);
//      initMG4J();
//      // initialise the semantic indexers
//      if(indexConfig.getSemanticIndexers() != null && 
//              indexConfig.getSemanticIndexers().length > 0) {
//        for(SemanticIndexerConfig sic : indexConfig.getSemanticIndexers()){
//          for(SemanticAnnotationHelper sah : sic.getHelpers()){
//            sah.init(this);
//            if(sah.getMode() == SemanticAnnotationHelper.Mode.DOCUMENT &&
//                documentSizes == null) {
//              // we need to load the document sizes from a token index
//              documentSizes = getIndexes()[0].getIndex().sizes;
//            }            
//          }
//        }
//      }
//      
//      
//      activeQueryRunners = Collections.synchronizedList(
//              new ArrayList<QueryRunner>());
//    } catch(FileNotFoundException e) {
//      throw new IndexException("File not found!", e);
//    } catch(IOException e) {
//      throw new IndexException("Input/output exception!", e);
//    }
//    subBindingsEnabled = false;
//
//  }

  /**
   * Get the {@link SemanticAnnotationHelper} corresponding to a query's
   * annotation type.
   * @throws IllegalArgumentException if the annotation helper for this
   *         type cannot be found.
   */
  public SemanticAnnotationHelper getAnnotationHelper(AnnotationQuery query) {
    for(SemanticIndexerConfig semConfig : indexConfig.getSemanticIndexers()){
      for(int i = 0; i < semConfig.getAnnotationTypes().length; i++){
        if(query.getAnnotationType().equals(
                semConfig.getAnnotationTypes()[i])){
          return semConfig.getHelpers()[i];
        }
      }
    }
    throw new IllegalArgumentException("Semantic annotation type \""
            + query.getAnnotationType() + "\" not known to this query engine.");
  }
  
  
  /**
   * Obtains a query executor for a given {@link QueryNode}.
   * 
   * @param query
   *          the query to be executed.
   * @return a {@link QueryExecutor} for the provided query, running over the
   *         indexes in this query engine.
   * @throws IOException
   *           if the index files cannot be accessed.
   */
  public QueryRunner getQueryRunner(QueryNode query) throws IOException {
    logger.info("Executing query: " + query.toString());
    QueryExecutor qExecutor = query.getQueryExecutor(this);
    QueryRunner qRunner;
    MimirScorer scorer = null;
    try {
      scorer = scorerSource == null ? null : scorerSource.call();
    } catch(Exception e) {
      logger.error("Could not obtain a scorer. Running query unranked.", e);
    }
    qRunner = new RankingQueryRunnerImpl(qExecutor, scorer);
    activeQueryRunners.add(qRunner);
    return qRunner;
  }
  
  /**
   * Notifies the QueryEngine that the given QueryRunner has been closed. 
   * @param qRunner
   */
  public void releaseQueryRunner(QueryRunner qRunner) {
    activeQueryRunners.remove(qRunner);
  }

  /**
   * Obtains a query executor for a given query, expressed as a String.
   * 
   * @param query
   *          the query to be executed.
   * @return a {@link QueryExecutor} for the provided query, running over the
   *         indexes in this query engine.
   * @throws IOException
   *           if the index files cannot be accessed.
   * @throws ParseException
   *           if the string provided for the query cannot be parsed.
   */
  public QueryRunner getQueryRunner(String query) throws IOException,
  ParseException {
    logger.info("Executing query: " + query.toString());
    QueryNode qNode =
      (queryTokeniser == null) ? QueryParser.parse(query) : QueryParser
              .parse(query, queryTokeniser);
      return getQueryRunner(qNode);
  }

  /**
   * Obtains the document text for a given search hit.
   * 
   * @param hit
   *          the search hit for which the text is sought.
   * @param leftContext
   *          the number of tokens to the left of the hit to be included in the
   *          result.
   * @param rightContext
   *          the number of tokens to the right of the hit to be included in the
   *          result.
   * @return an array of arrays of {@link String}s, representing the tokens and
   *         spaces at the location of the search hit. The first element of the
   *         array is an array of tokens, the second element contains the
   *         spaces.The first element of each array corresponds to the first
   *         token of the left context.
   * @throws IOException
   */
  public String[][] getHitText(Binding hit, int leftContext, int rightContext)
  throws IndexException {
    return getText(hit.getDocumentId(), hit.getTermPosition() - leftContext,
            leftContext + hit.getLength() + rightContext);
  }

  /**
   * Gets the text covered by a given binding.
   * 
   * @param hit
   *          the binding.
   * @return an array of two string arrays, the first representing the tokens
   *         covered by the binding and the second the spaces after each token.
   * @throws IOException
   */
  public String[][] getHitText(Binding hit) throws IndexException {
    return getText(hit.getDocumentId(), hit.getTermPosition(), hit.getLength());
  }

  /**
   * Get the text to the left of the given binding.
   * 
   * @param hit
   *          the binding.
   * @param numTokens
   *          the maximum number of tokens of context to return. The actual
   *          number of tokens returned may be smaller than this if the hit
   *          starts within <code>numTokens</code> tokens of the start of the
   *          document.
   * @return an array of two string arrays, the first representing the tokens
   *         before the binding and the second the spaces after each token.
   * @throws IOException
   */
  public String[][] getLeftContext(Binding hit, int numTokens)
  throws IndexException {
    int startOffset = hit.getTermPosition() - numTokens;
    // if numTokens is greater than the start offset of the hit
    // then we need to return all the document text up to the
    // token before the hit position (possibly no tokens...)
    if(startOffset < 0) {
      numTokens += startOffset; // startOffset is negative, so this will
      // subtract from numTokens
      startOffset = 0;
    }
    return getText(hit.getDocumentId(), startOffset, numTokens);
  }

  /**
   * Get the text to the right of the given binding.
   * 
   * @param hit
   *          the binding.
   * @param numTokens
   *          the maximum number of tokens of context to return. The actual
   *          number of tokens returned may be smaller than this if the hit ends
   *          within <code>numTokens</code> tokens of the end of the document.
   * @return an array of two string arrays, the first representing the tokens
   *         after the binding and the second the spaces after each token.
   * @throws IOException
   */
  public String[][] getRightContext(Binding hit, int numTokens)
  throws IndexException {
    DocumentData docData;
    try {
      docData = index.getDocumentData(hit.getDocumentId());
    } catch(IOException e) {
      throw new IndexException(e);
    }
    int startOffset = hit.getTermPosition() + hit.getLength();
    if(startOffset >= docData.getTokens().length) {
      // hit is at the end of the document
      return new String[][]{new String[0], new String[0]};
    }
    if(startOffset + numTokens > docData.getTokens().length) {
      // fewer than numTokens tokens of right context available, adjust
      numTokens = docData.getTokens().length - startOffset;
    }
    return getText(hit.getDocumentId(), startOffset, numTokens);
  }

  /**
   * Obtains the text for a specified region of a document. The return value is
   * a pair of parallel arrays, one of tokens and the other of the spaces
   * between them. If <code>length >= 0</code>, the two parallel arrays will
   * always be exactly <code>length</code> items long, but any token positions
   * that do not exist in the document (i.e. before the start or beyond the end
   * of the text) will be <code>null</code>. If <code>length < 0</code> the
   * arrays will be of sufficient length to hold all the tokens from
   * <code>termPosition</code> to the end of the document, with no trailing
   * <code>null</code>s (there may be leading <code>null</code>s if
   * <code>termPosition < 0</code>).
   * 
   * @param documentID
   *          the document ID
   * @param termPosition
   *          the position of the first term required
   * @param length
   *          the number of terms to return. May be negativem, in which case all
   *          terms from termPosition to the end of the document will be
   *          returned.
   * @return an array of two string arrays. The first represents the tokens and
   *         the second represents the spaces between them
   * @throws IndexException
   */
  public String[][] getText(long documentID, int termPosition, int length)
  throws IndexException {
    try {
      return index.getDocumentData(documentID).getText(termPosition, length);
    } catch(IOException e) {
      throw new IndexException(e); 
    }
  }

  /**
   * Renders a document and a list of hits.
   * 
   * @param docID
   *          the document to be rendered.
   * @param hits
   *          the list of hits to be rendered.
   * @param output
   *          the {@link Appendable} used to write the output.
   * @throws IOException
   *           if the output cannot be written to.
   * @throws IndexException
   *           if no document renderer is available.
   */
  public void renderDocument(long docID, List<Binding> hits, Appendable output)
  throws IOException, IndexException {
    DocumentRenderer docRenderer = indexConfig.getDocumentRenderer();
    if(docRenderer == null) { throw new IndexException(
    "No document renderer is configured for this index!"); }
    docRenderer.render(index.getDocumentData(docID), hits, output);
  }

  public String getDocumentTitle(long docID) throws IndexException {
    try {
      return index.getDocumentData(docID).getDocumentTitle();
    } catch(IOException e) {
      throw new IndexException(e);
    }
  }

  public String getDocumentURI(long docID) throws IndexException {
    try {
      return index.getDocumentData(docID).getDocumentURI();
    } catch(IOException e) {
      throw new IndexException(e);
    }
  }

  /**
   * Obtains an arbitrary document metadata field from the stored document data.
   * {@link DocumentMetadataHelper}s used at indexing time can add arbitrary 
   * {@link Serializable} values as metadata fields for the documents being
   * indexed. This method is used at search time to retrieve those values. 
   *  
   * @param docID the ID of document for which the metadata is sought.
   * @param fieldName the name of the metadata filed to be obtained
   * @return the de-serialised value stored at indexing time for the given 
   * field name and document.
   * @throws IndexException
   */
  public Serializable getDocumentMetadataField(long docID, String fieldName) 
      throws IndexException {
    try {
      return index.getDocumentData(docID).getMetadataField(fieldName);
    } catch(IOException e) {
      throw new IndexException(e);
    }
  }
  


  /**
   * Closes this {@link QueryEngine} and releases all resources.
   */
  public void close() {
    // close all active query runners
    List<QueryRunner> runnersCopy = new ArrayList<QueryRunner>(activeQueryRunners);
    for(QueryRunner aRunner : runnersCopy) {
      try {
        logger.debug("Closing query runner: " + aRunner.toString());
        aRunner.close();
      } catch(IOException e) {
        // log and ignore
        logger.error("Exception while closing query runner.", e);
      }
    }
  }

}