IndexConfig.java example

Explorer

mimir-master
- etc
  - generated-header.java
- mimir-client
  - src
    - gate
      - mimir
        index
        MimirConnector.java
        MimirIndexingPR.java
        search
        RemoteQueryRunner.java
        tool
        WebUtils.java
- mimir-cloud
  - archive-unpacker
    - src
      - gate
        mimir
        util
        MultiFileInputStream.java
        UnpackWizard.java
  - src
    - java
      - gate
        mimir
        util
        IndexArchiveState.java
        MultiFileOutputStream.java
- mimir-core
  - src
    - gate
      - mimir
        AbstractSemanticAnnotationHelper.java
        Constraint.java
        ConstraintType.java
        DocumentMetadataHelper.java
        DocumentRenderer.java
        IndexConfig.java
        MimirIndex.java
        SemanticAnnotationHelper.java
        index
        AtomicAnnotationIndex.java
        AtomicIndex.java
        AtomicTokenIndex.java
        DocumentCollection.java
        DocumentData.java
        GATEDocument.java
        GATEDocumentFactory.java
        IndexException.java
        Mention.java
        OriginalMarkupMetadataHelper.java
        package-info.java
        search
        FederatedQueryRunner.java
        IndexReaderPool.java
        QueryEngine.java
        QueryRunner.java
        RankingQueryRunnerImpl.java
        query
        AbstractIntersectionQueryExecutor.java
        AbstractOverlapQuery.java
        AbstractQueryExecutor.java
        AndQuery.java
        AnnotationQuery.java
        Binding.java
        ConstQuery.java
        ContainsQuery.java
        ExecutorsList.java
        GapQuery.java
        MinusQuery.java
        OrQuery.java
        QueryExecutor.java
        QueryNode.java
        RepeatsQuery.java
        SequenceQuery.java
        TermQuery.java
        WithinQuery.java
        parser
        ParseException.java
        Query.java
        QueryParser.java
        QueryParserConstants.java
        QueryParserTokenManager.java
        SimpleCharStream.java
        Token.java
        TokenMgrError.java
        score
        BindingScorer.java
        DelegatingScoringQueryExecutor.java
        MimirScorer.java
        terms
        AbstractCompoundTermsQuery.java
        AbstractDocumentsBasedTermsQuery.java
        AbstractIndexTermsQuery.java
        AndTermsQuery.java
        AnnotationTermsQuery.java
        CompoundTermsQuery.java
        ConstTermsQuery.java
        DocumentTermsQuery.java
        DocumentsAndTermsQuery.java
        DocumentsBasedTermsQuery.java
        DocumentsOrTermsQuery.java
        LimitTermsQuery.java
        OrTermsQuery.java
        SortedTermsQuery.java
        TermTypeTermsQuery.java
        TermsQuery.java
        TermsResultSet.java
        util
        DefaultMentionDescriber.java
        DelegatingSemanticAnnotationHelper.java
        DocumentFeaturesMetadataHelper.java
        IgnoreEmptiesTermProcessor.java
        IndexUpgrader.java
        MG4JTools.java
        NormalizingTermProcessor.java
        OntologyMentionDescriber.java
        TruncateIndex.java
- mimir-test
  - src
    - gate
      - mimir
        test
        QueryTests.java
        RenderZipCollection.java
        Scratch.java
        ScratchConsole.java
        TestQueryParser.java
        TestUtils.java
- mimir-web
  - src
    - gwt
      - gate
        mimir
        web
        client
        UI.java
    - java
      - gate
        mimir
        util
        LogAnalyser.java
        web
        client
        DocumentData.java
        GwtRpcService.java
        GwtRpcServiceAsync.java
        MimirSearchException.java
        ResultsData.java
- plugins
  - db-h2
    - src
      - gate
        mimir
        db
        AnnotationTemplateCache.java
        DBSemanticAnnotationHelper.java
  - measurements
    - src
      - gate
        mimir
        measurements
        MeasurementAnnotationHelper.java
        MeasurementPluginResource.java
  - sparql
    - src
      - gate
        mimir
        sparql
        RequestMethod.java
        SPARQLResultSet.java
        SPARQLSemanticAnnotationHelper.java

/*
 *  IndexConfig.java
 *
 *  Copyright (c) 2007-2011, The University of Sheffield.
 *
 *  This file is part of GATE Mímir (see http://gate.ac.uk/family/mimir.html), 
 *  and is free software, licenced under the GNU Lesser General Public License,
 *  Version 3, June 2007 (also included with this distribution as file
 *  LICENCE-LGPL3.html).
 *
 * Valentin Tablan, 18 Feb 2009
 *
 *  $Id$
 */
package gate.mimir;

import gate.Gate;
import gate.mimir.index.IndexException;
import it.unimi.di.big.mg4j.index.NullTermProcessor;
import it.unimi.di.big.mg4j.index.TermProcessor;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.Serializable;
import java.net.URL;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;

import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;

import com.thoughtworks.xstream.XStream;
import com.thoughtworks.xstream.io.HierarchicalStreamReader;
import com.thoughtworks.xstream.io.HierarchicalStreamWriter;
import com.thoughtworks.xstream.io.xml.PrettyPrintWriter;
import com.thoughtworks.xstream.io.xml.QNameMap;
import com.thoughtworks.xstream.io.xml.StaxDriver;
import com.thoughtworks.xstream.io.xml.StaxReader;

/**
 * Interface for indexer configurations.
 */
public class IndexConfig implements Serializable {
  
  /**
   * Base class for indexer configs
   */
  public static class IndexerConfig implements Serializable {
    
    /**
     * Serialisation ID.
     */
    private static final long serialVersionUID = -3980825689154182192L;

    public IndexerConfig(boolean directIndexEnabled) {
      super();
      this.directIndexEnabled = directIndexEnabled;
    }

    /**
     * Should a direct index be also built?
     */
    private boolean directIndexEnabled = false;
    
    /**
     * Should a direct index be also built?
     * @return <code>true</code> if a direct index was requested.
     */
    public boolean isDirectIndexEnabled() {
      return directIndexEnabled;
    }

  }
  
  /**
   * Object storing the configuration for a Token indexer.
   */
  public static class TokenIndexerConfig extends IndexerConfig {
    /**
     * Serialisation ID.
     */
    private static final long serialVersionUID = 1868954146230945676L;

    /**
     * The name of the feature on Token annotations that need to be indexed.
     */
    private String featureName;

    /**
     * The term processor to be used for this indexer.
     */
    private TermProcessor termProcessor;

    /**
     * Creates a new TokenIndexerConfig.
     * 
     * @param featureName
     *          the name of the feature (on Token annotations) that needs to be
     *          indexed.
     * @param termProcessor
     *          The {@link TermProcessor} to be used by this indexer. If
     *          <code>null</code> is given, then a {@link NullTermProcessor} is
     *          used.
     * @param directIndexEnabled should a direct index also be built?         
     */
    public TokenIndexerConfig(String featureName, TermProcessor termProcessor, 
                              boolean directIndexEnabled) {
      super(directIndexEnabled);
      this.featureName = featureName;
      this.termProcessor =
              termProcessor == null
                      ? NullTermProcessor.getInstance()
                      : termProcessor;
    }

    /**
     * Obtains the name of the feature (on Token annotations) that needs to be
     * indexed by this token indexer.
     * 
     * @return the featureName
     */
    public String getFeatureName() {
      return featureName;
    }

    /**
     * Obtains the instance of {@link TermProcessor} that needs to be used by
     * this token indexer.
     * 
     * @return the termProcessor
     */
    public TermProcessor getTermProcessor() {
      return termProcessor;
    }
  }

  /**
   * Object storing the configuration for a semantic annotation indexer.
   */
  public static class SemanticIndexerConfig extends IndexerConfig {
    /**
     * Serialisation ID.
     */
    private static final long serialVersionUID = -8714423642897958538L;

    /**
     * The types of the annotation that need to be indexed by this indexer.
     */
    private String[] annotationTypes;

    /**
     * The {@link SemanticAnnotationHelper}s used by this indexer.
     */
    private SemanticAnnotationHelper[] helpers;

    /**
     * Creates a SemanticIndexerConfig. The two arrays given as parameters must
     * have the same length, the helper at a given position in the helpers array
     * is used to index the annotations with the type at the same position in
     * the annotationTypes array.
     * 
     * @param annotationTypes
     *          the types of the annotations that need to be indexed by this
     *          indexer.
     * @param helper
     *          the {@link SemanticAnnotationHelper}s used by this indexer.
     * @param directIndexEnabled should a direct index also be built?         
     */
    public SemanticIndexerConfig(String[] annotationTypes,
            SemanticAnnotationHelper[] helpers, boolean directIndexEnabled) {
      super(directIndexEnabled);
      this.annotationTypes = annotationTypes;
      this.helpers = helpers;
    }

    /**
     * Gets the types of annotations indexed by this indexer.
     * 
     * @return the annotationTypes
     */
    public String[] getAnnotationTypes() {
      return annotationTypes;
    }

    /**
     * Gets the {@link SemanticAnnotationHelper}s used to index annotations.
     * 
     * @return the helpers
     */
    public SemanticAnnotationHelper[] getHelpers() {
      return helpers;
    }
  }

  /**
   * 
   */
  private static final long serialVersionUID = -8127630936829037489L;
  
  /**
   * The current format version for the XML files containing serialisations of 
   * IndexConfig instances.
   * Version numbers:
   * <dl>
   * <dt>4</dt><dd>First version number used. Indexes previous to this did not 
   * save their version.</dd>
   * <dt>5</dt><dd>Mimir indexes are now built with MG4J-big (64 bits).</dd>
   * <dt>6</dt><dd>Added support for direct indexes.</dd>
   * <dt>7</dt><dd>Mímir 5.0 live index.</dt>
   * </dl>
   */
  public static final int FORMAT_VERSION = 7;

  /**
   * The default feature name for obtaining document URIs (provided as features
   * on documents).
   */
  public static final String DOCUMENT_URI_FEATURE_DEFAULT_NAME =
          "gate.mimir.uri";
  
  /**
   * The default value for {@link #timeBetweenBatches} (1 hour).
   */
  public static final int DEFAULT_TIME_BETWEEN_BATCHES = 3600 * 1000;
  
  
  /**
   * The default value for {@link #maximumBatches}
   */
  public static final int DEFAULT_MAXIMUM_BATCHES = 20;
  
  /**
   * A Map storing values that need to be passed between the various pluggable
   * components used by this index (e.g. ORDI-based annotation helpers may
   * pass references to the ORDI Factory between each other). 
   */
  private transient Map<String, Object> context;
  

  
  /**
   * Gets the map used for passing values between the various pluggable elements
   * in this index (such as annotation helpers). The returned map is live, 
   * meaning that all changes made to it are available to all other clients 
   * requesting it.    
   * @return a {@link Map}, with {@link String} keys and arbitrary values. 
   */
  public Map<String, Object> getContext() {
    // lazy creation
    if(context == null) {
      context = Collections.synchronizedMap(new HashMap<String, Object>());
    }
    return context;
  }

  /**
   * Constructs an index configuration object.
   * 
   * @param indexDirectory
   *          indexDirectory the top level directory to be used for storing the
   *          index.
   * @param tokenAnnotationSetName
   *          the name for the annotation set where token annotations can be
   *          found. Use <tt>null</tt> for the default annotation set.
   * @param tokenAnnotationType
   *          the type of annotations used as tokens.
   * @param semanticAnnotationSetName
   *          the name for the annotation set where semantic annotations should
   *          be collected from.
   * @param tokenIndexers
   *          an array of {@link TokenIndexerConfig} values, describing the
   *          configuration for the indexing of each token feature.
   * @param semanticIndexers
   *          an array of {@link SemanticIndexerConfig} values, describing the
   *          the configuration for indexing semantic annotations.
   */
  public IndexConfig(File indexDirectory, String tokenAnnotationSetName,
          String tokenAnnotationType, String semanticAnnotationSetName,
          TokenIndexerConfig[] tokenIndexers,
          SemanticIndexerConfig[] semanticIndexers,
          DocumentMetadataHelper[] docMetadataHelpers,
          DocumentRenderer documentRenderer) {
    
    this.indexDirectory = indexDirectory;
    this.formatVersion = FORMAT_VERSION;
    this.tokenAnnotationSetName = tokenAnnotationSetName;
    this.tokenAnnotationType = tokenAnnotationType;
    this.tokenIndexers = tokenIndexers;
    this.semanticAnnotationSetName = semanticAnnotationSetName;
    this.semanticIndexers = semanticIndexers;
    this.docMetadataHelpers = docMetadataHelpers;
    this.documentRenderer = documentRenderer;
    this.options = new HashMap<String, String>();
  }

  
  
  /**
   * @return the formatVersion See {@link #FORMAT_VERSION}.
   */
  public int getFormatVersion() {
    return formatVersion;
  }

  /**
   * See {@link #FORMAT_VERSION}.
   * @param formatVersion the formatVersion to set
   */
  public void setFormatVersion(int formatVersion) {
    this.formatVersion = formatVersion;
  }

  /**
   * Gets the top level directory of an index.
   * 
   * @return a {@link File} object.
   */
  public File getIndexDirectory() {
    return indexDirectory;
  }

  /**
   * Gets the annotation type to be used for obtaining tokens.
   * 
   * @return an {@link String} object.
   */
  public String getTokenAnnotationType() {
    return tokenAnnotationType;
  }

  /**
   * Gets the name for the annotation set where token annotations can be found.
   * 
   * @return the tokenAnnotationSet
   */
  public String getTokenAnnotationSetName() {
    return tokenAnnotationSetName;
  }

  /**
   * Gets the configuration for all the token indexers used.
   * 
   * @return an array of {@link TokenIndexerConfig} values.
   */
  public TokenIndexerConfig[] getTokenIndexers() {
    return tokenIndexers;
  }

  /**
   * Gets the name of the annotation set containing semantic annotations.
   * 
   * @return the semanticAnnotationSetName
   */
  public String getSemanticAnnotationSetName() {
    return semanticAnnotationSetName;
  }

  /**
   * Gets the configuration for all the semantic annotation indexers used.
   * 
   * @return an array of {@link SemanticIndexerConfig} values.
   */
  public SemanticIndexerConfig[] getSemanticIndexers() {
    return semanticIndexers;
  }
  
  /**
   * Gets the current value for the time interval (in milliseconds) between the 
   * saving of a batch and the next. This is the maximum interval documents 
   * submitted for indexing are kept in RAM (and are thus not searcheable).
   * 
   * Defaults to {@value #DEFAULT_TIME_BETWEEN_BATCHES}.
   * @return
   */
  public int getTimeBetweenBatches() {
    return timeBetweenBatches;
  }

  /**
   * Sets the current value for the time interval (in milliseconds) between the 
   * saving of a batch and the next. This is the maximum interval documents 
   * submitted for indexing are kept in RAM (and are thus not searcheable). 
   * 
   * Defaults to {@value #DEFAULT_TIME_BETWEEN_BATCHES}.
   */  
  public void setTimeBetweenBatches(int timeBetweenBatches) {
    this.timeBetweenBatches = timeBetweenBatches;
  }

  /**
   * Gets the maximum number of on-disk index batches before an index compaction
   * is triggered.
   * 
   * Defaults to {@value #DEFAULT_MAXIMUM_BATCHES}.
   * @return
   */
  public int getMaximumBatches() {
    return maximumBatches;
  }

  
  /**
   * Sets the maximum number of on-disk index batches before an index compaction
   * is triggered.
   * 
   * Defaults to {@link #DEFAULT_MAXIMUM_BATCHES}.
   * @param maximumBatches
   */
  public void setMaximumBatches(int maximumBatches) {
    this.maximumBatches = maximumBatches;
  }

  /**
   * Gets the options map - a Map with arbitrary configuration options, which 
   * is made available to all sub-elements of this index (e.g. the various 
   * annotation helpers).  
   */
  public Map<String, String> getOptions() {
    return options;
  }

  /**
   * Gets the renderer to be used for displaying documents and hits.
   * 
   * @return the documentRenderer
   */
  public DocumentRenderer getDocumentRenderer() {
    return documentRenderer;
  }

  /**
   * Sets the renderer to be used for displaying documents and hits.
   * 
   * @param documentRenderer
   *          the documentRenderer to set
   */
  public void setDocumentRenderer(DocumentRenderer documentRenderer) {
    this.documentRenderer = documentRenderer;
  }

  /**
   * Gets the array of document metadata helpers.
   * 
   * @return the docMetadataHelpers
   */
  public DocumentMetadataHelper[] getDocMetadataHelpers() {
    return docMetadataHelpers;
  }

  /**
   * @return the documentUriFeatureName
   */
  public String getDocumentUriFeatureName() {
    return documentUriFeatureName;
  }

  /**
   * @param documentUriFeatureName
   *          the documentUriFeatureName to set
   */
  public void setDocumentUriFeatureName(String documentUriFeatureName) {
    this.documentUriFeatureName = documentUriFeatureName;
  }

  /**
   * Creates an XStream object suitable for loading and saving Mimir index
   * configurations.
   */
  private static XStream newXStream() {
    XStream xs = new XStream(new StaxDriver());
    xs.setClassLoader(Gate.getClassLoader());
    xs.alias("indexConfig", IndexConfig.class);
    xs.alias("tokenIndexer", TokenIndexerConfig.class);
    xs.alias("semanticIndexer", SemanticIndexerConfig.class);
    // when loading old indexes, add the '.big.'
    xs.aliasPackage("it.unimi.dsi.mg4j", "it.unimi.di.big.mg4j");
    // when loading pre-5.0 indexes, replace the package name
    xs.aliasPackage("it.unimi.dsi.big.mg4j", "it.unimi.di.big.mg4j");
    return xs;
  }

  /**
   * Saves an {@link IndexConfig} object to a file via XML serialisation.
   * 
   * @param config
   *          the object to be saved.
   * @param file
   *          the file to write to.
   * @throws IOException
   */
  public static void writeConfigToFile(IndexConfig config, File file)
          throws IOException {
    XStream xstream = newXStream();
    FileWriter fileWriter = new FileWriter(file);
    HierarchicalStreamWriter xmlWriter = new PrettyPrintWriter(fileWriter);
    xstream.marshal(config, xmlWriter);
  }

  /**
   * Loads an index config object from a file. The file should have been created
   * using the {@link #writeConfigToFile(IndexConfig, File)} method.
   * 
   * @param file
   *          the file to read.
   * @return an {@link IndexConfig} object.
   * @throws IOException
   *           if the provided config file cannot be found.
   * @throws IndexException
   *           if the parsing of the config file fails.
   */
  public static IndexConfig readConfigFromFile(File file) throws IOException,
          IndexException {
    return readConfigFromUrl(file.toURI().toURL());
  }

  /**
   * Loads an index config object from a URL. The file should have been created
   * using the {@link #writeConfigToFile(IndexConfig, File)} method.
   * 
   * @param u
   *          the URL to read.
   * @return an {@link IndexConfig} object.
   * @throws IOException
   *           if the provided config file cannot be found.
   * @throws IndexException
   *           if the parsing of the config file fails.
   */
  public static IndexConfig readConfigFromUrl(URL u) throws IOException,
          IndexException {
    try {
      XMLInputFactory inputFactory = XMLInputFactory.newInstance();
      InputStream configStream = new BufferedInputStream(u.openStream()); 
      XMLStreamReader xsr =
              inputFactory.createXMLStreamReader(configStream);
      HierarchicalStreamReader xmlReader = new StaxReader(new QNameMap(), xsr);
      try {
        IndexConfig theConfig = (IndexConfig)newXStream().unmarshal(xmlReader);
        // check the version number
        if(theConfig.formatVersion > FORMAT_VERSION){
          throw new UnsupportedOperationException(
            "The version of the IndexConfig at \"" + u.toExternalForm() + 
            "\" is greater than the maximum supported version by this Mímir " +
            "implementation (" + theConfig.formatVersion + " > " + FORMAT_VERSION +
            ").");
        }
        return theConfig;
      } finally {
        xmlReader.close();
        configStream.close();
      }
    } catch(XMLStreamException e) {
      throw new IndexException("Exception while reading config from " + u, e);
    }
  }

  /**
   * Loads an index config object from a file, but allows the caller to override
   * the index directory stored in the file. This is useful if the index was
   * created on one machine but is being used on another.
   * 
   * @param configFile
   *          the file to read
   * @param indexDir
   *          the top-level index directory, which will be used instead of the
   *          value stored in the config file.
   * @throws FileNotFoundException
   *           if the provided config file cannot be found.
   * @throws IndexException
   *           if the parsing of the config file fails.
   */
  public static IndexConfig readConfigFromFile(File configFile, File indexDir)
          throws IOException, IndexException {
    IndexConfig conf = readConfigFromFile(configFile);
    // indexDirectory is private but this method is inside the IndexConfig
    // class so this assignment is legal.
    conf.indexDirectory = indexDir;
    return conf;
  }

  /**
   * Loads an index config object from a URL, but allows the caller to override
   * the index directory stored in the file. This is useful if the index was
   * created on one machine but is being used on another.
   * 
   * @param configFile
   *          the file to read
   * @param indexDir
   *          the top-level index directory, which will be used instead of the
   *          value stored in the config file.
   * @throws FileNotFoundException
   *           if the provided config file cannot be found.
   * @throws IndexException
   *           if the parsing of the config file fails.
   */
  public static IndexConfig readConfigFromUrl(URL configFile, File indexDir)
          throws IOException, IndexException {
    IndexConfig conf = readConfigFromUrl(configFile);
    // indexDirectory is private but this method is inside the IndexConfig
    // class so this assignment is legal.
    conf.indexDirectory = indexDir;
    return conf;
  }

  /**
   * The top level directory of the index.
   */
  private File indexDirectory;

  /**
   * The format version for this index config instance.
   */
  private int formatVersion;
  
  /**
   * The annotation type used for tokens.
   */
  private String tokenAnnotationType;

  /**
   * The annotation set where token annotations can be found.
   */
  private String tokenAnnotationSetName;

  /**
   * The configuration for all the token indexers used.
   */
  private TokenIndexerConfig[] tokenIndexers;

  /**
   * The configuration for all the semantic indexers used.
   */
  private SemanticIndexerConfig[] semanticIndexers;

  /**
   * The helpers used for generating document metadata.
   */
  private DocumentMetadataHelper[] docMetadataHelpers;

  /**
   * The document renderer used to render documents and hits.
   */
  private DocumentRenderer documentRenderer;

  /**
   * The name of the annotation set containing the semantic annotations
   */
  private String semanticAnnotationSetName;

  /**
   * The name for the document feature containing the document URI. Defaults to
   * {@link #DOCUMENT_URI_FEATURE_DEFAULT_NAME}.
   */
  private String documentUriFeatureName = DOCUMENT_URI_FEATURE_DEFAULT_NAME;
  
  
  /**
   * The maximum amount of time between dumping batches to disk, i.e. the 
   * maximum amount of time a document may be stored in RAM after having been 
   * submitted for indexing and before it becomes searchable. 
   */
  private int timeBetweenBatches = DEFAULT_TIME_BETWEEN_BATCHES;
  
  
  /**
   * The maximum number of constituent batches in any atomic index before a 
   * compact operation is triggered. 
   */
  private int maximumBatches = DEFAULT_MAXIMUM_BATCHES;
  
  /**
   * A Map with arbitrary configuration options, which is made available to all
   * sub-elements of this index (e.g. the various annotation helpers).  
   */
  private Map<String, String> options;
}