AtomicIndex.java example

Explorer
mimir-master
- etc
  - generated-header.java
- mimir-client
  - src
    - gate
      - mimir
        index
        MimirConnector.java
        MimirIndexingPR.java
        search
        RemoteQueryRunner.java
        tool
        WebUtils.java
- mimir-cloud
  - archive-unpacker
    - src
      - gate
        mimir
        util
        MultiFileInputStream.java
        UnpackWizard.java
  - src
    - java
      - gate
        mimir
        util
        IndexArchiveState.java
        MultiFileOutputStream.java
- mimir-core
  - src
    - gate
      - mimir
        AbstractSemanticAnnotationHelper.java
        Constraint.java
        ConstraintType.java
        DocumentMetadataHelper.java
        DocumentRenderer.java
        IndexConfig.java
        MimirIndex.java
        SemanticAnnotationHelper.java
        index
        AtomicAnnotationIndex.java
        AtomicIndex.java
        AtomicTokenIndex.java
        DocumentCollection.java
        DocumentData.java
        GATEDocument.java
        GATEDocumentFactory.java
        IndexException.java
        Mention.java
        OriginalMarkupMetadataHelper.java
        package-info.java
        search
        FederatedQueryRunner.java
        IndexReaderPool.java
        QueryEngine.java
        QueryRunner.java
        RankingQueryRunnerImpl.java
        query
        AbstractIntersectionQueryExecutor.java
        AbstractOverlapQuery.java
        AbstractQueryExecutor.java
        AndQuery.java
        AnnotationQuery.java
        Binding.java
        ConstQuery.java
        ContainsQuery.java
        ExecutorsList.java
        GapQuery.java
        MinusQuery.java
        OrQuery.java
        QueryExecutor.java
        QueryNode.java
        RepeatsQuery.java
        SequenceQuery.java
        TermQuery.java
        WithinQuery.java
        parser
        ParseException.java
        Query.java
        QueryParser.java
        QueryParserConstants.java
        QueryParserTokenManager.java
        SimpleCharStream.java
        Token.java
        TokenMgrError.java
        score
        BindingScorer.java
        DelegatingScoringQueryExecutor.java
        MimirScorer.java
        terms
        AbstractCompoundTermsQuery.java
        AbstractDocumentsBasedTermsQuery.java
        AbstractIndexTermsQuery.java
        AndTermsQuery.java
        AnnotationTermsQuery.java
        CompoundTermsQuery.java
        ConstTermsQuery.java
        DocumentTermsQuery.java
        DocumentsAndTermsQuery.java
        DocumentsBasedTermsQuery.java
        DocumentsOrTermsQuery.java
        LimitTermsQuery.java
        OrTermsQuery.java
        SortedTermsQuery.java
        TermTypeTermsQuery.java
        TermsQuery.java
        TermsResultSet.java
        util
        DefaultMentionDescriber.java
        DelegatingSemanticAnnotationHelper.java
        DocumentFeaturesMetadataHelper.java
        IgnoreEmptiesTermProcessor.java
        IndexUpgrader.java
        MG4JTools.java
        NormalizingTermProcessor.java
        OntologyMentionDescriber.java
        TruncateIndex.java
- mimir-test
  - src
    - gate
      - mimir
        test
        QueryTests.java
        RenderZipCollection.java
        Scratch.java
        ScratchConsole.java
        TestQueryParser.java
        TestUtils.java
- mimir-web
  - src
    - gwt
      - gate
        mimir
        web
        client
        UI.java
    - java
      - gate
        mimir
        util
        LogAnalyser.java
        web
        client
        DocumentData.java
        GwtRpcService.java
        GwtRpcServiceAsync.java
        MimirSearchException.java
        ResultsData.java
- plugins
  - db-h2
    - src
      - gate
        mimir
        db
        AnnotationTemplateCache.java
        DBSemanticAnnotationHelper.java
  - measurements
    - src
      - gate
        mimir
        measurements
        MeasurementAnnotationHelper.java
        MeasurementPluginResource.java
  - sparql
    - src
      - gate
        mimir
        sparql
        RequestMethod.java
        SPARQLResultSet.java
        SPARQLSemanticAnnotationHelper.java
/*
 *  AtomicIndex.java
 *
 *  Copyright (c) 2007-2013, The University of Sheffield.
 *
 *  This file is part of GATE Mímir (see http://gate.ac.uk/family/mimir.html), 
 *  and is free software, licenced under the GNU Lesser General Public License,
 *  Version 3, June 2007 (also included with this distribution as file
 *  LICENCE-LGPL3.html).
 *
 *  Valentin Tablan, 1 Nov 2013
 *
 *  $Id$
 */
package gate.mimir.index;

import gate.Annotation;
import gate.mimir.MimirIndex;
import gate.mimir.search.IndexReaderPool.IndexDictionary;
import gate.util.GateRuntimeException;
import it.unimi.di.big.mg4j.index.BitStreamIndex;
import it.unimi.di.big.mg4j.index.CompressionFlags;
import it.unimi.di.big.mg4j.index.CompressionFlags.Coding;
import it.unimi.di.big.mg4j.index.CompressionFlags.Component;
import it.unimi.di.big.mg4j.index.DiskBasedIndex;
import it.unimi.di.big.mg4j.index.Index;
import it.unimi.di.big.mg4j.index.Index.UriKeys;
import it.unimi.di.big.mg4j.index.IndexIterator;
import it.unimi.di.big.mg4j.index.IndexReader;
import it.unimi.di.big.mg4j.index.IndexWriter;
import it.unimi.di.big.mg4j.index.NullTermProcessor;
import it.unimi.di.big.mg4j.index.QuasiSuccinctIndex;
import it.unimi.di.big.mg4j.index.QuasiSuccinctIndexWriter;
import it.unimi.di.big.mg4j.index.SkipBitStreamIndexWriter;
import it.unimi.di.big.mg4j.index.TermProcessor;
import it.unimi.di.big.mg4j.index.cluster.ContiguousDocumentalStrategy;
import it.unimi.di.big.mg4j.index.cluster.ContiguousLexicalStrategy;
import it.unimi.di.big.mg4j.index.cluster.DocumentalCluster;
import it.unimi.di.big.mg4j.index.cluster.DocumentalConcatenatedCluster;
import it.unimi.di.big.mg4j.index.cluster.LexicalCluster;
import it.unimi.di.big.mg4j.io.IOFactory;
import it.unimi.di.big.mg4j.tool.Combine;
import it.unimi.di.big.mg4j.tool.Combine.IndexType;
import it.unimi.di.big.mg4j.tool.Concatenate;
import it.unimi.di.big.mg4j.tool.Scan;
import it.unimi.dsi.big.io.FileLinesCollection;
import it.unimi.dsi.big.util.ShiftAddXorSignedStringMap;
import it.unimi.dsi.big.util.StringMap;
import it.unimi.dsi.bits.Fast;
import it.unimi.dsi.bits.TransformationStrategies;
import it.unimi.dsi.fastutil.Arrays;
import it.unimi.dsi.fastutil.Hash;
import it.unimi.dsi.fastutil.Swapper;
import it.unimi.dsi.fastutil.ints.IntArrayList;
import it.unimi.dsi.fastutil.ints.IntBigArrayBigList;
import it.unimi.dsi.fastutil.ints.IntBigList;
import it.unimi.dsi.fastutil.ints.IntComparator;
import it.unimi.dsi.fastutil.ints.IntList;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;
import it.unimi.dsi.fastutil.objects.Object2LongAVLTreeMap;
import it.unimi.dsi.fastutil.objects.Object2LongMap;
import it.unimi.dsi.fastutil.objects.Object2ReferenceOpenHashMap;
import it.unimi.dsi.fastutil.objects.ObjectBigArrayBigList;
import it.unimi.dsi.fastutil.objects.ObjectBigList;
import it.unimi.dsi.io.OutputBitStream;
import it.unimi.dsi.lang.MutableString;
import it.unimi.dsi.lang.ObjectParser;
import it.unimi.dsi.logging.ProgressLogger;
import it.unimi.dsi.sux4j.mph.LcpMonotoneMinimalPerfectHashFunction;
import it.unimi.dsi.util.BloomFilter;
import it.unimi.dsi.util.Properties;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.PrintStream;
import java.io.PrintWriter;
import java.nio.ByteOrder;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.Callable;
import java.util.concurrent.Future;
import java.util.concurrent.FutureTask;
import java.util.concurrent.RunnableFuture;

import org.apache.commons.configuration.ConfigurationException;
import org.apache.log4j.Logger;

import com.google.common.io.PatternFilenameFilter;

/**
 * <p>
 * An inverted index associating terms with documents. Terms can be either token
 * feature values, or annotations. Optionally, a direct index may also be 
 * present.
 * </p>
 * <p>
 * An atomic index manages a head index (the principal data) and a set of tail 
 * indexes (batches containing updates). Additionally, the data representing 
 * all the new documents that have been queued for indexing since the last tail
 * was written are stored in RAM.
 * </p>
 * <p>
 * When direct indexing is enabled, the term IDs in the direct index are 
 * different from the term IDs in the inverted index. In the inverted index 
 * the term IDs are their position in the lexicographically sorted list of all
 * terms. In the directed index, the term IDs are their position in the list
 * sorted by the time they were first seen during indexing.
 * </p>
 * <p>
 * The head and tail batches can be combined into a new head by a 
 * <em>compact</em> operation.
 */
public abstract class AtomicIndex implements Runnable {
  
  /**
   * A callable that does nothing. This is used to produce instances of 
   * {@link CustomFuture} which are only used to wait for the completion of an 
   * operation that returns nothing.
   */
  private static final Callable<Void> noOpVoid = new Callable<Void>() {
    @Override
    public Void call() throws Exception {
      return null;
    }
  };
  
  /**
   * An in-RAM representation of a postings list
   */
  protected static class PostingsList {
    
    /**
     * The first document pointer added to this postings list.
     */
    private long firstDocumentPointer = -1;
    
    /**
     * The last seen document pointer.
     */
    private long lastDocumentPointer = -1;
    
    /**
     * The list of document pointer differentials (differences from the previous
     * document pointer. The first document pointer value is stored in
     * {@link #firstDocumentPointer}), and we store a <tt>0</tt> in the first 
     * position of this list. The actual value of a document pointer for a given
     * position is:
     * <dl>
     *   <dt>pos 0</dt>
     *   <dd>{@link #firstDocumentPointer}</dd>
     *   <dt>pos i</dt>
     *   <dd>pointer at position (i-1) + documentPointersDifferential[i]</dd>
     * </dl>
     */
    private IntList documentPointersDifferential;
    

    /**
     * The count (number of terms) for each document. This list is aligned with 
     * {@link #documentPointersDifferential}.
     */
    private IntList counts;
    
    /**
     * The list of positions in this postings list. For each document at 
     * position <tt>i</i>, there will be counts[i] positions stored in this 
     * list. This value is <code>non-null</code> only if positions are stored,
     * which is configured through a construction-time parameter.
     */
    private IntArrayList positions;
    
    /**
     * The last seen position in the current document.
     */
    private int lastPosition = -1;
    
    /**
     * The number of positions in the current document
     */
    private int count = 0; 
    
    /**
     * The maximum term count of all the stored documents
     */
    private int maxCount = 0;
    
    /**
     * The number of document pointers contained
     */
    private long frequency = 0;
    
    /**
     * The total number of term occurrences in all stored documents.
     */
    private long occurrences = 0;
    
    /**
     * The sum of the maximum positions for each document.
     */
    private long sumMaxPos = 0;
    
    public PostingsList(boolean storePositions) {
      firstDocumentPointer = -1;
      documentPointersDifferential = new IntArrayList();
      counts = new IntArrayList();
      if(storePositions) {
        positions = new IntArrayList();
      }
    }

    /**
     * Start storing the data for a new document
     * @param pointer
     */
    public void newDocumentPointer(long pointer) {
      // is this really a new document?
      if(pointer != lastDocumentPointer) {
        if(firstDocumentPointer < 0) firstDocumentPointer = pointer;
        if(lastDocumentPointer == -1) {
          // this is the first document
          documentPointersDifferential.add(0);  
        } else {
          // close previous document
          flush();
          // add the new document
          documentPointersDifferential.add((int)(pointer - lastDocumentPointer));
        }
        lastDocumentPointer = pointer;
        // reset the lastPosition when moving to a new document
        lastPosition = -1;
        
        frequency++;
      }
    }

    public void addPosition(int pos) {
      // ignore if the position hasn't changed: we don't store two identical 
      // records
      if(pos != lastPosition) {
        positions.add(pos);
        count++;
        //and update lastPosition
        lastPosition = pos;        
      }
    }
    
    /**
     * When storing positions, the count is automatically calculated. When not 
     * storing positions, it needs to be explicitly set by calling this method.
     * @param count
     */
    public void setCount(int count) {
      this.count = count;
    }
    
    /**
     * Checks whether the given position is valid (i.e. greater than the last 
     * seen position. If the position is invalid, this means that a call to
     * {@link #addPosition(int)} with the same value would actually be a 
     * no-operation.  
     * @param pos
     * @return
     */
    public boolean checkPosition(int pos){
      return pos > lastPosition;
    }
    
    /**
     * Notifies this postings list that it has received all the data
     */
    public void flush() {
      if(count > 0) {
        // we have some new positions for the last document: they were already
        // added to positions, but we now need to store their count
        counts.add(count);
        if(count > maxCount) maxCount = count;
        sumMaxPos += lastPosition;
        occurrences += count;
      }
      count = 0;
    }
    
    /**
     * Empties all the data from this postings list making it ready to be reused.
     */
    public void clear() {
      documentPointersDifferential.clear();
      count = 0;
      counts.clear();
      maxCount = 0;
      occurrences = 0;
      if(positions != null){
        positions.clear();
        lastPosition = -1;
        sumMaxPos = 0;
      }
      firstDocumentPointer = -1;
      lastDocumentPointer = -1;
      frequency = 0;
    }
    
    /**
     * Writes the data contained in this postings list to an index writer
     * @param indexWriter
     * @throws IOException 
     */
    public void write(IndexWriter indexWriter) throws IOException {
      flush();
      if(indexWriter instanceof QuasiSuccinctIndexWriter) {
        ((QuasiSuccinctIndexWriter)indexWriter).newInvertedList(
            frequency,
            occurrences, 
            positions!= null ? sumMaxPos : 0);
      } else {
        indexWriter.newInvertedList();
      }
      
      indexWriter.writeFrequency(frequency);
      long currDocumentPointer = firstDocumentPointer;
      int positionsStart = 0;
      for(int docId = 0; docId < documentPointersDifferential.size(); docId++) {
        currDocumentPointer += documentPointersDifferential.get(docId);
        int currCount = counts.get(docId);
        OutputBitStream obs = indexWriter.newDocumentRecord();
        indexWriter.writeDocumentPointer(obs, currDocumentPointer);
        indexWriter.writePositionCount(obs, currCount);
        if(positions != null){
          indexWriter.writeDocumentPositions(obs, positions.elements(),
              positionsStart, currCount, -1);
          positionsStart += currCount;       
        }
      }
    }

    @Override
    public String toString() {
      StringBuilder str = new StringBuilder();
      long docPointer = firstDocumentPointer;
      int positionsPointer = 0;
      boolean firstDoc = true;
      for(int i = 0; i < documentPointersDifferential.size(); i++) {
        docPointer += documentPointersDifferential.get(i);
        int count = counts.getInt(i);
        if(firstDoc) {
          firstDoc = false;
        } else {
          str.append("; ");
        }
        str.append(docPointer).append("(");
        boolean firstPos = true;
        for(int j = positionsPointer; j < positionsPointer + count; j++) {
          if(firstPos) {
            firstPos = false;
          } else {
            str.append(", ");
          }
          str.append(positions.getInt(j));
        }
        str.append(") ");
      }
      
      return str.toString();
    }
    
    
  }
  
  /**
   * Class representing an MG4J index batch, such as the head or any of the 
   * tails.
   */
  protected static class MG4JIndex {
    protected File indexDir;
    protected Index invertedIndex;
    protected Index directIndex;
    protected BloomFilter<Void> invertedTermFilter;
    protected BloomFilter<Void> directTermFilter;
    protected String indexName;
    
    public MG4JIndex(
        File indexDir,
        String indexName,
        Index invertedIndex,  
        BloomFilter<Void> invertedTermFilter,
        Index directIndex,
        BloomFilter<Void> directTermFilter) {
      
      this.indexDir = indexDir;
      this.indexName = indexName;
      this.invertedIndex = invertedIndex;
      this.invertedTermFilter = invertedTermFilter;
      
      this.directIndex = directIndex;
      this.directTermFilter = directTermFilter;
    }
  }
  
  /**
   * Given a terms file (text file with one term per line) this method generates
   * the corresponding termmap file (binary representation of a StringMap).
   * Optionally, a {@link BloomFilter} can also be generated, if the suitable
   * target file is provided.
   * 
   * @param termsFile the input file
   * @param termmapFile the output termmap file, or <code>null</code> if a 
   * termmap is not required.
   * @param bloomFilterFile the file to be used for writing the 
   * {@link BloomFilter} for the index, or <code>null</code> if a Bloom filter
   * is not required.
   * @throws IOException
   */
  public static void generateTermMap(File termsFile, File termmapFile,
      File bloomFilterFile) throws IOException {
    FileLinesCollection fileLinesCollection =
        new FileLinesCollection(termsFile.getAbsolutePath(), "UTF-8");
    if(termmapFile != null) {
      StringMap<CharSequence> terms =
          new ShiftAddXorSignedStringMap(
              fileLinesCollection.iterator(),
              new LcpMonotoneMinimalPerfectHashFunction<CharSequence>(
                  fileLinesCollection, TransformationStrategies.prefixFreeUtf16()));      
      BinIO.storeObject(terms, termmapFile);      
    }

    if(bloomFilterFile != null) {
      BloomFilter<Void> bloomFilter = BloomFilter.create(fileLinesCollection.size64());
      for(MutableString term : fileLinesCollection) {
        bloomFilter.add(term);
      }
      BinIO.storeObject(bloomFilter, bloomFilterFile);
    }
  }  

  /**
   * Creates a documental cluster from a list of {@link MG4JIndex} values.
   * 
   * @param batches the indexes to be combined into a cluster 
   * @param termProcessor the term processor to be used (can be null)
   * @return a documental cluster view of the list of indexes provided.
   */
  protected final static Index openInvertedIndexCluster(
      List<MG4JIndex> batches, TermProcessor termProcessor){
    
    if(batches == null || batches.size() == 0) return null;
    if(batches.size() == 1) return batches.get(0).invertedIndex;
    
    // prepare the documental cluster
    Index[] indexes = new Index[batches.size()];
    // cut points between the batches - there are numBatches+1 cutpoints,
    // cutPoints[0] is always zero, and cutPoints[i] is the sum of the
    // sizes of batches 0 to i-1 inclusive
    long[] cutPoints = new long[indexes.length + 1];
    cutPoints[0] = 0;
    int numberOfTerms = -1;
    int numberOfDocuments = -1;
    long numberOfPostings = -1;
    long numberOfOccurences =-1;
    int maxCount =-1;
    int indexIdx = 0;
    IntBigList sizes = new IntBigArrayBigList();
    @SuppressWarnings("unchecked")
    BloomFilter<Void> bloomFilters[] = new BloomFilter[indexes.length];
    
    for(MG4JIndex aSubIndex : batches) {
      indexes[indexIdx] = aSubIndex.invertedIndex;
      cutPoints[indexIdx + 1] = cutPoints[indexIdx] + 
          aSubIndex.invertedIndex.numberOfDocuments;
      numberOfTerms += aSubIndex.invertedIndex.numberOfTerms;
      numberOfDocuments += aSubIndex.invertedIndex.numberOfDocuments;
      numberOfPostings += aSubIndex.invertedIndex.numberOfPostings;
      numberOfOccurences += aSubIndex.invertedIndex.numberOfOccurrences;
      if(maxCount < aSubIndex.invertedIndex.maxCount){
        maxCount = aSubIndex.invertedIndex.maxCount;
      }
      bloomFilters[indexIdx] = aSubIndex.invertedTermFilter;
      sizes.addAll(aSubIndex.invertedIndex.sizes);
      indexIdx++;
    }
    
    return new DocumentalConcatenatedCluster(indexes,
          new ContiguousDocumentalStrategy(cutPoints),
          false, // flat = all component indexes have the same term list
          bloomFilters, // Bloom Filters
          numberOfDocuments, 
          numberOfTerms, 
          numberOfPostings, 
          numberOfOccurences, 
          maxCount, 
          null, // payload
          true, // hasCounts 
          true, // hasPositions, 
          termProcessor, 
          null, // field 
          sizes, // sizes
          null // properties
          );
  }  
  
  /**
   * Opens the direct index files from all the batches and combines them into
   * a {@link LexicalCluster}.
   * @param batches the batches to be opened.
   * @return
   */
  protected final static Index openDirectIndexCluster(List<MG4JIndex> batches){
    
    if(batches == null || batches.size() == 0) return null;
    if(batches.size() == 1) return batches.get(0).directIndex;
    
    // prepare the lexical cluster
    Index[] indexes = new Index[batches.size()];
    int[] cutPoints = new int[indexes.length];
    cutPoints[0] = 0;
    String[] cutPointTerms = new String[indexes.length];
    cutPointTerms[0] = longToTerm(0);
    int numberOfTerms = -1;
    int numberOfDocuments = -1;
    long numberOfPostings = -1;
    long numberOfOccurences =-1;
    int maxCount =-1;
    int indexIdx = 0;
    @SuppressWarnings("unchecked")
    BloomFilter<Void> bloomFilters[] = new BloomFilter[indexes.length];
    
    for(MG4JIndex aSubIndex : batches) {
      indexes[indexIdx] = aSubIndex.directIndex;
      // we build this based on the inverted index, as the cut-points for the
      // lexical partitioning are based on document IDs
      if(indexIdx < cutPoints.length - 1) {
        cutPoints[indexIdx + 1] = cutPoints[indexIdx] + 
            (int)aSubIndex.invertedIndex.numberOfDocuments;
        cutPointTerms[indexIdx + 1] = longToTerm(cutPoints[indexIdx + 1]);
      }
      numberOfTerms += aSubIndex.directIndex.numberOfTerms;
      numberOfDocuments += aSubIndex.directIndex.numberOfDocuments;
      numberOfPostings += aSubIndex.directIndex.numberOfPostings;
      numberOfOccurences += aSubIndex.directIndex.numberOfOccurrences;
      if(maxCount < aSubIndex.directIndex.maxCount){
        maxCount = aSubIndex.directIndex.maxCount;
      }
      bloomFilters[indexIdx] = aSubIndex.directTermFilter;
      indexIdx++;
    }
    cutPointTerms[cutPointTerms.length - 1] = null;
    
    return new LexicalCluster(indexes,
          new ContiguousLexicalStrategy(cutPoints, cutPointTerms),
          bloomFilters, // Bloom Filters
          numberOfDocuments, 
          numberOfTerms, 
          numberOfPostings, 
          numberOfOccurences, 
          maxCount, 
          null, // payload
          true, // hasCounts 
          false, // hasPositions, 
          NullTermProcessor.getInstance(), 
          null, // field 
          null, // sizes
          null // properties
          );
  }  
  
  /**
   * Converts a long value into a String containing a zero-padded Hex 
   * representation of the input value. The lexicographic ordering of the 
   * generated strings is the same as the natural order of the corresponding
   * long values.
   *  
   * @param value the value to convert.
   * @return the string representation.
   */
  public static final String longToTerm(long value) {
    String valueStr = Long.toHexString(value);
    return "0000000000000000".substring(valueStr.length()) + valueStr;
  }  
  
  /**
   * The file name (under the current directory for this atomic index) which 
   * stores the principal index. 
   */
  public static final String HEAD_FILE_NAME = "head";
  
  /**
   * The file extension used for the temporary directory where the updated head
   * is being built.
   */
  public static final String HEAD_NEW_EXT = ".new";
  
  /**
   * The file extension used for the temporary directory where the old head 
   * index is being stored while the newly updated one is being installed.
   */
  public static final String HEAD_OLD_EXT = ".old";
  
  /**
   * The prefix used for file names (under the current directory for this 
   * atomic index) for updates to the head index.
   */
  public static final String TAIL_FILE_NAME_PREFIX = "tail-";
  
  
  public static final String DIRECT_TERMS_FILENAME = "direct.terms";
  
  /**
   * FIles belonging to teh direct index get this suffix added to their 
   * basename.
   */
  public static final String DIRECT_INDEX_NAME_SUFFIX = "-dir";
  
  /**
   * The file name (under the current directory for this atomic index) for the
   * directory containing the documents that have been queued for indexing, but 
   * not yet indexed. 
   */
  public static final String DOCUMENTS_QUEUE_FILE_NAME = "queued-documents";
  
  /** The initial size of the term map. */
  private static final int INITIAL_TERM_MAP_SIZE = 1024;
  
  /**
   * A marker value that gets queued to indicate a request to
   * write the in-RAM data to a new index batch.
   */
  private static final GATEDocument DUMP_BATCH = new GATEDocument(){};

  /**
   * A marker value that gets queued to indicate a request to combine all the
   * on-disk batches into a new head.
   */
  private static final GATEDocument COMPACT_INDEX = new GATEDocument(){};
  
  private static Logger logger = Logger.getLogger(AtomicIndex.class);
  
  protected static final PatternFilenameFilter TAILS_FILENAME_FILTER = 
      new PatternFilenameFilter("\\Q" + TAIL_FILE_NAME_PREFIX + "\\E\\d+");
  
  /**
   * The name of this atomic index.
   */
  protected String name;
  
  /**
   * The directory where this atomic index stores its files.
   */
  protected File indexDirectory;
  
  /**
   * The term processor used to process the feature values being indexed.
   */
  protected TermProcessor termProcessor = null;
  
  /**
   * The size (number of terms) for the longest document indexed but not yet 
   * saved. 
   */
  protected int maxDocSizeInRAM = -1;
  
  /**
   * The number of occurrences represented in RAM and not yet written to disk.  
   */
  protected long occurrencesInRAM = 0;
  

  
  /**
   * The {@link MimirIndex} that this atomic index is a member of.
   */
  protected MimirIndex parent;
  
  /**
   * A list containing the head and tails of this index.
   */
  protected List<MG4JIndex> batches;
  
  /**
   * The cluster-view of all the MG4J indexes that are part of this index (i.e.
   * the head and all the tails). 
   */
  protected Index invertedIndex;
  
  /**
   * The direct index for this atomic index. If 
   * <code>{@link #hasDirectIndex()}</code> is false, then this index will be 
   * <code>null</code>.
   */
  protected Index directIndex;
  
  /**
   * A set of properties added to the ones obtained from the index writer when
   * writing out batches.
   */
  protected Properties additionalProperties;
  
  /**
   * A set of properties added to the ones obtained from the direct index writer
   * when writing out batches.
   */
  protected Properties additionalDirectProperties;
  
  /**
   * Is the direct indexing enabled? Direct indexes are used to find terms 
   * occurring in given documents. This is the reverse operation to the typical
   * search, which finds documents containing a given a set of terms.
   */
  protected boolean hasDirectIndex;
  
  /**
   * This map associates direct index terms with their IDs. See the note at the
   * top-level javadocs for this class for a discussion on direct and inverted 
   * term IDs. 
   */
  protected Object2LongMap<String> directTermIds;
  
  /**
   * The terms in the direct index, in the order they were first seen during 
   * indexing.
   */
  protected ObjectBigList<String> directTerms;
  
  /**
   * The single thread used to index documents. All writes to the index files
   * are done from this thread.
   */
  protected Thread indexingThread;
  
  /**
   * Documents to be indexed are queued in this queue.
   */
  protected BlockingQueue<GATEDocument> inputQueue;
  
  /**
   * Documents that have been indexed are passed on to this queue.
   */
  protected BlockingQueue<GATEDocument> outputQueue;

    
  /**
   * The position of the current (or most-recently used) token in the current
   * document.
   */
  protected int tokenPosition;
  
  /**
   * A mutable string used to create instances of MutableString on the cheap.
   */
  protected MutableString currentTerm;
  
  /**
   * The number of documents currently stored in RAM.
   */
  protected int documentsInRAM;
  
  /**
   * An in-memory inverted index that gets dumped to files for each batch. 
   */
  protected Object2ReferenceOpenHashMap<MutableString, PostingsList> termMap;
  
  /**
   * The sizes (numbers of terms) for all the documents indexed in RAM.
   */
  protected IntArrayList documentSizesInRAM;
  
  /**
   * If a request was made to compress the index (combine all sub-indexes 
   * into a new head) this value will be non-null. The operation will be 
   * performed on the indexing thread at the first opportunity. At that point 
   * this future will complete, and the value will be set back to null.
   */
  protected RunnableFuture<Void> compactIndexTask;
  
  /**
   * If a request was made to write the in-RAM index data to disk this value 
   * will be not null. The operation will be performed on the indexing
   * thread at the first opportunity.  At that point the Future will complete, 
   * and the value will be set back to null.
   */
  protected RunnableFuture<Long> batchWriteTask;
  
  /**
   * Creates a new AtomicIndex
   * 
   * @param parent the {@link MimirIndex} containing this atomic index.
   * @param name the name of the sub-index, e.g. <em>token-i</em> or 
   *  <em>mentions-j</em>
   * @param indexDirectory the directory where this index should store all its 
   *  files.
   * @param hasDirectIndex should a direct index be used?
   * @param inputQueue the input queue for documents to be indexed.
   * @param outputQueue the output queue for documents that have been indexed.
   * @throws IndexException 
   * @throws IOException 
   */
	protected AtomicIndex(MimirIndex parent, String name,
      boolean hasDirectIndex, TermProcessor termProcessor,
      BlockingQueue<GATEDocument> inputQueue,
      BlockingQueue<GATEDocument> outputQueue) throws IOException, IndexException {
    this.parent = parent;
    this.name = name;
    this.indexDirectory = new File(parent.getIndexDirectory(), name);
    this.hasDirectIndex = hasDirectIndex;
    this.termProcessor = termProcessor;
    this.inputQueue = inputQueue;
    this.outputQueue = outputQueue;
    
    this.currentTerm = new MutableString();
    
    this.additionalProperties = new Properties();
    // save the term processor
    additionalProperties.setProperty(Index.PropertyKeys.TERMPROCESSOR, 
        ObjectParser.toSpec(termProcessor));
    if(hasDirectIndex) {
      additionalDirectProperties = new Properties();
      additionalDirectProperties.setProperty(Index.PropertyKeys.TERMPROCESSOR, 
          ObjectParser.toSpec(NullTermProcessor.getInstance()));
    }
    initIndex();
  }

	/**
	 * Opens the index and prepares it for indexing and searching. 
	 * @throws IndexException 
	 * @throws IOException 
	 */
	protected void initIndex() throws IOException, IndexException {
    // open the index
	  batches = new ArrayList<AtomicIndex.MG4JIndex>();
    if(indexDirectory.exists()) {
      // opening an existing index
      List<String> batchNames = new ArrayList<String>();
      
      File headDir = new File(indexDirectory, HEAD_FILE_NAME);
      if(headDir.exists()) {
        batchNames.add(HEAD_FILE_NAME);
      }
      Map<Integer, String> tails = new TreeMap<Integer, String>();
      for(String aTail : indexDirectory.list(TAILS_FILENAME_FILTER)) {
        tails.put(
            Integer.parseInt(aTail.substring(TAIL_FILE_NAME_PREFIX.length())), 
            aTail);
      }
      // add the tails in order
      batchNames.addAll(tails.values());
      // modify internal state
      synchronized(this) {
        // load all batches, in order
        for(String batchName : batchNames) {
          batches.add(openSubIndex(batchName));
        }
      }      
    } else {
      // new index creation
      indexDirectory.mkdirs();
    }
    synchronized(this) {
      invertedIndex = openInvertedIndexCluster(batches, termProcessor);
    }
    // open direct index
    if(hasDirectIndex) {
      directTerms = new ObjectBigArrayBigList<String>();
      directTermIds = new Object2LongAVLTreeMap<String>();
      directTermIds.defaultReturnValue(-1);
      File directTermsFile = new File(indexDirectory, DIRECT_TERMS_FILENAME);
      if(directTermsFile.exists()) {
        FileLinesCollection fileLines = new FileLinesCollection(
            directTermsFile.getAbsolutePath(), "UTF-8");
        Iterator<MutableString> termsIter = fileLines.iterator();
        long termID = 0;
        while(termsIter.hasNext()) {
          String term = termsIter.next().toString();
          directTerms.add(term);
          directTermIds.put(term, termID++);
        }
      }
      synchronized(this) {
        directIndex = openDirectIndexCluster(batches);
      }
    }
	}
		
  /**
	 * Gets the name of this atomic index. This is used as the file name for the 
	 * directory storing the index files.
	 * @return
	 */
	public String getName() {
	  return name;
	}
	
	/**
	 * Is a direct index configured for this atomic index. 
	 * @return
	 */
	public boolean hasDirectIndex(){
	  return hasDirectIndex;
	}
		
	/**
	 * Starts a new MG4J batch. First time around this will be the head, 
	 * subsequent calls will start a new tail.
	 */
	protected void newBatch() {
	  occurrencesInRAM = 0;
    maxDocSizeInRAM = -1;
    documentsInRAM = 0;
    if(termMap == null) {
      termMap = new Object2ReferenceOpenHashMap<MutableString, 
          PostingsList>(INITIAL_TERM_MAP_SIZE, Hash.FAST_LOAD_FACTOR );      
    } else {
      termMap.clear();
      termMap.trim( INITIAL_TERM_MAP_SIZE );
    } 
    if(documentSizesInRAM  == null) {
      documentSizesInRAM = new IntArrayList();
    } else {
      documentSizesInRAM.clear();
    }
	}
	
	/**
	 * Writes all the data currently stored in RAM to a new index batch. The first
	 * batch is the head index, all other batches are tail indexes.
	 * @throws IOException 
	 * @throws IndexException
	 * @return the number of occurrences written to disk 
	 */
	protected long writeCurrentBatch() throws IOException, IndexException {
	  if(documentsInRAM == 0) return 0;
	  
	  // find the name for the new tail
	  int tailNo = -1;
	  File headDir = new File(indexDirectory, HEAD_FILE_NAME);
	  if(headDir.exists()) {
	    // we have a head, calculate the tail number for this new tail
	    String[] existingTails = indexDirectory.list(TAILS_FILENAME_FILTER);
	    for(String aTail : existingTails) {
	      int aTailNo = Integer.parseInt(aTail.substring(TAIL_FILE_NAME_PREFIX.length()));
	      if(aTailNo > tailNo) tailNo = aTailNo;
	    }
	    tailNo++;	    
	  }
	  
	  // Open an index writer for the new tail
	  String newTailName = tailNo == -1 ? HEAD_FILE_NAME : 
	      (TAIL_FILE_NAME_PREFIX + Integer.toString(tailNo));
	  File newTailDir = new File(indexDirectory, newTailName);
	  newTailDir.mkdir();
	  String mg4jBasename = new File(newTailDir, name).getAbsolutePath();
	  QuasiSuccinctIndexWriter indexWriter = new QuasiSuccinctIndexWriter(
	      IOFactory.FILESYSTEM_FACTORY,
	      mg4jBasename,
	      documentsInRAM,
	      Fast.mostSignificantBit(QuasiSuccinctIndex.DEFAULT_QUANTUM),
	      QuasiSuccinctIndexWriter.DEFAULT_CACHE_SIZE,
	      CompressionFlags.DEFAULT_QUASI_SUCCINCT_INDEX,
	      ByteOrder.nativeOrder());
	  // write the data from RAM
    int numTermsInRAM = termMap.size();
    logger.info( "Generating index for batch " + newTailName + 
            "; documents: " + documentsInRAM + "; terms:" + numTermsInRAM + 
            "; occurrences: " + occurrencesInRAM +
            " / " + parent.getOccurrencesInRam());
    
    // We write down all term in appearance order in termArray.
    final MutableString[] termArray = termMap.keySet().toArray(new MutableString[ numTermsInRAM ]);
    // We sort the terms appearing in the batch and write them on disk.
    Arrays.quickSort(0, termArray.length, 
            new IntComparator() {
              @Override
              public int compare(Integer one, Integer other) {
                return compare(one.intValue(), other.intValue());
              }
              
              @Override
              public int compare(int one, int other) {
                return termArray[one].compareTo(termArray[other]);
              }
            },
            new Swapper() {
              @Override
              public void swap(int one, int other) {
                MutableString temp = termArray[one];
                termArray[one] = termArray[other];
                termArray[other] = temp;
              }
            });
	  // write the terms, termmap, and bloom filter files
    
    // make sure we can't create a Bloom filter of expected size 0
    BloomFilter<Void> termFilter = BloomFilter.create(Math.max(numTermsInRAM, 1));
    PrintWriter pw = new PrintWriter( 
        new OutputStreamWriter(new FastBufferedOutputStream(
            new FileOutputStream(mg4jBasename + DiskBasedIndex.TERMS_EXTENSION), 
            64 * 1024), 
        "UTF-8" ));
    for (MutableString t : termArray ) {
      t.println( pw );
      termFilter.add(t);
    }
    pw.close();
    generateTermMap(new File(mg4jBasename + DiskBasedIndex.TERMS_EXTENSION),
        new File(mg4jBasename + DiskBasedIndex.TERMMAP_EXTENSION), null);
    // write the bloom filter
    BinIO.storeObject(termFilter, 
        new File(mg4jBasename + DocumentalCluster.BLOOM_EXTENSION)); 
    // write the sizes file
    File sizesFile = new File(mg4jBasename + DiskBasedIndex.SIZES_EXTENSION);
    OutputBitStream sizesStream = new OutputBitStream(sizesFile);   
    for(int docSize : documentSizesInRAM.elements()) {
      sizesStream.writeGamma(docSize);
    }
    sizesStream.close();
    // write the actual index
    int maxCount = 0;
    for ( int i = 0; i < numTermsInRAM; i++ ) {
      PostingsList postingsList = termMap.get( termArray[ i ] );
      if ( maxCount < postingsList.maxCount ) maxCount = postingsList.maxCount;
      postingsList.write(indexWriter);
    }
    indexWriter.close();
    // write the index properties
    try {
      Properties properties = indexWriter.properties();
      additionalProperties.setProperty( Index.PropertyKeys.SIZE, 
          indexWriter.writtenBits());
      // -1 means unknown
      additionalProperties.setProperty( Index.PropertyKeys.MAXDOCSIZE, 
          maxDocSizeInRAM);
      additionalProperties.setProperty( Index.PropertyKeys.MAXCOUNT, maxCount );
      additionalProperties.setProperty( Index.PropertyKeys.OCCURRENCES, 
          occurrencesInRAM );
      properties.addAll(additionalProperties);
      Scan.saveProperties( IOFactory.FILESYSTEM_FACTORY, properties, 
          mg4jBasename + DiskBasedIndex.PROPERTIES_EXTENSION );
      
      // write stats
      PrintStream statsPs = new PrintStream(new File(mg4jBasename + 
          DiskBasedIndex.STATS_EXTENSION));
      indexWriter.printStats(statsPs);
      statsPs.close();
    } catch(ConfigurationException e) {
      // this should never happen
      throw new IndexException("Error while saving tail properties", e);
    }
	  
    if(hasDirectIndex) {
      writeDirectIndex(newTailDir);
    }
    // update parent
    long res = occurrencesInRAM;
    
    // clear out internal state, in preparation for the next tail  
    newBatch();
    
    // merge new tail into index cluster
    try {
      // modify internal state
      synchronized(this) {
        batches.add(openSubIndex(newTailName));
        invertedIndex = openInvertedIndexCluster(batches, termProcessor);
        if(hasDirectIndex) {
          directIndex = openDirectIndexCluster(batches);
        }
      }
    } catch(Exception e) {
      throw new IndexException("Could not open the index just written to " +
         mg4jBasename , e);
    }
    return res;
	}
	
	/**
	 * Writes the in-RAM data to a new direct index batch.
	 * @param batchDir
	 */
  protected void writeDirectIndex(File batchDir) 
      throws IOException, IndexException {
    // The index we are writing is a direct index, so we give it new terms
    // which are actually document IDs, and they have posting lists containing
    // document IDs, which are actually termIDs.

    // The document pointers in RAM are zero-based, so we need to add all the 
    // documents on disk to this.
    long docsOnDisk = 0;
    for(MG4JIndex index : batches) {
      docsOnDisk += index.invertedIndex.numberOfDocuments;
    }
    
    //1. invert index data in RAM
    Object2ReferenceOpenHashMap<MutableString, PostingsList> docMap = 
          new Object2ReferenceOpenHashMap<MutableString, 
            PostingsList>(INITIAL_TERM_MAP_SIZE, Hash.FAST_LOAD_FACTOR );
    MutableString docIdStr = new MutableString();
    // make sure all the terms about to be indexed have direct ID
    for(MutableString termMS : termMap.keySet()) {
      String termString = termMS.toString();
      long directTermId = directTermIds.getLong(termString);
      if(directTermId == directTermIds.defaultReturnValue()) {
        // term not seen before
        directTerms.add(termString);
        directTermId = directTerms.size64() -1;
        directTermIds.put(termString, directTermId);
      }
    }
    // we now read the posting lists for all the terms, in ascending term order    
    MutableString termMS = new MutableString();
    for(long directTermId = 0; directTermId < directTerms.size64(); directTermId++){
      String termString = directTerms.get(directTermId);
      termMS.replace(termString);
      PostingsList termPostings = termMap.get(termMS);
      if(termPostings != null) {
        long docPointer = docsOnDisk + termPostings.firstDocumentPointer;
        for(int i = 0; i < termPostings.documentPointersDifferential.size(); i++) {
          docPointer += termPostings.documentPointersDifferential.get(i);       
          int count = termPostings.counts.getInt(i);
          // convert data to the correct type
          docIdStr.replace(longToTerm(docPointer));
          // at this point we have term, document, counts so we can write the data
          // to the in-RAM direct index
          PostingsList docPostings = docMap.get(docIdStr);
          if(docPostings == null) {
            docPostings = new PostingsList(false);
            docMap.put(docIdStr.copy(), docPostings);
          }
          docPostings.newDocumentPointer(directTermId);
          docPostings.setCount(count); 
          docPostings.flush();
        } 
      }
    }
    
    // 2. write the data from RAM
    String mg4jBasename = new File(batchDir, name + 
        DIRECT_INDEX_NAME_SUFFIX).getAbsolutePath();
    // copy the default compression flags, and remove positions
    Map<Component, Coding> flags = new HashMap<Component, Coding>(
        CompressionFlags.DEFAULT_QUASI_SUCCINCT_INDEX);
    flags.remove(Component.POSITIONS);
    QuasiSuccinctIndexWriter directIndexWriter =
        new QuasiSuccinctIndexWriter(
            IOFactory.FILESYSTEM_FACTORY,
            mg4jBasename, 
            directTerms.size64(),
            Fast.mostSignificantBit(QuasiSuccinctIndex.DEFAULT_QUANTUM),
            QuasiSuccinctIndexWriter.DEFAULT_CACHE_SIZE,
            flags,
            ByteOrder.nativeOrder());
    
    // sort all the docIds
    final MutableString[] docArray = docMap.keySet().toArray(new MutableString[ docMap.size() ]);
    // We sort the terms appearing in the batch and write them on disk.
    Arrays.quickSort(0, docArray.length, 
            new IntComparator() {
              @Override
              public int compare(Integer one, Integer other) {
                return compare(one.intValue(), other.intValue());
              }
              
              @Override
              public int compare(int one, int other) {
                return docArray[one].compareTo(docArray[other]);
              }
            },
            new Swapper() {
              @Override
              public void swap(int one, int other) {
                MutableString temp = docArray[one];
                docArray[one] = docArray[other];
                docArray[other] = temp;
              }
            });
    
    BloomFilter<Void> docBloomFilter = BloomFilter.create(docArray.length);
    PrintWriter pw = new PrintWriter( 
        new OutputStreamWriter(new FastBufferedOutputStream(
            new FileOutputStream(mg4jBasename + DiskBasedIndex.TERMS_EXTENSION), 
            64 * 1024), 
        "UTF-8" ));
    for (MutableString t : docArray ) {
      t.println( pw );
      docBloomFilter.add(t);
    }
    pw.close();
    generateTermMap(new File(mg4jBasename + DiskBasedIndex.TERMS_EXTENSION),
        new File(mg4jBasename + DiskBasedIndex.TERMMAP_EXTENSION), null);
    // write the bloom filter
    BinIO.storeObject(docBloomFilter, 
        new File(mg4jBasename + DocumentalCluster.BLOOM_EXTENSION)); 
    // write the sizes file
    // this is a list of document sizes (directTerms in our case)    
    File sizesFile = new File(mg4jBasename + DiskBasedIndex.SIZES_EXTENSION);
    OutputBitStream sizesStream = new OutputBitStream(sizesFile);
    int maxTermSize = -1; // -1 means unknown
    //for(MutableString term : termArray) {
    for(long directTermId = 0; directTermId < directTerms.size64(); directTermId++){
      String termString = directTerms.get(directTermId);
      termMS.replace(termString);
      PostingsList termPostings = termMap.get(termMS);
      int termSize = termPostings != null ?
          (int)termPostings.frequency : 0;
      sizesStream.writeGamma(termSize);
      if(termSize > maxTermSize) maxTermSize = termSize;
    }
    sizesStream.close();
    
    // write the actual index
    int maxCount = 0;
    long occurrences = 0;
    for ( int i = 0; i < docArray.length; i++ ) {
      PostingsList postingsList = docMap.get( docArray[ i ] );
      if ( maxCount < postingsList.maxCount ) maxCount = postingsList.maxCount;
      postingsList.write(directIndexWriter);
      occurrences += postingsList.occurrences;
    }
    directIndexWriter.close();
    // write the index properties
    try {
      Properties properties = directIndexWriter.properties();
      additionalDirectProperties.setProperty( Index.PropertyKeys.SIZE, 
          directIndexWriter.writtenBits());
      // -1 means unknown
      additionalDirectProperties.setProperty( Index.PropertyKeys.MAXDOCSIZE, 
          maxTermSize);
      additionalDirectProperties.setProperty( Index.PropertyKeys.MAXCOUNT, maxCount );
      additionalDirectProperties.setProperty( Index.PropertyKeys.OCCURRENCES, 
          occurrences);
      properties.addAll(additionalDirectProperties);
      Scan.saveProperties( IOFactory.FILESYSTEM_FACTORY, properties, 
          mg4jBasename + DiskBasedIndex.PROPERTIES_EXTENSION );
      
      // write stats
      PrintStream statsPs = new PrintStream(new File(mg4jBasename + 
          DiskBasedIndex.STATS_EXTENSION));
      directIndexWriter.printStats(statsPs);
      statsPs.close();
    } catch(ConfigurationException e) {
      // this should never happen
      throw new IndexException("Error while saving tail properties", e);
    }
    //update the index-wide direct terms file
    File newDirectTermsFile = new File(indexDirectory, DIRECT_TERMS_FILENAME + HEAD_NEW_EXT);
    pw = new PrintWriter(new OutputStreamWriter(new FastBufferedOutputStream(
        new FileOutputStream(newDirectTermsFile), 64 * 1024), "UTF-8" ));
    for (String t : directTerms ) {
      pw.println(t);
    }
    pw.close();

    File directTermsFile = new File(indexDirectory, DIRECT_TERMS_FILENAME);
    File oldDirectTermsFile = new File(indexDirectory, DIRECT_TERMS_FILENAME + HEAD_OLD_EXT);
    if(!directTermsFile.exists() || directTermsFile.renameTo(oldDirectTermsFile)) {
      if(newDirectTermsFile.renameTo(directTermsFile)) {
        oldDirectTermsFile.delete();
      } else {
        throw new IndexException("Unable to save direct terms file");
      }
    }
  }
	
	
	/**
	 * Combines all the currently existing batches, generating a new head index.
	 * @throws IndexException 
	 * @throws IOException 
	 * @throws ConfigurationException 
	 */
	protected void compactIndex() throws IndexException, IOException, ConfigurationException {
	  File headDirNew = new File(indexDirectory, HEAD_FILE_NAME + HEAD_NEW_EXT);
	  // make a local copy of the sub-indexes
	  List<MG4JIndex> indexesToMerge = 
	      new ArrayList<AtomicIndex.MG4JIndex>(batches);
	  if(!headDirNew.mkdir()) {
	    throw new IndexException("Could not create new head directory at " + 
	        headDirNew.getAbsolutePath() +  "!"); 
	  }
	  
	  Map<Component,Coding> codingFlags = 
	      CompressionFlags.DEFAULT_QUASI_SUCCINCT_INDEX;
	  String outputBaseName = new File(headDirNew, name).getAbsolutePath();
	  
	  String[] inputBaseNames = new String[indexesToMerge.size()];
	  for(int i = 0; i < inputBaseNames.length; i++) {
	    inputBaseNames[i] = new File(indexesToMerge.get(i).indexDir, name)
	      .getAbsolutePath(); 
	  }
	  
	  try {
      new Concatenate(
          IOFactory.FILESYSTEM_FACTORY,
          outputBaseName,
          inputBaseNames,
          false, // metadataOnly 
          Combine.DEFAULT_BUFFER_SIZE, 
          codingFlags,
          IndexType.QUASI_SUCCINCT,
          true, // skips
          // BitStreamIndex.DEFAULT_QUANTUM,
          // replaced with optimised automatic calculation
          -5, 
          BitStreamIndex.DEFAULT_HEIGHT, 
          SkipBitStreamIndexWriter.DEFAULT_TEMP_BUFFER_SIZE, 
          ProgressLogger.DEFAULT_LOG_INTERVAL).run();
      // generate term map
      generateTermMap(new File(outputBaseName + DiskBasedIndex.TERMS_EXTENSION), 
          new File(outputBaseName +  DiskBasedIndex.TERMMAP_EXTENSION),
          new File(outputBaseName +  DocumentalCluster.BLOOM_EXTENSION));
    } catch(Exception e) {
      throw new IndexException("Exception while combining sub-indexes", e);
    }

    if(hasDirectIndex()) {
      combineDirectIndexes(indexesToMerge, new File(headDirNew, name + 
          DIRECT_INDEX_NAME_SUFFIX).getAbsolutePath());
    }	  
	  
	  // update the internal state
    synchronized(this) {
      // remove the indexes that were merged
      batches.removeAll(indexesToMerge);
      // insert the new head at the front of the list
      File headDir = new File(indexDirectory, HEAD_FILE_NAME);
      File headDirOld = new File(indexDirectory, HEAD_FILE_NAME + HEAD_OLD_EXT);
      if(headDir.exists() && headDir.renameTo(headDirOld)){
        if(headDirNew.renameTo(headDir)) {
          batches.add(0, openSubIndex(HEAD_FILE_NAME));
          invertedIndex = openInvertedIndexCluster(batches, termProcessor);
          if(hasDirectIndex) {
            directIndex =openDirectIndexCluster(batches);
          }
          // clean-up: delete old head, used-up tails
          if(!gate.util.Files.rmdir(headDirOld)) {
            throw new IndexException(
                "Could not fully delete old sub-index at: " + headDirOld);
          }
          for(MG4JIndex aSubIndex : indexesToMerge) {
            if(!aSubIndex.indexDir.equals(headDir)) {
              if(!gate.util.Files.rmdir(aSubIndex.indexDir)){
                throw new IndexException(
                    "Could not fully delete old sub-index at: " + 
                    aSubIndex.indexDir);
              }              
            }
          }
        } else {
          throw new IndexException("Cold not rename new head at " + 
              headDirNew.getAbsolutePath() + " to " + headDir);
        }
      } else {
        throw new IndexException("Cold not rename head at " + 
            headDir.getAbsolutePath() + " to " + headDirOld);
      }
    }
	}
	
	/**
	 * Given a set of direct indexes (MG4J indexes, with counts, but no positions,
	 * that form a lexical cluster) this method produces one single output index
	 * containing the data from all the input indexes.
	 * @param inputIndexes
	 * @param outputBasename
	 * @throws IOException 
	 * @throws ConfigurationException 
	 */
	protected static void combineDirectIndexes (List<MG4JIndex> inputIndexes, 
	    String outputBasename) throws IOException, ConfigurationException {
	  
	  long noOfDocuments = 0;
	  long noOfTerms = 0;
	  for(MG4JIndex index : inputIndexes) {
	    noOfDocuments += index.directIndex.numberOfDocuments;
	    noOfTerms += index.directIndex.numberOfTerms;
	  }
	  
	  // open the output writer
    // copy the default compression flags, and remove positions
    Map<Component, Coding> flags = new HashMap<Component, Coding>(
        CompressionFlags.DEFAULT_QUASI_SUCCINCT_INDEX);
    flags.remove(Component.POSITIONS);
    QuasiSuccinctIndexWriter outputIndexWriter =
        new QuasiSuccinctIndexWriter(
            IOFactory.FILESYSTEM_FACTORY,
            outputBasename, 
            noOfDocuments,
            Fast.mostSignificantBit(QuasiSuccinctIndex.DEFAULT_QUANTUM),
            QuasiSuccinctIndexWriter.DEFAULT_CACHE_SIZE,
            flags,
            ByteOrder.nativeOrder());
    
    BloomFilter<Void> bloomFilter = BloomFilter.create(noOfTerms);
    PrintWriter termsPw = new PrintWriter( 
        new OutputStreamWriter(new FastBufferedOutputStream(
            new FileOutputStream(outputBasename + DiskBasedIndex.TERMS_EXTENSION), 
            64 * 1024), 
        "UTF-8" ));
    
    // write the index
    long occurrences = 0;
    int maxCount = 0;
    PostingsList postingsList = new PostingsList(false);
    for(MG4JIndex inputIndex : inputIndexes) {
      IndexReader inputReader = inputIndex.directIndex.getReader();
      File directTermsFile = new File(inputIndex.indexDir, 
          inputIndex.indexName + DIRECT_INDEX_NAME_SUFFIX + 
          DiskBasedIndex.TERMS_EXTENSION);
      FileLinesCollection.FileLinesIterator termsIter =
          new FileLinesCollection(directTermsFile.getAbsolutePath(), 
          "UTF-8").iterator();
      MutableString termMS = null;
      IndexIterator inputIterator = inputReader.nextIterator();
      while(inputIterator != null && termsIter.hasNext()) {
        termMS = termsIter.next();
        bloomFilter.add(termMS);
        termMS.println(termsPw);
        long docPointer = inputIterator.nextDocument();
        while(docPointer !=  IndexIterator.END_OF_LIST) {
          postingsList.newDocumentPointer(docPointer);
          postingsList.setCount(inputIterator.count());
          docPointer = inputIterator.nextDocument();
        }
        postingsList.flush();
        occurrences += postingsList.occurrences;
        if ( maxCount < postingsList.maxCount ) maxCount = postingsList.maxCount;
        postingsList.write(outputIndexWriter);
        postingsList.clear();
        inputIterator = inputReader.nextIterator();
      }
      inputReader.close();
    }
    outputIndexWriter.close();
    termsPw.close();
    generateTermMap(new File(outputBasename + DiskBasedIndex.TERMS_EXTENSION),
        new File(outputBasename + DiskBasedIndex.TERMMAP_EXTENSION), null);
    // write the bloom filter
    BinIO.storeObject(bloomFilter, 
        new File(outputBasename + DocumentalCluster.BLOOM_EXTENSION));
    // direct indexes don't store positions, so sizes are not needed

    // write the index properties
    Properties properties = outputIndexWriter.properties();
    properties.setProperty(Index.PropertyKeys.TERMPROCESSOR, 
        ObjectParser.toSpec(NullTermProcessor.getInstance()));
    properties.setProperty( Index.PropertyKeys.SIZE,  
        outputIndexWriter.writtenBits());
    // -1 means unknown
    properties.setProperty( Index.PropertyKeys.MAXDOCSIZE, -1);
    properties.setProperty( Index.PropertyKeys.MAXCOUNT, maxCount );
    properties.setProperty( Index.PropertyKeys.OCCURRENCES, occurrences);
    Scan.saveProperties( IOFactory.FILESYSTEM_FACTORY, properties, 
        outputBasename + DiskBasedIndex.PROPERTIES_EXTENSION );
    
    // write stats
    PrintStream statsPs = new PrintStream(new File(outputBasename + 
        DiskBasedIndex.STATS_EXTENSION));
    outputIndexWriter.printStats(statsPs);
    statsPs.close();
	}
	
	/**
	 * Instructs this index to dump to disk all the in-RAM index data at the fist 
	 * opportunity.
	 * @return a {@link Future} value that, upon completion, will return the 
	 * number of occurrences written to disk.
	 * @throws InterruptedException if this thread is interrupted while trying to
	 * queue the dump request.
	 */
	public Future<Long> requestSyncToDisk() throws InterruptedException {
	  if(batchWriteTask == null) {
	    batchWriteTask = new FutureTask<Long>(new Callable<Long>() {
        @Override
        public Long call() throws Exception {
          return writeCurrentBatch();
        }
      });
	    inputQueue.put(DUMP_BATCH);  
	  }
	  return batchWriteTask;
	}
	
	/**
	 * Requests this atomic index to compact its on-disk batches into a single
	 * batch.
	 * 
	 * @return a {@link Future} which can be used to find out when the compaction
	 * operation has completed.
	 * @throws InterruptedException if this thread is interrupted while trying to
   * queue the compaction request.
	 */
  public Future<Void> requestCompactIndex() throws InterruptedException {
    if(compactIndexTask == null) {
      compactIndexTask = new FutureTask<Void>(new Callable<Void>(){
        @Override
        public Void call() throws Exception {
          compactIndex();
          return null;
        }
      });
      inputQueue.put(COMPACT_INDEX);
    }
    return compactIndexTask;
  }
	
	/**
	 * Opens one sub-index, specified as a directory inside this Atomic Index's
	 * index directory.
	 * @param subIndexDirname
	 * @return
	 * @throws IOException 
	 * @throws IndexException 
	 */
	protected MG4JIndex openSubIndex(String subIndexDirname) throws IOException, IndexException {
    Index invertedIndex = null;
    File subIndexDir = new File(indexDirectory, subIndexDirname);
    String mg4jBasename = new File(subIndexDir, name).getAbsolutePath(); 
    try {
      try{
        invertedIndex = Index.getInstance(
            mg4jBasename + "?" + UriKeys.MAPPED.name().toLowerCase() + "=1;", 
            true, true);
      } catch(IOException e) {
        // memory mapping failed
        logger.info("Memory mapping failed for index " + mg4jBasename
                + ". Loading as file index instead");
        // now try to open it as a plain an on-disk index
        invertedIndex = Index.getInstance(mg4jBasename, true, true);
      }
    } catch(Exception e) {
      throw new IndexException("Could not open the sub-index at" + mg4jBasename , e);
    }
    //read the Bloom filter 
    File bloomFile = new File(mg4jBasename + DocumentalCluster.BLOOM_EXTENSION);
    BloomFilter<Void> invertedTermFilter = null;
    try {
      if(bloomFile.exists()) {
        invertedTermFilter = (BloomFilter<Void>) BinIO.loadObject(bloomFile);
      }
    } catch(ClassNotFoundException e) {
      // this should never happen. If it does, it's not fatal
      logger.warn("Exception wile loading stre Bloom Filter", e);
    }
    
    Index directIndex = null;
    BloomFilter<Void> directTermFilter = null;
    if(hasDirectIndex) {
      // open direct index
      mg4jBasename = new File(subIndexDir, name + 
          DIRECT_INDEX_NAME_SUFFIX).getAbsolutePath();
      try {
        try{
          directIndex = Index.getInstance(
              mg4jBasename + "?" + UriKeys.MAPPED.name().toLowerCase() + "=1;", 
              true, false);
        } catch(IOException e) {
          // memory mapping failed
          logger.info("Memory mapping failed for index " + mg4jBasename
                  + ". Loading as file index instead");
          // now try to open it as a plain an on-disk index
          directIndex = Index.getInstance(mg4jBasename, true, false);
        }
      } catch(Exception e) {
        throw new IndexException("Could not open the sub-index at" + mg4jBasename , e);
      }
      //read the Bloom filter 
      bloomFile = new File(mg4jBasename + DocumentalCluster.BLOOM_EXTENSION);
      
      try {
        if(bloomFile.exists()) {
          directTermFilter = (BloomFilter<Void>) BinIO.loadObject(bloomFile);
        }
      } catch(ClassNotFoundException e) {
        // this should never happen. If it does, it's not fatal
        logger.warn("Exception wile loading stre Bloom Filter", e);
      }
    }
    
    MG4JIndex newIndexData = new MG4JIndex(subIndexDir, name,
        invertedIndex, invertedTermFilter, 
        directIndex, directTermFilter);
	  return newIndexData;
	}
	
	/**
	 * Runnable implementation: the logic of this run method is simply indexing
	 * documents queued to the input queue. To stop it, send a 
	 * {@link GATEDocument#END_OF_QUEUE} value to the input queue.
	 */
	public void run() {
	  indexingThread = Thread.currentThread();
	  GATEDocument aDocument;
	  try{
	    // start in-RAM indexing
	    newBatch();
  	  if(inputQueue != null) {
        do{
          aDocument = inputQueue.take();
          if(aDocument != GATEDocument.END_OF_QUEUE) {
            if(aDocument == DUMP_BATCH) {
              //dump batch was requested
              if(batchWriteTask != null){
                batchWriteTask.run();
              }
              batchWriteTask = null;
            } else if(aDocument == COMPACT_INDEX) {
              // compress index was requested
              if(compactIndexTask != null) {
                compactIndexTask.run();
              }
              compactIndexTask = null;
            } else {
              try {
                long occurencesBefore = occurrencesInRAM;
                processDocument(aDocument);
                aDocument.addOccurrences(occurrencesInRAM - occurencesBefore);
              } catch(Throwable e) {
                logger.error("Problem while indexing document!", e);
              }          
            }
          } else {
            // close down
            writeCurrentBatch();
            flush();
          }
          if(aDocument != DUMP_BATCH && aDocument != COMPACT_INDEX) {
            outputQueue.put(aDocument);  
          }
        } while(aDocument != GATEDocument.END_OF_QUEUE);
  	  }
	  }catch(InterruptedException e) {
      Thread.currentThread().interrupt();
    } catch(Exception e) {
      logger.error("Exception during indexing!", e);
      throw new GateRuntimeException("Exception during indexing!", e);
    } finally {
      indexingThread = null;
    }
	}
	
	/**
	 * Closes all file-based resources.
	 * @throws IOException
	 */
	abstract protected void flush() throws IOException;
	
	/**
	 * Notifies this index to stop its indexing operations, and waits for all data
	 * to be written. 
	 * @throws InterruptedException is the waiting thread is interrupted before 
	 * the indexing thread has finished writing all the data.
	 */
	public void close() throws InterruptedException {
    inputQueue.put(GATEDocument.END_OF_QUEUE);
    if(indexingThread != null) {
      indexingThread.join();
    }
	}

  /**
   * Hook for subclasses, called before processing the annotations
   * for this document.  The default implementation is a no-op.
   */
  protected void documentStarting(GATEDocument gateDocument) throws IndexException {
  }

  /**
   * Hook for subclasses, called after annotations for this document
   * have been processed.  The default implementation is a no-op.
   */
  protected void documentEnding(GATEDocument gateDocument) throws IndexException {
  }
	
  /**
   * Get the annotations that are to be processed for a document,
   * in increasing order of offset.
   */
  protected abstract Annotation[] getAnnotsToProcess(
          GATEDocument gateDocument) throws IndexException;
  
  
  /**
   * Calculate the starting position for the given annotation, storing
   * it in {@link #tokenPosition}.  The starting position is the
   * index of the token within the document where the annotation starts,
   * and <em>must</em> be >= the previous value of tokenPosition.
   * @param ann
   * @param gateDocument
   */
  protected abstract void calculateStartPositionForAnnotation(Annotation ann,
          GATEDocument gateDocument) throws IndexException;
  
  /**
   * Determine the string (or strings, if there are alternatives) that should 
   * be stored in the index for the given annotation.
   * 
   * If a single string value should be returned, it is more efficient to store
   * the value in {@link #currentTerm}, in which case <code>null</code> should 
   * be returned instead.
   * 
   * If the current term should not be indexed (e.g. it's a stop word), then 
   * the implementation should return an empty String array.
   * 
   * @param ann
   * @param gateDocument
   */
  protected abstract String[] calculateTermStringForAnnotation(Annotation ann,
          GATEDocument gateDocument) throws IndexException;
  
  /**
   * Adds the supplied document to the in-RAM index.
   * @param gateDocument the document to index
   * @throws IndexException
   */
  protected void processDocument(GATEDocument gateDocument) throws IndexException{
    //zero document related counters
    tokenPosition = 0;
    
    documentStarting(gateDocument);
    //get the annotations to be processed
    Annotation[] annotsToProcess = getAnnotsToProcess(gateDocument);
    logger.debug("Starting document "
        + gateDocument.getDocument().getName() + ". "
        + annotsToProcess.length + " annotations to process");    
    try {
      //process the annotations one by one.
      for(Annotation ann : annotsToProcess){
        processAnnotation(ann, gateDocument);
      }
      // the current document is finished
      int docLength = tokenPosition + 1;
      if(docLength > maxDocSizeInRAM) maxDocSizeInRAM = docLength;
      documentSizesInRAM.add(docLength);
    } finally {
      documentEnding(gateDocument);
      documentsInRAM++;
    }
  }
  
  /**
   * Indexes one annotation (either a Token or a semantic annotation).
   * @param ann the annotation to be indexed
   * @param gateDocument the GATEDocument containing the annotation
   * @throws IndexException
   * @throws IOException
   */
  protected void processAnnotation(Annotation ann,
      GATEDocument gateDocument) throws IndexException {
    // calculate the position and string for this annotation
    calculateStartPositionForAnnotation(ann, gateDocument);
    String[] terms = calculateTermStringForAnnotation(ann, gateDocument);
    if(terms == null){
      //the value was already stored in #currentTerm by the implementation.
      indexCurrentTerm();
    }else if(terms.length == 0){
      //we received an empty array -> we should NOT index the current term
    }else{
      //we have received multiple values from the implementation
      for(String aTerm : terms){
        currentTerm.replace(aTerm == null ? "" : aTerm);
        indexCurrentTerm();
      }
    }
  }
  
  /**
   * Adds the value in {@link #currentTerm} to the index.
   * @throws IOException 
   */
  protected void indexCurrentTerm() {
    //check if we have seen this mention before
    PostingsList termPostings = termMap.get(currentTerm);
    if(termPostings == null){
      //new term -> create a new postings list.
      termMap.put( currentTerm.copy(), termPostings = new PostingsList(true));
    }
    //add the current posting to the current postings list
    // In a documental cluster, each sub-index is zero-based. This is why we use
    // the local document pointer here.
    termPostings.newDocumentPointer(documentsInRAM);
    //this is needed so that we don't increment the number of occurrences
    //for duplicate values.
    if(termPostings.checkPosition(tokenPosition)){
      termPostings.addPosition(tokenPosition);
      occurrencesInRAM++;
    } else {
      logger.debug("Duplicate position");
    }
  }

  /**
   * Gets the top level directory for this atomic index. This will be a 
   * directory contained in the top level directory of the {@link MimirIndex}
   * which includes this atomic index.
   * @return
   */
  public File getIndexDirectory() {
    return indexDirectory;
  }

  /**
   * Gets the top level {@link MimirIndex} to which this atomic index belongs.
   * @return
   */
  public MimirIndex getParent() {
    return parent;
  }

  /**
   * Gets the input queue used by this atomic index. This queue is used to 
   * submit documents for indexing.
   * @return
   */
  public BlockingQueue<GATEDocument> getInputQueue() {
    return inputQueue;
  }

  /**
   * Gets the output queue used by this atomic index. This is used to 
   * "return" documents that have finished indexing. Notably, values 
   * in this queue will have their occurrences value (see
   * {@link GATEDocument#getOccurrences()}) increased by the number of 
   * occurrences generated by indexing the document in this atomic index.
   * 
   * @return
   */
  public BlockingQueue<GATEDocument> getOutputQueue() {
    return outputQueue;
  }

  /**
   * Gets the inverted index (an {@link Index} value) that can be used to 
   * search this atomic index. This will normally be a 
   * {@link DocumentalCluster} view over all the batches contained. 
   * @return
   */
  public Index getIndex() {
    return invertedIndex;
  }
  
  
  /**
   * Gets the direct index for this atomic index. The returned value is 
   * <code>non-null</code> only if the atomic index was configured to have a 
   * direct index upon its construction (see 
   * {@link #AtomicIndex(MimirIndex, String, File, boolean, TermProcessor, BlockingQueue, BlockingQueue)}.).
   * You can check if a direct index has been configured by calling 
   * {@link #hasDirectIndex()}.
   * @return an Index in which terms and documents are reversed. When querying 
   * the returned index, the "terms" provided should be String 
   * representations of document IDs (as produced by {@link #longToTerm(long)}).
   * The search results is a set of "document IDs", which are actually
   * term IDs. The actual term string corresponding to the returned term IDs can
   * be obtained by calling {@link #getDirectTerm(long)}.   
   */
  public Index getDirectIndex() {
    return directIndex;
  }
 
  /**
   * Gets the term string for a given direct term ID. The term ID must have been 
   * obtained from the direct index of this index.
   * @param termId the ID for the term being sought.
   * @return the string for the given term.
   */
  public CharSequence getDirectTerm(long termId) {
    return directTerms.get(termId);
  }
  
  /**
   * Gets the list of direct terms for this index. The terms are sorted by the 
   * first they were seen, and <strong>not</strong> lexicographically.
   * @return
   */
  public ObjectBigList<? extends CharSequence> getDirectTerms() {
    return directTerms;
  }
  
  /**
   * Gets the occurrence count in the whole index for a given direct term,
   * specified by a direct term ID (which must have been obtained from the 
   * direct index of this index).
   * 
   * @param directTermId
   * @return
   * @throws IOException
   */
  public long getDirectTermOccurenceCount(long directTermId) throws IOException {
    String termStr = directTerms.get(directTermId);
    // we need to sum up all the counts for this term in the inverted index
    long count = 0;
    IndexIterator idxItr = invertedIndex.documents(termStr);
    long docId = idxItr.nextDocument();
    while(docId != IndexIterator.END_OF_LIST) {
      count += idxItr.count();
      docId = idxItr.nextDocument();
    }
    return count;
  }
  
  /**
   * Returns the number of batches in this atomic index.
   * @return
   */
  public int getBatchCount() {
    return batches.size();
  }
}