TruncateIndex.java example

Explorer

mimir-master
- etc
  - generated-header.java
- mimir-client
  - src
    - gate
      - mimir
        index
        MimirConnector.java
        MimirIndexingPR.java
        search
        RemoteQueryRunner.java
        tool
        WebUtils.java
- mimir-cloud
  - archive-unpacker
    - src
      - gate
        mimir
        util
        MultiFileInputStream.java
        UnpackWizard.java
  - src
    - java
      - gate
        mimir
        util
        IndexArchiveState.java
        MultiFileOutputStream.java
- mimir-core
  - src
    - gate
      - mimir
        AbstractSemanticAnnotationHelper.java
        Constraint.java
        ConstraintType.java
        DocumentMetadataHelper.java
        DocumentRenderer.java
        IndexConfig.java
        MimirIndex.java
        SemanticAnnotationHelper.java
        index
        AtomicAnnotationIndex.java
        AtomicIndex.java
        AtomicTokenIndex.java
        DocumentCollection.java
        DocumentData.java
        GATEDocument.java
        GATEDocumentFactory.java
        IndexException.java
        Mention.java
        OriginalMarkupMetadataHelper.java
        package-info.java
        search
        FederatedQueryRunner.java
        IndexReaderPool.java
        QueryEngine.java
        QueryRunner.java
        RankingQueryRunnerImpl.java
        query
        AbstractIntersectionQueryExecutor.java
        AbstractOverlapQuery.java
        AbstractQueryExecutor.java
        AndQuery.java
        AnnotationQuery.java
        Binding.java
        ConstQuery.java
        ContainsQuery.java
        ExecutorsList.java
        GapQuery.java
        MinusQuery.java
        OrQuery.java
        QueryExecutor.java
        QueryNode.java
        RepeatsQuery.java
        SequenceQuery.java
        TermQuery.java
        WithinQuery.java
        parser
        ParseException.java
        Query.java
        QueryParser.java
        QueryParserConstants.java
        QueryParserTokenManager.java
        SimpleCharStream.java
        Token.java
        TokenMgrError.java
        score
        BindingScorer.java
        DelegatingScoringQueryExecutor.java
        MimirScorer.java
        terms
        AbstractCompoundTermsQuery.java
        AbstractDocumentsBasedTermsQuery.java
        AbstractIndexTermsQuery.java
        AndTermsQuery.java
        AnnotationTermsQuery.java
        CompoundTermsQuery.java
        ConstTermsQuery.java
        DocumentTermsQuery.java
        DocumentsAndTermsQuery.java
        DocumentsBasedTermsQuery.java
        DocumentsOrTermsQuery.java
        LimitTermsQuery.java
        OrTermsQuery.java
        SortedTermsQuery.java
        TermTypeTermsQuery.java
        TermsQuery.java
        TermsResultSet.java
        util
        DefaultMentionDescriber.java
        DelegatingSemanticAnnotationHelper.java
        DocumentFeaturesMetadataHelper.java
        IgnoreEmptiesTermProcessor.java
        IndexUpgrader.java
        MG4JTools.java
        NormalizingTermProcessor.java
        OntologyMentionDescriber.java
        TruncateIndex.java
- mimir-test
  - src
    - gate
      - mimir
        test
        QueryTests.java
        RenderZipCollection.java
        Scratch.java
        ScratchConsole.java
        TestQueryParser.java
        TestUtils.java
- mimir-web
  - src
    - gwt
      - gate
        mimir
        web
        client
        UI.java
    - java
      - gate
        mimir
        util
        LogAnalyser.java
        web
        client
        DocumentData.java
        GwtRpcService.java
        GwtRpcServiceAsync.java
        MimirSearchException.java
        ResultsData.java
- plugins
  - db-h2
    - src
      - gate
        mimir
        db
        AnnotationTemplateCache.java
        DBSemanticAnnotationHelper.java
  - measurements
    - src
      - gate
        mimir
        measurements
        MeasurementAnnotationHelper.java
        MeasurementPluginResource.java
  - sparql
    - src
      - gate
        mimir
        sparql
        RequestMethod.java
        SPARQLResultSet.java
        SPARQLSemanticAnnotationHelper.java

/*
 *  TruncateIndex.java
 *
 *  Copyright (c) 2007-2016, The University of Sheffield.
 *
 *  This file is part of GATE Mímir (see http://gate.ac.uk/family/mimir.html), 
 *  and is free software, licenced under the GNU Lesser General Public License,
 *  Version 3, June 2007 (also included with this distribution as file
 *  LICENCE-LGPL3.html).
 *
 *  Ian Roberts, 1st September 2016
 *
 *  $Id$
 */
package gate.mimir.util;

import gate.Gate;
import gate.mimir.IndexConfig;
import gate.mimir.IndexConfig.SemanticIndexerConfig;
import gate.mimir.IndexConfig.TokenIndexerConfig;
import gate.mimir.MimirIndex;
import gate.mimir.index.AtomicIndex;
import gate.mimir.index.DocumentCollection;
import it.unimi.di.big.mg4j.index.CompressionFlags;
import it.unimi.di.big.mg4j.index.CompressionFlags.Coding;
import it.unimi.di.big.mg4j.index.CompressionFlags.Component;
import it.unimi.di.big.mg4j.index.DiskBasedIndex;
import it.unimi.di.big.mg4j.index.Index;
import it.unimi.di.big.mg4j.index.IndexIterator;
import it.unimi.di.big.mg4j.index.IndexReader;
import it.unimi.di.big.mg4j.index.QuasiSuccinctIndex;
import it.unimi.di.big.mg4j.index.QuasiSuccinctIndexWriter;
import it.unimi.di.big.mg4j.index.cluster.DocumentalCluster;
import it.unimi.di.big.mg4j.io.IOFactory;
import it.unimi.di.big.mg4j.tool.Scan;
import it.unimi.dsi.big.io.FileLinesCollection;
import it.unimi.dsi.big.io.FileLinesCollection.FileLinesIterator;
import it.unimi.dsi.bits.Fast;
import it.unimi.dsi.fastutil.ints.IntArrayList;
import it.unimi.dsi.fastutil.ints.IntList;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;
import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap;
import it.unimi.dsi.fastutil.longs.LongArrayList;
import it.unimi.dsi.fastutil.longs.LongList;
import it.unimi.dsi.io.InputBitStream;
import it.unimi.dsi.io.OutputBitStream;
import it.unimi.dsi.lang.MutableString;
import it.unimi.dsi.util.BloomFilter;
import it.unimi.dsi.util.Properties;

import java.io.EOFException;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FilenameFilter;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.PrintStream;
import java.io.PrintWriter;
import java.nio.ByteOrder;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
import java.util.zip.ZipInputStream;
import java.util.zip.ZipOutputStream;

import org.apache.commons.configuration.ConfigurationException;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.output.ByteArrayOutputStream;
import org.apache.log4j.Logger;

/**
 * Utility class to fix up a Mimir index that has been corrupted, e.g.
 * by an unclean shutdown or out-of-memory condition. The index must be
 * closed to use this tool, which either means the Mimir webapp is not
 * running, or the index has been deleted from the running Mimir. It is
 * very very strongly recommended to back up an index before attempting
 * this procedure. The clean up process will unavoidably remove some
 * number of documents from the tail of the index, but will attempt to
 * keep the number of lost documents to a minimum.
 * 
 * @author ian
 *
 */
public class TruncateIndex {

  private static final Logger log = Logger.getLogger(TruncateIndex.class);

  /**
   * Comparator that orders mimir zip collection files by number (e.g.
   * mimir-collection-16.zip comes after mimir-collection-12-15.zip but
   * before mimir-collection-100-120.zip)
   */
  public static final Comparator<File> ZIP_COLLECTION_COMPARATOR =
          new Comparator<File>() {
            public int compare(File a, File b) {
              int numA =
                      Integer.parseInt(a.getName().substring(
                              a.getName().lastIndexOf('-') + 1,
                              a.getName().length() - 4));
              int numB =
                      Integer.parseInt(b.getName().substring(
                              b.getName().lastIndexOf('-') + 1,
                              b.getName().length() - 4));
              return numA - numB;
            }
          };

  public static final Comparator<String> BATCH_COMPARATOR =
          new Comparator<String>() {
            public int compare(String a, String b) {
              if(a.equals("head")) {
                if(b.equals("head")) {
                  // both heads
                  return 0;
                } else {
                  // head before tail
                  return -1;
                }
              } else {
                if(b.equals("head")) {
                  // tail after head
                  return 1;
                } else {
                  // both tails, compare by number
                  int numA =
                          Integer.parseInt(a.substring(a.lastIndexOf('-') + 1));
                  int numB =
                          Integer.parseInt(b.substring(b.lastIndexOf('-') + 1));
                  return numA - numB;
                }
              }
            }
          };

  public static final FilenameFilter INDEX_NAME_FILTER = new FilenameFilter() {
    private Pattern pat = Pattern.compile("(?:token|mention)-\\d+");

    @Override
    public boolean accept(File dir, String name) {
      return pat.matcher(name).matches();
    }
  };

  public static final FilenameFilter BATCH_NAME_FILTER = new FilenameFilter() {
    private Pattern pat = Pattern.compile("head|tail-\\d+");

    @Override
    public boolean accept(File dir, String name) {
      return pat.matcher(name).matches();
    }
  };

  public static void main(String... args) throws Exception {
    Gate.runInSandbox(true);
    Gate.init();
    int i = 0;
    while(i < args.length && "-p".equals(args[i])) {
      Gate.getCreoleRegister().registerDirectories(new File(args[++i]).toURI().toURL());
      i++;
    }
    truncateIndex(new File(args[i]));
  }

  /**
   * Attempt to fix up a corrupted Mimir index by truncating some number
   * of documents off the end. There will be a certain number of
   * documents in complete index batches, and a (possibly different)
   * number of documents successfully persisted to disk in the zip files
   * of the DocumentCollection, the index will be truncated to the
   * smaller of those two numbers.
   * 
   * @param indexDirectory the top-level directory of the Mimir index
   *          (containing config.xml)
   */
  public static void truncateIndex(File indexDirectory) throws Exception {
    // 1. Repair the last zip file in the DocumentCollection
    repairLastZip(indexDirectory);

    // 2. Determine the last "good" batch (the greatest numbered head or
    // tail that is fully written to disk in every AtomicIndex) and
    // stash the bad ones
    String lastGoodBatch = determineLastGoodBatch(indexDirectory);

    if(lastGoodBatch == null) {
      throw new RuntimeException(
              "All batches are corrupt, sorry, this index is a write-off");
    }

    // 3. If the zip collection is at least as long as the sum of the
    // good batches, truncate it to match the batches and we're done.
    BatchDetails batches = batchEndPoints(indexDirectory);
    long totalDocsInBatches = batches.endPoints[batches.endPoints.length - 1];
    long totalDocsInZips = totalDocumentsInZipCollection(indexDirectory);

    if(totalDocsInBatches == totalDocsInZips) {
      log.info("We're in luck, the batches and zips line up exactly");
      return;
    } else if(totalDocsInZips > totalDocsInBatches) {
      truncateZipCollectionTo(indexDirectory, totalDocsInBatches);
      return;
    } else if(totalDocsInZips == 0) {
      throw new RuntimeException("Zip collection is empty");
    }

    // 4. Otherwise, the zip collection stops in the middle of a batch B
    int endBatch = -1;
    for(int i = 0; i < batches.names.length; i++) {
      if(batches.endPoints[i] >= totalDocsInZips) {
        endBatch = i;
        break;
      }
    }
    log.info("Zip collection ends within " + batches.names[endBatch]);
    if(batches.endPoints[endBatch] == totalDocsInZips) {
      // special case - zip collection ends exactly at the end of a
      // batch. Stash subsequent batches and we're done
      log.info("Zip collection ends exactly at the end of batch "
              + batches.names[endBatch]);
      log.info("Stashing subsequent batches");
      stashBatches(indexDirectory, java.util.Arrays.asList(batches.names)
              .subList(endBatch + 1, batches.endPoints.length));
      log.info("Done");
      return;
    }
    // 4.1. Stash B (for every AtomicIndex) and any batches beyond it.
    stashBatches(indexDirectory, java.util.Arrays.asList(batches.names)
            .subList(endBatch, batches.endPoints.length));

    // 4.2. Read each stashed B and re-write it but with documents
    // beyond the end of the zip collection omitted
    long endOfPreviousBatch = 0L;
    if(endBatch > 0) {
      endOfPreviousBatch = batches.endPoints[endBatch - 1];
    }
    trimBatch(indexDirectory, batches.names[endBatch], totalDocsInZips
            - endOfPreviousBatch);

    // 4.3. Truncate the direct indexes for those AtomicIndexes that
    // require it
    IndexConfig indexConfig =
            IndexConfig.readConfigFromFile(new File(indexDirectory,
                    MimirIndex.INDEX_CONFIG_FILENAME));
    TokenIndexerConfig[] tokenIndexes = indexConfig.getTokenIndexers();
    for(int i = 0; i < tokenIndexes.length; i++) {
      if(tokenIndexes[i].isDirectIndexEnabled()) {
        truncateDirectIndex(indexDirectory, "token-" + i,
                batches.names[endBatch], totalDocsInZips - 1);
      }
    }
    SemanticIndexerConfig[] semanticIndexes = indexConfig.getSemanticIndexers();
    for(int i = 0; i < semanticIndexes.length; i++) {
      if(semanticIndexes[i].isDirectIndexEnabled()) {
        truncateDirectIndex(indexDirectory, "mention-" + i,
                batches.names[endBatch], totalDocsInZips - 1);
      }
    }
  }

  public static void repairLastZip(File indexDirectory) throws IOException {
    log.info("Ensuring last zip file in " + indexDirectory.getAbsolutePath()
            + " is complete");
    File[] zipCollectionFiles =
            indexDirectory
                    .listFiles(DocumentCollection.CollectionFile.FILENAME_FILTER);
    if(zipCollectionFiles.length > 0) {
      java.util.Arrays.sort(zipCollectionFiles, ZIP_COLLECTION_COMPARATOR);
      File lastZip = zipCollectionFiles[zipCollectionFiles.length - 1];
      log.info("Last zip is " + lastZip.getName());
      File brokenBatches = new File(indexDirectory, "broken-batches");
      brokenBatches.mkdirs();
      File movedLastZip = new File(brokenBatches, lastZip.getName());
      if(movedLastZip.exists()) {
        movedLastZip.delete();
      }
      if(!lastZip.renameTo(movedLastZip)) {
        throw new RuntimeException("Could not stash " + lastZip.getName()
                + " in broken-batches");
      }
      log.debug("Moved " + lastZip.getName() + " to broken-batches");
      String lastGoodDoc = null;
      try(FileInputStream oldIn = new FileInputStream(movedLastZip);
              ZipInputStream zipIn = new ZipInputStream(oldIn);
              FileOutputStream newOut = new FileOutputStream(lastZip);
              ZipOutputStream zipOut = new ZipOutputStream(newOut)) {
        ZipEntry entry = null;
        try {
          while((entry = zipIn.getNextEntry()) != null) {
            ByteArrayOutputStream data = new ByteArrayOutputStream();
            IOUtils.copy(zipIn, data);
            // if we get here the input zip was not truncated mid-entry,
            // so it's safe to write this entry
            zipOut.putNextEntry(entry);
            IOUtils.write(data.toByteArray(), zipOut);
            zipOut.closeEntry();
            lastGoodDoc = entry.getName();
          }
        } catch(EOFException eof) {
          // this is expected, if the zip was not properly closed
        }
      }
      log.info("Last good document ID was " + lastGoodDoc);
    } else {
      log.warn("No files in zip collection");
    }
  }

  /**
   * Determines the last "good" batch name (head or tail-N) for the
   * given index, and stashes any bad batches in the broken-batches
   * directory.
   * 
   * @param indexDirectory
   * @return
   * @throws IOException
   */
  public static String determineLastGoodBatch(File indexDirectory)
          throws IOException {
    String lastGood = null;

    File[] subIndexes = indexDirectory.listFiles(INDEX_NAME_FILTER);
    if(subIndexes.length == 0) {
      throw new RuntimeException("Index has no AtomicIndexes!");
    }
    String[] batches = subIndexes[0].list(BATCH_NAME_FILTER);
    java.util.Arrays.sort(batches, BATCH_COMPARATOR);
    BATCH: for(String batch : batches) {
      for(File subIndex : subIndexes) {
        if(!new File(new File(subIndex, batch), subIndex.getName()
                + ".properties").exists()) {
          break BATCH;
        }
      }
      // if we get to here we know this batch exists in all sub-indexes
      lastGood = batch;
    }

    if(lastGood != null) {
      File brokenBatches = new File(indexDirectory, "broken-batches");
      // stash bad batches
      for(File subIndex : subIndexes) {
        File[] thisIndexBatches = subIndex.listFiles(BATCH_NAME_FILTER);
        for(File b : thisIndexBatches) {
          if(BATCH_COMPARATOR.compare(lastGood, b.getName()) < 0) {
            // this is a bad batch, stash it
            File movedB =
                    new File(brokenBatches, subIndex.getName() + "-"
                            + b.getName());
            if(movedB.exists()) {
              FileUtils.deleteDirectory(movedB);
            }
            if(!b.renameTo(movedB)) {
              throw new RuntimeException("Could not stash " + movedB.getName());
            }
          }
        }
      }
    }

    return lastGood;
  }

  public static class BatchDetails {
    String[] names;

    long[] endPoints;
  }

  public static BatchDetails batchEndPoints(File indexDirectory)
          throws IOException, ConfigurationException {
    BatchDetails details = new BatchDetails();
    long totalDocs = 0;
    File[] subIndexes = indexDirectory.listFiles(INDEX_NAME_FILTER);
    if(subIndexes.length == 0) {
      throw new RuntimeException("Index has no AtomicIndexes!");
    }
    details.names = subIndexes[0].list(BATCH_NAME_FILTER);
    java.util.Arrays.sort(details.names, BATCH_COMPARATOR);

    details.endPoints = new long[details.names.length];
    for(int i = 0; i < details.names.length; i++) {
      Properties batchProps = new Properties();
      try(FileInputStream propsIn =
              new FileInputStream(new File(new File(subIndexes[0],
                      details.names[i]), subIndexes[0].getName()
                      + ".properties"))) {
        batchProps.load(propsIn);
      }
      totalDocs += batchProps.getLong("documents");
      details.endPoints[i] = totalDocs;
    }

    return details;
  }

  public static long totalDocumentsInZipCollection(File indexDirectory)
          throws IOException {
    long totalDocs = 0;
    File[] zipCollectionFiles =
            indexDirectory
                    .listFiles(DocumentCollection.CollectionFile.FILENAME_FILTER);
    for(File zip : zipCollectionFiles) {
      try(ZipFile zf = new ZipFile(zip)) {
        totalDocs += zf.size();
      }
    }

    return totalDocs;
  }

  public static void truncateZipCollectionTo(File indexDirectory, long numDocs)
          throws IOException {
    File[] zipCollectionFiles =
            indexDirectory
                    .listFiles(DocumentCollection.CollectionFile.FILENAME_FILTER);
    java.util.Arrays.sort(zipCollectionFiles, ZIP_COLLECTION_COMPARATOR);
    // the truncation point is somewhere within the last zip file whose
    // first entry is less than numDocs (document IDs are zero based, so
    // the document named numDocs is actually the (numDocs+1)th one).
    int targetFile = -1;
    for(int i = 0; i < zipCollectionFiles.length; i++) {
      try(FileInputStream fis = new FileInputStream(zipCollectionFiles[i]);
              ZipInputStream zipIn = new ZipInputStream(fis)) {
        ZipEntry firstEntry = zipIn.getNextEntry();
        if(firstEntry != null) {
          long documentId = Long.parseLong(firstEntry.getName());
          if(documentId >= numDocs) {
            break;
          } else {
            targetFile = i;
          }
        }
      }
    }

    if(targetFile < 0) {
      throw new RuntimeException(
              "Zip collection broken beyond repair - there is no zip file containing the cut point");
    }

    // we know that document (numDocs-1) is somewhere in
    // zipCollectionFiles[targetFile]. Move that file out of the way and
    // rewrite it, truncated appropriately.
    File origFile = zipCollectionFiles[targetFile];
    File brokenBatches = new File(indexDirectory, "broken-batches");
    brokenBatches.mkdirs();
    File movedFile =
            new File(brokenBatches, "to-truncate-" + origFile.getName());
    if(movedFile.exists()) {
      movedFile.delete();
    }
    if(!origFile.renameTo(movedFile)) {
      throw new RuntimeException("Could not stash " + origFile.getName()
              + " in broken-batches");
    }
    String lastEntryName = String.valueOf(numDocs - 1);
    try(FileInputStream oldIn = new FileInputStream(movedFile);
            ZipInputStream zipIn = new ZipInputStream(oldIn);
            FileOutputStream newOut = new FileOutputStream(origFile);
            ZipOutputStream zipOut = new ZipOutputStream(newOut)) {
      ZipEntry entry = null;
      try {
        while((entry = zipIn.getNextEntry()) != null) {
          ByteArrayOutputStream data = new ByteArrayOutputStream();
          IOUtils.copy(zipIn, data);
          // if we get here the input zip was not truncated mid-entry,
          // so it's safe to write this entry
          zipOut.putNextEntry(entry);
          IOUtils.write(data.toByteArray(), zipOut);
          zipOut.closeEntry();
          if(lastEntryName.equals(entry.getName())) {
            // reached the cut point, stop copying
            break;
          }
        }
      } catch(EOFException eof) {
        // this is expected, if the zip was not properly closed
      }
    }
    log.info("Truncated zip collection file " + origFile + " to document "
            + lastEntryName);
  }

  public static void stashBatches(File indexDirectory, List<String> batches)
          throws IOException {
    File brokenBatches = new File(indexDirectory, "broken-batches");
    File[] subIndexes = indexDirectory.listFiles(INDEX_NAME_FILTER);

    for(File subIndex : subIndexes) {
      for(String batchName : batches) {
        File b = new File(subIndex, batchName);
        File movedB =
                new File(brokenBatches, subIndex.getName() + "-" + batchName);
        if(movedB.exists()) {
          FileUtils.deleteDirectory(movedB);
        }
        if(!b.renameTo(movedB)) {
          throw new RuntimeException("Could not stash " + movedB.getName());
        }
      }
    }
  }

  /**
   * Trim the given batch in all sub-indexes to the given length in
   * documents. Assumes the batch has already been stashed as
   * broken-batches/subindex-batchName.
   * 
   * @param indexDirectory top level index directory
   * @param batchName name of the batch to trim
   * @param numDocs number of documents to which the batch should be
   *          trimmed.
   */
  public static void trimBatch(File indexDirectory, String batchName,
          long numDocs) throws Exception {
    File brokenBatches = new File(indexDirectory, "broken-batches");
    File[] subIndexes = indexDirectory.listFiles(INDEX_NAME_FILTER);

    for(File subIndex : subIndexes) {
      File stashedBatch =
              new File(brokenBatches, subIndex.getName() + "-" + batchName);
      if(!stashedBatch.exists()) {
        throw new RuntimeException("Stashed batch " + stashedBatch
                + " not found");
      }
      File batchDir = new File(subIndex, batchName);
      batchDir.mkdirs();
      log.info("Trimming batch " + batchDir);
      String stashedIndexBasename =
              new File(stashedBatch, subIndex.getName()).getAbsolutePath();
      String outputIndexBasename =
              new File(batchDir, subIndex.getName()).getAbsolutePath();

      Index stashedIndex = Index.getInstance(stashedIndexBasename, true, true);

      // when you read through an index sequentially, the IndexIterators
      // don't tell you what term they were for, so we need to read the
      // .terms file from the stashed batch in step with the index
      // reader.
      File stashedTermsFile =
              new File(stashedIndexBasename + DiskBasedIndex.TERMS_EXTENSION);
      FileLinesCollection termsColl =
              new FileLinesCollection(stashedTermsFile.getAbsolutePath(),
                      "UTF-8");
      long numTerms = termsColl.size64();
      Iterator<MutableString> termsIter = termsColl.iterator();
      File newTermsFile =
              new File(outputIndexBasename + DiskBasedIndex.TERMS_EXTENSION);

      // there will certainly be no *more* than numTerms terms in the
      // final index, there may be fewer
      BloomFilter<Void> termFilter = BloomFilter.create(Math.max(numTerms, 1));

      Properties writerProperties = null;
      long writtenBits = 0;
      int maxDocSize = 0;
      int maxCount = 0;
      long totalOccurrences = 0;
      try(IndexReader indexReader = stashedIndex.getReader();
              FileOutputStream termsOS = new FileOutputStream(newTermsFile);
              OutputStreamWriter termsOSW =
                      new OutputStreamWriter(termsOS, "UTF-8");
              PrintWriter termsWriter = new PrintWriter(termsOSW)) {
        QuasiSuccinctIndexWriter indexWriter =
                new QuasiSuccinctIndexWriter(
                        IOFactory.FILESYSTEM_FACTORY,
                        outputIndexBasename,
                        numDocs,
                        Fast.mostSignificantBit(QuasiSuccinctIndex.DEFAULT_QUANTUM),
                        QuasiSuccinctIndexWriter.DEFAULT_CACHE_SIZE,
                        CompressionFlags.DEFAULT_QUASI_SUCCINCT_INDEX,
                        ByteOrder.nativeOrder());

        IndexIterator iter;
        while((iter = indexReader.nextIterator()) != null) {
          MutableString term = termsIter.next();
          // we can't stream the inverted list, because we need to know
          // up front how many documents the term is found in so we can
          // write that number before writing the positions.
          LongList docPointers = new LongArrayList();
          IntList counts = new IntArrayList();
          List<IntArrayList> positions = new ArrayList<>();
          long frequency = 0;
          long curPointer;
          long occurrences = 0;
          long sumMaxPos = 0;
          while((curPointer = iter.nextDocument()) != IndexIterator.END_OF_LIST) {
            if(curPointer < numDocs) {
              frequency++;
              docPointers.add(curPointer);
              counts.add(iter.count());
              IntArrayList thisDocPositions = new IntArrayList(iter.count());
              positions.add(thisDocPositions);
              occurrences += iter.count();
              totalOccurrences += iter.count();
              if(iter.count() > maxCount) {
                maxCount = iter.count();
              }
              int pos;
              int lastPos = 0;
              while((pos = iter.nextPosition()) != IndexIterator.END_OF_POSITIONS) {
                thisDocPositions.add(pos);
                lastPos = pos;
              }
              sumMaxPos += lastPos;
              if(lastPos > maxDocSize) {
                maxDocSize = lastPos;
              }
            } else {
              break;
            }
          }

          if(frequency > 0) {
            // this term occurred in at least one document that we're
            // not truncating, so now we know it's safe to write the
            // (truncated) inverted list to the new index and the term
            // to the terms file.

            term.println(termsWriter);
            termFilter.add(term);

            indexWriter.newInvertedList(frequency, occurrences, sumMaxPos);
            indexWriter.writeFrequency(frequency);
            for(int i = 0; i < frequency; i++) {
              OutputBitStream obs = indexWriter.newDocumentRecord();
              indexWriter.writeDocumentPointer(obs, docPointers.get(i));
              indexWriter.writePositionCount(obs, counts.get(i));
              indexWriter.writeDocumentPositions(obs, positions.get(i)
                      .elements(), 0, positions.get(i).size(), -1);
            }
          }
        }

        indexWriter.close();
        writerProperties = indexWriter.properties();
        // write stats file
        try(PrintStream statsPs =
                new PrintStream(new File(outputIndexBasename
                        + DiskBasedIndex.STATS_EXTENSION))) {
          indexWriter.printStats(statsPs);
        }
        writtenBits = indexWriter.writtenBits();
      }

      // regenerate the term map from the (possibly shorter) terms file
      AtomicIndex.generateTermMap(new File(outputIndexBasename
              + DiskBasedIndex.TERMS_EXTENSION), new File(outputIndexBasename
              + DiskBasedIndex.TERMMAP_EXTENSION), null);

      // write the bloom filter
      BinIO.storeObject(termFilter, new File(outputIndexBasename
              + DocumentalCluster.BLOOM_EXTENSION));

      // write the truncated sizes file
      File stashedSizesFile =
              new File(stashedIndexBasename + DiskBasedIndex.SIZES_EXTENSION);
      File sizesFile =
              new File(outputIndexBasename + DiskBasedIndex.SIZES_EXTENSION);
      try(InputBitStream stashedSizesStream =
              new InputBitStream(stashedSizesFile);
              OutputBitStream sizesStream = new OutputBitStream(sizesFile)) {
        for(long i = 0; i < numDocs; i++) {
          sizesStream.writeGamma(stashedSizesStream.readGamma());
        }
      }

      // generate the index properties
      Properties stashedProps = new Properties();
      try(FileInputStream stashedPropsStream =
              new FileInputStream(stashedIndexBasename
                      + DiskBasedIndex.PROPERTIES_EXTENSION)) {
        stashedProps.load(stashedPropsStream);
      }
      Properties newProps = new Properties();
      newProps.setProperty(Index.PropertyKeys.TERMPROCESSOR,
              stashedProps.getProperty(Index.PropertyKeys.TERMPROCESSOR));
      newProps.setProperty(Index.PropertyKeys.SIZE, writtenBits);
      // -1 means unknown
      newProps.setProperty(Index.PropertyKeys.MAXDOCSIZE, maxDocSize);
      newProps.setProperty(Index.PropertyKeys.MAXCOUNT, maxCount);
      newProps.setProperty(Index.PropertyKeys.OCCURRENCES, totalOccurrences);
      writerProperties.addAll(newProps);
      Scan.saveProperties(IOFactory.FILESYSTEM_FACTORY, writerProperties,
              outputIndexBasename + DiskBasedIndex.PROPERTIES_EXTENSION);
    }
  }

  /**
   * Truncate the given direct index to remove documents beyond the
   * given lastDocId. The original version of the batch is assumed to
   * have been stashed as broken-batches/subIndexName-batchName
   * 
   * @param indexDirectory the top-level index directory
   * @param subIndexName the name of the sub-index (token-N or
   *          mention-N)
   * @param batchName the name of the batch (head or tail-N)
   * @param lastDocId the last valid document ID
   */
  public static void truncateDirectIndex(File indexDirectory,
          String subIndexName, String batchName, long lastDocId)
          throws Exception {
    File brokenBatches = new File(indexDirectory, "broken-batches");
    File stashedBatch = new File(brokenBatches, subIndexName + "-" + batchName);
    if(!stashedBatch.exists()) {
      throw new RuntimeException("Stashed batch " + stashedBatch + " not found");
    }
    File batchDir = new File(new File(indexDirectory, subIndexName), batchName);
    batchDir.mkdirs();
    log.info("Trimming direct index for batch " + batchDir);

    String stashedIndexBasename =
            new File(stashedBatch, subIndexName
                    + AtomicIndex.DIRECT_INDEX_NAME_SUFFIX).getAbsolutePath();
    String outputIndexBasename =
            new File(batchDir, subIndexName
                    + AtomicIndex.DIRECT_INDEX_NAME_SUFFIX).getAbsolutePath();

    // A direct index is modelled in Mimir as an inverted index where
    // the terms are documents and vice versa. The "term string" is a
    // zero-padded hex representation of the document ID, so we simply
    // need to stream "inverted" lists from the stashed index to the new
    // one until we reach the term string that is the hex representation
    // of lastDocId.

    MutableString lastDocIdAsHex =
            new MutableString(AtomicIndex.longToTerm(lastDocId));

    // determine the number of documents in this direct index (i.e. the
    // number of entries in the .terms file that are lexicographically
    // <= lastDocIdAsHex)
    long numDocsInIndex = 0;

    File stashedTermsFile =
            new File(stashedIndexBasename + DiskBasedIndex.TERMS_EXTENSION);
    FileLinesCollection termsColl =
            new FileLinesCollection(stashedTermsFile.getAbsolutePath(), "UTF-8");
    try(FileLinesIterator docIdsIter = termsColl.iterator()) {
      while(docIdsIter.hasNext()
              && docIdsIter.next().compareTo(lastDocIdAsHex) <= 0) {
        numDocsInIndex++;
      }
    }
    log.info("Trimmed index will contain " + numDocsInIndex + " documents");

    // write the truncated "terms" file, term map and bloom filter
    BloomFilter<Void> docBloomFilter = BloomFilter.create(numDocsInIndex);

    try(FileLinesIterator docIdsIter = termsColl.iterator();
            PrintWriter pw =
                    new PrintWriter(new OutputStreamWriter(
                            new FastBufferedOutputStream(new FileOutputStream(
                                    outputIndexBasename
                                            + DiskBasedIndex.TERMS_EXTENSION),
                                    64 * 1024), "UTF-8"))) {
      for(long i = 0; i < numDocsInIndex; i++) {
        MutableString t = docIdsIter.next();
        t.println(pw);
        docBloomFilter.add(t);
      }
    }
    AtomicIndex.generateTermMap(new File(outputIndexBasename
            + DiskBasedIndex.TERMS_EXTENSION), new File(outputIndexBasename
            + DiskBasedIndex.TERMMAP_EXTENSION), null);
    BinIO.storeObject(docBloomFilter, new File(outputIndexBasename
            + DocumentalCluster.BLOOM_EXTENSION));

    // stream "inverted lists" (i.e. documents) from the stashed to the
    // new index, and build up a cache of "document sizes" (i.e. the
    // number of documents that contain each term referenced in this
    // index). We can't simply use the sizes from the stashed index
    // because they will include the counts from the inverted lists
    // we're trimming off.
    Long2IntOpenHashMap termSizes = new Long2IntOpenHashMap();
    termSizes.defaultReturnValue(0);

    // we need the total potential number of direct terms to create the
    // index writer
    File directTermsFile =
            new File(new File(indexDirectory, subIndexName),
                    AtomicIndex.DIRECT_TERMS_FILENAME);
    FileLinesCollection directTerms =
            new FileLinesCollection(directTermsFile.getAbsolutePath(), "UTF-8");

    Index stashedIndex = Index.getInstance(stashedIndexBasename, true, false);

    int maxCount = 0;
    long totalOccurrences = 0;
    long writtenBits = 0;
    int maxTermSize = -1; // -1 means unknown
    Properties writerProperties;
    
    try(IndexReader indexReader = stashedIndex.getReader()) {
      // copy the default compression flags, and remove positions
      Map<Component, Coding> flags =
              new HashMap<Component, Coding>(
                      CompressionFlags.DEFAULT_QUASI_SUCCINCT_INDEX);
      flags.remove(Component.POSITIONS);
      QuasiSuccinctIndexWriter directIndexWriter =
              new QuasiSuccinctIndexWriter(
                      IOFactory.FILESYSTEM_FACTORY,
                      outputIndexBasename,
                      directTerms.size64(),
                      Fast.mostSignificantBit(QuasiSuccinctIndex.DEFAULT_QUANTUM),
                      QuasiSuccinctIndexWriter.DEFAULT_CACHE_SIZE, flags,
                      ByteOrder.nativeOrder());
      IndexIterator iter;
      int docCounter = 0;
      long occurrences = 0;
      while((iter = indexReader.nextIterator()) != null
              && ++docCounter <= numDocsInIndex) {
        // annoyingly we can't stream straight from the old inverted
        // list to the new one, as we need to know up front the total
        // occurrences value which is not exposed through any public
        // API.
        LongList docPointers = new LongArrayList();
        IntList counts = new IntArrayList();
        long frequency = iter.frequency();
        long curPointer;
        while((curPointer = iter.nextDocument()) != IndexIterator.END_OF_LIST) {
          docPointers.add(curPointer);
          counts.add(iter.count());
          termSizes.put(curPointer, termSizes.get(curPointer) + iter.count());
          occurrences += iter.count();
          totalOccurrences += iter.count();
          if(iter.count() > maxCount) {
            maxCount = iter.count();
          }
        }
        directIndexWriter.newInvertedList(frequency, occurrences, 0);
        directIndexWriter.writeFrequency(frequency);
        for(int i = 0; i < frequency; i++) {
          OutputBitStream obs = directIndexWriter.newDocumentRecord();
          directIndexWriter.writeDocumentPointer(obs, docPointers.get(i));
          directIndexWriter.writePositionCount(obs, counts.get(i));
          // no positions in a direct index
        }
      }
      directIndexWriter.close();
      writtenBits = directIndexWriter.writtenBits();
      
      // write the new sizes file
      File sizesFile = new File(outputIndexBasename + DiskBasedIndex.SIZES_EXTENSION);
      try(OutputBitStream sizesStream = new OutputBitStream(sizesFile)) {
        for(long i = 0; i < directTerms.size64(); i++) {
          int termSize = termSizes.get(i);
          sizesStream.writeGamma(termSize);
          if(termSize > maxTermSize) {
            maxTermSize = termSize;
          }
        }
      }
      writerProperties = directIndexWriter.properties();
      // write stats file
      try(PrintStream statsPs =
              new PrintStream(new File(outputIndexBasename
                      + DiskBasedIndex.STATS_EXTENSION))) {
        directIndexWriter.printStats(statsPs);
      }
    }
    
    // generate the index properties
    Properties stashedProps = new Properties();
    try(FileInputStream stashedPropsStream =
            new FileInputStream(stashedIndexBasename
                    + DiskBasedIndex.PROPERTIES_EXTENSION)) {
      stashedProps.load(stashedPropsStream);
    }
    Properties newProps = new Properties();
    newProps.setProperty(Index.PropertyKeys.TERMPROCESSOR,
            stashedProps.getProperty(Index.PropertyKeys.TERMPROCESSOR));
    newProps.setProperty(Index.PropertyKeys.SIZE, writtenBits);
    // -1 means unknown
    newProps.setProperty(Index.PropertyKeys.MAXDOCSIZE, maxTermSize);
    newProps.setProperty(Index.PropertyKeys.MAXCOUNT, maxCount);
    newProps.setProperty(Index.PropertyKeys.OCCURRENCES, totalOccurrences);
    writerProperties.addAll(newProps);
    Scan.saveProperties(IOFactory.FILESYSTEM_FACTORY, writerProperties,
            outputIndexBasename + DiskBasedIndex.PROPERTIES_EXTENSION);
  }
}