DocumentCollection.java example

Explorer

mimir-master
- etc
  - generated-header.java
- mimir-client
  - src
    - gate
      - mimir
        index
        MimirConnector.java
        MimirIndexingPR.java
        search
        RemoteQueryRunner.java
        tool
        WebUtils.java
- mimir-cloud
  - archive-unpacker
    - src
      - gate
        mimir
        util
        MultiFileInputStream.java
        UnpackWizard.java
  - src
    - java
      - gate
        mimir
        util
        IndexArchiveState.java
        MultiFileOutputStream.java
- mimir-core
  - src
    - gate
      - mimir
        AbstractSemanticAnnotationHelper.java
        Constraint.java
        ConstraintType.java
        DocumentMetadataHelper.java
        DocumentRenderer.java
        IndexConfig.java
        MimirIndex.java
        SemanticAnnotationHelper.java
        index
        AtomicAnnotationIndex.java
        AtomicIndex.java
        AtomicTokenIndex.java
        DocumentCollection.java
        DocumentData.java
        GATEDocument.java
        GATEDocumentFactory.java
        IndexException.java
        Mention.java
        OriginalMarkupMetadataHelper.java
        package-info.java
        search
        FederatedQueryRunner.java
        IndexReaderPool.java
        QueryEngine.java
        QueryRunner.java
        RankingQueryRunnerImpl.java
        query
        AbstractIntersectionQueryExecutor.java
        AbstractOverlapQuery.java
        AbstractQueryExecutor.java
        AndQuery.java
        AnnotationQuery.java
        Binding.java
        ConstQuery.java
        ContainsQuery.java
        ExecutorsList.java
        GapQuery.java
        MinusQuery.java
        OrQuery.java
        QueryExecutor.java
        QueryNode.java
        RepeatsQuery.java
        SequenceQuery.java
        TermQuery.java
        WithinQuery.java
        parser
        ParseException.java
        Query.java
        QueryParser.java
        QueryParserConstants.java
        QueryParserTokenManager.java
        SimpleCharStream.java
        Token.java
        TokenMgrError.java
        score
        BindingScorer.java
        DelegatingScoringQueryExecutor.java
        MimirScorer.java
        terms
        AbstractCompoundTermsQuery.java
        AbstractDocumentsBasedTermsQuery.java
        AbstractIndexTermsQuery.java
        AndTermsQuery.java
        AnnotationTermsQuery.java
        CompoundTermsQuery.java
        ConstTermsQuery.java
        DocumentTermsQuery.java
        DocumentsAndTermsQuery.java
        DocumentsBasedTermsQuery.java
        DocumentsOrTermsQuery.java
        LimitTermsQuery.java
        OrTermsQuery.java
        SortedTermsQuery.java
        TermTypeTermsQuery.java
        TermsQuery.java
        TermsResultSet.java
        util
        DefaultMentionDescriber.java
        DelegatingSemanticAnnotationHelper.java
        DocumentFeaturesMetadataHelper.java
        IgnoreEmptiesTermProcessor.java
        IndexUpgrader.java
        MG4JTools.java
        NormalizingTermProcessor.java
        OntologyMentionDescriber.java
        TruncateIndex.java
- mimir-test
  - src
    - gate
      - mimir
        test
        QueryTests.java
        RenderZipCollection.java
        Scratch.java
        ScratchConsole.java
        TestQueryParser.java
        TestUtils.java
- mimir-web
  - src
    - gwt
      - gate
        mimir
        web
        client
        UI.java
    - java
      - gate
        mimir
        util
        LogAnalyser.java
        web
        client
        DocumentData.java
        GwtRpcService.java
        GwtRpcServiceAsync.java
        MimirSearchException.java
        ResultsData.java
- plugins
  - db-h2
    - src
      - gate
        mimir
        db
        AnnotationTemplateCache.java
        DBSemanticAnnotationHelper.java
  - measurements
    - src
      - gate
        mimir
        measurements
        MeasurementAnnotationHelper.java
        MeasurementPluginResource.java
  - sparql
    - src
      - gate
        mimir
        sparql
        RequestMethod.java
        SPARQLResultSet.java
        SPARQLSemanticAnnotationHelper.java

/*
 *  DocumentCollection.java
 *
 *  Copyright (c) 2007-2011, The University of Sheffield.
 *
 *  This file is part of GATE Mímir (see http://gate.ac.uk/family/mimir.html), 
 *  and is free software, licenced under the GNU Lesser General Public License,
 *  Version 3, June 2007 (also included with this distribution as file
 *  LICENCE-LGPL3.html).
 *
 *  Valentin Tablan, 15 Apr 2009
 *
 *  $Id$
 */
package gate.mimir.index;


import gate.mimir.MimirIndex;
import it.unimi.dsi.fastutil.longs.Long2ObjectLinkedOpenHashMap;

import java.io.BufferedOutputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FilenameFilter;
import java.io.IOException;
import java.io.InputStream;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.ObjectStreamClass;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.Enumeration;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.zip.ZipEntry;
import java.util.zip.ZipException;
import java.util.zip.ZipFile;
import java.util.zip.ZipOutputStream;

import org.apache.log4j.Logger;



/**
 * A Mimir document collection. Consists of one or more zip files containing 
 * serialised {@link DocumentData} values. Each {@link MimirIndex} contains a
 * document collection.
 */
public class DocumentCollection {
  
  /**
   * The maximum number of documents to be stored in the in-RAM document cache.
   */
  protected static final int DOCUMENT_DATA_CACHE_SIZE = 100;
  
  /**
   * Class representing one of the collection (zip) files.
   */
  public static class CollectionFile implements Comparable<CollectionFile> {
    /**
     * The filename for the zip collection.
     */
    public static final String MIMIR_COLLECTION_BASENAME = "mimir-collection-";
    
    /**
     * The file extension used for the mimir-specific relocatable zip collection
     * definition.
     */
    public static final String MIMIR_COLLECTION_EXTENSION = ".zip";
    
    /**
     * Regex pattern that recognises a valid collection file name and its parts.
     * The following capturing groups can be used when a match occurs:
     * <ul>
     *   <li>1: the collection file ID</li>
     *   <li>2: the collection file number (the numeric part of the ID)</li>
     *   <li>3: (optional) the collection file suffix (the non-numeric part of the ID)</li>
     * </ul>   
     */
    protected static final Pattern MIMIR_COLLECTION_PATTERN = Pattern.compile(
        "\\Q" + MIMIR_COLLECTION_BASENAME + "\\E((\\d+)(?:-([-0-9a-zA-Z]+))?)\\Q"+
        MIMIR_COLLECTION_EXTENSION + "\\E");
    
    public static FilenameFilter FILENAME_FILTER = new FilenameFilter() {
      @Override
      public boolean accept(File dir, String name) {
        return MIMIR_COLLECTION_PATTERN.matcher(name).matches();
      }
    };
    
    
    protected File file;
    
	  protected ZipFile zipFile;
	  
	  protected long firstEntry;
	  
	  protected long lastEntry;

	  /**
	   * Each collection file has a number, and optionally a suffix. For example
	   * in "mimir-collection-0-a.zip", the number is 0, and the suffix
	   * is a.
	   */
	  protected int collectionFileNumber;
	  
	  
	  /**
	   * The size in bytes of the underlying file.
	   */
	  protected long length;
	  
	  /**
	   * The number of documents contained.
	   */
	  protected int documentCount;

	  /**
     * Given the name of a zip file, this method returns its ID: the part of the 
     * file name between the prefix ({@value DocumentCollection#MIMIR_COLLECTION_BASENAME}) and
     * the suffix ({@value DocumentCollection#MIMIR_COLLECTION_EXTENSION}), or <code>null</code> if 
     * the name is not that of a valid collection file.
     * @param fileName the file name to be parsed.
     * @return the ID of the file, or <code>null</code>.
     */
    protected static String getCollectionFileId(String fileName){
      Matcher m = MIMIR_COLLECTION_PATTERN.matcher(fileName);
      return m.matches() ? m.group(1) : null;
    }

    protected static int getCollectionFileNumber(String fileName){
      Matcher m = MIMIR_COLLECTION_PATTERN.matcher(fileName);
      return m.matches() ? Integer.parseInt(m.group(2)) : -1;
    }
    
    public static String getCollectionFileName(String id) {
      return MIMIR_COLLECTION_BASENAME + id + MIMIR_COLLECTION_EXTENSION;
    }
    
	  public CollectionFile(File file) throws ZipException, IOException {
	    this.file = file;
      zipFile = new ZipFile(file);
      collectionFileNumber = getCollectionFileNumber(file.getName());
      Enumeration<? extends ZipEntry> entries = zipFile.entries();
      firstEntry = Long.MAX_VALUE;
      lastEntry = -1;
      documentCount = 0;
      while(entries.hasMoreElements()) {
        ZipEntry anEntry = entries.nextElement();
        String entryName = anEntry.getName();
        try {
          long entryId = Long.parseLong(entryName);
          //update the current maximum and minimum
          if(entryId > lastEntry) lastEntry = entryId;
          if(entryId < firstEntry) firstEntry = entryId;
          documentCount++;
        } catch(NumberFormatException e) {
          //not parseable -> we'll ignore this entry.
          logger.warn("Unparseable zip entry name: " + entryName);
        }
      }
      if(firstEntry == Long.MAX_VALUE) firstEntry = -1;
      length = file.length();
	  }
	  
    @Override
    public int compareTo(CollectionFile o) {
      return Long.compare(firstEntry, o.firstEntry);
    }
    
    public boolean containsDocument(long documentID) {
      return firstEntry <= documentID && 
          documentID <= lastEntry &&
          zipFile.getEntry(Long.toString(documentID)) != null;
    }
    
    public DocumentData getDocumentData(Long documentID) throws IOException {
      ZipEntry entry = zipFile.getEntry(Long.toString(documentID));
      if(entry == null) throw new NoSuchElementException(
          "No entry found for document ID " + documentID);
      CustomObjectInputStream ois = null;
      try {
        ois = new CustomObjectInputStream(zipFile.getInputStream(entry));
        return (DocumentData) ois.readObject();
      } catch(ClassNotFoundException e) {
        //invalid data read from the zip file
        throw new IOException("Invalid data read from zip file!", e);
      } finally {
        if(ois != null) ois.close();
      }
    }
    
    
    public void close() throws IOException {
      zipFile.close();
    }
  }

  /**
   * Custom implementation of {@link ObjectInputStream} that handles reading
   * old mimir archive files where the contents include serialised classes
   * with old (pre-Mímir-5) class names.
   */
  protected static class CustomObjectInputStream extends ObjectInputStream {

    public CustomObjectInputStream() throws IOException, SecurityException {
      super();
    }

    public CustomObjectInputStream(InputStream in) throws IOException {
      super(in);
    }

    @Override
    protected Class<?> resolveClass(ObjectStreamClass desc) throws IOException,
        ClassNotFoundException {
      if("gate.mimir.index.mg4j.zipcollection.DocumentData".equals(desc.getName())) {
        desc = ObjectStreamClass.lookup(Class.forName("gate.mimir.index.DocumentData"));
      }
      return super.resolveClass(desc);
    }
  }
  
  /**
   * Class that handles the creation of collection files.
   */
  protected static class CollectionFileWriter {
    /**
     * The number of documents kept in memory until a new zip file is written. As 
     * new documents are submitted, they get written to the currently open zip 
     * file but they cannot be read from the file. To account for this, we keep 
     * them in memory, in the {@link #inputBuffer} structure.
     */
    protected static final int INPUT_BUFFER_SIZE = 1000;
    
    
    /**
     * Document data objects that have been written to the zip file currently 
     * being created and have to be kept in RAM until the file is closed and can 
     * be open in read mode. 
     */
    protected Long2ObjectLinkedOpenHashMap<DocumentData> inputBuffer;
    
    /**
     * The zip file managed by this collection.
     */
    protected ZipOutputStream zipOuputStream;
    
    /**
     * The zip file to which we are currently writing.
     */
    protected File zipFile;
    
    /**
     * The number of entries written so far to the current zip file.
     */
    protected int currentEntries;
    
    /**
     * The amount of bytes written so far to the current zip file.
     */
    protected long currentLength;
    
    /**
     * A {@link ByteArrayOutputStream} used to temporarily store serialised 
     * document data objects.
     */
    protected ByteArrayOutputStream byteArrayOS;
    
    public CollectionFileWriter(File file) throws IndexException {
      this.zipFile = file;
      if(zipFile.exists()) throw new IndexException("Collection zip file (" + 
          file.getAbsolutePath() + ") already exists!");
      byteArrayOS = new ByteArrayOutputStream();
      
      try {
        zipOuputStream = new ZipOutputStream(new BufferedOutputStream(
                new  FileOutputStream(zipFile)));
      } catch(FileNotFoundException e) {
        throw new IndexException("Cannot write to collection zip file (" + 
                zipFile.getAbsolutePath() + ")", e);
      }
      currentEntries = 0;
      currentLength = 0;
      inputBuffer = new Long2ObjectLinkedOpenHashMap<DocumentData>();
    }
    
    /**
     * 
     * @param entryName
     * @param document
     * @return true if the document was written successfully, false if this 
     * collection file is full and cannot take the extra content.
     * 
     * @throws IOException
     */
    public boolean writeDocumentData(long documentId, DocumentData document) throws IOException {
      //write the new document to the byte array
      ObjectOutputStream objectOutStream = new ObjectOutputStream(byteArrayOS);
      objectOutStream.writeObject(document);
      objectOutStream.close();

      // check if this will take us over size
      if(currentLength + byteArrayOS.size() > ZIP_FILE_MAX_SIZE ||
         currentEntries >= ZIP_FILE_MAX_ENTRIES ||
         inputBuffer.size() >= INPUT_BUFFER_SIZE) return false;
      
      // create a new entry in the current zip file
      ZipEntry entry = new ZipEntry(Long.toString(documentId));
      zipOuputStream.putNextEntry(entry);
      //write the data
      byteArrayOS.writeTo(zipOuputStream);
      zipOuputStream.closeEntry();
      currentLength += entry.getCompressedSize();
      
      //clean up the byte array for next time
      byteArrayOS.reset();
      currentEntries++;
      // save the document data to the input buffer
      inputBuffer.put(documentId, document);
      return true;
    }

    public void close() throws IOException {
      if(zipOuputStream != null) zipOuputStream.close();
      inputBuffer.clear();
    }
  }
  
  /**
   * The zip files containing the document collection.
   */
  protected List<CollectionFile> collectionFiles = null;
  
  protected CollectionFileWriter collectionFileWriter;
  
  
  private static Logger logger = Logger.getLogger(DocumentCollection.class);
  
  /**
   * The top level directory for the index.
   */
  protected File indexDirectory;
  
  /**
   * A cache of {@link DocumentData} values used for returning the various
   * document details (title, URI, text).
   */
  protected Long2ObjectLinkedOpenHashMap<DocumentData> documentCache;
  

  
  /**
   * Flag that gets set to true when the collection is closed (and blocks all 
   * subsequent operations).
   */
  private volatile boolean closed = false; 
  
  /**
   * The maximum number of bytes to write to a single zip file.
   */
  public static final long ZIP_FILE_MAX_SIZE = 2 * 1000 * 1000 * 1000; 
    
  /**
   * The maximum number of entries to write to a single zip file.
   * Java 1.5 only support 2^16 entries.
   * If running on Java 1.6, this limit can safely be increased, however, the
   * total size of the file (as specified by {@link #ZIP_FILE_MAX_SIZE}) should 
   * not be greater than 4GB, in either case.
   */
  public static final int ZIP_FILE_MAX_ENTRIES = 250000;
  

  
  /**
   * The ID for the next document to be written in this collection. This value 
   * is initialised to 0 and then is automatically incremented whenever a new 
   * document is written.
   */
  protected long nextDocumentId;
  
  
  /**
   * Creates a DocumentCollection object for accessing the document data.
   * @param indexDirectory
   * @throws IOException 
   */
  public DocumentCollection(File indexDirectory) throws IOException {
    this.indexDirectory = indexDirectory;
    
    collectionFiles = new ArrayList<CollectionFile>();
    // prepare for reading
    for(File aCollectionFile : indexDirectory.listFiles(CollectionFile.FILENAME_FILTER)) {
      collectionFiles.add(new CollectionFile(aCollectionFile));
    }
    Collections.sort(collectionFiles);
    // sanity check
    for(int i = 0;  i < collectionFiles.size() - 1; i++) {
      CollectionFile first = collectionFiles.get(i);
      CollectionFile second = collectionFiles.get(i + 1);
      if(first.lastEntry >= second.firstEntry) {
        throw new IOException(
            "Invalid entries distribution: collection file " + 
            second.zipFile.getName() + 
            " contains an entry named \"" + second.firstEntry + 
            "\", but an entry with a larger-or-equal ID was " +
            "already seen in a previous collection file!");          
      }
    }
    documentCache = new Long2ObjectLinkedOpenHashMap<DocumentData>();
    
    // prepare for writing
    nextDocumentId = collectionFiles.isEmpty() ? 0 : 
        (collectionFiles.get(collectionFiles.size() - 1).lastEntry + 1);
    
  }
  

  /**
   * Gets the document data for a given document ID.
   * @param documentID the ID of the document to be retrieved.
   * @return a {@link DocumentData} object for the requested document ID.
   * @throws IOException if there are problems accessing the underlying zip file; 
   * @throws NoSuchElementException if the requested document ID is not found.
   */
  public DocumentData getDocumentData(long documentID) throws IOException{
    if(closed) throw new IllegalStateException(
            "This document collection has already been closed!");
    DocumentData documentData = null;
    if(collectionFiles.isEmpty() ||
       documentID > collectionFiles.get(collectionFiles.size() - 1).lastEntry) {
      // it's a new document that's not yet available from the zip files
      documentData = collectionFileWriter != null ?
          collectionFileWriter.inputBuffer.get(documentID):
          null;
    } else {
      // it's an old document. Try the cache first
      documentData = documentCache.getAndMoveToFirst(documentID);
      if(documentData == null) {
        // cache miss: we need to actually load it
        //locate the right zip file
        synchronized(collectionFiles) {
          files: for(CollectionFile aColFile : collectionFiles) {
            if(aColFile.containsDocument(documentID)) {
              // we found the collection file containing the document
              documentData = aColFile.getDocumentData(documentID);
              documentCache.putAndMoveToFirst(documentID, documentData);
              if(documentCache.size() > DOCUMENT_DATA_CACHE_SIZE) {
                documentCache.removeLast();
              }
              break files;
            }
          } 
        }
      }
    }
    if(documentData == null) throw new NoSuchElementException(
        "No entry found for document ID " + documentID);
    return documentData;  
  }
  
  /**
   * Writes a new document to the underlying zip file. The documents added 
   * through this method will get automatically generated names starting from 
   * "0", and continuing with "1", "2", etc.   
   * @param document
   * @throws IndexException if there are any problems while accessing the zip 
   * collection file(s).
   */
  public void writeDocument(DocumentData document) throws IndexException{
    if(collectionFileWriter == null) openCollectionWriter();
    
    try{
      boolean success = false;
      while(!success) {
        success = collectionFileWriter.writeDocumentData(nextDocumentId, 
            document);
        if(!success) {
          // the current collection file is full: close it
          collectionFileWriter.close();
          synchronized(collectionFiles) {
            // open the newly saved zip file
            collectionFiles.add(new CollectionFile(collectionFileWriter.zipFile));
          }
          // open a new one and try again
          openCollectionWriter();
        }   
      }
    } catch(IOException e){
      throw new IndexException("Problem while accessing the collection file", e);
    } finally {
      nextDocumentId++;
    }
  }
  
  /**
   * Opens the current zip file and sets the {@link #zipFile} and 
   * {@link #zipOuputStream} values accordingly. 
   * @throws IndexException if the collection zip file already exists, or cannot
   * be opened for writing.
   */
  protected void openCollectionWriter() throws IndexException{
    int zipFileNumber = 0;
    synchronized(collectionFiles) {
      zipFileNumber = collectionFiles.isEmpty() ? 0 :
        collectionFiles.get(collectionFiles.size() - 1).collectionFileNumber + 1; 
    }
    collectionFileWriter = new CollectionFileWriter(
        new File(indexDirectory,
            CollectionFile.getCollectionFileName(
                Integer.toString(zipFileNumber))));
  }
  
  /**
   * Close this document collection and release all allocated resources (such 
   * as open file handles). 
   * @throws IOException 
   * @throws IndexException 
   */
  public void close() throws IOException {
    // close the writer
    if(collectionFileWriter != null) collectionFileWriter.close();
    // close the reader
    closed = true;
    if(collectionFiles != null){
      for(CollectionFile colFile : collectionFiles){
        try {
          colFile.close();
        } catch(IOException e) {
          // ignore
        }
      }
      collectionFiles.clear();
      collectionFiles = null;      
    }
    documentCache.clear();
  }
  
  
  /**
   * Returns the number of archive files in this collection.
   * @return
   */
  public int getArchiveCount() {
    return collectionFiles.size();
  }
  
  /**
   * Combines multiple smaller collection files into larger ones. If multiple
   * consecutive collection files can be combined without exceeding the maximum
   * permitted sizes ({@link #ZIP_FILE_MAX_ENTRIES} and 
   * {@link #ZIP_FILE_MAX_SIZE}), then they are combined.
   * 
   * @throws ZipException
   * @throws IOException
   * @throws IndexException 
   */
  public synchronized void compact() throws ZipException, IOException, IndexException {
    logger.debug("Starting collection compact.");
    // find an interval of files that can be joined together
    // we search from the end toward the start so that we can modify the 
    // list without changing the yet-unvisited IDs.
    CollectionFile[] colFilesArr = collectionFiles.toArray(
        new CollectionFile[collectionFiles.size()]);
    int intervalEnd = -1;
    int intervalLength = 0;
    int intervalEntries = 0;
    long intervalBytes = 0;
    for(int i = colFilesArr.length -1; i >= 0; i--) {
      // is the current file small?
      boolean smallFile = 
          colFilesArr[i].documentCount < ZIP_FILE_MAX_ENTRIES &&
          colFilesArr[i].length < ZIP_FILE_MAX_SIZE;
      if(intervalEnd < 0) { // we're looking for the first 'small' file
        if(smallFile) {
          // we found a small file: start a new interval
          intervalEnd = i;
          intervalLength = 1;
          intervalEntries = colFilesArr[i].documentCount;
          intervalBytes = colFilesArr[i].length;
        }
      } else { // we're trying to extend the current interval
        boolean currentFileAccepted = 
            intervalEntries + colFilesArr[i].documentCount < ZIP_FILE_MAX_ENTRIES &&
            intervalBytes + colFilesArr[i].length < ZIP_FILE_MAX_SIZE;
        if(currentFileAccepted) {
          // extend the current interval
          intervalEntries += colFilesArr[i].documentCount;
          intervalBytes += colFilesArr[i].length;
          intervalLength = intervalEnd - i + 1;
        }
        if(!currentFileAccepted || i == 0) {
          // end the current interval
          if(intervalLength > 1){
            int intervalStart = intervalEnd - intervalLength + 1;
            // combine the files
            // create the new file
            String newFileName = "temp-" + 
                CollectionFile.MIMIR_COLLECTION_BASENAME + 
                colFilesArr[intervalStart].collectionFileNumber + 
                "-" + 
                colFilesArr[intervalEnd].collectionFileNumber + 
                CollectionFile.MIMIR_COLLECTION_EXTENSION;
            File newZipFile = new File(indexDirectory, newFileName);
            ZipOutputStream  zos = new ZipOutputStream(new BufferedOutputStream(
                      new  FileOutputStream(newZipFile)));
            byte[] buff = new byte[1024 * 1024];
            for(int j = intervalStart; j <= intervalEnd; j++) {
              Enumeration<? extends ZipEntry> entries = 
                  colFilesArr[j].zipFile.entries();
              while(entries.hasMoreElements()) {
                ZipEntry anEntry = entries.nextElement();
                zos.putNextEntry(new ZipEntry(anEntry));
                InputStream is = colFilesArr[j].zipFile.getInputStream(anEntry);
                int read = is.read(buff);
                while(read >= 0){
                  zos.write(buff, 0, read);
                  read = is.read(buff);
                }
                zos.closeEntry();
              }
            }
            zos.close();
            
            //update the collection
            synchronized(colFilesArr) {
              //confirm that the collection files have not changed since we started
              for(int j = intervalStart; j <= intervalEnd; j++) {
                if(colFilesArr[j] != collectionFiles.get(j)) {
                  logger.warn("Collection files have changed since the "
                      + "compacting operation started. Compact aborted." + 
                      "Details: " + colFilesArr[j].file.getAbsolutePath() + 
                      " not the same as " + 
                      collectionFiles.get(j).file.getAbsolutePath());

                  // delete the newly created collection file
                  newZipFile.delete();
                  return;
                }
              }
              // build name for new collection file
              File newCollectionFile = new File(indexDirectory, 
                  CollectionFile.getCollectionFileName(
                      Integer.toString(colFilesArr[intervalStart].collectionFileNumber) + 
                      "-" + 
                      Integer.toString(colFilesArr[intervalEnd].collectionFileNumber)));
              // delete the old files
              for(int j = intervalStart; j <= intervalEnd; j++) {
                CollectionFile oldColFile = collectionFiles.remove(intervalStart);
                if(!oldColFile.file.delete()) {
                  throw new IndexException(
                      "Could not delete old collection file " + 
                      oldColFile.file + "! " + 
                      "Document collection now inconsistent.");
                }
              }
              // rename temp file to new name
              newZipFile.renameTo(newCollectionFile);
              // add new collection file
              collectionFiles.add(intervalStart, new CollectionFile(newCollectionFile));
            }
          }
          // we found and merged an interval, 
          // or we only found an interval of length 1: start again
          intervalEnd = -1;
          intervalLength = 0;
          intervalEntries = 0;
          intervalBytes = 0;
        }
      }
    }
  }
}