InstantiatedIndexReader.java example

Explorer
solrcene-master
package org.apache.lucene.store.instantiated;

/**
 * Copyright 2006 The Apache Software Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.Comparator;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.FieldSelector;
import org.apache.lucene.index.*;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BitVector;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.Bits;

/**
 * An InstantiatedIndexReader is not a snapshot in time, it is completely in
 * sync with the latest commit to the store!
 * <p>
 * Consider using InstantiatedIndex as if it was immutable.
 */
public class InstantiatedIndexReader extends IndexReader {

  private final InstantiatedIndex index;

  public InstantiatedIndexReader(InstantiatedIndex index) {
    super();
    this.index = index;
  }

  /**
   * @return always true.
   */
  @Override
  public boolean isOptimized() {
    return true;
  }

  /**
   * An InstantiatedIndexReader is not a snapshot in time, it is completely in
   * sync with the latest commit to the store!
   * 
   * @return output from {@link InstantiatedIndex#getVersion()} in associated instantiated index.
   */
  @Override
  public long getVersion() {
    return index.getVersion();
  }

  @Override
  public Directory directory() {
    throw new UnsupportedOperationException();
  }

  /**
   * An InstantiatedIndexReader is always current!
   * 
   * Check whether this IndexReader is still using the current (i.e., most
   * recently committed) version of the index. If a writer has committed any
   * changes to the index since this reader was opened, this will return
   * <code>false</code>, in which case you must open a new IndexReader in
   * order to see the changes. See the description of the <a
   * href="IndexWriter.html#autoCommit"><code>autoCommit</code></a> flag
   * which controls when the {@link IndexWriter} actually commits changes to the
   * index.
   * 
   * @return always true
   * @throws CorruptIndexException if the index is corrupt
   * @throws IOException if there is a low-level IO error
   * @throws UnsupportedOperationException unless overridden in subclass
   */
  @Override
  public boolean isCurrent() throws IOException {
    return true;
  }

  public InstantiatedIndex getIndex() {
    return index;
  }

  @Override
  public Bits getDeletedDocs() {
    return new Bits() {
      public boolean get(int n) {
        return (index.getDeletedDocuments() != null && index.getDeletedDocuments().get(n))
          || (uncommittedDeletedDocuments != null && uncommittedDeletedDocuments.get(n));
      }

      public int length() {
        return maxDoc();
      }
    };
  }

  private BitVector uncommittedDeletedDocuments;

  private Map<String,List<NormUpdate>> uncommittedNormsByFieldNameAndDocumentNumber = null;

  private class NormUpdate {
    private int doc;
    private byte value;

    public NormUpdate(int doc, byte value) {
      this.doc = doc;
      this.value = value;
    }
  }

  @Override
  public int numDocs() {
    // todo i suppose this value could be cached, but array#length and bitvector#count is fast.
    int numDocs = getIndex().getDocumentsByNumber().length;
    if (uncommittedDeletedDocuments != null) {
      numDocs -= uncommittedDeletedDocuments.count();
    }
    if (index.getDeletedDocuments() != null) {
      numDocs -= index.getDeletedDocuments().count();
    }
    return numDocs;
  }

  @Override
  public int maxDoc() {
    return getIndex().getDocumentsByNumber().length;
  }

  @Override
  public boolean hasDeletions() {
    return index.getDeletedDocuments() != null || uncommittedDeletedDocuments != null;
  }

  @Override
  protected void doDelete(int docNum) throws IOException {

    // dont delete if already deleted
    if ((index.getDeletedDocuments() != null && index.getDeletedDocuments().get(docNum))
        || (uncommittedDeletedDocuments != null && uncommittedDeletedDocuments.get(docNum))) {
      return;
    }

    if (uncommittedDeletedDocuments == null) {
      uncommittedDeletedDocuments = new BitVector(maxDoc());
    }

    uncommittedDeletedDocuments.set(docNum);
  }

  @Override
  protected void doUndeleteAll() throws IOException {
    // todo: read/write lock
    uncommittedDeletedDocuments = null;
    // todo: read/write unlock
  }

  @Override
  protected void doCommit(Map<String,String> commitUserData) throws IOException {
    // todo: read/write lock

    // 1. update norms
    if (uncommittedNormsByFieldNameAndDocumentNumber != null) {
      for (Map.Entry<String,List<NormUpdate>> e : uncommittedNormsByFieldNameAndDocumentNumber.entrySet()) {
        byte[] norms = getIndex().getNormsByFieldNameAndDocumentNumber().get(e.getKey());
        for (NormUpdate normUpdate : e.getValue()) {
          norms[normUpdate.doc] = normUpdate.value;
        }
      }
      uncommittedNormsByFieldNameAndDocumentNumber = null;
    }

    // 2. remove deleted documents
    if (uncommittedDeletedDocuments != null) {
      if (index.getDeletedDocuments() == null) {
        index.setDeletedDocuments(uncommittedDeletedDocuments);
      } else {
        for (int d = 0; d< uncommittedDeletedDocuments.size(); d++) {
          if (uncommittedDeletedDocuments.get(d)) {
            index.getDeletedDocuments().set(d);
          }
        }
      }
      uncommittedDeletedDocuments = null;
    }

    // todo unlock read/writelock
  }

  @Override
  protected void doClose() throws IOException {
    // ignored
    // todo perhaps release all associated instances?
  }

  @Override
  public Collection<String> getFieldNames(FieldOption fieldOption) {
    Set<String> fieldSet = new HashSet<String>();
    for (FieldSetting fi : index.getFieldSettings().values()) {
      if (fieldOption == IndexReader.FieldOption.ALL) {
        fieldSet.add(fi.fieldName);
      } else if (!fi.indexed && fieldOption == IndexReader.FieldOption.UNINDEXED) {
        fieldSet.add(fi.fieldName);
      } else if (fi.storePayloads && fieldOption == IndexReader.FieldOption.STORES_PAYLOADS) {
        fieldSet.add(fi.fieldName);
      } else if (fi.indexed && fieldOption == IndexReader.FieldOption.INDEXED) {
        fieldSet.add(fi.fieldName);
      } else if (fi.indexed && fi.storeTermVector == false && fieldOption == IndexReader.FieldOption.INDEXED_NO_TERMVECTOR) {
        fieldSet.add(fi.fieldName);
      } else if (fi.storeTermVector == true && fi.storePositionWithTermVector == false && fi.storeOffsetWithTermVector == false
          && fieldOption == IndexReader.FieldOption.TERMVECTOR) {
        fieldSet.add(fi.fieldName);
      } else if (fi.indexed && fi.storeTermVector && fieldOption == IndexReader.FieldOption.INDEXED_WITH_TERMVECTOR) {
        fieldSet.add(fi.fieldName);
      } else if (fi.storePositionWithTermVector && fi.storeOffsetWithTermVector == false
          && fieldOption == IndexReader.FieldOption.TERMVECTOR_WITH_POSITION) {
        fieldSet.add(fi.fieldName);
      } else if (fi.storeOffsetWithTermVector && fi.storePositionWithTermVector == false
          && fieldOption == IndexReader.FieldOption.TERMVECTOR_WITH_OFFSET) {
        fieldSet.add(fi.fieldName);
      } else if ((fi.storeOffsetWithTermVector && fi.storePositionWithTermVector)
          && fieldOption == IndexReader.FieldOption.TERMVECTOR_WITH_POSITION_OFFSET) {
        fieldSet.add(fi.fieldName);
      } 
    }
    return fieldSet;
  }

  /**
   * Return the {@link org.apache.lucene.document.Document} at the <code>n</code><sup>th</sup>
   * position.
     <p>
   * <b>Warning!</b>
   * The resulting document is the actual stored document instance
   * and not a deserialized clone as retuned by an IndexReader
   * over a {@link org.apache.lucene.store.Directory}.
   * I.e., if you need to touch the document, clone it first!
   * <p>
   * This can also be seen as a feature for live changes of stored values,
   * but be careful! Adding a field with an name unknown to the index
   * or to a field with previously no stored values will make
   * {@link org.apache.lucene.store.instantiated.InstantiatedIndexReader#getFieldNames(org.apache.lucene.index.IndexReader.FieldOption)}
   * out of sync, causing problems for instance when merging the
   * instantiated index to another index.
     <p>
   * This implementation ignores the field selector! All stored fields are always returned!
   * <p>
   *
   * @param n document number
   * @param fieldSelector ignored
   * @return The stored fields of the {@link org.apache.lucene.document.Document} at the nth position
   * @throws CorruptIndexException if the index is corrupt
   * @throws IOException if there is a low-level IO error
   * 
   * @see org.apache.lucene.document.Fieldable
   * @see org.apache.lucene.document.FieldSelector
   * @see org.apache.lucene.document.SetBasedFieldSelector
   * @see org.apache.lucene.document.LoadFirstFieldSelector
   */
  @Override
  public Document document(int n, FieldSelector fieldSelector) throws CorruptIndexException, IOException {
    return document(n);
  }

  /**
   * Returns the stored fields of the <code>n</code><sup>th</sup>
   * <code>Document</code> in this index.
   * <p>
   * <b>Warning!</b>
   * The resulting document is the actual stored document instance
   * and not a deserialized clone as retuned by an IndexReader
   * over a {@link org.apache.lucene.store.Directory}.
   * I.e., if you need to touch the document, clone it first!
   * <p>
   * This can also be seen as a feature for live changes of stored values,
   * but be careful! Adding a field with an name unknown to the index
   * or to a field with previously no stored values will make
   * {@link org.apache.lucene.store.instantiated.InstantiatedIndexReader#getFieldNames(org.apache.lucene.index.IndexReader.FieldOption)}
   * out of sync, causing problems for instance when merging the
   * instantiated index to another index.
   *
   * @throws CorruptIndexException if the index is corrupt
   * @throws IOException if there is a low-level IO error
   */

  @Override
  public Document document(int n) throws IOException {
    return getIndex().getDocumentsByNumber()[n].getDocument();
  }

  /**
   * never ever touch these values. it is the true values, unless norms have
   * been touched.
   */
  @Override
  public byte[] norms(String field) throws IOException {
    byte[] norms = getIndex().getNormsByFieldNameAndDocumentNumber().get(field);
    if (norms == null) {
      return new byte[0]; // todo a static final zero length attribute?
    }
    if (uncommittedNormsByFieldNameAndDocumentNumber != null) {
      norms = norms.clone();
      List<NormUpdate> updated = uncommittedNormsByFieldNameAndDocumentNumber.get(field);
      if (updated != null) {
        for (NormUpdate normUpdate : updated) {
          norms[normUpdate.doc] = normUpdate.value;
        }
      }
    }
    return norms;
  }

  @Override
  public void norms(String field, byte[] bytes, int offset) throws IOException {
    byte[] norms = getIndex().getNormsByFieldNameAndDocumentNumber().get(field);
    if (norms == null) {
      return;
    }
    System.arraycopy(norms, 0, bytes, offset, norms.length);
  }

  @Override
  protected void doSetNorm(int doc, String field, byte value) throws IOException {
    if (uncommittedNormsByFieldNameAndDocumentNumber == null) {
      uncommittedNormsByFieldNameAndDocumentNumber = new HashMap<String,List<NormUpdate>>(getIndex().getNormsByFieldNameAndDocumentNumber().size());
    }
    List<NormUpdate> list = uncommittedNormsByFieldNameAndDocumentNumber.get(field);
    if (list == null) {
      list = new LinkedList<NormUpdate>();
      uncommittedNormsByFieldNameAndDocumentNumber.put(field, list);
    }
    list.add(new NormUpdate(doc, value));
  }

  @Override
  public int docFreq(Term t) throws IOException {
    InstantiatedTerm term = getIndex().findTerm(t);
    if (term == null) {
      return 0;
    } else {
      return term.getAssociatedDocuments().length;
    }
  }

  @Override
  public Fields fields() {
    if (getIndex().getOrderedTerms().length == 0) {
      return null;
    }

    return new Fields() {
      @Override
      public FieldsEnum iterator() {
        final InstantiatedTerm[] orderedTerms = getIndex().getOrderedTerms();

        return new FieldsEnum() {
          int upto = -1;
          String currentField;

          @Override
          public String next() {
            do {
              upto++;
              if (upto >= orderedTerms.length) {
                return null;
              }
            } while(orderedTerms[upto].field() == currentField);
            
            currentField = orderedTerms[upto].field();
            return currentField;
          }

          @Override
          public TermsEnum terms() {
            return new InstantiatedTermsEnum(orderedTerms, upto, currentField);
          }
        };
      }

      @Override
      public Terms terms(final String field) {
        final InstantiatedTerm[] orderedTerms = getIndex().getOrderedTerms();
        int i = Arrays.binarySearch(orderedTerms, new Term(field), InstantiatedTerm.termComparator);
        if (i < 0) {
          i = -i - 1;
        }
        if (i >= orderedTerms.length || !orderedTerms[i].field().equals(field)) {
          // field does not exist
          return null;
        }
        final int startLoc = i;

        return new Terms() {
          @Override 
          public TermsEnum iterator() {
            return new InstantiatedTermsEnum(orderedTerms, startLoc, field);
          }

          @Override
          public Comparator<BytesRef> getComparator() {
            return BytesRef.getUTF8SortedAsUnicodeComparator();
          }
        };
      }
    };
  }

  @Override
  public TermFreqVector[] getTermFreqVectors(int docNumber) throws IOException {
    InstantiatedDocument doc = getIndex().getDocumentsByNumber()[docNumber];
    if (doc.getVectorSpace() == null) {
      return null;
    }
    TermFreqVector[] ret = new TermFreqVector[doc.getVectorSpace().size()];
    Iterator<String> it = doc.getVectorSpace().keySet().iterator();
    for (int i = 0; i < ret.length; i++) {
      ret[i] = new InstantiatedTermPositionVector(getIndex().getDocumentsByNumber()[docNumber], it.next());
    }
    return ret;
  }

  @Override
  public TermFreqVector getTermFreqVector(int docNumber, String field) throws IOException {
    InstantiatedDocument doc = getIndex().getDocumentsByNumber()[docNumber];
    if (doc.getVectorSpace() == null || doc.getVectorSpace().get(field) == null) {
      return null;
    } else {
      return new InstantiatedTermPositionVector(doc, field);
    }
  }

  @Override
  public void getTermFreqVector(int docNumber, String field, TermVectorMapper mapper) throws IOException {
    InstantiatedDocument doc = getIndex().getDocumentsByNumber()[docNumber];
    if (doc.getVectorSpace() != null && doc.getVectorSpace().get(field) == null) {
      List<InstantiatedTermDocumentInformation> tv = doc.getVectorSpace().get(field);
      mapper.setExpectations(field, tv.size(), true, true);
      for (InstantiatedTermDocumentInformation tdi : tv) {
        mapper.map(tdi.getTerm().getTerm().bytes(), tdi.getTermPositions().length, tdi.getTermOffsets(), tdi.getTermPositions());
      }
    }
  }

  @Override
  public void getTermFreqVector(int docNumber, TermVectorMapper mapper) throws IOException {
    InstantiatedDocument doc = getIndex().getDocumentsByNumber()[docNumber];
    for (Map.Entry<String, List<InstantiatedTermDocumentInformation>> e : doc.getVectorSpace().entrySet()) {
      mapper.setExpectations(e.getKey(), e.getValue().size(), true, true);
      for (InstantiatedTermDocumentInformation tdi : e.getValue()) {
        mapper.map(tdi.getTerm().getTerm().bytes(), tdi.getTermPositions().length, tdi.getTermOffsets(), tdi.getTermPositions());
      }
    }
  }
}