RAMOnlyPostingsFormat.java example

Explorer
solr-analytics-master
- lucene
- solr
package org.apache.lucene.codecs.ramonly;

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.SortedMap;
import java.util.TreeMap;
import java.util.concurrent.atomic.AtomicInteger;

import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.PostingsConsumer;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.TermStats;
import org.apache.lucene.codecs.TermsConsumer;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;

/** Stores all postings data in RAM, but writes a small
 *  token (header + single int) to identify which "slot" the
 *  index is using in RAM HashMap.
 *
 *  NOTE: this codec sorts terms by reverse-unicode-order! */

public final class RAMOnlyPostingsFormat extends PostingsFormat {

  // For fun, test that we can override how terms are
  // sorted, and basic things still work -- this comparator
  // sorts in reversed unicode code point order:
  private static final Comparator<BytesRef> reverseUnicodeComparator = new Comparator<BytesRef>() {
      public int compare(BytesRef t1, BytesRef t2) {
        byte[] b1 = t1.bytes;
        byte[] b2 = t2.bytes;
        int b1Stop;
        int b1Upto = t1.offset;
        int b2Upto = t2.offset;
        if (t1.length < t2.length) {
          b1Stop = t1.offset + t1.length;
        } else {
          b1Stop = t1.offset + t2.length;
        }
        while(b1Upto < b1Stop) {
          final int bb1 = b1[b1Upto++] & 0xff;
          final int bb2 = b2[b2Upto++] & 0xff;
          if (bb1 != bb2) {
            //System.out.println("cmp 1=" + t1 + " 2=" + t2 + " return " + (bb2-bb1));
            return bb2 - bb1;
          }
        }

        // One is prefix of another, or they are equal
        return t2.length-t1.length;
      }

      @Override
      public boolean equals(Object other) {
        return this == other;
      }
    };

  public RAMOnlyPostingsFormat() {
    super("RAMOnly");
  }
    
  // Postings state:
  static class RAMPostings extends FieldsProducer {
    final Map<String,RAMField> fieldToTerms = new TreeMap<String,RAMField>();

    @Override
    public Terms terms(String field) {
      return fieldToTerms.get(field);
    }

    @Override
    public int size() {
      return fieldToTerms.size();
    }

    @Override
    public Iterator<String> iterator() {
      return Collections.unmodifiableSet(fieldToTerms.keySet()).iterator();
    }

    @Override
    public void close() {
    }
  } 

  static class RAMField extends Terms {
    final String field;
    final SortedMap<String,RAMTerm> termToDocs = new TreeMap<String,RAMTerm>();
    long sumTotalTermFreq;
    long sumDocFreq;
    int docCount;
    final FieldInfo info;

    RAMField(String field, FieldInfo info) {
      this.field = field;
      this.info = info;
    }

    @Override
    public long size() {
      return termToDocs.size();
    }

    @Override
    public long getSumTotalTermFreq() {
      return sumTotalTermFreq;
    }
      
    @Override
    public long getSumDocFreq() throws IOException {
      return sumDocFreq;
    }
      
    @Override
    public int getDocCount() throws IOException {
      return docCount;
    }

    @Override
    public TermsEnum iterator(TermsEnum reuse) {
      return new RAMTermsEnum(RAMOnlyPostingsFormat.RAMField.this);
    }

    @Override
    public Comparator<BytesRef> getComparator() {
      return reverseUnicodeComparator;
    }

    @Override
    public boolean hasOffsets() {
      return info.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
    }

    @Override
    public boolean hasPositions() {
      return info.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
    }
    
    @Override
    public boolean hasPayloads() {
      return info.hasPayloads();
    }
  }

  static class RAMTerm {
    final String term;
    long totalTermFreq;
    final List<RAMDoc> docs = new ArrayList<RAMDoc>();
    public RAMTerm(String term) {
      this.term = term;
    }
  }

  static class RAMDoc {
    final int docID;
    final int[] positions;
    byte[][] payloads;

    public RAMDoc(int docID, int freq) {
      this.docID = docID;
      positions = new int[freq];
    }
  }

  // Classes for writing to the postings state
  private static class RAMFieldsConsumer extends FieldsConsumer {

    private final RAMPostings postings;
    private final RAMTermsConsumer termsConsumer = new RAMTermsConsumer();

    public RAMFieldsConsumer(RAMPostings postings) {
      this.postings = postings;
    }

    @Override
    public TermsConsumer addField(FieldInfo field) {
      if (field.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0) {
        throw new UnsupportedOperationException("this codec cannot index offsets");
      }
      RAMField ramField = new RAMField(field.name, field);
      postings.fieldToTerms.put(field.name, ramField);
      termsConsumer.reset(ramField);
      return termsConsumer;
    }

    @Override
    public void close() {
      // TODO: finalize stuff
    }
  }

  private static class RAMTermsConsumer extends TermsConsumer {
    private RAMField field;
    private final RAMPostingsWriterImpl postingsWriter = new RAMPostingsWriterImpl();
    RAMTerm current;
      
    void reset(RAMField field) {
      this.field = field;
    }
      
    @Override
    public PostingsConsumer startTerm(BytesRef text) {
      final String term = text.utf8ToString();
      current = new RAMTerm(term);
      postingsWriter.reset(current);
      return postingsWriter;
    }

      
    @Override
    public Comparator<BytesRef> getComparator() {
      return BytesRef.getUTF8SortedAsUnicodeComparator();
    }

    @Override
    public void finishTerm(BytesRef text, TermStats stats) {
      assert stats.docFreq > 0;
      assert stats.docFreq == current.docs.size();
      current.totalTermFreq = stats.totalTermFreq;
      field.termToDocs.put(current.term, current);
    }

    @Override
    public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) {
      field.sumTotalTermFreq = sumTotalTermFreq;
      field.sumDocFreq = sumDocFreq;
      field.docCount = docCount;
    }
  }

  static class RAMPostingsWriterImpl extends PostingsConsumer {
    private RAMTerm term;
    private RAMDoc current;
    private int posUpto = 0;

    public void reset(RAMTerm term) {
      this.term = term;
    }

    @Override
    public void startDoc(int docID, int freq) {
      current = new RAMDoc(docID, freq);
      term.docs.add(current);
      posUpto = 0;
    }

    @Override
    public void addPosition(int position, BytesRef payload, int startOffset, int endOffset) {
      assert startOffset == -1;
      assert endOffset == -1;
      current.positions[posUpto] = position;
      if (payload != null && payload.length > 0) {
        if (current.payloads == null) {
          current.payloads = new byte[current.positions.length][];
        }
        byte[] bytes = current.payloads[posUpto] = new byte[payload.length];
        System.arraycopy(payload.bytes, payload.offset, bytes, 0, payload.length);
      }
      posUpto++;
    }

    @Override
    public void finishDoc() {
      assert posUpto == current.positions.length;
    }
  }

  static class RAMTermsEnum extends TermsEnum {
    Iterator<String> it;
    String current;
    private final RAMField ramField;

    public RAMTermsEnum(RAMField field) {
      this.ramField = field;
    }
      
    @Override
    public Comparator<BytesRef> getComparator() {
      return BytesRef.getUTF8SortedAsUnicodeComparator();
    }

    @Override
    public BytesRef next() {
      if (it == null) {
        if (current == null) {
          it = ramField.termToDocs.keySet().iterator();
        } else {
          it = ramField.termToDocs.tailMap(current).keySet().iterator();
        }
      }
      if (it.hasNext()) {
        current = it.next();
        return new BytesRef(current);
      } else {
        return null;
      }
    }

    @Override
    public SeekStatus seekCeil(BytesRef term, boolean useCache) {
      current = term.utf8ToString();
      it = null;
      if (ramField.termToDocs.containsKey(current)) {
        return SeekStatus.FOUND;
      } else {
        if (current.compareTo(ramField.termToDocs.lastKey()) > 0) {
          return SeekStatus.END;
        } else {
          return SeekStatus.NOT_FOUND;
        }
      }
    }

    @Override
    public void seekExact(long ord) {
      throw new UnsupportedOperationException();
    }

    @Override
    public long ord() {
      throw new UnsupportedOperationException();
    }

    @Override
    public BytesRef term() {
      // TODO: reuse BytesRef
      return new BytesRef(current);
    }

    @Override
    public int docFreq() {
      return ramField.termToDocs.get(current).docs.size();
    }

    @Override
    public long totalTermFreq() {
      return ramField.termToDocs.get(current).totalTermFreq;
    }

    @Override
    public DocsEnum docs(Bits liveDocs, DocsEnum reuse, int flags) {
      return new RAMDocsEnum(ramField.termToDocs.get(current), liveDocs);
    }

    @Override
    public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, int flags) {
      return new RAMDocsAndPositionsEnum(ramField.termToDocs.get(current), liveDocs);
    }
  }

  private static class RAMDocsEnum extends DocsEnum {
    private final RAMTerm ramTerm;
    private final Bits liveDocs;
    private RAMDoc current;
    int upto = -1;
    int posUpto = 0;

    public RAMDocsEnum(RAMTerm ramTerm, Bits liveDocs) {
      this.ramTerm = ramTerm;
      this.liveDocs = liveDocs;
    }

    @Override
    public int advance(int targetDocID) {
      do {
        nextDoc();
      } while (upto < ramTerm.docs.size() && current.docID < targetDocID);
      return NO_MORE_DOCS;
    }

    // TODO: override bulk read, for better perf
    @Override
    public int nextDoc() {
      while(true) {
        upto++;
        if (upto < ramTerm.docs.size()) {
          current = ramTerm.docs.get(upto);
          if (liveDocs == null || liveDocs.get(current.docID)) {
            posUpto = 0;
            return current.docID;
          }
        } else {
          return NO_MORE_DOCS;
        }
      }
    }

    @Override
    public int freq() throws IOException {
      return current.positions.length;
    }

    @Override
    public int docID() {
      return current.docID;
    }
  }

  private static class RAMDocsAndPositionsEnum extends DocsAndPositionsEnum {
    private final RAMTerm ramTerm;
    private final Bits liveDocs;
    private RAMDoc current;
    int upto = -1;
    int posUpto = 0;

    public RAMDocsAndPositionsEnum(RAMTerm ramTerm, Bits liveDocs) {
      this.ramTerm = ramTerm;
      this.liveDocs = liveDocs;
    }

    @Override
    public int advance(int targetDocID) {
      do {
        nextDoc();
      } while (upto < ramTerm.docs.size() && current.docID < targetDocID);
      return NO_MORE_DOCS;
    }

    // TODO: override bulk read, for better perf
    @Override
    public int nextDoc() {
      while(true) {
        upto++;
        if (upto < ramTerm.docs.size()) {
          current = ramTerm.docs.get(upto);
          if (liveDocs == null || liveDocs.get(current.docID)) {
            posUpto = 0;
            return current.docID;
          }
        } else {
          return NO_MORE_DOCS;
        }
      }
    }

    @Override
    public int freq() throws IOException {
      return current.positions.length;
    }

    @Override
    public int docID() {
      return current.docID;
    }

    @Override
    public int nextPosition() {
      return current.positions[posUpto++];
    }

    @Override
    public int startOffset() {
      return -1;
    }

    @Override
    public int endOffset() {
      return -1;
    }

    @Override
    public BytesRef getPayload() {
      if (current.payloads != null && current.payloads[posUpto-1] != null) {
        return new BytesRef(current.payloads[posUpto-1]);
      } else {
        return null;
      }
    }
  }

  // Holds all indexes created, keyed by the ID assigned in fieldsConsumer
  private final Map<Integer,RAMPostings> state = new HashMap<Integer,RAMPostings>();

  private final AtomicInteger nextID = new AtomicInteger();

  private final String RAM_ONLY_NAME = "RAMOnly";
  private final static int VERSION_START = 0;
  private final static int VERSION_LATEST = VERSION_START;

  private static final String ID_EXTENSION = "id";

  @Override
  public FieldsConsumer fieldsConsumer(SegmentWriteState writeState) throws IOException {
    final int id = nextID.getAndIncrement();

    // TODO -- ok to do this up front instead of
    // on close....?  should be ok?
    // Write our ID:
    final String idFileName = IndexFileNames.segmentFileName(writeState.segmentInfo.name, writeState.segmentSuffix, ID_EXTENSION);
    IndexOutput out = writeState.directory.createOutput(idFileName, writeState.context);
    boolean success = false;
    try {
      CodecUtil.writeHeader(out, RAM_ONLY_NAME, VERSION_LATEST);
      out.writeVInt(id);
      success = true;
    } finally {
      if (!success) {
        IOUtils.closeWhileHandlingException(out);
      } else {
        IOUtils.close(out);
      }
    }
    
    final RAMPostings postings = new RAMPostings();
    final RAMFieldsConsumer consumer = new RAMFieldsConsumer(postings);

    synchronized(state) {
      state.put(id, postings);
    }
    return consumer;
  }

  @Override
  public FieldsProducer fieldsProducer(SegmentReadState readState)
    throws IOException {

    // Load our ID:
    final String idFileName = IndexFileNames.segmentFileName(readState.segmentInfo.name, readState.segmentSuffix, ID_EXTENSION);
    IndexInput in = readState.dir.openInput(idFileName, readState.context);
    boolean success = false;
    final int id;
    try {
      CodecUtil.checkHeader(in, RAM_ONLY_NAME, VERSION_START, VERSION_LATEST);
      id = in.readVInt();
      success = true;
    } finally {
      if (!success) {
        IOUtils.closeWhileHandlingException(in);
      } else {
        IOUtils.close(in);
      }
    }
    
    synchronized(state) {
      return state.get(id);
    }
  }
}