TestTermInfosReaderIndex.java example

Explorer
solr-analytics-master
- lucene
- solr
package org.apache.lucene.codecs.lucene3x;

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Random;

import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.codecs.FieldInfosReader;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.LogMergePolicy;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.SegmentReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util._TestUtil;
import org.junit.AfterClass;
import org.junit.BeforeClass;

public class TestTermInfosReaderIndex extends LuceneTestCase {
  
  private static int NUMBER_OF_DOCUMENTS;
  private static int NUMBER_OF_FIELDS;
  private static TermInfosReaderIndex index;
  private static Directory directory;
  private static SegmentTermEnum termEnum;
  private static int indexDivisor;
  private static int termIndexInterval;
  private static IndexReader reader;
  private static List<Term> sampleTerms;
  
  /** we will manually instantiate preflex-rw here */
  @BeforeClass
  public static void beforeClass() throws Exception {
    LuceneTestCase.PREFLEX_IMPERSONATION_IS_ACTIVE = true;
    IndexWriterConfig config = newIndexWriterConfig(TEST_VERSION_CURRENT, 
        new MockAnalyzer(random(), MockTokenizer.KEYWORD, false));
    
    termIndexInterval = config.getTermIndexInterval();
    indexDivisor = _TestUtil.nextInt(random(), 1, 10);
    NUMBER_OF_DOCUMENTS = atLeast(100);
    NUMBER_OF_FIELDS = atLeast(Math.max(10, 3*termIndexInterval*indexDivisor/NUMBER_OF_DOCUMENTS));
    
    directory = newDirectory();

    config.setCodec(new PreFlexRWCodec());
    LogMergePolicy mp = newLogMergePolicy();
    // turn off compound file, this test will open some index files directly.
    mp.setUseCompoundFile(false);
    config.setMergePolicy(mp);

    
    populate(directory, config);

    DirectoryReader r0 = IndexReader.open(directory);
    SegmentReader r = LuceneTestCase.getOnlySegmentReader(r0);
    String segment = r.getSegmentName();
    r.close();

    FieldInfosReader infosReader = new PreFlexRWCodec().fieldInfosFormat().getFieldInfosReader();
    FieldInfos fieldInfos = infosReader.read(directory, segment, IOContext.READONCE);
    String segmentFileName = IndexFileNames.segmentFileName(segment, "", Lucene3xPostingsFormat.TERMS_INDEX_EXTENSION);
    long tiiFileLength = directory.fileLength(segmentFileName);
    IndexInput input = directory.openInput(segmentFileName, newIOContext(random()));
    termEnum = new SegmentTermEnum(directory.openInput(IndexFileNames.segmentFileName(segment, "", Lucene3xPostingsFormat.TERMS_EXTENSION), newIOContext(random())), fieldInfos, false);
    int totalIndexInterval = termEnum.indexInterval * indexDivisor;
    
    SegmentTermEnum indexEnum = new SegmentTermEnum(input, fieldInfos, true);
    index = new TermInfosReaderIndex(indexEnum, indexDivisor, tiiFileLength, totalIndexInterval);
    indexEnum.close();
    input.close();
    
    reader = IndexReader.open(directory);
    sampleTerms = sample(random(),reader,1000);
  }
  
  @AfterClass
  public static void afterClass() throws Exception {
    termEnum.close();
    reader.close();
    directory.close();
    termEnum = null;
    reader = null;
    directory = null;
    index = null;
    sampleTerms = null;
  }
  
  public void testSeekEnum() throws CorruptIndexException, IOException {
    int indexPosition = 3;
    SegmentTermEnum clone = termEnum.clone();
    Term term = findTermThatWouldBeAtIndex(clone, indexPosition);
    SegmentTermEnum enumerator = clone;
    index.seekEnum(enumerator, indexPosition);
    assertEquals(term, enumerator.term());
    clone.close();
  }
  
  public void testCompareTo() throws IOException {
    Term term = new Term("field" + random().nextInt(NUMBER_OF_FIELDS) ,getText());
    for (int i = 0; i < index.length(); i++) {
      Term t = index.getTerm(i);
      int compareTo = term.compareTo(t);
      assertEquals(compareTo, index.compareTo(term, i));
    }
  }
  
  public void testRandomSearchPerformance() throws CorruptIndexException, IOException {
    IndexSearcher searcher = new IndexSearcher(reader);
    for (Term t : sampleTerms) {
      TermQuery query = new TermQuery(t);
      TopDocs topDocs = searcher.search(query, 10);
      assertTrue(topDocs.totalHits > 0);
    }
  }

  private static List<Term> sample(Random random, IndexReader reader, int size) throws IOException {
    List<Term> sample = new ArrayList<Term>();
    Fields fields = MultiFields.getFields(reader);
    for (String field : fields) {
      Terms terms = fields.terms(field);
      assertNotNull(terms);
      TermsEnum termsEnum = terms.iterator(null);
      while (termsEnum.next() != null) {
        if (sample.size() >= size) {
          int pos = random.nextInt(size);
          sample.set(pos, new Term(field, termsEnum.term()));
        } else {
          sample.add(new Term(field, termsEnum.term()));
        }
      }
    }
    Collections.shuffle(sample);
    return sample;
  }

  private Term findTermThatWouldBeAtIndex(SegmentTermEnum termEnum, int index) throws IOException {
    int termPosition = index * termIndexInterval * indexDivisor;
    for (int i = 0; i < termPosition; i++) {
      // TODO: this test just uses random terms, so this is always possible
      assumeTrue("ran out of terms", termEnum.next());
    }
    final Term term = termEnum.term();
    // An indexed term is only written when the term after
    // it exists, so, if the number of terms is 0 mod
    // termIndexInterval, the last index term will not be
    // written; so we require a term after this term
    // as well:
    assumeTrue("ran out of terms", termEnum.next());
    return term;
  }

  private static void populate(Directory directory, IndexWriterConfig config) throws CorruptIndexException, LockObtainFailedException, IOException {
    RandomIndexWriter writer = new RandomIndexWriter(random(), directory, config);
    for (int i = 0; i < NUMBER_OF_DOCUMENTS; i++) {
      Document document = new Document();
      for (int f = 0; f < NUMBER_OF_FIELDS; f++) {
        document.add(newStringField("field" + f, getText(), Field.Store.NO));
      }
      writer.addDocument(document);
    }
    writer.forceMerge(1);
    writer.close();
  }

  private static String getText() {
    return Long.toString(random().nextLong(),Character.MAX_RADIX);
  }
}