TestTermsEnum2.java example

Explorer
lucene-solr-master
- lucene
- solr
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.index;


import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.SortedSet;
import java.util.TreeSet;

import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.TermsEnum.SeekStatus;
import org.apache.lucene.search.AutomatonQuery;
import org.apache.lucene.search.CheckHits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
import org.apache.lucene.util.automaton.*;

import static org.apache.lucene.util.automaton.Operations.DEFAULT_MAX_DETERMINIZED_STATES;

public class TestTermsEnum2 extends LuceneTestCase {
  private Directory dir;
  private IndexReader reader;
  private IndexSearcher searcher;
  private SortedSet<BytesRef> terms; // the terms we put in the index
  private Automaton termsAutomaton; // automata of the same
  int numIterations;

  @Override
  public void setUp() throws Exception {
    super.setUp();
    numIterations = atLeast(50);
    dir = newDirectory();
    RandomIndexWriter writer = new RandomIndexWriter(random(), dir,
        newIndexWriterConfig(new MockAnalyzer(random(), MockTokenizer.KEYWORD, false))
            .setMaxBufferedDocs(TestUtil.nextInt(random(), 50, 1000)));
    Document doc = new Document();
    Field field = newStringField("field", "", Field.Store.YES);
    doc.add(field);
    terms = new TreeSet<>();
 
    int num = atLeast(200);
    for (int i = 0; i < num; i++) {
      String s = TestUtil.randomUnicodeString(random());
      field.setStringValue(s);
      terms.add(new BytesRef(s));
      writer.addDocument(doc);
    }
    
    termsAutomaton = Automata.makeStringUnion(terms);
    
    reader = writer.getReader();
    searcher = newSearcher(reader);
    writer.close();
  }
  
  @Override
  public void tearDown() throws Exception {
    reader.close();
    dir.close();
    super.tearDown();
  }
  
  /** tests a pre-intersected automaton against the original */
  public void testFiniteVersusInfinite() throws Exception {

    for (int i = 0; i < numIterations; i++) {
      String reg = AutomatonTestUtil.randomRegexp(random());
      Automaton automaton = Operations.determinize(new RegExp(reg, RegExp.NONE).toAutomaton(),
        DEFAULT_MAX_DETERMINIZED_STATES);
      final List<BytesRef> matchedTerms = new ArrayList<>();
      for(BytesRef t : terms) {
        if (Operations.run(automaton, t.utf8ToString())) {
          matchedTerms.add(t);
        }
      }

      Automaton alternate = Automata.makeStringUnion(matchedTerms);
      //System.out.println("match " + matchedTerms.size() + " " + alternate.getNumberOfStates() + " states, sigma=" + alternate.getStartPoints().length);
      //AutomatonTestUtil.minimizeSimple(alternate);
      //System.out.println("minimize done");
      AutomatonQuery a1 = new AutomatonQuery(new Term("field", ""), automaton);
      AutomatonQuery a2 = new AutomatonQuery(new Term("field", ""), alternate, Integer.MAX_VALUE);

      ScoreDoc[] origHits = searcher.search(a1, 25).scoreDocs;
      ScoreDoc[] newHits = searcher.search(a2, 25).scoreDocs;
      CheckHits.checkEqual(a1, origHits, newHits);
    }
  }
  
  /** seeks to every term accepted by some automata */
  public void testSeeking() throws Exception {
    for (int i = 0; i < numIterations; i++) {
      String reg = AutomatonTestUtil.randomRegexp(random());
      Automaton automaton = Operations.determinize(new RegExp(reg, RegExp.NONE).toAutomaton(),
        DEFAULT_MAX_DETERMINIZED_STATES);
      TermsEnum te = MultiFields.getTerms(reader, "field").iterator();
      ArrayList<BytesRef> unsortedTerms = new ArrayList<>(terms);
      Collections.shuffle(unsortedTerms, random());

      for (BytesRef term : unsortedTerms) {
        if (Operations.run(automaton, term.utf8ToString())) {
          // term is accepted
          if (random().nextBoolean()) {
            // seek exact
            assertTrue(te.seekExact(term));
          } else {
            // seek ceil
            assertEquals(SeekStatus.FOUND, te.seekCeil(term));
            assertEquals(term, te.term());
          }
        }
      }
    }
  }
  
  /** mixes up seek and next for all terms */
  public void testSeekingAndNexting() throws Exception {
    for (int i = 0; i < numIterations; i++) {
      TermsEnum te = MultiFields.getTerms(reader, "field").iterator();

      for (BytesRef term : terms) {
        int c = random().nextInt(3);
        if (c == 0) {
          assertEquals(term, te.next());
        } else if (c == 1) {
          assertEquals(SeekStatus.FOUND, te.seekCeil(term));
          assertEquals(term, te.term());
        } else {
          assertTrue(te.seekExact(term));
        }
      }
    }
  }
  
  /** tests intersect: TODO start at a random term! */
  public void testIntersect() throws Exception {
    for (int i = 0; i < numIterations; i++) {
      String reg = AutomatonTestUtil.randomRegexp(random());
      Automaton automaton = new RegExp(reg, RegExp.NONE).toAutomaton();
      CompiledAutomaton ca = new CompiledAutomaton(automaton, Operations.isFinite(automaton), false);
      TermsEnum te = MultiFields.getTerms(reader, "field").intersect(ca, null);
      Automaton expected = Operations.determinize(Operations.intersection(termsAutomaton, automaton),
        DEFAULT_MAX_DETERMINIZED_STATES);
      TreeSet<BytesRef> found = new TreeSet<>();
      while (te.next() != null) {
        found.add(BytesRef.deepCopyOf(te.term()));
      }
      
      Automaton actual = Operations.determinize(Automata.makeStringUnion(found),
        DEFAULT_MAX_DETERMINIZED_STATES);
      assertTrue(Operations.sameLanguage(expected, actual));
    }
  }
}