/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.index; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.SortedSet; import java.util.TreeSet; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.TermsEnum.SeekStatus; import org.apache.lucene.search.AutomatonQuery; import org.apache.lucene.search.CheckHits; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.store.Directory; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.TestUtil; import org.apache.lucene.util.automaton.*; import static org.apache.lucene.util.automaton.Operations.DEFAULT_MAX_DETERMINIZED_STATES; public class TestTermsEnum2 extends LuceneTestCase { private Directory dir; private IndexReader reader; private IndexSearcher searcher; private SortedSet<BytesRef> terms; // the terms we put in the index private Automaton termsAutomaton; // automata of the same int numIterations; @Override public void setUp() throws Exception { super.setUp(); numIterations = atLeast(50); dir = newDirectory(); RandomIndexWriter writer = new RandomIndexWriter(random(), dir, newIndexWriterConfig(new MockAnalyzer(random(), MockTokenizer.KEYWORD, false)) .setMaxBufferedDocs(TestUtil.nextInt(random(), 50, 1000))); Document doc = new Document(); Field field = newStringField("field", "", Field.Store.YES); doc.add(field); terms = new TreeSet<>(); int num = atLeast(200); for (int i = 0; i < num; i++) { String s = TestUtil.randomUnicodeString(random()); field.setStringValue(s); terms.add(new BytesRef(s)); writer.addDocument(doc); } termsAutomaton = Automata.makeStringUnion(terms); reader = writer.getReader(); searcher = newSearcher(reader); writer.close(); } @Override public void tearDown() throws Exception { reader.close(); dir.close(); super.tearDown(); } /** tests a pre-intersected automaton against the original */ public void testFiniteVersusInfinite() throws Exception { for (int i = 0; i < numIterations; i++) { String reg = AutomatonTestUtil.randomRegexp(random()); Automaton automaton = Operations.determinize(new RegExp(reg, RegExp.NONE).toAutomaton(), DEFAULT_MAX_DETERMINIZED_STATES); final List<BytesRef> matchedTerms = new ArrayList<>(); for(BytesRef t : terms) { if (Operations.run(automaton, t.utf8ToString())) { matchedTerms.add(t); } } Automaton alternate = Automata.makeStringUnion(matchedTerms); //System.out.println("match " + matchedTerms.size() + " " + alternate.getNumberOfStates() + " states, sigma=" + alternate.getStartPoints().length); //AutomatonTestUtil.minimizeSimple(alternate); //System.out.println("minimize done"); AutomatonQuery a1 = new AutomatonQuery(new Term("field", ""), automaton); AutomatonQuery a2 = new AutomatonQuery(new Term("field", ""), alternate, Integer.MAX_VALUE); ScoreDoc[] origHits = searcher.search(a1, 25).scoreDocs; ScoreDoc[] newHits = searcher.search(a2, 25).scoreDocs; CheckHits.checkEqual(a1, origHits, newHits); } } /** seeks to every term accepted by some automata */ public void testSeeking() throws Exception { for (int i = 0; i < numIterations; i++) { String reg = AutomatonTestUtil.randomRegexp(random()); Automaton automaton = Operations.determinize(new RegExp(reg, RegExp.NONE).toAutomaton(), DEFAULT_MAX_DETERMINIZED_STATES); TermsEnum te = MultiFields.getTerms(reader, "field").iterator(); ArrayList<BytesRef> unsortedTerms = new ArrayList<>(terms); Collections.shuffle(unsortedTerms, random()); for (BytesRef term : unsortedTerms) { if (Operations.run(automaton, term.utf8ToString())) { // term is accepted if (random().nextBoolean()) { // seek exact assertTrue(te.seekExact(term)); } else { // seek ceil assertEquals(SeekStatus.FOUND, te.seekCeil(term)); assertEquals(term, te.term()); } } } } } /** mixes up seek and next for all terms */ public void testSeekingAndNexting() throws Exception { for (int i = 0; i < numIterations; i++) { TermsEnum te = MultiFields.getTerms(reader, "field").iterator(); for (BytesRef term : terms) { int c = random().nextInt(3); if (c == 0) { assertEquals(term, te.next()); } else if (c == 1) { assertEquals(SeekStatus.FOUND, te.seekCeil(term)); assertEquals(term, te.term()); } else { assertTrue(te.seekExact(term)); } } } } /** tests intersect: TODO start at a random term! */ public void testIntersect() throws Exception { for (int i = 0; i < numIterations; i++) { String reg = AutomatonTestUtil.randomRegexp(random()); Automaton automaton = new RegExp(reg, RegExp.NONE).toAutomaton(); CompiledAutomaton ca = new CompiledAutomaton(automaton, Operations.isFinite(automaton), false); TermsEnum te = MultiFields.getTerms(reader, "field").intersect(ca, null); Automaton expected = Operations.determinize(Operations.intersection(termsAutomaton, automaton), DEFAULT_MAX_DETERMINIZED_STATES); TreeSet<BytesRef> found = new TreeSet<>(); while (te.next() != null) { found.add(BytesRef.deepCopyOf(te.term())); } Automaton actual = Operations.determinize(Automata.makeStringUnion(found), DEFAULT_MAX_DETERMINIZED_STATES); assertTrue(Operations.sameLanguage(expected, actual)); } } }