package org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.HashSet; import java.util.List; import java.util.Set; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.store.Directory; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.LineFileDocs; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util._TestUtil; public class TestTermsEnum extends LuceneTestCase { public void test() throws Exception { final LineFileDocs docs = new LineFileDocs(random); final Directory d = newDirectory(); final RandomIndexWriter w = new RandomIndexWriter(random, d); final int numDocs = atLeast(10); for(int docCount=0;docCount<numDocs;docCount++) { w.addDocument(docs.nextDoc()); } final IndexReader r = w.getReader(); w.close(); final List<Term> terms = new ArrayList<Term>(); TermEnum termEnum = r.terms(new Term("body")); do { Term term = termEnum.term(); if (term == null || !"body".equals(term.field())) { break; } terms.add(term); } while (termEnum.next()); if (VERBOSE) { System.out.println("TEST: " + terms.size() + " terms"); } int upto = -1; final int iters = atLeast(200); for(int iter=0;iter<iters;iter++) { final boolean isEnd; if (upto != -1 && random.nextBoolean()) { // next if (VERBOSE) { System.out.println("TEST: iter next"); } termEnum.next(); isEnd = termEnum.term() == null || !"body".equals(termEnum.term().field()); upto++; if (isEnd) { if (VERBOSE) { System.out.println(" end"); } assertEquals(upto, terms.size()); upto = -1; } else { if (VERBOSE) { System.out.println(" got term=" + termEnum.term() + " expected=" + terms.get(upto)); } assertTrue(upto < terms.size()); assertEquals(terms.get(upto), termEnum.term()); } } else { final Term target; final String exists; if (random.nextBoolean()) { // likely fake term if (random.nextBoolean()) { target = new Term("body", _TestUtil.randomSimpleString(random)); } else { target = new Term("body", _TestUtil.randomRealisticUnicodeString(random)); } exists = "likely not"; } else { // real term target = terms.get(random.nextInt(terms.size())); exists = "yes"; } upto = Collections.binarySearch(terms, target); if (VERBOSE) { System.out.println("TEST: iter seekCeil target=" + target + " exists=" + exists); } termEnum = r.terms(target); final Term actualTerm = termEnum.term(); if (VERBOSE) { System.out.println(" got term=" + actualTerm); } if (upto < 0) { upto = -(upto+1); if (upto >= terms.size()) { assertTrue(actualTerm == null || !"body".equals(actualTerm.field())); upto = -1; } else { assertTrue(actualTerm != null && "body".equals(actualTerm.field())); assertEquals(terms.get(upto), actualTerm); } } else { assertEquals(terms.get(upto), actualTerm); } } } r.close(); d.close(); } private Directory d; private IndexReader r; private final String FIELD = "field"; private IndexReader makeIndex(String... terms) throws Exception { d = newDirectory(); IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)); /* CoreCodecProvider cp = new CoreCodecProvider(); cp.unregister(cp.lookup("Standard")); cp.register(new StandardCodec(minTermsInBlock, maxTermsInBlock)); cp.setDefaultFieldCodec("Standard"); iwc.setCodecProvider(cp); */ final RandomIndexWriter w = new RandomIndexWriter(random, d, iwc); w.w.setInfoStream(VERBOSE ? System.out : null); for(String term : terms) { Document doc = new Document(); Field f = newField(FIELD, term, Field.Store.NO, Field.Index.NOT_ANALYZED); doc.add(f); w.addDocument(doc); } if (r != null) { close(); } r = w.getReader(); w.close(); return r; } private void close() throws Exception { final Directory d = ((SegmentReader) r.getSequentialSubReaders()[0]).directory(); r.close(); d.close(); } private int docFreq(IndexReader r, String term) throws Exception { return r.docFreq(new Term(FIELD, term)); } public void testEasy() throws Exception { // No floor arcs: r = makeIndex("aa0", "aa1", "aa2", "aa3", "bb0", "bb1", "bb2", "bb3", "aa"); // First term in block: assertEquals(1, docFreq(r, "aa0")); // Scan forward to another term in same block assertEquals(1, docFreq(r, "aa2")); assertEquals(1, docFreq(r, "aa")); // Reset same block then scan forwards assertEquals(1, docFreq(r, "aa1")); // Not found, in same block assertEquals(0, docFreq(r, "aa5")); // Found, in same block assertEquals(1, docFreq(r, "aa2")); // Not found in index: assertEquals(0, docFreq(r, "b0")); // Found: assertEquals(1, docFreq(r, "aa2")); // Found, rewind: assertEquals(1, docFreq(r, "aa0")); // First term in block: assertEquals(1, docFreq(r, "bb0")); // Scan forward to another term in same block assertEquals(1, docFreq(r, "bb2")); // Reset same block then scan forwards assertEquals(1, docFreq(r, "bb1")); // Not found, in same block assertEquals(0, docFreq(r, "bb5")); // Found, in same block assertEquals(1, docFreq(r, "bb2")); // Not found in index: assertEquals(0, docFreq(r, "b0")); // Found: assertEquals(1, docFreq(r, "bb2")); // Found, rewind: assertEquals(1, docFreq(r, "bb0")); close(); } // tests: // - test same prefix has non-floor block and floor block (ie, has 2 long outputs on same term prefix) // - term that's entirely in the index public void testFloorBlocks() throws Exception { final String[] terms = new String[] {"aa0", "aa1", "aa2", "aa3", "aa4", "aa5", "aa6", "aa7", "aa8", "aa9", "aa", "xx"}; r = makeIndex(terms); //r = makeIndex("aa0", "aa1", "aa2", "aa3", "aa4", "aa5", "aa6", "aa7", "aa8", "aa9"); // First term in first block: assertEquals(1, docFreq(r, "aa0")); assertEquals(1, docFreq(r, "aa4")); // No block assertEquals(0, docFreq(r, "bb0")); // Second block assertEquals(1, docFreq(r, "aa4")); // Backwards to prior floor block: assertEquals(1, docFreq(r, "aa0")); // Forwards to last floor block: assertEquals(1, docFreq(r, "aa9")); assertEquals(0, docFreq(r, "a")); assertEquals(1, docFreq(r, "aa")); assertEquals(0, docFreq(r, "a")); assertEquals(1, docFreq(r, "aa")); // Forwards to last floor block: assertEquals(1, docFreq(r, "xx")); assertEquals(1, docFreq(r, "aa1")); assertEquals(0, docFreq(r, "yy")); assertEquals(1, docFreq(r, "xx")); assertEquals(1, docFreq(r, "aa9")); assertEquals(1, docFreq(r, "xx")); assertEquals(1, docFreq(r, "aa4")); final TermEnum te = r.terms(new Term(FIELD)); while(te.next()) { //System.out.println("TEST: next term=" + te.term().utf8ToString()); } testRandomSeeks(r, terms); close(); } public void testZeroTerms() throws Exception { d = newDirectory(); final RandomIndexWriter w = new RandomIndexWriter(random, d); w.w.setInfoStream(VERBOSE ? System.out : null); Document doc = new Document(); doc.add(newField("field", "one two three", Field.Store.NO, Field.Index.ANALYZED)); doc = new Document(); doc.add(newField("field2", "one two three", Field.Store.NO, Field.Index.ANALYZED)); w.addDocument(doc); w.commit(); w.deleteDocuments(new Term("field", "one")); w.forceMerge(1); IndexReader r = w.getReader(); w.close(); assertEquals(1, r.numDocs()); assertEquals(1, r.maxDoc()); TermEnum terms = r.terms(new Term("field")); if (terms != null) { assertTrue(!terms.next() || !"field".equals(terms.term().field())); } r.close(); d.close(); } private String getRandomString() { //return _TestUtil.randomSimpleString(random); return _TestUtil.randomRealisticUnicodeString(random); } public void testRandomTerms() throws Exception { final String[] terms = new String[_TestUtil.nextInt(random, 1, atLeast(1000))]; final Set<String> seen = new HashSet<String>(); final boolean allowEmptyString = random.nextBoolean(); if (random.nextInt(10) == 7 && terms.length > 2) { // Sometimes add a bunch of terms sharing a longish common prefix: final int numTermsSamePrefix = random.nextInt(terms.length/2); if (numTermsSamePrefix > 0) { String prefix; while(true) { prefix = getRandomString(); if (prefix.length() < 5) { continue; } else { break; } } while(seen.size() < numTermsSamePrefix) { final String t = prefix + getRandomString(); if (!seen.contains(t)) { terms[seen.size()] = t; seen.add(t); } } } } while(seen.size() < terms.length) { final String t = getRandomString(); if (!seen.contains(t) && (allowEmptyString || t.length() != 0)) { terms[seen.size()] = t; seen.add(t); } } r = makeIndex(terms); testRandomSeeks(r, terms); close(); } private BytesRef getNonExistTerm(BytesRef[] terms) { BytesRef t = null; while(true) { final String ts = getRandomString(); t = new BytesRef(ts); if (Arrays.binarySearch(terms, t) < 0) { return t; } } } private void testRandomSeeks(IndexReader r, String... validTermStrings) throws IOException { final BytesRef[] validTerms = new BytesRef[validTermStrings.length]; for(int termIDX=0;termIDX<validTermStrings.length;termIDX++) { validTerms[termIDX] = new BytesRef(validTermStrings[termIDX]); } Arrays.sort(validTerms, BytesRef.getUTF8SortedAsUTF16Comparator()); if (VERBOSE) { System.out.println("TEST: " + validTerms.length + " terms:"); for(int idx=0;idx<validTerms.length;idx++) { System.out.println(" " + idx + ": " + validTerms[idx]); } } final int END_LOC = -validTerms.length-1; for(int iter=0;iter<100*RANDOM_MULTIPLIER;iter++) { final BytesRef t; int loc; if (random.nextInt(6) == 4) { // pick term that doens't exist: t = getNonExistTerm(validTerms); if (VERBOSE) { System.out.println("\nTEST: invalid term=" + t.utf8ToString()); } loc = Arrays.binarySearch(validTerms, t, BytesRef.getUTF8SortedAsUTF16Comparator()); } else { // pick valid term loc = random.nextInt(validTerms.length); t = new BytesRef(validTerms[loc]); if (VERBOSE) { System.out.println("\nTEST: valid term=" + t.utf8ToString()); } } final Term targetTerm = new Term(FIELD, t.utf8ToString()); if (VERBOSE) { System.out.println(" seek term=" + targetTerm); } final TermEnum te = r.terms(targetTerm); Term actualTerm = te.term(); if (VERBOSE) { System.out.println(" got " + actualTerm); } if (loc >= 0) { // assertEquals(TermsEnum.SeekStatus.FOUND, result); } else if (loc == END_LOC) { assertTrue(actualTerm == null || !FIELD.equals(actualTerm.field())); } else { assert loc >= -validTerms.length; assertTrue(actualTerm != null && FIELD.equals(actualTerm.field())); //assertEquals(TermsEnum.SeekStatus.NOT_FOUND, result); } if (loc >= 0) { assertEquals(targetTerm, actualTerm); } else if (loc == END_LOC) { continue; } else { loc = -loc-1; assertEquals(new Term(FIELD, validTerms[loc].utf8ToString()), actualTerm); } // Do a bunch of next's after the seek final int numNext = random.nextInt(validTerms.length); if (VERBOSE) { System.out.println("\nTEST: numNext=" + numNext); } for(int nextCount=0;nextCount<numNext;nextCount++) { if (VERBOSE) { System.out.println("\nTEST: next loc=" + loc + " of " + validTerms.length); } boolean result = te.next(); actualTerm = te.term(); loc++; if (loc == validTerms.length) { if (VERBOSE) { System.out.println(" actual=null"); } assertFalse(result); assertTrue(actualTerm == null || !FIELD.equals(actualTerm.field())); break; } else { if (VERBOSE) { System.out.println(" actual=" + new BytesRef(actualTerm.text())); } assertTrue(result); assertTrue(actualTerm != null && FIELD.equals(actualTerm.field())); assertEquals(validTerms[loc], new BytesRef(actualTerm.text())); } } } } }