package org.apache.lucene.index; /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.util.Arrays; import java.util.List; import java.util.ArrayList; import java.util.HashSet; import java.util.Set; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.IntField; import org.apache.lucene.index.DocTermOrds.TermOrdsIterator; import org.apache.lucene.search.FieldCache; import org.apache.lucene.store.Directory; import org.apache.lucene.store.MockDirectoryWrapper; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.StringHelper; import org.apache.lucene.util._TestUtil; // TODO: // - test w/ del docs // - test prefix // - test w/ cutoff // - crank docs way up so we get some merging sometimes public class TestDocTermOrds extends LuceneTestCase { public void testSimple() throws Exception { Directory dir = newDirectory(); final RandomIndexWriter w = new RandomIndexWriter(random(), dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())).setMergePolicy(newLogMergePolicy())); Document doc = new Document(); Field field = newTextField("field", "", Field.Store.NO); doc.add(field); field.setStringValue("a b c"); w.addDocument(doc); field.setStringValue("d e f"); w.addDocument(doc); field.setStringValue("a f"); w.addDocument(doc); final IndexReader r = w.getReader(); w.close(); final DocTermOrds dto = new DocTermOrds(SlowCompositeReaderWrapper.wrap(r), "field"); TermOrdsIterator iter = dto.lookup(0, null); final int[] buffer = new int[5]; assertEquals(3, iter.read(buffer)); assertEquals(0, buffer[0]); assertEquals(1, buffer[1]); assertEquals(2, buffer[2]); iter = dto.lookup(1, iter); assertEquals(3, iter.read(buffer)); assertEquals(3, buffer[0]); assertEquals(4, buffer[1]); assertEquals(5, buffer[2]); iter = dto.lookup(2, iter); assertEquals(2, iter.read(buffer)); assertEquals(0, buffer[0]); assertEquals(5, buffer[1]); r.close(); dir.close(); } public void testRandom() throws Exception { Directory dir = newDirectory(); final int NUM_TERMS = atLeast(20); final Set<BytesRef> terms = new HashSet<BytesRef>(); while(terms.size() < NUM_TERMS) { final String s = _TestUtil.randomRealisticUnicodeString(random()); //final String s = _TestUtil.randomSimpleString(random); if (s.length() > 0) { terms.add(new BytesRef(s)); } } final BytesRef[] termsArray = terms.toArray(new BytesRef[terms.size()]); Arrays.sort(termsArray); final int NUM_DOCS = atLeast(100); IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())); // Sometimes swap in codec that impls ord(): if (random().nextInt(10) == 7) { // Make sure terms index has ords: Codec codec = _TestUtil.alwaysPostingsFormat(PostingsFormat.forName("Lucene40WithOrds")); conf.setCodec(codec); } final RandomIndexWriter w = new RandomIndexWriter(random(), dir, conf); final int[][] idToOrds = new int[NUM_DOCS][]; final Set<Integer> ordsForDocSet = new HashSet<Integer>(); for(int id=0;id<NUM_DOCS;id++) { Document doc = new Document(); doc.add(new IntField("id", id, Field.Store.NO)); final int termCount = _TestUtil.nextInt(random(), 0, 20*RANDOM_MULTIPLIER); while(ordsForDocSet.size() < termCount) { ordsForDocSet.add(random().nextInt(termsArray.length)); } final int[] ordsForDoc = new int[termCount]; int upto = 0; if (VERBOSE) { System.out.println("TEST: doc id=" + id); } for(int ord : ordsForDocSet) { ordsForDoc[upto++] = ord; Field field = newStringField("field", termsArray[ord].utf8ToString(), Field.Store.NO); if (VERBOSE) { System.out.println(" f=" + termsArray[ord].utf8ToString()); } doc.add(field); } ordsForDocSet.clear(); Arrays.sort(ordsForDoc); idToOrds[id] = ordsForDoc; w.addDocument(doc); } final DirectoryReader r = w.getReader(); w.close(); if (VERBOSE) { System.out.println("TEST: reader=" + r); } for(AtomicReaderContext ctx : r.leaves()) { if (VERBOSE) { System.out.println("\nTEST: sub=" + ctx.reader()); } verify(ctx.reader(), idToOrds, termsArray, null); } // Also test top-level reader: its enum does not support // ord, so this forces the OrdWrapper to run: if (VERBOSE) { System.out.println("TEST: top reader"); } AtomicReader slowR = SlowCompositeReaderWrapper.wrap(r); verify(slowR, idToOrds, termsArray, null); FieldCache.DEFAULT.purge(slowR); r.close(); dir.close(); } public void testRandomWithPrefix() throws Exception { Directory dir = newDirectory(); final Set<String> prefixes = new HashSet<String>(); final int numPrefix = _TestUtil.nextInt(random(), 2, 7); if (VERBOSE) { System.out.println("TEST: use " + numPrefix + " prefixes"); } while(prefixes.size() < numPrefix) { prefixes.add(_TestUtil.randomRealisticUnicodeString(random())); //prefixes.add(_TestUtil.randomSimpleString(random)); } final String[] prefixesArray = prefixes.toArray(new String[prefixes.size()]); final int NUM_TERMS = atLeast(20); final Set<BytesRef> terms = new HashSet<BytesRef>(); while(terms.size() < NUM_TERMS) { final String s = prefixesArray[random().nextInt(prefixesArray.length)] + _TestUtil.randomRealisticUnicodeString(random()); //final String s = prefixesArray[random.nextInt(prefixesArray.length)] + _TestUtil.randomSimpleString(random); if (s.length() > 0) { terms.add(new BytesRef(s)); } } final BytesRef[] termsArray = terms.toArray(new BytesRef[terms.size()]); Arrays.sort(termsArray); final int NUM_DOCS = atLeast(100); IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())); // Sometimes swap in codec that impls ord(): if (random().nextInt(10) == 7) { Codec codec = _TestUtil.alwaysPostingsFormat(PostingsFormat.forName("Lucene40WithOrds")); conf.setCodec(codec); } final RandomIndexWriter w = new RandomIndexWriter(random(), dir, conf); final int[][] idToOrds = new int[NUM_DOCS][]; final Set<Integer> ordsForDocSet = new HashSet<Integer>(); for(int id=0;id<NUM_DOCS;id++) { Document doc = new Document(); doc.add(new IntField("id", id, Field.Store.NO)); final int termCount = _TestUtil.nextInt(random(), 0, 20*RANDOM_MULTIPLIER); while(ordsForDocSet.size() < termCount) { ordsForDocSet.add(random().nextInt(termsArray.length)); } final int[] ordsForDoc = new int[termCount]; int upto = 0; if (VERBOSE) { System.out.println("TEST: doc id=" + id); } for(int ord : ordsForDocSet) { ordsForDoc[upto++] = ord; Field field = newStringField("field", termsArray[ord].utf8ToString(), Field.Store.NO); if (VERBOSE) { System.out.println(" f=" + termsArray[ord].utf8ToString()); } doc.add(field); } ordsForDocSet.clear(); Arrays.sort(ordsForDoc); idToOrds[id] = ordsForDoc; w.addDocument(doc); } final DirectoryReader r = w.getReader(); w.close(); if (VERBOSE) { System.out.println("TEST: reader=" + r); } AtomicReader slowR = SlowCompositeReaderWrapper.wrap(r); for(String prefix : prefixesArray) { final BytesRef prefixRef = prefix == null ? null : new BytesRef(prefix); final int[][] idToOrdsPrefix = new int[NUM_DOCS][]; for(int id=0;id<NUM_DOCS;id++) { final int[] docOrds = idToOrds[id]; final List<Integer> newOrds = new ArrayList<Integer>(); for(int ord : idToOrds[id]) { if (StringHelper.startsWith(termsArray[ord], prefixRef)) { newOrds.add(ord); } } final int[] newOrdsArray = new int[newOrds.size()]; int upto = 0; for(int ord : newOrds) { newOrdsArray[upto++] = ord; } idToOrdsPrefix[id] = newOrdsArray; } for(AtomicReaderContext ctx : r.leaves()) { if (VERBOSE) { System.out.println("\nTEST: sub=" + ctx.reader()); } verify(ctx.reader(), idToOrdsPrefix, termsArray, prefixRef); } // Also test top-level reader: its enum does not support // ord, so this forces the OrdWrapper to run: if (VERBOSE) { System.out.println("TEST: top reader"); } verify(slowR, idToOrdsPrefix, termsArray, prefixRef); } FieldCache.DEFAULT.purge(slowR); r.close(); dir.close(); } private void verify(AtomicReader r, int[][] idToOrds, BytesRef[] termsArray, BytesRef prefixRef) throws Exception { final DocTermOrds dto = new DocTermOrds(r, "field", prefixRef, Integer.MAX_VALUE, _TestUtil.nextInt(random(), 2, 10)); final int[] docIDToID = FieldCache.DEFAULT.getInts(r, "id", false); /* for(int docID=0;docID<subR.maxDoc();docID++) { System.out.println(" docID=" + docID + " id=" + docIDToID[docID]); } */ if (VERBOSE) { System.out.println("TEST: verify prefix=" + (prefixRef==null ? "null" : prefixRef.utf8ToString())); System.out.println("TEST: all TERMS:"); TermsEnum allTE = MultiFields.getTerms(r, "field").iterator(null); int ord = 0; while(allTE.next() != null) { System.out.println(" ord=" + (ord++) + " term=" + allTE.term().utf8ToString()); } } //final TermsEnum te = subR.fields().terms("field").iterator(); final TermsEnum te = dto.getOrdTermsEnum(r); if (dto.numTerms() == 0) { if (prefixRef == null) { assertNull(MultiFields.getTerms(r, "field")); } else { Terms terms = MultiFields.getTerms(r, "field"); if (terms != null) { TermsEnum termsEnum = terms.iterator(null); TermsEnum.SeekStatus result = termsEnum.seekCeil(prefixRef, false); if (result != TermsEnum.SeekStatus.END) { assertFalse("term=" + termsEnum.term().utf8ToString() + " matches prefix=" + prefixRef.utf8ToString(), StringHelper.startsWith(termsEnum.term(), prefixRef)); } else { // ok } } else { // ok } } return; } if (VERBOSE) { System.out.println("TEST: TERMS:"); te.seekExact(0); while(true) { System.out.println(" ord=" + te.ord() + " term=" + te.term().utf8ToString()); if (te.next() == null) { break; } } } TermOrdsIterator iter = null; final int[] buffer = new int[5]; for(int docID=0;docID<r.maxDoc();docID++) { if (VERBOSE) { System.out.println("TEST: docID=" + docID + " of " + r.maxDoc() + " (id=" + docIDToID[docID] + ")"); } iter = dto.lookup(docID, iter); final int[] answers = idToOrds[docIDToID[docID]]; int upto = 0; while(true) { final int chunk = iter.read(buffer); for(int idx=0;idx<chunk;idx++) { te.seekExact((long) buffer[idx]); final BytesRef expected = termsArray[answers[upto++]]; if (VERBOSE) { System.out.println(" exp=" + expected.utf8ToString() + " actual=" + te.term().utf8ToString()); } assertEquals("expected=" + expected.utf8ToString() + " actual=" + te.term().utf8ToString() + " ord=" + buffer[idx], expected, te.term()); } if (chunk < buffer.length) { assertEquals(answers.length, upto); break; } } } } }