package org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.util.Collection; import java.util.Random; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util._TestUtil; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.search.*; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.store.Directory; import org.apache.lucene.search.Explanation.IDFExplanation; public class TestOmitTf extends LuceneTestCase { private Random random; @Override public void setUp() throws Exception { super.setUp(); random = newRandom(); } public static class SimpleSimilarity extends Similarity { @Override public float lengthNorm(String field, int numTerms) { return 1.0f; } @Override public float queryNorm(float sumOfSquaredWeights) { return 1.0f; } @Override public float tf(float freq) { return freq; } @Override public float sloppyFreq(int distance) { return 2.0f; } @Override public float idf(int docFreq, int numDocs) { return 1.0f; } @Override public float coord(int overlap, int maxOverlap) { return 1.0f; } @Override public IDFExplanation idfExplain(Collection<Term> terms, Searcher searcher) throws IOException { return new IDFExplanation() { @Override public float getIdf() { return 1.0f; } @Override public String explain() { return "Inexplicable"; } }; } } // Tests whether the DocumentWriter correctly enable the // omitTermFreqAndPositions bit in the FieldInfo public void testOmitTermFreqAndPositions() throws Exception { Directory ram = newDirectory(random); Analyzer analyzer = new MockAnalyzer(); IndexWriter writer = new IndexWriter(ram, newIndexWriterConfig(random, TEST_VERSION_CURRENT, analyzer)); Document d = new Document(); // this field will have Tf Field f1 = new Field("f1", "This field has term freqs", Field.Store.NO, Field.Index.ANALYZED); d.add(f1); // this field will NOT have Tf Field f2 = new Field("f2", "This field has NO Tf in all docs", Field.Store.NO, Field.Index.ANALYZED); f2.setOmitTermFreqAndPositions(true); d.add(f2); writer.addDocument(d); writer.optimize(); // now we add another document which has term freq for field f2 and not for f1 and verify if the SegmentMerger // keep things constant d = new Document(); // Reverse f1.setOmitTermFreqAndPositions(true); d.add(f1); f2.setOmitTermFreqAndPositions(false); d.add(f2); writer.addDocument(d); // force merge writer.optimize(); // flush writer.close(); _TestUtil.checkIndex(ram); SegmentReader reader = SegmentReader.getOnlySegmentReader(ram); FieldInfos fi = reader.fieldInfos(); assertTrue("OmitTermFreqAndPositions field bit should be set.", fi.fieldInfo("f1").omitTermFreqAndPositions); assertTrue("OmitTermFreqAndPositions field bit should be set.", fi.fieldInfo("f2").omitTermFreqAndPositions); reader.close(); ram.close(); } // Tests whether merging of docs that have different // omitTermFreqAndPositions for the same field works public void testMixedMerge() throws Exception { Directory ram = newDirectory(random); Analyzer analyzer = new MockAnalyzer(); IndexWriter writer = new IndexWriter(ram, newIndexWriterConfig(random, TEST_VERSION_CURRENT, analyzer).setMaxBufferedDocs(3)); ((LogMergePolicy) writer.getConfig().getMergePolicy()).setMergeFactor(2); Document d = new Document(); // this field will have Tf Field f1 = new Field("f1", "This field has term freqs", Field.Store.NO, Field.Index.ANALYZED); d.add(f1); // this field will NOT have Tf Field f2 = new Field("f2", "This field has NO Tf in all docs", Field.Store.NO, Field.Index.ANALYZED); f2.setOmitTermFreqAndPositions(true); d.add(f2); for(int i=0;i<30;i++) writer.addDocument(d); // now we add another document which has term freq for field f2 and not for f1 and verify if the SegmentMerger // keep things constant d = new Document(); // Reverese f1.setOmitTermFreqAndPositions(true); d.add(f1); f2.setOmitTermFreqAndPositions(false); d.add(f2); for(int i=0;i<30;i++) writer.addDocument(d); // force merge writer.optimize(); // flush writer.close(); _TestUtil.checkIndex(ram); SegmentReader reader = SegmentReader.getOnlySegmentReader(ram); FieldInfos fi = reader.fieldInfos(); assertTrue("OmitTermFreqAndPositions field bit should be set.", fi.fieldInfo("f1").omitTermFreqAndPositions); assertTrue("OmitTermFreqAndPositions field bit should be set.", fi.fieldInfo("f2").omitTermFreqAndPositions); reader.close(); ram.close(); } // Make sure first adding docs that do not omitTermFreqAndPositions for // field X, then adding docs that do omitTermFreqAndPositions for that same // field, public void testMixedRAM() throws Exception { Directory ram = newDirectory(random); Analyzer analyzer = new MockAnalyzer(); IndexWriter writer = new IndexWriter(ram, newIndexWriterConfig(random, TEST_VERSION_CURRENT, analyzer).setMaxBufferedDocs(10)); ((LogMergePolicy) writer.getConfig().getMergePolicy()).setMergeFactor(2); Document d = new Document(); // this field will have Tf Field f1 = new Field("f1", "This field has term freqs", Field.Store.NO, Field.Index.ANALYZED); d.add(f1); // this field will NOT have Tf Field f2 = new Field("f2", "This field has NO Tf in all docs", Field.Store.NO, Field.Index.ANALYZED); d.add(f2); for(int i=0;i<5;i++) writer.addDocument(d); f2.setOmitTermFreqAndPositions(true); for(int i=0;i<20;i++) writer.addDocument(d); // force merge writer.optimize(); // flush writer.close(); _TestUtil.checkIndex(ram); SegmentReader reader = SegmentReader.getOnlySegmentReader(ram); FieldInfos fi = reader.fieldInfos(); assertTrue("OmitTermFreqAndPositions field bit should not be set.", !fi.fieldInfo("f1").omitTermFreqAndPositions); assertTrue("OmitTermFreqAndPositions field bit should be set.", fi.fieldInfo("f2").omitTermFreqAndPositions); reader.close(); ram.close(); } private void assertNoPrx(Directory dir) throws Throwable { final String[] files = dir.listAll(); for(int i=0;i<files.length;i++) assertFalse(files[i].endsWith(".prx")); } // Verifies no *.prx exists when all fields omit term freq: public void testNoPrxFile() throws Throwable { Directory ram = newDirectory(random); Analyzer analyzer = new MockAnalyzer(); IndexWriter writer = new IndexWriter(ram, newIndexWriterConfig(random, TEST_VERSION_CURRENT, analyzer).setMaxBufferedDocs(3)); LogMergePolicy lmp = (LogMergePolicy) writer.getConfig().getMergePolicy(); lmp.setMergeFactor(2); lmp.setUseCompoundFile(false); lmp.setUseCompoundDocStore(false); Document d = new Document(); Field f1 = new Field("f1", "This field has term freqs", Field.Store.NO, Field.Index.ANALYZED); f1.setOmitTermFreqAndPositions(true); d.add(f1); for(int i=0;i<30;i++) writer.addDocument(d); writer.commit(); assertNoPrx(ram); // force merge writer.optimize(); // flush writer.close(); assertNoPrx(ram); _TestUtil.checkIndex(ram); ram.close(); } // Test scores with one field with Term Freqs and one without, otherwise with equal content public void testBasic() throws Exception { Directory dir = newDirectory(random); Analyzer analyzer = new MockAnalyzer(); IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(random, TEST_VERSION_CURRENT, analyzer).setMaxBufferedDocs(2) .setSimilarity(new SimpleSimilarity())); ((LogMergePolicy) writer.getConfig().getMergePolicy()).setMergeFactor(2); StringBuilder sb = new StringBuilder(265); String term = "term"; for(int i = 0; i<30; i++){ Document d = new Document(); sb.append(term).append(" "); String content = sb.toString(); Field noTf = new Field("noTf", content + (i%2==0 ? "" : " notf"), Field.Store.NO, Field.Index.ANALYZED); noTf.setOmitTermFreqAndPositions(true); d.add(noTf); Field tf = new Field("tf", content + (i%2==0 ? " tf" : ""), Field.Store.NO, Field.Index.ANALYZED); d.add(tf); writer.addDocument(d); //System.out.println(d); } writer.optimize(); // flush writer.close(); _TestUtil.checkIndex(dir); /* * Verify the index */ Searcher searcher = new IndexSearcher(dir, true); searcher.setSimilarity(new SimpleSimilarity()); Term a = new Term("noTf", term); Term b = new Term("tf", term); Term c = new Term("noTf", "notf"); Term d = new Term("tf", "tf"); TermQuery q1 = new TermQuery(a); TermQuery q2 = new TermQuery(b); TermQuery q3 = new TermQuery(c); TermQuery q4 = new TermQuery(d); PhraseQuery pq = new PhraseQuery(); pq.add(a); pq.add(c); try { searcher.search(pq, 10); fail("did not hit expected exception"); } catch (IllegalStateException ise) { // expected } searcher.search(q1, new CountingHitCollector() { private Scorer scorer; @Override public final void setScorer(Scorer scorer) { this.scorer = scorer; } @Override public final void collect(int doc) throws IOException { //System.out.println("Q1: Doc=" + doc + " score=" + score); float score = scorer.score(); assertTrue(score==1.0f); super.collect(doc); } }); //System.out.println(CountingHitCollector.getCount()); searcher.search(q2, new CountingHitCollector() { private Scorer scorer; @Override public final void setScorer(Scorer scorer) { this.scorer = scorer; } @Override public final void collect(int doc) throws IOException { //System.out.println("Q2: Doc=" + doc + " score=" + score); float score = scorer.score(); assertTrue(score==1.0f+doc); super.collect(doc); } }); //System.out.println(CountingHitCollector.getCount()); searcher.search(q3, new CountingHitCollector() { private Scorer scorer; @Override public final void setScorer(Scorer scorer) { this.scorer = scorer; } @Override public final void collect(int doc) throws IOException { //System.out.println("Q1: Doc=" + doc + " score=" + score); float score = scorer.score(); assertTrue(score==1.0f); assertFalse(doc%2==0); super.collect(doc); } }); //System.out.println(CountingHitCollector.getCount()); searcher.search(q4, new CountingHitCollector() { private Scorer scorer; @Override public final void setScorer(Scorer scorer) { this.scorer = scorer; } @Override public final void collect(int doc) throws IOException { float score = scorer.score(); //System.out.println("Q1: Doc=" + doc + " score=" + score); assertTrue(score==1.0f); assertTrue(doc%2==0); super.collect(doc); } }); //System.out.println(CountingHitCollector.getCount()); BooleanQuery bq = new BooleanQuery(); bq.add(q1,Occur.MUST); bq.add(q4,Occur.MUST); searcher.search(bq, new CountingHitCollector() { @Override public final void collect(int doc) throws IOException { //System.out.println("BQ: Doc=" + doc + " score=" + score); super.collect(doc); } }); assertEquals(15, CountingHitCollector.getCount()); searcher.close(); dir.close(); } public static class CountingHitCollector extends Collector { static int count=0; static int sum=0; private int docBase = -1; CountingHitCollector(){count=0;sum=0;} @Override public void setScorer(Scorer scorer) throws IOException {} @Override public void collect(int doc) throws IOException { count++; sum += doc + docBase; // use it to avoid any possibility of being optimized away } public static int getCount() { return count; } public static int getSum() { return sum; } @Override public void setNextReader(IndexReader reader, int docBase) { this.docBase = docBase; } @Override public boolean acceptsDocsOutOfOrder() { return true; } } }