package org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.io.StringReader; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.CachingTokenFilter; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.analysis.TeeSinkTokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.WhitespaceAnalyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.store.Directory; import org.apache.lucene.store.MockDirectoryWrapper; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.LuceneTestCase; /** tests for writing term vectors */ public class TestTermVectorsWriter extends LuceneTestCase { // LUCENE-1442 public void testDoubleOffsetCounting() throws Exception { Directory dir = newDirectory(); IndexWriter w = new IndexWriter(dir, newIndexWriterConfig( TEST_VERSION_CURRENT, new MockAnalyzer(random))); Document doc = new Document(); Field f = newField("field", "abcd", Field.Store.NO, Field.Index.NOT_ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS); doc.add(f); doc.add(f); Field f2 = newField("field", "", Field.Store.NO, Field.Index.NOT_ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS); doc.add(f2); doc.add(f); w.addDocument(doc); w.close(); IndexReader r = IndexReader.open(dir, true); TermVectorOffsetInfo[] termOffsets = ((TermPositionVector) r.getTermFreqVector(0, "field")).getOffsets(0); // Token "" occurred once assertEquals(1, termOffsets.length); assertEquals(8, termOffsets[0].getStartOffset()); assertEquals(8, termOffsets[0].getEndOffset()); // Token "abcd" occurred three times termOffsets = ((TermPositionVector) r.getTermFreqVector(0, "field")).getOffsets(1); assertEquals(3, termOffsets.length); assertEquals(0, termOffsets[0].getStartOffset()); assertEquals(4, termOffsets[0].getEndOffset()); assertEquals(4, termOffsets[1].getStartOffset()); assertEquals(8, termOffsets[1].getEndOffset()); assertEquals(8, termOffsets[2].getStartOffset()); assertEquals(12, termOffsets[2].getEndOffset()); r.close(); dir.close(); } // LUCENE-1442 public void testDoubleOffsetCounting2() throws Exception { Directory dir = newDirectory(); IndexWriter w = new IndexWriter(dir, newIndexWriterConfig( TEST_VERSION_CURRENT, new MockAnalyzer(random))); Document doc = new Document(); Field f = newField("field", "abcd", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS); doc.add(f); doc.add(f); w.addDocument(doc); w.close(); IndexReader r = IndexReader.open(dir, true); TermVectorOffsetInfo[] termOffsets = ((TermPositionVector) r.getTermFreqVector(0, "field")).getOffsets(0); assertEquals(2, termOffsets.length); assertEquals(0, termOffsets[0].getStartOffset()); assertEquals(4, termOffsets[0].getEndOffset()); assertEquals(5, termOffsets[1].getStartOffset()); assertEquals(9, termOffsets[1].getEndOffset()); r.close(); dir.close(); } // LUCENE-1448 public void testEndOffsetPositionCharAnalyzer() throws Exception { Directory dir = newDirectory(); IndexWriter w = new IndexWriter(dir, newIndexWriterConfig( TEST_VERSION_CURRENT, new MockAnalyzer(random))); Document doc = new Document(); Field f = newField("field", "abcd ", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS); doc.add(f); doc.add(f); w.addDocument(doc); w.close(); IndexReader r = IndexReader.open(dir, true); TermVectorOffsetInfo[] termOffsets = ((TermPositionVector) r.getTermFreqVector(0, "field")).getOffsets(0); assertEquals(2, termOffsets.length); assertEquals(0, termOffsets[0].getStartOffset()); assertEquals(4, termOffsets[0].getEndOffset()); assertEquals(8, termOffsets[1].getStartOffset()); assertEquals(12, termOffsets[1].getEndOffset()); r.close(); dir.close(); } // LUCENE-1448 public void testEndOffsetPositionWithCachingTokenFilter() throws Exception { Directory dir = newDirectory(); Analyzer analyzer = new MockAnalyzer(random); IndexWriter w = new IndexWriter(dir, newIndexWriterConfig( TEST_VERSION_CURRENT, analyzer)); Document doc = new Document(); TokenStream stream = analyzer.tokenStream("field", new StringReader("abcd ")); stream.reset(); // TODO: wierd to reset before wrapping with CachingTokenFilter... correct? stream = new CachingTokenFilter(stream); Field f = new Field("field", stream, Field.TermVector.WITH_POSITIONS_OFFSETS); doc.add(f); doc.add(f); w.addDocument(doc); w.close(); IndexReader r = IndexReader.open(dir, true); TermVectorOffsetInfo[] termOffsets = ((TermPositionVector) r.getTermFreqVector(0, "field")).getOffsets(0); assertEquals(2, termOffsets.length); assertEquals(0, termOffsets[0].getStartOffset()); assertEquals(4, termOffsets[0].getEndOffset()); assertEquals(8, termOffsets[1].getStartOffset()); assertEquals(12, termOffsets[1].getEndOffset()); r.close(); dir.close(); } // LUCENE-1448 public void testEndOffsetPositionWithTeeSinkTokenFilter() throws Exception { MockDirectoryWrapper dir = newDirectory(); Analyzer analyzer = new WhitespaceAnalyzer(TEST_VERSION_CURRENT); IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, analyzer)); Document doc = new Document(); TeeSinkTokenFilter tee = new TeeSinkTokenFilter(analyzer.tokenStream("field", new StringReader("abcd "))); TokenStream sink = tee.newSinkTokenStream(); Field f1 = new Field("field", tee, Field.TermVector.WITH_POSITIONS_OFFSETS); Field f2 = new Field("field", sink, Field.TermVector.WITH_POSITIONS_OFFSETS); doc.add(f1); doc.add(f2); w.addDocument(doc); w.close(); IndexReader r = IndexReader.open(dir, true); TermVectorOffsetInfo[] termOffsets = ((TermPositionVector) r.getTermFreqVector(0, "field")).getOffsets(0); assertEquals(2, termOffsets.length); assertEquals(0, termOffsets[0].getStartOffset()); assertEquals(4, termOffsets[0].getEndOffset()); assertEquals(8, termOffsets[1].getStartOffset()); assertEquals(12, termOffsets[1].getEndOffset()); r.close(); dir.close(); } // LUCENE-1448 public void testEndOffsetPositionStopFilter() throws Exception { Directory dir = newDirectory(); IndexWriter w = new IndexWriter(dir, newIndexWriterConfig( TEST_VERSION_CURRENT, new StandardAnalyzer(TEST_VERSION_CURRENT))); Document doc = new Document(); Field f = newField("field", "abcd the", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS); doc.add(f); doc.add(f); w.addDocument(doc); w.close(); IndexReader r = IndexReader.open(dir, true); TermVectorOffsetInfo[] termOffsets = ((TermPositionVector) r.getTermFreqVector(0, "field")).getOffsets(0); assertEquals(2, termOffsets.length); assertEquals(0, termOffsets[0].getStartOffset()); assertEquals(4, termOffsets[0].getEndOffset()); assertEquals(9, termOffsets[1].getStartOffset()); assertEquals(13, termOffsets[1].getEndOffset()); r.close(); dir.close(); } // LUCENE-1448 public void testEndOffsetPositionStandard() throws Exception { Directory dir = newDirectory(); IndexWriter w = new IndexWriter(dir, newIndexWriterConfig( TEST_VERSION_CURRENT, new MockAnalyzer(random))); Document doc = new Document(); Field f = newField("field", "abcd the ", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS); Field f2 = newField("field", "crunch man", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS); doc.add(f); doc.add(f2); w.addDocument(doc); w.close(); IndexReader r = IndexReader.open(dir, true); TermPositionVector tpv = ((TermPositionVector) r.getTermFreqVector(0, "field")); TermVectorOffsetInfo[] termOffsets = tpv.getOffsets(0); assertEquals(1, termOffsets.length); assertEquals(0, termOffsets[0].getStartOffset()); assertEquals(4, termOffsets[0].getEndOffset()); termOffsets = tpv.getOffsets(1); assertEquals(11, termOffsets[0].getStartOffset()); assertEquals(17, termOffsets[0].getEndOffset()); termOffsets = tpv.getOffsets(2); assertEquals(18, termOffsets[0].getStartOffset()); assertEquals(21, termOffsets[0].getEndOffset()); r.close(); dir.close(); } // LUCENE-1448 public void testEndOffsetPositionStandardEmptyField() throws Exception { Directory dir = newDirectory(); IndexWriter w = new IndexWriter(dir, newIndexWriterConfig( TEST_VERSION_CURRENT, new MockAnalyzer(random))); Document doc = new Document(); Field f = newField("field", "", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS); Field f2 = newField("field", "crunch man", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS); doc.add(f); doc.add(f2); w.addDocument(doc); w.close(); IndexReader r = IndexReader.open(dir, true); TermPositionVector tpv = ((TermPositionVector) r.getTermFreqVector(0, "field")); TermVectorOffsetInfo[] termOffsets = tpv.getOffsets(0); assertEquals(1, termOffsets.length); assertEquals(1, termOffsets[0].getStartOffset()); assertEquals(7, termOffsets[0].getEndOffset()); termOffsets = tpv.getOffsets(1); assertEquals(8, termOffsets[0].getStartOffset()); assertEquals(11, termOffsets[0].getEndOffset()); r.close(); dir.close(); } // LUCENE-1448 public void testEndOffsetPositionStandardEmptyField2() throws Exception { Directory dir = newDirectory(); IndexWriter w = new IndexWriter(dir, newIndexWriterConfig( TEST_VERSION_CURRENT, new MockAnalyzer(random))); Document doc = new Document(); Field f = newField("field", "abcd", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS); doc.add(f); doc.add(newField("field", "", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)); Field f2 = newField("field", "crunch", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS); doc.add(f2); w.addDocument(doc); w.close(); IndexReader r = IndexReader.open(dir, true); TermPositionVector tpv = ((TermPositionVector) r.getTermFreqVector(0, "field")); TermVectorOffsetInfo[] termOffsets = tpv.getOffsets(0); assertEquals(1, termOffsets.length); assertEquals(0, termOffsets[0].getStartOffset()); assertEquals(4, termOffsets[0].getEndOffset()); termOffsets = tpv.getOffsets(1); assertEquals(6, termOffsets[0].getStartOffset()); assertEquals(12, termOffsets[0].getEndOffset()); r.close(); dir.close(); } // LUCENE-1168 public void testTermVectorCorruption() throws IOException { Directory dir = newDirectory(); for(int iter=0;iter<2;iter++) { IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig( TEST_VERSION_CURRENT, new MockAnalyzer(random)) .setMaxBufferedDocs(2).setRAMBufferSizeMB( IndexWriterConfig.DISABLE_AUTO_FLUSH).setMergeScheduler( new SerialMergeScheduler()).setMergePolicy( new LogDocMergePolicy())); Document document = new Document(); Field storedField = newField("stored", "stored", Field.Store.YES, Field.Index.NO); document.add(storedField); writer.addDocument(document); writer.addDocument(document); document = new Document(); document.add(storedField); Field termVectorField = newField("termVector", "termVector", Field.Store.NO, Field.Index.NOT_ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS); document.add(termVectorField); writer.addDocument(document); writer.optimize(); writer.close(); IndexReader reader = IndexReader.open(dir, true); for(int i=0;i<reader.numDocs();i++) { reader.document(i); reader.getTermFreqVectors(i); } reader.close(); writer = new IndexWriter(dir, newIndexWriterConfig( TEST_VERSION_CURRENT, new MockAnalyzer(random)).setMaxBufferedDocs(2) .setRAMBufferSizeMB(IndexWriterConfig.DISABLE_AUTO_FLUSH) .setMergeScheduler(new SerialMergeScheduler()).setMergePolicy( new LogDocMergePolicy())); Directory[] indexDirs = {new MockDirectoryWrapper(random, new RAMDirectory(dir))}; writer.addIndexes(indexDirs); writer.optimize(); writer.close(); } dir.close(); } // LUCENE-1168 public void testTermVectorCorruption2() throws IOException { Directory dir = newDirectory(); for(int iter=0;iter<2;iter++) { IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig( TEST_VERSION_CURRENT, new MockAnalyzer(random)) .setMaxBufferedDocs(2).setRAMBufferSizeMB( IndexWriterConfig.DISABLE_AUTO_FLUSH).setMergeScheduler( new SerialMergeScheduler()).setMergePolicy( new LogDocMergePolicy())); Document document = new Document(); Field storedField = newField("stored", "stored", Field.Store.YES, Field.Index.NO); document.add(storedField); writer.addDocument(document); writer.addDocument(document); document = new Document(); document.add(storedField); Field termVectorField = newField("termVector", "termVector", Field.Store.NO, Field.Index.NOT_ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS); document.add(termVectorField); writer.addDocument(document); writer.optimize(); writer.close(); IndexReader reader = IndexReader.open(dir, true); assertTrue(reader.getTermFreqVectors(0)==null); assertTrue(reader.getTermFreqVectors(1)==null); assertTrue(reader.getTermFreqVectors(2)!=null); reader.close(); } dir.close(); } // LUCENE-1168 public void testTermVectorCorruption3() throws IOException { Directory dir = newDirectory(); IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig( TEST_VERSION_CURRENT, new MockAnalyzer(random)) .setMaxBufferedDocs(2).setRAMBufferSizeMB( IndexWriterConfig.DISABLE_AUTO_FLUSH).setMergeScheduler( new SerialMergeScheduler()).setMergePolicy(new LogDocMergePolicy())); Document document = new Document(); document = new Document(); Field storedField = newField("stored", "stored", Field.Store.YES, Field.Index.NO); document.add(storedField); Field termVectorField = newField("termVector", "termVector", Field.Store.NO, Field.Index.NOT_ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS); document.add(termVectorField); for(int i=0;i<10;i++) writer.addDocument(document); writer.close(); writer = new IndexWriter(dir, newIndexWriterConfig( TEST_VERSION_CURRENT, new MockAnalyzer(random)).setMaxBufferedDocs(2) .setRAMBufferSizeMB(IndexWriterConfig.DISABLE_AUTO_FLUSH) .setMergeScheduler(new SerialMergeScheduler()).setMergePolicy( new LogDocMergePolicy())); for(int i=0;i<6;i++) writer.addDocument(document); writer.optimize(); writer.close(); IndexReader reader = IndexReader.open(dir, true); for(int i=0;i<10;i++) { reader.getTermFreqVectors(i); reader.document(i); } reader.close(); dir.close(); } // LUCENE-1008 public void testNoTermVectorAfterTermVector() throws IOException { Directory dir = newDirectory(); IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig( TEST_VERSION_CURRENT, new MockAnalyzer(random))); Document document = new Document(); document.add(newField("tvtest", "a b c", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.YES)); iw.addDocument(document); document = new Document(); document.add(newField("tvtest", "x y z", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.NO)); iw.addDocument(document); // Make first segment iw.commit(); document.add(newField("tvtest", "a b c", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.YES)); iw.addDocument(document); // Make 2nd segment iw.commit(); iw.optimize(); iw.close(); dir.close(); } // LUCENE-1010 public void testNoTermVectorAfterTermVectorMerge() throws IOException { Directory dir = newDirectory(); IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig( TEST_VERSION_CURRENT, new MockAnalyzer(random))); Document document = new Document(); document.add(newField("tvtest", "a b c", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.YES)); iw.addDocument(document); iw.commit(); document = new Document(); document.add(newField("tvtest", "x y z", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.NO)); iw.addDocument(document); // Make first segment iw.commit(); iw.optimize(); document.add(newField("tvtest", "a b c", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.YES)); iw.addDocument(document); // Make 2nd segment iw.commit(); iw.optimize(); iw.close(); dir.close(); } }