/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.index; import java.io.IOException; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.CachingTokenFilter; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.analysis.MockTokenFilter; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; import org.apache.lucene.document.StoredField; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.store.Directory; import org.apache.lucene.store.MockDirectoryWrapper; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.TestUtil; /** tests for writing term vectors */ public class TestTermVectorsWriter extends LuceneTestCase { // LUCENE-1442 public void testDoubleOffsetCounting() throws Exception { Directory dir = newDirectory(); IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random()))); Document doc = new Document(); FieldType customType = new FieldType(StringField.TYPE_NOT_STORED); customType.setStoreTermVectors(true); customType.setStoreTermVectorPositions(true); customType.setStoreTermVectorOffsets(true); Field f = newField("field", "abcd", customType); doc.add(f); doc.add(f); Field f2 = newField("field", "", customType); doc.add(f2); doc.add(f); w.addDocument(doc); w.close(); IndexReader r = DirectoryReader.open(dir); Terms vector = r.getTermVectors(0).terms("field"); assertNotNull(vector); TermsEnum termsEnum = vector.iterator(); assertNotNull(termsEnum.next()); assertEquals("", termsEnum.term().utf8ToString()); // Token "" occurred once assertEquals(1, termsEnum.totalTermFreq()); PostingsEnum dpEnum = termsEnum.postings(null, PostingsEnum.ALL); assertTrue(dpEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); dpEnum.nextPosition(); assertEquals(8, dpEnum.startOffset()); assertEquals(8, dpEnum.endOffset()); assertEquals(DocIdSetIterator.NO_MORE_DOCS, dpEnum.nextDoc()); // Token "abcd" occurred three times assertEquals(new BytesRef("abcd"), termsEnum.next()); dpEnum = termsEnum.postings(dpEnum, PostingsEnum.ALL); assertEquals(3, termsEnum.totalTermFreq()); assertTrue(dpEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); dpEnum.nextPosition(); assertEquals(0, dpEnum.startOffset()); assertEquals(4, dpEnum.endOffset()); dpEnum.nextPosition(); assertEquals(4, dpEnum.startOffset()); assertEquals(8, dpEnum.endOffset()); dpEnum.nextPosition(); assertEquals(8, dpEnum.startOffset()); assertEquals(12, dpEnum.endOffset()); assertEquals(DocIdSetIterator.NO_MORE_DOCS, dpEnum.nextDoc()); assertNull(termsEnum.next()); r.close(); dir.close(); } // LUCENE-1442 public void testDoubleOffsetCounting2() throws Exception { Directory dir = newDirectory(); IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random()))); Document doc = new Document(); FieldType customType = new FieldType(TextField.TYPE_NOT_STORED); customType.setStoreTermVectors(true); customType.setStoreTermVectorPositions(true); customType.setStoreTermVectorOffsets(true); Field f = newField("field", "abcd", customType); doc.add(f); doc.add(f); w.addDocument(doc); w.close(); IndexReader r = DirectoryReader.open(dir); TermsEnum termsEnum = r.getTermVectors(0).terms("field").iterator(); assertNotNull(termsEnum.next()); PostingsEnum dpEnum = termsEnum.postings(null, PostingsEnum.ALL); assertEquals(2, termsEnum.totalTermFreq()); assertTrue(dpEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); dpEnum.nextPosition(); assertEquals(0, dpEnum.startOffset()); assertEquals(4, dpEnum.endOffset()); dpEnum.nextPosition(); assertEquals(5, dpEnum.startOffset()); assertEquals(9, dpEnum.endOffset()); assertEquals(DocIdSetIterator.NO_MORE_DOCS, dpEnum.nextDoc()); r.close(); dir.close(); } // LUCENE-1448 public void testEndOffsetPositionCharAnalyzer() throws Exception { Directory dir = newDirectory(); IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random()))); Document doc = new Document(); FieldType customType = new FieldType(TextField.TYPE_NOT_STORED); customType.setStoreTermVectors(true); customType.setStoreTermVectorPositions(true); customType.setStoreTermVectorOffsets(true); Field f = newField("field", "abcd ", customType); doc.add(f); doc.add(f); w.addDocument(doc); w.close(); IndexReader r = DirectoryReader.open(dir); TermsEnum termsEnum = r.getTermVectors(0).terms("field").iterator(); assertNotNull(termsEnum.next()); PostingsEnum dpEnum = termsEnum.postings(null, PostingsEnum.ALL); assertEquals(2, termsEnum.totalTermFreq()); assertTrue(dpEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); dpEnum.nextPosition(); assertEquals(0, dpEnum.startOffset()); assertEquals(4, dpEnum.endOffset()); dpEnum.nextPosition(); assertEquals(8, dpEnum.startOffset()); assertEquals(12, dpEnum.endOffset()); assertEquals(DocIdSetIterator.NO_MORE_DOCS, dpEnum.nextDoc()); r.close(); dir.close(); } // LUCENE-1448 public void testEndOffsetPositionWithCachingTokenFilter() throws Exception { Directory dir = newDirectory(); Analyzer analyzer = new MockAnalyzer(random()); IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(analyzer)); Document doc = new Document(); try (TokenStream stream = new CachingTokenFilter(analyzer.tokenStream("field", "abcd "))) { FieldType customType = new FieldType(TextField.TYPE_NOT_STORED); customType.setStoreTermVectors(true); customType.setStoreTermVectorPositions(true); customType.setStoreTermVectorOffsets(true); Field f = new Field("field", stream, customType); doc.add(f); doc.add(f); w.addDocument(doc); } w.close(); IndexReader r = DirectoryReader.open(dir); TermsEnum termsEnum = r.getTermVectors(0).terms("field").iterator(); assertNotNull(termsEnum.next()); PostingsEnum dpEnum = termsEnum.postings(null, PostingsEnum.ALL); assertEquals(2, termsEnum.totalTermFreq()); assertTrue(dpEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); dpEnum.nextPosition(); assertEquals(0, dpEnum.startOffset()); assertEquals(4, dpEnum.endOffset()); dpEnum.nextPosition(); assertEquals(8, dpEnum.startOffset()); assertEquals(12, dpEnum.endOffset()); assertEquals(DocIdSetIterator.NO_MORE_DOCS, dpEnum.nextDoc()); r.close(); dir.close(); } // LUCENE-1448 public void testEndOffsetPositionStopFilter() throws Exception { Directory dir = newDirectory(); IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET))); Document doc = new Document(); FieldType customType = new FieldType(TextField.TYPE_NOT_STORED); customType.setStoreTermVectors(true); customType.setStoreTermVectorPositions(true); customType.setStoreTermVectorOffsets(true); Field f = newField("field", "abcd the", customType); doc.add(f); doc.add(f); w.addDocument(doc); w.close(); IndexReader r = DirectoryReader.open(dir); TermsEnum termsEnum = r.getTermVectors(0).terms("field").iterator(); assertNotNull(termsEnum.next()); PostingsEnum dpEnum = termsEnum.postings(null, PostingsEnum.ALL); assertEquals(2, termsEnum.totalTermFreq()); assertTrue(dpEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); dpEnum.nextPosition(); assertEquals(0, dpEnum.startOffset()); assertEquals(4, dpEnum.endOffset()); dpEnum.nextPosition(); assertEquals(9, dpEnum.startOffset()); assertEquals(13, dpEnum.endOffset()); assertEquals(DocIdSetIterator.NO_MORE_DOCS, dpEnum.nextDoc()); r.close(); dir.close(); } // LUCENE-1448 public void testEndOffsetPositionStandard() throws Exception { Directory dir = newDirectory(); IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random()))); Document doc = new Document(); FieldType customType = new FieldType(TextField.TYPE_NOT_STORED); customType.setStoreTermVectors(true); customType.setStoreTermVectorPositions(true); customType.setStoreTermVectorOffsets(true); Field f = newField("field", "abcd the ", customType); Field f2 = newField("field", "crunch man", customType); doc.add(f); doc.add(f2); w.addDocument(doc); w.close(); IndexReader r = DirectoryReader.open(dir); TermsEnum termsEnum = r.getTermVectors(0).terms("field").iterator(); assertNotNull(termsEnum.next()); PostingsEnum dpEnum = termsEnum.postings(null, PostingsEnum.ALL); assertTrue(dpEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); dpEnum.nextPosition(); assertEquals(0, dpEnum.startOffset()); assertEquals(4, dpEnum.endOffset()); assertNotNull(termsEnum.next()); dpEnum = termsEnum.postings(dpEnum, PostingsEnum.ALL); assertTrue(dpEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); dpEnum.nextPosition(); assertEquals(11, dpEnum.startOffset()); assertEquals(17, dpEnum.endOffset()); assertNotNull(termsEnum.next()); dpEnum = termsEnum.postings(dpEnum, PostingsEnum.ALL); assertTrue(dpEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); dpEnum.nextPosition(); assertEquals(18, dpEnum.startOffset()); assertEquals(21, dpEnum.endOffset()); r.close(); dir.close(); } // LUCENE-1448 public void testEndOffsetPositionStandardEmptyField() throws Exception { Directory dir = newDirectory(); IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random()))); Document doc = new Document(); FieldType customType = new FieldType(TextField.TYPE_NOT_STORED); customType.setStoreTermVectors(true); customType.setStoreTermVectorPositions(true); customType.setStoreTermVectorOffsets(true); Field f = newField("field", "", customType); Field f2 = newField("field", "crunch man", customType); doc.add(f); doc.add(f2); w.addDocument(doc); w.close(); IndexReader r = DirectoryReader.open(dir); TermsEnum termsEnum = r.getTermVectors(0).terms("field").iterator(); assertNotNull(termsEnum.next()); PostingsEnum dpEnum = termsEnum.postings(null, PostingsEnum.ALL); assertEquals(1, (int) termsEnum.totalTermFreq()); assertTrue(dpEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); dpEnum.nextPosition(); assertEquals(1, dpEnum.startOffset()); assertEquals(7, dpEnum.endOffset()); assertNotNull(termsEnum.next()); dpEnum = termsEnum.postings(dpEnum, PostingsEnum.ALL); assertTrue(dpEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); dpEnum.nextPosition(); assertEquals(8, dpEnum.startOffset()); assertEquals(11, dpEnum.endOffset()); r.close(); dir.close(); } // LUCENE-1448 public void testEndOffsetPositionStandardEmptyField2() throws Exception { Directory dir = newDirectory(); IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random()))); Document doc = new Document(); FieldType customType = new FieldType(TextField.TYPE_NOT_STORED); customType.setStoreTermVectors(true); customType.setStoreTermVectorPositions(true); customType.setStoreTermVectorOffsets(true); Field f = newField("field", "abcd", customType); doc.add(f); doc.add(newField("field", "", customType)); Field f2 = newField("field", "crunch", customType); doc.add(f2); w.addDocument(doc); w.close(); IndexReader r = DirectoryReader.open(dir); TermsEnum termsEnum = r.getTermVectors(0).terms("field").iterator(); assertNotNull(termsEnum.next()); PostingsEnum dpEnum = termsEnum.postings(null, PostingsEnum.ALL); assertEquals(1, (int) termsEnum.totalTermFreq()); assertTrue(dpEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); dpEnum.nextPosition(); assertEquals(0, dpEnum.startOffset()); assertEquals(4, dpEnum.endOffset()); assertNotNull(termsEnum.next()); dpEnum = termsEnum.postings(dpEnum, PostingsEnum.ALL); assertTrue(dpEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); dpEnum.nextPosition(); assertEquals(6, dpEnum.startOffset()); assertEquals(12, dpEnum.endOffset()); r.close(); dir.close(); } // LUCENE-1168 public void testTermVectorCorruption() throws IOException { Directory dir = newDirectory(); for(int iter=0;iter<2;iter++) { IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random())) .setMaxBufferedDocs(2) .setRAMBufferSizeMB(IndexWriterConfig.DISABLE_AUTO_FLUSH) .setMergeScheduler(new SerialMergeScheduler()) .setMergePolicy(new LogDocMergePolicy())); Document document = new Document(); FieldType customType = new FieldType(); customType.setStored(true); Field storedField = newField("stored", "stored", customType); document.add(storedField); writer.addDocument(document); writer.addDocument(document); document = new Document(); document.add(storedField); FieldType customType2 = new FieldType(StringField.TYPE_NOT_STORED); customType2.setStoreTermVectors(true); customType2.setStoreTermVectorPositions(true); customType2.setStoreTermVectorOffsets(true); Field termVectorField = newField("termVector", "termVector", customType2); document.add(termVectorField); writer.addDocument(document); writer.forceMerge(1); writer.close(); IndexReader reader = DirectoryReader.open(dir); for(int i=0;i<reader.numDocs();i++) { reader.document(i); reader.getTermVectors(i); } reader.close(); writer = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random())) .setMaxBufferedDocs(2) .setRAMBufferSizeMB(IndexWriterConfig.DISABLE_AUTO_FLUSH) .setMergeScheduler(new SerialMergeScheduler()) .setMergePolicy(new LogDocMergePolicy())); Directory[] indexDirs = {new MockDirectoryWrapper(random(), TestUtil.ramCopyOf(dir))}; writer.addIndexes(indexDirs); writer.forceMerge(1); writer.close(); } dir.close(); } // LUCENE-1168 public void testTermVectorCorruption2() throws IOException { Directory dir = newDirectory(); for(int iter=0;iter<2;iter++) { IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random())) .setMaxBufferedDocs(2) .setRAMBufferSizeMB(IndexWriterConfig.DISABLE_AUTO_FLUSH) .setMergeScheduler(new SerialMergeScheduler()) .setMergePolicy(new LogDocMergePolicy())); Document document = new Document(); FieldType customType = new FieldType(); customType.setStored(true); Field storedField = newField("stored", "stored", customType); document.add(storedField); writer.addDocument(document); writer.addDocument(document); document = new Document(); document.add(storedField); FieldType customType2 = new FieldType(StringField.TYPE_NOT_STORED); customType2.setStoreTermVectors(true); customType2.setStoreTermVectorPositions(true); customType2.setStoreTermVectorOffsets(true); Field termVectorField = newField("termVector", "termVector", customType2); document.add(termVectorField); writer.addDocument(document); writer.forceMerge(1); writer.close(); IndexReader reader = DirectoryReader.open(dir); assertNull(reader.getTermVectors(0)); assertNull(reader.getTermVectors(1)); assertNotNull(reader.getTermVectors(2)); reader.close(); } dir.close(); } // LUCENE-1168 public void testTermVectorCorruption3() throws IOException { Directory dir = newDirectory(); IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random())) .setMaxBufferedDocs(2) .setRAMBufferSizeMB(IndexWriterConfig.DISABLE_AUTO_FLUSH) .setMergeScheduler(new SerialMergeScheduler()) .setMergePolicy(new LogDocMergePolicy())); Document document = new Document(); FieldType customType = new FieldType(); customType.setStored(true); Field storedField = newField("stored", "stored", customType); document.add(storedField); FieldType customType2 = new FieldType(StringField.TYPE_NOT_STORED); customType2.setStoreTermVectors(true); customType2.setStoreTermVectorPositions(true); customType2.setStoreTermVectorOffsets(true); Field termVectorField = newField("termVector", "termVector", customType2); document.add(termVectorField); for(int i=0;i<10;i++) writer.addDocument(document); writer.close(); writer = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random())) .setMaxBufferedDocs(2) .setRAMBufferSizeMB(IndexWriterConfig.DISABLE_AUTO_FLUSH) .setMergeScheduler(new SerialMergeScheduler()) .setMergePolicy(new LogDocMergePolicy())); for(int i=0;i<6;i++) writer.addDocument(document); writer.forceMerge(1); writer.close(); IndexReader reader = DirectoryReader.open(dir); for(int i=0;i<10;i++) { reader.getTermVectors(i); reader.document(i); } reader.close(); dir.close(); } // LUCENE-1008 public void testNoTermVectorAfterTermVector() throws IOException { Directory dir = newDirectory(); IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random()))); Document document = new Document(); FieldType customType2 = new FieldType(StringField.TYPE_NOT_STORED); customType2.setStoreTermVectors(true); customType2.setStoreTermVectorPositions(true); customType2.setStoreTermVectorOffsets(true); document.add(newField("tvtest", "a b c", customType2)); iw.addDocument(document); document = new Document(); document.add(newTextField("tvtest", "x y z", Field.Store.NO)); iw.addDocument(document); // Make first segment iw.commit(); FieldType customType = new FieldType(StringField.TYPE_NOT_STORED); customType.setStoreTermVectors(true); document = new Document(); document.add(newField("tvtest", "a b c", customType)); iw.addDocument(document); // Make 2nd segment iw.commit(); iw.forceMerge(1); iw.close(); dir.close(); } // LUCENE-1010 public void testNoTermVectorAfterTermVectorMerge() throws IOException { Directory dir = newDirectory(); IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random()))); Document document = new Document(); FieldType customType = new FieldType(StringField.TYPE_NOT_STORED); customType.setStoreTermVectors(true); document.add(newField("tvtest", "a b c", customType)); iw.addDocument(document); iw.commit(); document = new Document(); document.add(newTextField("tvtest", "x y z", Field.Store.NO)); iw.addDocument(document); // Make first segment iw.commit(); iw.forceMerge(1); FieldType customType2 = new FieldType(StringField.TYPE_NOT_STORED); customType2.setStoreTermVectors(true); document.add(newField("tvtest", "a b c", customType2)); document = new Document(); iw.addDocument(document); // Make 2nd segment iw.commit(); iw.forceMerge(1); iw.close(); dir.close(); } /** * In a single doc, for the same field, mix the term vectors up */ public void testInconsistentTermVectorOptions() throws IOException { FieldType a, b; // no vectors + vectors a = new FieldType(TextField.TYPE_NOT_STORED); b = new FieldType(TextField.TYPE_NOT_STORED); b.setStoreTermVectors(true); doTestMixup(a, b); // vectors + vectors with pos a = new FieldType(TextField.TYPE_NOT_STORED); a.setStoreTermVectors(true); b = new FieldType(TextField.TYPE_NOT_STORED); b.setStoreTermVectors(true); b.setStoreTermVectorPositions(true); doTestMixup(a, b); // vectors + vectors with off a = new FieldType(TextField.TYPE_NOT_STORED); a.setStoreTermVectors(true); b = new FieldType(TextField.TYPE_NOT_STORED); b.setStoreTermVectors(true); b.setStoreTermVectorOffsets(true); doTestMixup(a, b); // vectors with pos + vectors with pos + off a = new FieldType(TextField.TYPE_NOT_STORED); a.setStoreTermVectors(true); a.setStoreTermVectorPositions(true); b = new FieldType(TextField.TYPE_NOT_STORED); b.setStoreTermVectors(true); b.setStoreTermVectorPositions(true); b.setStoreTermVectorOffsets(true); doTestMixup(a, b); // vectors with pos + vectors with pos + pay a = new FieldType(TextField.TYPE_NOT_STORED); a.setStoreTermVectors(true); a.setStoreTermVectorPositions(true); b = new FieldType(TextField.TYPE_NOT_STORED); b.setStoreTermVectors(true); b.setStoreTermVectorPositions(true); b.setStoreTermVectorPayloads(true); doTestMixup(a, b); } private void doTestMixup(FieldType ft1, FieldType ft2) throws IOException { Directory dir = newDirectory(); RandomIndexWriter iw = new RandomIndexWriter(random(), dir); // add 3 good docs for (int i = 0; i < 3; i++) { Document doc = new Document(); doc.add(new StringField("id", Integer.toString(i), Field.Store.NO)); iw.addDocument(doc); } // add broken doc Document doc = new Document(); doc.add(new Field("field", "value1", ft1)); doc.add(new Field("field", "value2", ft2)); // ensure broken doc hits exception IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> { iw.addDocument(doc); }); assertNotNull(expected.getMessage()); assertTrue(expected.getMessage().startsWith("all instances of a given field name must have the same term vectors settings")); // ensure good docs are still ok IndexReader ir = iw.getReader(); assertEquals(3, ir.numDocs()); ir.close(); iw.close(); dir.close(); } // LUCENE-5611: don't abort segment when term vector settings are wrong public void testNoAbortOnBadTVSettings() throws Exception { Directory dir = newDirectory(); // Don't use RandomIndexWriter because we want to be sure both docs go to 1 seg: IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); IndexWriter iw = new IndexWriter(dir, iwc); Document doc = new Document(); iw.addDocument(doc); FieldType ft = new FieldType(StoredField.TYPE); ft.setStoreTermVectors(true); ft.freeze(); doc.add(new Field("field", "value", ft)); expectThrows(IllegalArgumentException.class, () -> { iw.addDocument(doc); }); IndexReader r = DirectoryReader.open(iw); // Make sure the exc didn't lose our first document: assertEquals(1, r.numDocs()); iw.close(); r.close(); dir.close(); } }