/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.index; import java.io.IOException; import java.util.Arrays; import java.util.Random; import org.apache.lucene.analysis.*; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.TermVectorsReader; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; import org.apache.lucene.document.StoredField; import org.apache.lucene.document.TextField; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.store.Directory; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.TestUtil; public class TestTermVectorsReader extends LuceneTestCase { //Must be lexicographically sorted, will do in setup, versus trying to maintain here private String[] testFields = {"f1", "f2", "f3", "f4"}; private boolean[] testFieldsStorePos = {true, false, true, false}; private boolean[] testFieldsStoreOff = {true, false, false, true}; private String[] testTerms = {"this", "is", "a", "test"}; private int[][] positions = new int[testTerms.length][]; private Directory dir; private SegmentCommitInfo seg; private FieldInfos fieldInfos = new FieldInfos(new FieldInfo[0]); private static int TERM_FREQ = 3; private static class TestToken implements Comparable<TestToken> { String text; int pos; int startOffset; int endOffset; @Override public int compareTo(TestToken other) { return pos - other.pos; } } TestToken[] tokens = new TestToken[testTerms.length * TERM_FREQ]; @Override public void setUp() throws Exception { super.setUp(); /* for (int i = 0; i < testFields.length; i++) { fieldInfos.add(testFields[i], true, true, testFieldsStorePos[i], testFieldsStoreOff[i]); } */ Arrays.sort(testTerms); int tokenUpto = 0; Random rnd = random(); for (int i = 0; i < testTerms.length; i++) { positions[i] = new int[TERM_FREQ]; // first position must be 0 for (int j = 0; j < TERM_FREQ; j++) { // positions are always sorted in increasing order positions[i][j] = (int) (j * 10 + rnd.nextDouble() * 10); TestToken token = tokens[tokenUpto++] = new TestToken(); token.text = testTerms[i]; token.pos = positions[i][j]; token.startOffset = j * 10; token.endOffset = j * 10 + testTerms[i].length(); } } Arrays.sort(tokens); dir = newDirectory(); IndexWriter writer = new IndexWriter( dir, newIndexWriterConfig(new MyAnalyzer()). setMaxBufferedDocs(-1). setMergePolicy(newLogMergePolicy(false, 10)) .setUseCompoundFile(false) ); Document doc = new Document(); for(int i=0;i<testFields.length;i++) { FieldType customType = new FieldType(TextField.TYPE_NOT_STORED); if (testFieldsStorePos[i] && testFieldsStoreOff[i]) { customType.setStoreTermVectors(true); customType.setStoreTermVectorPositions(true); customType.setStoreTermVectorOffsets(true); } else if (testFieldsStorePos[i] && !testFieldsStoreOff[i]) { customType.setStoreTermVectors(true); customType.setStoreTermVectorPositions(true); } else if (!testFieldsStorePos[i] && testFieldsStoreOff[i]) { customType.setStoreTermVectors(true); customType.setStoreTermVectorPositions(true); customType.setStoreTermVectorOffsets(true); } else { customType.setStoreTermVectors(true); } doc.add(new Field(testFields[i], "", customType)); } //Create 5 documents for testing, they all have the same //terms for(int j=0;j<5;j++) { writer.addDocument(doc); } writer.commit(); seg = writer.newestSegment(); writer.close(); fieldInfos = IndexWriter.readFieldInfos(seg); } @Override public void tearDown() throws Exception { dir.close(); super.tearDown(); } private class MyTokenizer extends Tokenizer { private int tokenUpto; private final CharTermAttribute termAtt; private final PositionIncrementAttribute posIncrAtt; private final OffsetAttribute offsetAtt; public MyTokenizer() { super(); termAtt = addAttribute(CharTermAttribute.class); posIncrAtt = addAttribute(PositionIncrementAttribute.class); offsetAtt = addAttribute(OffsetAttribute.class); } @Override public boolean incrementToken() { if (tokenUpto >= tokens.length) { return false; } else { final TestToken testToken = tokens[tokenUpto++]; clearAttributes(); termAtt.append(testToken.text); offsetAtt.setOffset(testToken.startOffset, testToken.endOffset); if (tokenUpto > 1) { posIncrAtt.setPositionIncrement(testToken.pos - tokens[tokenUpto-2].pos); } else { posIncrAtt.setPositionIncrement(testToken.pos+1); } return true; } } @Override public void reset() throws IOException { super.reset(); this.tokenUpto = 0; } } private class MyAnalyzer extends Analyzer { @Override public TokenStreamComponents createComponents(String fieldName) { return new TokenStreamComponents(new MyTokenizer()); } } public void test() throws IOException { //Check to see the files were created properly in setup DirectoryReader reader = DirectoryReader.open(dir); for (LeafReaderContext ctx : reader.leaves()) { SegmentReader sr = (SegmentReader) ctx.reader(); assertTrue(sr.getFieldInfos().hasVectors()); } reader.close(); } public void testReader() throws IOException { TermVectorsReader reader = Codec.getDefault().termVectorsFormat().vectorsReader(dir, seg.info, fieldInfos, newIOContext(random())); for (int j = 0; j < 5; j++) { Terms vector = reader.get(j).terms(testFields[0]); assertNotNull(vector); assertEquals(testTerms.length, vector.size()); TermsEnum termsEnum = vector.iterator(); for (int i = 0; i < testTerms.length; i++) { final BytesRef text = termsEnum.next(); assertNotNull(text); String term = text.utf8ToString(); //System.out.println("Term: " + term); assertEquals(testTerms[i], term); } assertNull(termsEnum.next()); } reader.close(); } public void testDocsEnum() throws IOException { TermVectorsReader reader = Codec.getDefault().termVectorsFormat().vectorsReader(dir, seg.info, fieldInfos, newIOContext(random())); for (int j = 0; j < 5; j++) { Terms vector = reader.get(j).terms(testFields[0]); assertNotNull(vector); assertEquals(testTerms.length, vector.size()); TermsEnum termsEnum = vector.iterator(); PostingsEnum postingsEnum = null; for (int i = 0; i < testTerms.length; i++) { final BytesRef text = termsEnum.next(); assertNotNull(text); String term = text.utf8ToString(); //System.out.println("Term: " + term); assertEquals(testTerms[i], term); postingsEnum = TestUtil.docs(random(), termsEnum, postingsEnum, PostingsEnum.NONE); assertNotNull(postingsEnum); int doc = postingsEnum.docID(); assertEquals(-1, doc); assertTrue(postingsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); assertEquals(DocIdSetIterator.NO_MORE_DOCS, postingsEnum.nextDoc()); } assertNull(termsEnum.next()); } reader.close(); } public void testPositionReader() throws IOException { TermVectorsReader reader = Codec.getDefault().termVectorsFormat().vectorsReader(dir, seg.info, fieldInfos, newIOContext(random())); BytesRef[] terms; Terms vector = reader.get(0).terms(testFields[0]); assertNotNull(vector); assertEquals(testTerms.length, vector.size()); TermsEnum termsEnum = vector.iterator(); PostingsEnum dpEnum = null; for (int i = 0; i < testTerms.length; i++) { final BytesRef text = termsEnum.next(); assertNotNull(text); String term = text.utf8ToString(); //System.out.println("Term: " + term); assertEquals(testTerms[i], term); dpEnum = termsEnum.postings(dpEnum, PostingsEnum.ALL); assertNotNull(dpEnum); int doc = dpEnum.docID(); assertEquals(-1, doc); assertTrue(dpEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); assertEquals(dpEnum.freq(), positions[i].length); for (int j = 0; j < positions[i].length; j++) { assertEquals(positions[i][j], dpEnum.nextPosition()); } assertEquals(DocIdSetIterator.NO_MORE_DOCS, dpEnum.nextDoc()); dpEnum = termsEnum.postings(dpEnum, PostingsEnum.ALL); doc = dpEnum.docID(); assertEquals(-1, doc); assertTrue(dpEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); assertNotNull(dpEnum); assertEquals(dpEnum.freq(), positions[i].length); for (int j = 0; j < positions[i].length; j++) { assertEquals(positions[i][j], dpEnum.nextPosition()); assertEquals(j*10, dpEnum.startOffset()); assertEquals(j*10 + testTerms[i].length(), dpEnum.endOffset()); } assertEquals(DocIdSetIterator.NO_MORE_DOCS, dpEnum.nextDoc()); } Terms freqVector = reader.get(0).terms(testFields[1]); //no pos, no offset assertNotNull(freqVector); assertEquals(testTerms.length, freqVector.size()); termsEnum = freqVector.iterator(); assertNotNull(termsEnum); for (int i = 0; i < testTerms.length; i++) { final BytesRef text = termsEnum.next(); assertNotNull(text); String term = text.utf8ToString(); //System.out.println("Term: " + term); assertEquals(testTerms[i], term); assertNotNull(termsEnum.postings(null)); assertNotNull(termsEnum.postings(null, PostingsEnum.ALL)); } reader.close(); } public void testOffsetReader() throws IOException { TermVectorsReader reader = Codec.getDefault().termVectorsFormat().vectorsReader(dir, seg.info, fieldInfos, newIOContext(random())); Terms vector = reader.get(0).terms(testFields[0]); assertNotNull(vector); TermsEnum termsEnum = vector.iterator(); assertNotNull(termsEnum); assertEquals(testTerms.length, vector.size()); PostingsEnum dpEnum = null; for (int i = 0; i < testTerms.length; i++) { final BytesRef text = termsEnum.next(); assertNotNull(text); String term = text.utf8ToString(); assertEquals(testTerms[i], term); dpEnum = termsEnum.postings(dpEnum, PostingsEnum.ALL); assertNotNull(dpEnum); assertTrue(dpEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); assertEquals(dpEnum.freq(), positions[i].length); for (int j = 0; j < positions[i].length; j++) { assertEquals(positions[i][j], dpEnum.nextPosition()); } assertEquals(DocIdSetIterator.NO_MORE_DOCS, dpEnum.nextDoc()); dpEnum = termsEnum.postings(dpEnum, PostingsEnum.ALL); assertTrue(dpEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); assertNotNull(dpEnum); assertEquals(dpEnum.freq(), positions[i].length); for (int j = 0; j < positions[i].length; j++) { assertEquals(positions[i][j], dpEnum.nextPosition()); assertEquals(j*10, dpEnum.startOffset()); assertEquals(j*10 + testTerms[i].length(), dpEnum.endOffset()); } assertEquals(DocIdSetIterator.NO_MORE_DOCS, dpEnum.nextDoc()); } reader.close(); } public void testIllegalPayloadsWithoutPositions() throws Exception { Directory dir = newDirectory(); MockAnalyzer a = new MockAnalyzer(random()); a.setEnableChecks(false); RandomIndexWriter w = new RandomIndexWriter(random(), dir, a); FieldType ft = new FieldType(TextField.TYPE_NOT_STORED); ft.setStoreTermVectors(true); ft.setStoreTermVectorPayloads(true); Document doc = new Document(); doc.add(new Field("field", "value", ft)); IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> { w.addDocument(doc); }); assertEquals("cannot index term vector payloads without term vector positions (field=\"field\")", expected.getMessage()); w.close(); dir.close(); } public void testIllegalOffsetsWithoutVectors() throws Exception { Directory dir = newDirectory(); MockAnalyzer a = new MockAnalyzer(random()); a.setEnableChecks(false); RandomIndexWriter w = new RandomIndexWriter(random(), dir, a); FieldType ft = new FieldType(TextField.TYPE_NOT_STORED); ft.setStoreTermVectors(false); ft.setStoreTermVectorOffsets(true); Document doc = new Document(); doc.add(new Field("field", "value", ft)); IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> { w.addDocument(doc); }); assertEquals("cannot index term vector offsets when term vectors are not indexed (field=\"field\")", expected.getMessage()); w.close(); dir.close(); } public void testIllegalPositionsWithoutVectors() throws Exception { Directory dir = newDirectory(); MockAnalyzer a = new MockAnalyzer(random()); a.setEnableChecks(false); RandomIndexWriter w = new RandomIndexWriter(random(), dir, a); FieldType ft = new FieldType(TextField.TYPE_NOT_STORED); ft.setStoreTermVectors(false); ft.setStoreTermVectorPositions(true); Document doc = new Document(); doc.add(new Field("field", "value", ft)); IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> { w.addDocument(doc); }); assertEquals("cannot index term vector positions when term vectors are not indexed (field=\"field\")", expected.getMessage()); w.close(); dir.close(); } public void testIllegalVectorPayloadsWithoutVectors() throws Exception { Directory dir = newDirectory(); MockAnalyzer a = new MockAnalyzer(random()); a.setEnableChecks(false); RandomIndexWriter w = new RandomIndexWriter(random(), dir, a); FieldType ft = new FieldType(TextField.TYPE_NOT_STORED); ft.setStoreTermVectors(false); ft.setStoreTermVectorPayloads(true); Document doc = new Document(); doc.add(new Field("field", "value", ft)); IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> { w.addDocument(doc); }); assertEquals("cannot index term vector payloads when term vectors are not indexed (field=\"field\")", expected.getMessage()); w.close(); dir.close(); } public void testIllegalVectorsWithoutIndexed() throws Exception { Directory dir = newDirectory(); MockAnalyzer a = new MockAnalyzer(random()); a.setEnableChecks(false); RandomIndexWriter w = new RandomIndexWriter(random(), dir, a); FieldType ft = new FieldType(StoredField.TYPE); ft.setStoreTermVectors(true); Document doc = new Document(); doc.add(new Field("field", "value", ft)); IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> { w.addDocument(doc); }); assertEquals("cannot store term vectors for a field that is not indexed (field=\"field\")", expected.getMessage()); w.close(); dir.close(); } public void testIllegalVectorPositionsWithoutIndexed() throws Exception { Directory dir = newDirectory(); MockAnalyzer a = new MockAnalyzer(random()); a.setEnableChecks(false); RandomIndexWriter w = new RandomIndexWriter(random(), dir, a); FieldType ft = new FieldType(StoredField.TYPE); ft.setStoreTermVectorPositions(true); Document doc = new Document(); doc.add(new Field("field", "value", ft)); IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> { w.addDocument(doc); }); assertEquals("cannot store term vector positions for a field that is not indexed (field=\"field\")", expected.getMessage()); w.close(); dir.close(); } public void testIllegalVectorOffsetsWithoutIndexed() throws Exception { Directory dir = newDirectory(); MockAnalyzer a = new MockAnalyzer(random()); a.setEnableChecks(false); RandomIndexWriter w = new RandomIndexWriter(random(), dir, a); FieldType ft = new FieldType(StoredField.TYPE); ft.setStoreTermVectorOffsets(true); Document doc = new Document(); doc.add(new Field("field", "value", ft)); IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> { w.addDocument(doc); }); assertEquals("cannot store term vector offsets for a field that is not indexed (field=\"field\")", expected.getMessage()); w.close(); dir.close(); } public void testIllegalVectorPayloadsWithoutIndexed() throws Exception { Directory dir = newDirectory(); MockAnalyzer a = new MockAnalyzer(random()); a.setEnableChecks(false); RandomIndexWriter w = new RandomIndexWriter(random(), dir, a); FieldType ft = new FieldType(StoredField.TYPE); ft.setStoreTermVectorPayloads(true); Document doc = new Document(); doc.add(new Field("field", "value", ft)); IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> { w.addDocument(doc); }); assertEquals("cannot store term vector payloads for a field that is not indexed (field=\"field\")", expected.getMessage()); w.close(); dir.close(); } }