TestPostingsOffsets.java example

Explorer
lucene-solr-master
- lucene
- solr
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.index;


import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CannedTokenStream;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockPayloadAnalyzer;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.NumericDocValuesField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.English;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;

// TODO: we really need to test indexingoffsets, but then getting only docs / docs + freqs.
// not all codecs store prx separate...
// TODO: fix sep codec to index offsets so we can greatly reduce this list!
public class TestPostingsOffsets extends LuceneTestCase {
  IndexWriterConfig iwc;
  
  @Override
  public void setUp() throws Exception {
    super.setUp();
    iwc = newIndexWriterConfig(new MockAnalyzer(random()));
  }

  public void testBasic() throws Exception {
    Directory dir = newDirectory();
    
    RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);
    Document doc = new Document();

    FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
    ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
    if (random().nextBoolean()) {
      ft.setStoreTermVectors(true);
      ft.setStoreTermVectorPositions(random().nextBoolean());
      ft.setStoreTermVectorOffsets(random().nextBoolean());
    }
    Token[] tokens = new Token[] {
      makeToken("a", 1, 0, 6),
      makeToken("b", 1, 8, 9),
      makeToken("a", 1, 9, 17),
      makeToken("c", 1, 19, 50),
    };
    doc.add(new Field("content", new CannedTokenStream(tokens), ft));

    w.addDocument(doc);
    IndexReader r = w.getReader();
    w.close();

    PostingsEnum dp = MultiFields.getTermPositionsEnum(r, "content", new BytesRef("a"));
    assertNotNull(dp);
    assertEquals(0, dp.nextDoc());
    assertEquals(2, dp.freq());
    assertEquals(0, dp.nextPosition());
    assertEquals(0, dp.startOffset());
    assertEquals(6, dp.endOffset());
    assertEquals(2, dp.nextPosition());
    assertEquals(9, dp.startOffset());
    assertEquals(17, dp.endOffset());
    assertEquals(DocIdSetIterator.NO_MORE_DOCS, dp.nextDoc());

    dp = MultiFields.getTermPositionsEnum(r, "content", new BytesRef("b"));
    assertNotNull(dp);
    assertEquals(0, dp.nextDoc());
    assertEquals(1, dp.freq());
    assertEquals(1, dp.nextPosition());
    assertEquals(8, dp.startOffset());
    assertEquals(9, dp.endOffset());
    assertEquals(DocIdSetIterator.NO_MORE_DOCS, dp.nextDoc());

    dp = MultiFields.getTermPositionsEnum(r, "content", new BytesRef("c"));
    assertNotNull(dp);
    assertEquals(0, dp.nextDoc());
    assertEquals(1, dp.freq());
    assertEquals(3, dp.nextPosition());
    assertEquals(19, dp.startOffset());
    assertEquals(50, dp.endOffset());
    assertEquals(DocIdSetIterator.NO_MORE_DOCS, dp.nextDoc());

    r.close();
    dir.close();
  }
  
  public void testSkipping() throws Exception {
    doTestNumbers(false);
  }
  
  public void testPayloads() throws Exception {
    doTestNumbers(true);
  }
  
  public void doTestNumbers(boolean withPayloads) throws Exception {
    Directory dir = newDirectory();
    Analyzer analyzer = withPayloads ? new MockPayloadAnalyzer() : new MockAnalyzer(random());
    iwc = newIndexWriterConfig(analyzer);
    iwc.setMergePolicy(newLogMergePolicy()); // will rely on docids a bit for skipping
    RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);
    
    FieldType ft = new FieldType(TextField.TYPE_STORED);
    ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
    if (random().nextBoolean()) {
      ft.setStoreTermVectors(true);
      ft.setStoreTermVectorOffsets(random().nextBoolean());
      ft.setStoreTermVectorPositions(random().nextBoolean());
    }
    
    int numDocs = atLeast(500);
    for (int i = 0; i < numDocs; i++) {
      Document doc = new Document();
      doc.add(new Field("numbers", English.intToEnglish(i), ft));
      doc.add(new Field("oddeven", (i % 2) == 0 ? "even" : "odd", ft));
      doc.add(new StringField("id", "" + i, Field.Store.NO));
      w.addDocument(doc);
    }
    
    IndexReader reader = w.getReader();
    w.close();
    
    String terms[] = { "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "hundred" };
    
    for (String term : terms) {
      PostingsEnum dp = MultiFields.getTermPositionsEnum(reader, "numbers", new BytesRef(term));
      int doc;
      while((doc = dp.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
        String storedNumbers = reader.document(doc).get("numbers");
        int freq = dp.freq();
        for (int i = 0; i < freq; i++) {
          dp.nextPosition();
          int start = dp.startOffset();
          assert start >= 0;
          int end = dp.endOffset();
          assert end >= 0 && end >= start;
          // check that the offsets correspond to the term in the src text
          assertTrue(storedNumbers.substring(start, end).equals(term));
          if (withPayloads) {
            // check that we have a payload and it starts with "pos"
            assertNotNull(dp.getPayload());
            BytesRef payload = dp.getPayload();
            assertTrue(payload.utf8ToString().startsWith("pos:"));
          } // note: withPayloads=false doesnt necessarily mean we dont have them from MockAnalyzer!
        }
      }
    }
    
    // check we can skip correctly
    int numSkippingTests = atLeast(50);
    
    for (int j = 0; j < numSkippingTests; j++) {
      int num = TestUtil.nextInt(random(), 100, Math.min(numDocs - 1, 999));
      PostingsEnum dp = MultiFields.getTermPositionsEnum(reader, "numbers", new BytesRef("hundred"));
      int doc = dp.advance(num);
      assertEquals(num, doc);
      int freq = dp.freq();
      for (int i = 0; i < freq; i++) {
        String storedNumbers = reader.document(doc).get("numbers");
        dp.nextPosition();
        int start = dp.startOffset();
        assert start >= 0;
        int end = dp.endOffset();
        assert end >= 0 && end >= start;
        // check that the offsets correspond to the term in the src text
        assertTrue(storedNumbers.substring(start, end).equals("hundred"));
        if (withPayloads) {
          // check that we have a payload and it starts with "pos"
          assertNotNull(dp.getPayload());
          BytesRef payload = dp.getPayload();
          assertTrue(payload.utf8ToString().startsWith("pos:"));
        } // note: withPayloads=false doesnt necessarily mean we dont have them from MockAnalyzer!
      }
    }
    
    // check that other fields (without offsets) work correctly
    
    for (int i = 0; i < numDocs; i++) {
      PostingsEnum dp = MultiFields.getTermDocsEnum(reader, "id", new BytesRef("" + i), 0);
      assertEquals(i, dp.nextDoc());
      assertEquals(DocIdSetIterator.NO_MORE_DOCS, dp.nextDoc());
    }
    
    reader.close();
    dir.close();
  }

  public void testRandom() throws Exception {
    // token -> docID -> tokens
    final Map<String,Map<Integer,List<Token>>> actualTokens = new HashMap<>();

    Directory dir = newDirectory();
    RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);

    final int numDocs = atLeast(20);
    //final int numDocs = atLeast(5);

    FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);

    // TODO: randomize what IndexOptions we use; also test
    // changing this up in one IW buffered segment...:
    ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
    if (random().nextBoolean()) {
      ft.setStoreTermVectors(true);
      ft.setStoreTermVectorOffsets(random().nextBoolean());
      ft.setStoreTermVectorPositions(random().nextBoolean());
    }

    for(int docCount=0;docCount<numDocs;docCount++) {
      Document doc = new Document();
      doc.add(new NumericDocValuesField("id", docCount));
      List<Token> tokens = new ArrayList<>();
      final int numTokens = atLeast(100);
      //final int numTokens = atLeast(20);
      int pos = -1;
      int offset = 0;
      //System.out.println("doc id=" + docCount);
      for(int tokenCount=0;tokenCount<numTokens;tokenCount++) {
        final String text;
        if (random().nextBoolean()) {
          text = "a";
        } else if (random().nextBoolean()) {
          text = "b";
        } else if (random().nextBoolean()) {
          text = "c";
        } else {
          text = "d";
        }       
        
        int posIncr = random().nextBoolean() ? 1 : random().nextInt(5);
        if (tokenCount == 0 && posIncr == 0) {
          posIncr = 1;
        }
        final int offIncr = random().nextBoolean() ? 0 : random().nextInt(5);
        final int tokenOffset = random().nextInt(5);

        final Token token = makeToken(text, posIncr, offset+offIncr, offset+offIncr+tokenOffset);
        if (!actualTokens.containsKey(text)) {
          actualTokens.put(text, new HashMap<Integer,List<Token>>());
        }
        final Map<Integer,List<Token>> postingsByDoc = actualTokens.get(text);
        if (!postingsByDoc.containsKey(docCount)) {
          postingsByDoc.put(docCount, new ArrayList<Token>());
        }
        postingsByDoc.get(docCount).add(token);
        tokens.add(token);
        pos += posIncr;
        // stuff abs position into type:
        token.setType(""+pos);
        offset += offIncr + tokenOffset;
        //System.out.println("  " + token + " posIncr=" + token.getPositionIncrement() + " pos=" + pos + " off=" + token.startOffset() + "/" + token.endOffset() + " (freq=" + postingsByDoc.get(docCount).size() + ")");
      }
      doc.add(new Field("content", new CannedTokenStream(tokens.toArray(new Token[tokens.size()])), ft));
      w.addDocument(doc);
    }
    final DirectoryReader r = w.getReader();
    w.close();

    final String[] terms = new String[] {"a", "b", "c", "d"};
    for(LeafReaderContext ctx : r.leaves()) {
      // TODO: improve this
      LeafReader sub = ctx.reader();
      //System.out.println("\nsub=" + sub);
      final TermsEnum termsEnum = sub.fields().terms("content").iterator();
      PostingsEnum docs = null;
      PostingsEnum docsAndPositions = null;
      PostingsEnum docsAndPositionsAndOffsets = null;
      int[] docIDToID = new int[sub.maxDoc()];
      NumericDocValues values = DocValues.getNumeric(sub, "id");
      for(int i=0;i<sub.maxDoc();i++) {
        assertEquals(i, values.nextDoc());
        docIDToID[i] = (int) values.longValue();
      }
      
      for(String term : terms) {
        //System.out.println("  term=" + term);
        if (termsEnum.seekExact(new BytesRef(term))) {
          docs = termsEnum.postings(docs);
          assertNotNull(docs);
          int doc;
          //System.out.println("    doc/freq");
          while((doc = docs.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
            final List<Token> expected = actualTokens.get(term).get(docIDToID[doc]);
            //System.out.println("      doc=" + docIDToID[doc] + " docID=" + doc + " " + expected.size() + " freq");
            assertNotNull(expected);
            assertEquals(expected.size(), docs.freq());
          }

          // explicitly exclude offsets here
          docsAndPositions = termsEnum.postings(docsAndPositions, PostingsEnum.ALL);
          assertNotNull(docsAndPositions);
          //System.out.println("    doc/freq/pos");
          while((doc = docsAndPositions.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
            final List<Token> expected = actualTokens.get(term).get(docIDToID[doc]);
            //System.out.println("      doc=" + docIDToID[doc] + " " + expected.size() + " freq");
            assertNotNull(expected);
            assertEquals(expected.size(), docsAndPositions.freq());
            for(Token token : expected) {
              int pos = Integer.parseInt(token.type());
              //System.out.println("        pos=" + pos);
              assertEquals(pos, docsAndPositions.nextPosition());
            }
          }

          docsAndPositionsAndOffsets = termsEnum.postings(docsAndPositions, PostingsEnum.ALL);
          assertNotNull(docsAndPositionsAndOffsets);
          //System.out.println("    doc/freq/pos/offs");
          while((doc = docsAndPositionsAndOffsets.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
            final List<Token> expected = actualTokens.get(term).get(docIDToID[doc]);
            //System.out.println("      doc=" + docIDToID[doc] + " " + expected.size() + " freq");
            assertNotNull(expected);
            assertEquals(expected.size(), docsAndPositionsAndOffsets.freq());
            for(Token token : expected) {
              int pos = Integer.parseInt(token.type());
              //System.out.println("        pos=" + pos);
              assertEquals(pos, docsAndPositionsAndOffsets.nextPosition());
              assertEquals(token.startOffset(), docsAndPositionsAndOffsets.startOffset());
              assertEquals(token.endOffset(), docsAndPositionsAndOffsets.endOffset());
            }
          }
        }
      }        
      // TODO: test advance:
    }
    r.close();
    dir.close();
  }
  
  public void testWithUnindexedFields() throws Exception {
    Directory dir = newDirectory();
    RandomIndexWriter riw = new RandomIndexWriter(random(), dir, iwc);
    for (int i = 0; i < 100; i++) {
      Document doc = new Document();
      // ensure at least one doc is indexed with offsets
      if (i < 99 && random().nextInt(2) == 0) {
        // stored only
        FieldType ft = new FieldType();
        ft.setStored(true);
        doc.add(new Field("foo", "boo!", ft));
      } else {
        FieldType ft = new FieldType(TextField.TYPE_STORED);
        ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
        if (random().nextBoolean()) {
          // store some term vectors for the checkindex cross-check
          ft.setStoreTermVectors(true);
          ft.setStoreTermVectorPositions(true);
          ft.setStoreTermVectorOffsets(true);
        }
        doc.add(new Field("foo", "bar", ft));
      }
      riw.addDocument(doc);
    }
    CompositeReader ir = riw.getReader();
    FieldInfos fis = MultiFields.getMergedFieldInfos(ir);
    assertEquals(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS, fis.fieldInfo("foo").getIndexOptions());
    ir.close();
    ir.close();
    riw.close();
    dir.close();
  }
  
  public void testAddFieldTwice() throws Exception {
    Directory dir = newDirectory();
    RandomIndexWriter iw = new RandomIndexWriter(random(), dir);
    Document doc = new Document();
    FieldType customType3 = new FieldType(TextField.TYPE_STORED);
    customType3.setStoreTermVectors(true);
    customType3.setStoreTermVectorPositions(true);
    customType3.setStoreTermVectorOffsets(true);    
    customType3.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
    doc.add(new Field("content3", "here is more content with aaa aaa aaa", customType3));
    doc.add(new Field("content3", "here is more content with aaa aaa aaa", customType3));
    iw.addDocument(doc);
    iw.close();
    dir.close(); // checkindex
  }
  
  // NOTE: the next two tests aren't that good as we need an EvilToken...
  public void testNegativeOffsets() throws Exception {
    expectThrows(IllegalArgumentException.class, () -> {
      checkTokens(new Token[] { 
          makeToken("foo", 1, -1, -1)
      });
    });
  }
  
  public void testIllegalOffsets() throws Exception {
    expectThrows(IllegalArgumentException.class, () -> {
      checkTokens(new Token[] { 
          makeToken("foo", 1, 1, 0)
      });
    });
  }
  
  public void testIllegalOffsetsAcrossFieldInstances() throws Exception {
    expectThrows(IllegalArgumentException.class, () -> {
      checkTokens(new Token[] { makeToken("use", 1, 150, 160) }, 
                  new Token[] { makeToken("use", 1, 50, 60) });
    });
  }
   
  public void testBackwardsOffsets() throws Exception {
    expectThrows(IllegalArgumentException.class, () -> {
      checkTokens(new Token[] { 
         makeToken("foo", 1, 0, 3),
         makeToken("foo", 1, 4, 7),
         makeToken("foo", 0, 3, 6)
      });
    });
  }
  
  public void testStackedTokens() throws Exception {
    checkTokens(new Token[] { 
        makeToken("foo", 1, 0, 3),
        makeToken("foo", 0, 0, 3),
        makeToken("foo", 0, 0, 3)
      });
  }
  
  public void testCrazyOffsetGap() throws Exception {
    Directory dir = newDirectory();
    Analyzer analyzer = new Analyzer() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName) {
        return new TokenStreamComponents(new MockTokenizer(MockTokenizer.KEYWORD, false));
      }

      @Override
      public int getOffsetGap(String fieldName) {
        return -10;
      }
    };
    IndexWriter iw = new IndexWriter(dir, new IndexWriterConfig(analyzer));
    // add good document
    Document doc = new Document();
    iw.addDocument(doc);
    expectThrows(IllegalArgumentException.class, () -> {
      FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
      ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
      doc.add(new Field("foo", "bar", ft));
      doc.add(new Field("foo", "bar", ft));
      iw.addDocument(doc);
    });
    iw.commit();
    iw.close();

    // make sure we see our good doc
    DirectoryReader r = DirectoryReader.open(dir);   
    assertEquals(1, r.numDocs());
    r.close();
    dir.close();
  }

  public void testLegalbutVeryLargeOffsets() throws Exception {
    Directory dir = newDirectory();
    IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig(null));
    Document doc = new Document();
    Token t1 = new Token("foo", 0, Integer.MAX_VALUE-500);
    if (random().nextBoolean()) {
      t1.setPayload(new BytesRef("test"));
    }
    Token t2 = new Token("foo", Integer.MAX_VALUE-500, Integer.MAX_VALUE);
    TokenStream tokenStream = new CannedTokenStream(
        new Token[] { t1, t2 }
    );
    FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
    ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
    // store some term vectors for the checkindex cross-check
    ft.setStoreTermVectors(true);
    ft.setStoreTermVectorPositions(true);
    ft.setStoreTermVectorOffsets(true);
    Field field = new Field("foo", tokenStream, ft);
    doc.add(field);
    iw.addDocument(doc);
    iw.close();
    dir.close();
  }
  // TODO: more tests with other possibilities
  
  private void checkTokens(Token[] field1, Token[] field2) throws IOException {
    Directory dir = newDirectory();
    RandomIndexWriter riw = new RandomIndexWriter(random(), dir, iwc);
    boolean success = false;
    try {
      FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
      ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
      // store some term vectors for the checkindex cross-check
      ft.setStoreTermVectors(true);
      ft.setStoreTermVectorPositions(true);
      ft.setStoreTermVectorOffsets(true);
     
      Document doc = new Document();
      doc.add(new Field("body", new CannedTokenStream(field1), ft));
      doc.add(new Field("body", new CannedTokenStream(field2), ft));
      riw.addDocument(doc);
      riw.close();
      success = true;
    } finally {
      if (success) {
        IOUtils.close(dir);
      } else {
        IOUtils.closeWhileHandlingException(riw, dir);
      }
    }
  }
  
  private void checkTokens(Token[] tokens) throws IOException {
    Directory dir = newDirectory();
    RandomIndexWriter riw = new RandomIndexWriter(random(), dir, iwc);
    boolean success = false;
    try {
      FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
      ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
      // store some term vectors for the checkindex cross-check
      ft.setStoreTermVectors(true);
      ft.setStoreTermVectorPositions(true);
      ft.setStoreTermVectorOffsets(true);
     
      Document doc = new Document();
      doc.add(new Field("body", new CannedTokenStream(tokens), ft));
      riw.addDocument(doc);
      riw.close();
      success = true;
    } finally {
      if (success) {
        IOUtils.close(dir);
      } else {
        IOUtils.closeWhileHandlingException(riw, dir);
      }
    }
  }

  private Token makeToken(String text, int posIncr, int startOffset, int endOffset) {
    final Token t = new Token();
    t.append(text);
    t.setPositionIncrement(posIncr);
    t.setOffset(startOffset, endOffset);
    return t;
  }
}