HighlighterPhraseTest.java example

Explorer
heliosearch-master
- lucene
- solr
package org.apache.lucene.search.highlight;

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;

import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TopDocs;

import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.FixedBitSet;

public class HighlighterPhraseTest extends LuceneTestCase {
  private static final String FIELD = "text";
  public void testConcurrentPhrase() throws IOException, InvalidTokenOffsetsException {
    final String TEXT = "the fox jumped";
    final Directory directory = newDirectory();
    final IndexWriter indexWriter = new IndexWriter(directory,
        newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false)));
    try {
      final Document document = new Document();
      FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
      customType.setStoreTermVectorOffsets(true);
      customType.setStoreTermVectorPositions(true);
      customType.setStoreTermVectors(true);
      document.add(new Field(FIELD, new TokenStreamConcurrent(), customType));
      indexWriter.addDocument(document);
    } finally {
      indexWriter.close();
    }
    final IndexReader indexReader = DirectoryReader.open(directory);
    try {
      assertEquals(1, indexReader.numDocs());
      final IndexSearcher indexSearcher = newSearcher(indexReader);
      final PhraseQuery phraseQuery = new PhraseQuery();
      phraseQuery.add(new Term(FIELD, "fox"));
      phraseQuery.add(new Term(FIELD, "jumped"));
      phraseQuery.setSlop(0);
      TopDocs hits = indexSearcher.search(phraseQuery, 1);
      assertEquals(1, hits.totalHits);
      final Highlighter highlighter = new Highlighter(
          new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
          new QueryScorer(phraseQuery));

      final TokenStream tokenStream = TokenSources
          .getTokenStream(indexReader.getTermVector(
              0, FIELD), false);
      assertEquals(highlighter.getBestFragment(new TokenStreamConcurrent(),
          TEXT), highlighter.getBestFragment(tokenStream, TEXT));
    } finally {
      indexReader.close();
      directory.close();
    }
  }

  public void testConcurrentSpan() throws IOException, InvalidTokenOffsetsException {
    final String TEXT = "the fox jumped";
    final Directory directory = newDirectory();
    final IndexWriter indexWriter = new IndexWriter(directory,
        newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false)));
    try {
      final Document document = new Document();

      FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
      customType.setStoreTermVectorOffsets(true);
      customType.setStoreTermVectorPositions(true);
      customType.setStoreTermVectors(true);
      document.add(new Field(FIELD, new TokenStreamConcurrent(), customType));
      indexWriter.addDocument(document);
    } finally {
      indexWriter.close();
    }
    final IndexReader indexReader = DirectoryReader.open(directory);
    try {
      assertEquals(1, indexReader.numDocs());
      final IndexSearcher indexSearcher = newSearcher(indexReader);
      final Query phraseQuery = new SpanNearQuery(new SpanQuery[] {
          new SpanTermQuery(new Term(FIELD, "fox")),
          new SpanTermQuery(new Term(FIELD, "jumped")) }, 0, true);
      final FixedBitSet bitset = new FixedBitSet(indexReader.maxDoc());
      indexSearcher.search(phraseQuery, new Collector() {
        private int baseDoc;

        @Override
        public boolean acceptsDocsOutOfOrder() {
          return true;
        }

        @Override
        public void collect(int i) {
          bitset.set(this.baseDoc + i);
        }

        @Override
        public void setNextReader(AtomicReaderContext context) {
          this.baseDoc = context.docBase;
        }

        @Override
        public void setScorer(org.apache.lucene.search.Scorer scorer) {
          // Do Nothing
        }
      });
      assertEquals(1, bitset.cardinality());
      final int maxDoc = indexReader.maxDoc();
      final Highlighter highlighter = new Highlighter(
          new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
          new QueryScorer(phraseQuery));
      for (int position = bitset.nextSetBit(0); position >= 0 && position < maxDoc-1; position = bitset
          .nextSetBit(position + 1)) {
        assertEquals(0, position);
        final TokenStream tokenStream = TokenSources.getTokenStream(
            indexReader.getTermVector(position,
                FIELD), false);
        assertEquals(highlighter.getBestFragment(new TokenStreamConcurrent(),
            TEXT), highlighter.getBestFragment(tokenStream, TEXT));
      }
    } finally {
      indexReader.close();
      directory.close();
    }
  }

  public void testSparsePhrase() throws IOException, InvalidTokenOffsetsException {
    final String TEXT = "the fox did not jump";
    final Directory directory = newDirectory();
    final IndexWriter indexWriter = new IndexWriter(directory,
        newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false)));
    try {
      final Document document = new Document();

      FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
      customType.setStoreTermVectorOffsets(true);
      customType.setStoreTermVectorPositions(true);
      customType.setStoreTermVectors(true);
      document.add(new Field(FIELD, new TokenStreamSparse(), customType));
      indexWriter.addDocument(document);
    } finally {
      indexWriter.close();
    }
    final IndexReader indexReader = DirectoryReader.open(directory);
    try {
      assertEquals(1, indexReader.numDocs());
      final IndexSearcher indexSearcher = newSearcher(indexReader);
      final PhraseQuery phraseQuery = new PhraseQuery();
      phraseQuery.add(new Term(FIELD, "did"));
      phraseQuery.add(new Term(FIELD, "jump"));
      phraseQuery.setSlop(0);
      TopDocs hits = indexSearcher.search(phraseQuery, 1);
      assertEquals(0, hits.totalHits);
      final Highlighter highlighter = new Highlighter(
          new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
          new QueryScorer(phraseQuery));
      final TokenStream tokenStream = TokenSources
          .getTokenStream(indexReader.getTermVector(
              0, FIELD), false);
      assertEquals(
          highlighter.getBestFragment(new TokenStreamSparse(), TEXT),
          highlighter.getBestFragment(tokenStream, TEXT));
    } finally {
      indexReader.close();
      directory.close();
    }
  }

  public void testSparsePhraseWithNoPositions() throws IOException, InvalidTokenOffsetsException {
    final String TEXT = "the fox did not jump";
    final Directory directory = newDirectory();
    final IndexWriter indexWriter = new IndexWriter(directory,
        newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false)));
    try {
      final Document document = new Document();

      FieldType customType = new FieldType(TextField.TYPE_STORED);
      customType.setStoreTermVectorOffsets(true);
      customType.setStoreTermVectors(true);
      document.add(new Field(FIELD, TEXT, customType));
      indexWriter.addDocument(document);
    } finally {
      indexWriter.close();
    }
    final IndexReader indexReader = DirectoryReader.open(directory);
    try {
      assertEquals(1, indexReader.numDocs());
      final IndexSearcher indexSearcher = newSearcher(indexReader);
      final PhraseQuery phraseQuery = new PhraseQuery();
      phraseQuery.add(new Term(FIELD, "did"));
      phraseQuery.add(new Term(FIELD, "jump"));
      phraseQuery.setSlop(1);
      TopDocs hits = indexSearcher.search(phraseQuery, 1);
      assertEquals(1, hits.totalHits);
      final Highlighter highlighter = new Highlighter(
          new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
          new QueryScorer(phraseQuery));
      final TokenStream tokenStream = TokenSources.getTokenStream(
          indexReader.getTermVector(0, FIELD), true);
      assertEquals("the fox <B>did</B> not <B>jump</B>", highlighter
          .getBestFragment(tokenStream, TEXT));
    } finally {
      indexReader.close();
      directory.close();
    }
  }

  public void testSparseSpan() throws IOException, InvalidTokenOffsetsException {
    final String TEXT = "the fox did not jump";
    final Directory directory = newDirectory();
    final IndexWriter indexWriter = new IndexWriter(directory,
        newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false)));
    try {
      final Document document = new Document();
      FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
      customType.setStoreTermVectorOffsets(true);
      customType.setStoreTermVectorPositions(true);
      customType.setStoreTermVectors(true);
      document.add(new Field(FIELD, new TokenStreamSparse(), customType));
      indexWriter.addDocument(document);
    } finally {
      indexWriter.close();
    }
    final IndexReader indexReader = DirectoryReader.open(directory);
    try {
      assertEquals(1, indexReader.numDocs());
      final IndexSearcher indexSearcher = newSearcher(indexReader);
      final Query phraseQuery = new SpanNearQuery(new SpanQuery[] {
          new SpanTermQuery(new Term(FIELD, "did")),
          new SpanTermQuery(new Term(FIELD, "jump")) }, 0, true);

      TopDocs hits = indexSearcher.search(phraseQuery, 1);
      assertEquals(0, hits.totalHits);
      final Highlighter highlighter = new Highlighter(
          new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
          new QueryScorer(phraseQuery));
      final TokenStream tokenStream = TokenSources
          .getTokenStream(indexReader.getTermVector(
              0, FIELD), false);
      assertEquals(
          highlighter.getBestFragment(new TokenStreamSparse(), TEXT),
          highlighter.getBestFragment(tokenStream, TEXT));
    } finally {
      indexReader.close();
      directory.close();
    }
  }

  private static final class TokenStreamSparse extends TokenStream {
    private Token[] tokens;

    private int i = -1;

    private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
    private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
    private final PositionIncrementAttribute positionIncrementAttribute = addAttribute(PositionIncrementAttribute.class);

    public TokenStreamSparse() {
      reset();
    }

    @Override
    public boolean incrementToken() {
      this.i++;
      if (this.i >= this.tokens.length) {
        return false;
      }
      clearAttributes();
      termAttribute.setEmpty().append(this.tokens[i]);
      offsetAttribute.setOffset(this.tokens[i].startOffset(), this.tokens[i]
          .endOffset());
      positionIncrementAttribute.setPositionIncrement(this.tokens[i]
          .getPositionIncrement());
      return true;
    }

    @Override
    public void reset() {
      this.i = -1;
      this.tokens = new Token[] {
          new Token(new char[] { 't', 'h', 'e' }, 0, 3, 0, 3),
          new Token(new char[] { 'f', 'o', 'x' }, 0, 3, 4, 7),
          new Token(new char[] { 'd', 'i', 'd' }, 0, 3, 8, 11),
          new Token(new char[] { 'j', 'u', 'm', 'p' }, 0, 4, 16, 20) };
      this.tokens[3].setPositionIncrement(2);
    }
  }

  private static final class TokenStreamConcurrent extends TokenStream {
    private Token[] tokens;

    private int i = -1;

    private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
    private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
    private final PositionIncrementAttribute positionIncrementAttribute = addAttribute(PositionIncrementAttribute.class);

    public TokenStreamConcurrent() {
      reset();
    }

    @Override
    public boolean incrementToken() {
      this.i++;
      if (this.i >= this.tokens.length) {
        return false;
      }
      clearAttributes();
      termAttribute.setEmpty().append(this.tokens[i]);
      offsetAttribute.setOffset(this.tokens[i].startOffset(), this.tokens[i]
          .endOffset());
      positionIncrementAttribute.setPositionIncrement(this.tokens[i]
          .getPositionIncrement());
      return true;
    }

    @Override
    public void reset() {
      this.i = -1;
      this.tokens = new Token[] {
          new Token(new char[] { 't', 'h', 'e' }, 0, 3, 0, 3),
          new Token(new char[] { 'f', 'o', 'x' }, 0, 3, 4, 7),
          new Token(new char[] { 'j', 'u', 'm', 'p' }, 0, 4, 8, 14),
          new Token(new char[] { 'j', 'u', 'm', 'p', 'e', 'd' }, 0, 6, 8, 14) };
      this.tokens[3].setPositionIncrement(0);
    }
  }

}