TestSegmentingTokenizerBase.java example

Explorer
lucene-solr-master
- lucene
- solr
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.analysis.util;


import java.io.IOException;
import java.text.BreakIterator;
import java.util.Arrays;
import java.util.Locale;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.util.IOUtils;

/** Basic tests for {@link SegmentingTokenizerBase} */
public class TestSegmentingTokenizerBase extends BaseTokenStreamTestCase {
  private Analyzer sentence, sentenceAndWord;
  
  @Override
  public void setUp() throws Exception {
    super.setUp();
    sentence = new Analyzer() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName) {
        return new TokenStreamComponents(new WholeSentenceTokenizer());
      }
    };
    sentenceAndWord = new Analyzer() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName) {
        return new TokenStreamComponents(new SentenceAndWordTokenizer());
      }
    };
  }
  
  @Override
  public void tearDown() throws Exception {
    IOUtils.close(sentence, sentenceAndWord);
    super.tearDown();
  }

  
  /** Some simple examples, just outputting the whole sentence boundaries as "terms" */
  public void testBasics() throws IOException {
    assertAnalyzesTo(sentence, "The acronym for United States is U.S. but this doesn't end a sentence",
        new String[] { "The acronym for United States is U.S. but this doesn't end a sentence"}
    );
    assertAnalyzesTo(sentence, "He said, \"Are you going?\" John shook his head.",
        new String[] { "He said, \"Are you going?\" ", 
                       "John shook his head." }
    );
  }
  
  /** Test a subclass that sets some custom attribute values */
  public void testCustomAttributes() throws IOException {
    assertAnalyzesTo(sentenceAndWord, "He said, \"Are you going?\" John shook his head.",
        new String[] { "He", "said", "Are", "you", "going", "John", "shook", "his", "head" },
        new int[] { 0, 3, 10, 14, 18, 26, 31, 37, 41 },
        new int[] { 2, 7, 13, 17, 23, 30, 36, 40, 45 },
        new int[] { 1, 1,  1,  1,  1,  2,  1,  1,  1 }
    );
  }
  
  /** Tests tokenstream reuse */
  public void testReuse() throws IOException {
    assertAnalyzesTo(sentenceAndWord, "He said, \"Are you going?\"",
        new String[] { "He", "said", "Are", "you", "going" },
        new int[] { 0, 3, 10, 14, 18 },
        new int[] { 2, 7, 13, 17, 23 },
        new int[] { 1, 1,  1,  1,  1,}
    );
    assertAnalyzesTo(sentenceAndWord, "John shook his head.",
        new String[] { "John", "shook", "his", "head" },
        new int[] { 0,  5, 11, 15 },
        new int[] { 4, 10, 14, 19 },
        new int[] { 1,  1,  1,  1 }
    );
  }
  
  /** Tests TokenStream.end() */
  public void testEnd() throws IOException {
    // BaseTokenStreamTestCase asserts that end() is set to our StringReader's length for us here.
    // we add some junk whitespace to the end just to test it.
    assertAnalyzesTo(sentenceAndWord, "John shook his head          ",
        new String[] { "John", "shook", "his", "head" }
    );
    assertAnalyzesTo(sentenceAndWord, "John shook his head.          ",
        new String[] { "John", "shook", "his", "head" }
    );
  }
  
  /** Tests terms which span across boundaries */
  public void testHugeDoc() throws IOException {
    StringBuilder sb = new StringBuilder();
    char whitespace[] = new char[4094];
    Arrays.fill(whitespace, '\n');
    sb.append(whitespace);
    sb.append("testing 1234");
    String input = sb.toString();
    assertAnalyzesTo(sentenceAndWord, input, new String[] { "testing", "1234" });
  }
  
  /** Tests the handling of binary/malformed data */
  public void testHugeTerm() throws IOException {
    StringBuilder sb = new StringBuilder();
    for (int i = 0; i < 10240; i++) {
      sb.append('a');
    }
    String input = sb.toString();
    char token[] = new char[1024];
    Arrays.fill(token, 'a');
    String expectedToken = new String(token);
    String expected[] = { 
        expectedToken, expectedToken, expectedToken, 
        expectedToken, expectedToken, expectedToken,
        expectedToken, expectedToken, expectedToken,
        expectedToken
    };
    assertAnalyzesTo(sentence, input, expected);
  }
  
  /** blast some random strings through the analyzer */
  public void testRandomStrings() throws Exception {
    checkRandomData(random(), sentence, 10000*RANDOM_MULTIPLIER);
    checkRandomData(random(), sentenceAndWord, 10000*RANDOM_MULTIPLIER);
  }

  // some tokenizers for testing
  
  /** silly tokenizer that just returns whole sentences as tokens */
  static class WholeSentenceTokenizer extends SegmentingTokenizerBase {
    int sentenceStart, sentenceEnd;
    boolean hasSentence;
    
    private CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
    private OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
    
    public WholeSentenceTokenizer() {
      super(newAttributeFactory(), BreakIterator.getSentenceInstance(Locale.ROOT));
    }

    @Override
    protected void setNextSentence(int sentenceStart, int sentenceEnd) {
      this.sentenceStart = sentenceStart;
      this.sentenceEnd = sentenceEnd;
      hasSentence = true;
    }

    @Override
    protected boolean incrementWord() {
      if (hasSentence) {
        hasSentence = false;
        clearAttributes();
        termAtt.copyBuffer(buffer, sentenceStart, sentenceEnd-sentenceStart);
        offsetAtt.setOffset(correctOffset(offset+sentenceStart), correctOffset(offset+sentenceEnd));
        return true;
      } else {
        return false;
      }
    }
  }
  
  /** 
   * simple tokenizer, that bumps posinc + 1 for tokens after a 
   * sentence boundary to inhibit phrase queries without slop.
   */
  static class SentenceAndWordTokenizer extends SegmentingTokenizerBase {
    int sentenceStart, sentenceEnd;
    int wordStart, wordEnd;
    int posBoost = -1; // initially set to -1 so the first word in the document doesn't get a pos boost
    
    private CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
    private OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
    private PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
    
    public SentenceAndWordTokenizer() {
      super(newAttributeFactory(), BreakIterator.getSentenceInstance(Locale.ROOT));
    }

    @Override
    protected void setNextSentence(int sentenceStart, int sentenceEnd) {
      this.wordStart = this.wordEnd = this.sentenceStart = sentenceStart;
      this.sentenceEnd = sentenceEnd;
      posBoost++;
    }
    
    @Override
    public void reset() throws IOException {
      super.reset();
      posBoost = -1;
    }

    @Override
    protected boolean incrementWord() {
      wordStart = wordEnd;
      while (wordStart < sentenceEnd) {
        if (Character.isLetterOrDigit(buffer[wordStart]))
          break;
        wordStart++;
      }
      
      if (wordStart == sentenceEnd) return false;
      
      wordEnd = wordStart+1;
      while (wordEnd < sentenceEnd && Character.isLetterOrDigit(buffer[wordEnd]))
        wordEnd++;
      
      clearAttributes();
      termAtt.copyBuffer(buffer, wordStart, wordEnd-wordStart);
      offsetAtt.setOffset(correctOffset(offset+wordStart), correctOffset(offset+wordEnd));
      posIncAtt.setPositionIncrement(posIncAtt.getPositionIncrement() + posBoost);
      posBoost = 0;
      return true;
    }
  }
}