NGramTokenizerTest.java example

Explorer
lucene-solr-master
- lucene
- solr
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.analysis.ngram;



import java.io.IOException;
import java.io.StringReader;
import java.util.Arrays;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.util.TestUtil;

import com.carrotsearch.randomizedtesting.generators.RandomStrings;

/**
 * Tests {@link NGramTokenizer} for correctness.
 */
public class NGramTokenizerTest extends BaseTokenStreamTestCase {
  private StringReader input;
  
  @Override
  public void setUp() throws Exception {
    super.setUp();
    input = new StringReader("abcde");
  }
  
  public void testInvalidInput() throws Exception {
    expectThrows(IllegalArgumentException.class, () -> {
      NGramTokenizer tok = new NGramTokenizer(2, 1);
    });
  }
  
  public void testInvalidInput2() throws Exception {
    expectThrows(IllegalArgumentException.class, () -> {
      NGramTokenizer tok = new NGramTokenizer(0, 1);
      tok.setReader(input);
    });
  }
  
  public void testUnigrams() throws Exception {
    NGramTokenizer tokenizer = new NGramTokenizer(1, 1);
    tokenizer.setReader(input);
    assertTokenStreamContents(tokenizer, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5}, 5 /* abcde */);
  }
  
  public void testBigrams() throws Exception {
    NGramTokenizer tokenizer = new NGramTokenizer(2, 2);
    tokenizer.setReader(input);
    assertTokenStreamContents(tokenizer, new String[]{"ab","bc","cd","de"}, new int[]{0,1,2,3}, new int[]{2,3,4,5}, 5 /* abcde */);
  }
  
  public void testNgrams() throws Exception {
    NGramTokenizer tokenizer = new NGramTokenizer(1, 3);
    tokenizer.setReader(input);
    assertTokenStreamContents(tokenizer,
        new String[]{"a","ab", "abc", "b", "bc", "bcd", "c", "cd", "cde", "d", "de", "e"},
        new int[]{0,0,0,1,1,1,2,2,2,3,3,4},
        new int[]{1,2,3,2,3,4,3,4,5,4,5,5},
        null,
        null,
        null,
        5 /* abcde */,
        false
        );
  }
  
  public void testOversizedNgrams() throws Exception {
    NGramTokenizer tokenizer = new NGramTokenizer(6, 7);
    tokenizer.setReader(input);
    assertTokenStreamContents(tokenizer, new String[0], new int[0], new int[0], 5 /* abcde */);
  }
  
  public void testReset() throws Exception {
    NGramTokenizer tokenizer = new NGramTokenizer(1, 1);
    tokenizer.setReader(input);
    assertTokenStreamContents(tokenizer, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5}, 5 /* abcde */);
    tokenizer.setReader(new StringReader("abcde"));
    assertTokenStreamContents(tokenizer, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5}, 5 /* abcde */);
  }
  
  /** blast some random strings through the analyzer */
  public void testRandomStrings() throws Exception {
    int numIters = TEST_NIGHTLY ? 10 : 1;
    for (int i = 0; i < numIters; i++) {
      final int min = TestUtil.nextInt(random(), 2, 10);
      final int max = TestUtil.nextInt(random(), min, 20);
      Analyzer a = new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
          Tokenizer tokenizer = new NGramTokenizer(min, max);
          return new TokenStreamComponents(tokenizer, tokenizer);
        }    
      };
      checkRandomData(random(), a, 200*RANDOM_MULTIPLIER, 20);
      checkRandomData(random(), a, 10*RANDOM_MULTIPLIER, 1027);
      a.close();
    }
  }

  private static void testNGrams(int minGram, int maxGram, int length, final String nonTokenChars) throws IOException {
    final String s = RandomStrings.randomAsciiOfLength(random(), length);
    testNGrams(minGram, maxGram, s, nonTokenChars);
  }

  private static void testNGrams(int minGram, int maxGram, String s, String nonTokenChars) throws IOException {
    testNGrams(minGram, maxGram, s, nonTokenChars, false);
  }

  static int[] toCodePoints(CharSequence s) {
    final int[] codePoints = new int[Character.codePointCount(s, 0, s.length())];
    for (int i = 0, j = 0; i < s.length(); ++j) {
      codePoints[j] = Character.codePointAt(s, i);
      i += Character.charCount(codePoints[j]);
    }
    return codePoints;
  }

  static boolean isTokenChar(String nonTokenChars, int codePoint) {
    for (int i = 0; i < nonTokenChars.length(); ) {
      final int cp = nonTokenChars.codePointAt(i);
      if (cp == codePoint) {
        return false;
      }
      i += Character.charCount(cp);
    }
    return true;
  }

  static void testNGrams(int minGram, int maxGram, String s, final String nonTokenChars, boolean edgesOnly) throws IOException {
    // convert the string to code points
    final int[] codePoints = toCodePoints(s);
    final int[] offsets = new int[codePoints.length + 1];
    for (int i = 0; i < codePoints.length; ++i) {
      offsets[i+1] = offsets[i] + Character.charCount(codePoints[i]);
    }
    final Tokenizer grams = new NGramTokenizer(minGram, maxGram, edgesOnly) {
      @Override
      protected boolean isTokenChar(int chr) {
        return nonTokenChars.indexOf(chr) < 0;
      }
    };
    grams.setReader(new StringReader(s));
    final CharTermAttribute termAtt = grams.addAttribute(CharTermAttribute.class);
    final PositionIncrementAttribute posIncAtt = grams.addAttribute(PositionIncrementAttribute.class);
    final PositionLengthAttribute posLenAtt = grams.addAttribute(PositionLengthAttribute.class);
    final OffsetAttribute offsetAtt = grams.addAttribute(OffsetAttribute.class);
    grams.reset();
    for (int start = 0; start < codePoints.length; ++start) {
      nextGram:
      for (int end = start + minGram; end <= start + maxGram && end <= codePoints.length; ++end) {
        if (edgesOnly && start > 0 && isTokenChar(nonTokenChars, codePoints[start - 1])) {
          // not on an edge
          continue nextGram;
        }
        for (int j = start; j < end; ++j) {
          if (!isTokenChar(nonTokenChars, codePoints[j])) {
            continue nextGram;
          }
        }
        assertTrue(grams.incrementToken());
        assertArrayEquals(Arrays.copyOfRange(codePoints, start, end), toCodePoints(termAtt));
        assertEquals(1, posIncAtt.getPositionIncrement());
        assertEquals(1, posLenAtt.getPositionLength());
        assertEquals(offsets[start], offsetAtt.startOffset());
        assertEquals(offsets[end], offsetAtt.endOffset());
      }
    }
    assertFalse(grams.incrementToken());
    grams.end();
    assertEquals(s.length(), offsetAtt.startOffset());
    assertEquals(s.length(), offsetAtt.endOffset());
  }

  public void testLargeInput() throws IOException {
    // test sliding
    final int minGram = TestUtil.nextInt(random(), 1, 100);
    final int maxGram = TestUtil.nextInt(random(), minGram, 100);
    testNGrams(minGram, maxGram, TestUtil.nextInt(random(), 3 * 1024, 4 * 1024), "");
  }

  public void testLargeMaxGram() throws IOException {
    // test sliding with maxGram > 1024
    final int minGram = TestUtil.nextInt(random(), 1290, 1300);
    final int maxGram = TestUtil.nextInt(random(), minGram, 1300);
    testNGrams(minGram, maxGram, TestUtil.nextInt(random(), 3 * 1024, 4 * 1024), "");
  }

  public void testPreTokenization() throws IOException {
    final int minGram = TestUtil.nextInt(random(), 1, 100);
    final int maxGram = TestUtil.nextInt(random(), minGram, 100);
    testNGrams(minGram, maxGram, TestUtil.nextInt(random(), 0, 4 * 1024), "a");
  }

  public void testHeavyPreTokenization() throws IOException {
    final int minGram = TestUtil.nextInt(random(), 1, 100);
    final int maxGram = TestUtil.nextInt(random(), minGram, 100);
    testNGrams(minGram, maxGram, TestUtil.nextInt(random(), 0, 4 * 1024), "abcdef");
  }

  public void testFewTokenChars() throws IOException {
    final char[] chrs = new char[TestUtil.nextInt(random(), 4000, 5000)];
    Arrays.fill(chrs, ' ');
    for (int i = 0; i < chrs.length; ++i) {
      if (random().nextFloat() < 0.1) {
        chrs[i] = 'a';
      }
    }
    final int minGram = TestUtil.nextInt(random(), 1, 2);
    final int maxGram = TestUtil.nextInt(random(), minGram, 2);
    testNGrams(minGram, maxGram, new String(chrs), " ");
  }

  public void testFullUTF8Range() throws IOException {
    final int minGram = TestUtil.nextInt(random(), 1, 100);
    final int maxGram = TestUtil.nextInt(random(), minGram, 100);
    final String s = TestUtil.randomUnicodeString(random(), 4 * 1024);
    testNGrams(minGram, maxGram, s, "");
    testNGrams(minGram, maxGram, s, "abcdef");
  }
}