TestBugInSomething.java example

Explorer
lucene-solr-master
- lucene
- solr
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.analysis.core;

import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.nio.CharBuffer;
import java.util.Arrays;
import java.util.HashSet;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.CharFilter;
import org.apache.lucene.analysis.MockCharFilter;
import org.apache.lucene.analysis.MockTokenFilter;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.charfilter.MappingCharFilter;
import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
import org.apache.lucene.analysis.commongrams.CommonGramsFilter;
import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter;
import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer;
import org.apache.lucene.analysis.ngram.NGramTokenFilter;
import org.apache.lucene.analysis.shingle.ShingleFilter;
import org.apache.lucene.analysis.wikipedia.WikipediaTokenizer;
import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;


@SuppressCodecs("Direct")
public class TestBugInSomething extends BaseTokenStreamTestCase {
  public void test() throws Exception {
    final CharArraySet cas = new CharArraySet(3, false);
    cas.add("jjp");
    cas.add("wlmwoknt");
    cas.add("tcgyreo");
    
    final NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
    builder.add("mtqlpi", "");
    builder.add("mwoknt", "jjp");
    builder.add("tcgyreo", "zpfpajyws");
    final NormalizeCharMap map = builder.build();
    
    Analyzer a = new Analyzer() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName) {
        Tokenizer t = new MockTokenizer(MockTokenFilter.ENGLISH_STOPSET, false, -65);
        TokenFilter f = new CommonGramsFilter(t, cas);
        return new TokenStreamComponents(t, f);
      }

      @Override
      protected Reader initReader(String fieldName, Reader reader) {
        reader = new MockCharFilter(reader, 0);
        reader = new MappingCharFilter(map, reader);
        reader = new TestRandomChains.CheckThatYouDidntReadAnythingReaderWrapper(reader);
        return reader;
      }
    };
    checkAnalysisConsistency(random(), a, false, "wmgddzunizdomqyj");
    a.close();
  }
  
  CharFilter wrappedStream = new CharFilter(new StringReader("bogus")) {

    @Override
    public void mark(int readAheadLimit) {
      throw new UnsupportedOperationException("mark(int)");
    }

    @Override
    public boolean markSupported() {
      throw new UnsupportedOperationException("markSupported()");
    }

    @Override
    public int read() {
      throw new UnsupportedOperationException("read()");
    }

    @Override
    public int read(char[] cbuf) {
      throw new UnsupportedOperationException("read(char[])");
    }

    @Override
    public int read(CharBuffer target) {
      throw new UnsupportedOperationException("read(CharBuffer)");
    }

    @Override
    public boolean ready() {
      throw new UnsupportedOperationException("ready()");
    }

    @Override
    public void reset() {
      throw new UnsupportedOperationException("reset()");
    }

    @Override
    public long skip(long n) {
      throw new UnsupportedOperationException("skip(long)");
    }

    @Override
    public int correct(int currentOff) {
      throw new UnsupportedOperationException("correct(int)");
    }

    @Override
    public void close() {
      throw new UnsupportedOperationException("close()");
    }

    @Override
    public int read(char[] arg0, int arg1, int arg2) {
      throw new UnsupportedOperationException("read(char[], int, int)");
    }
  };
  
  public void testWrapping() throws Exception {
    CharFilter cs = new TestRandomChains.CheckThatYouDidntReadAnythingReaderWrapper(wrappedStream);
    Exception expected = expectThrows(Exception.class, () -> {
      cs.mark(1);
    });
    assertEquals("mark(int)", expected.getMessage());
    
    expected = expectThrows(Exception.class, () -> {
      cs.markSupported();
    });
    assertEquals("markSupported()", expected.getMessage());
    
    expected = expectThrows(Exception.class, () -> {
      cs.read();
    });
    assertEquals("read()", expected.getMessage());
    
    expected = expectThrows(Exception.class, () -> {
      cs.read(new char[0]);
    });
    assertEquals("read(char[])", expected.getMessage());
    
    expected = expectThrows(Exception.class, () -> {
      cs.read(CharBuffer.wrap(new char[0]));
    });
    assertEquals("read(CharBuffer)", expected.getMessage());
    
    expected = expectThrows(Exception.class, () -> {
      cs.reset();
    });
    assertEquals("reset()", expected.getMessage());
    
    expected = expectThrows(Exception.class, () -> {
      cs.skip(1);
    });
    assertEquals("skip(long)", expected.getMessage());
    
    expected = expectThrows(Exception.class, () -> {
      cs.correctOffset(1);
    });
    assertEquals("correct(int)", expected.getMessage());
    
    expected = expectThrows(Exception.class, () -> {
      cs.close();
    });
    assertEquals("close()", expected.getMessage());
    
    expected = expectThrows(Exception.class, () -> {
      cs.read(new char[0], 0, 0);
    });
    assertEquals("read(char[], int, int)", expected.getMessage());
  }
  
  // todo: test framework?
  
  static final class SopTokenFilter extends TokenFilter {

    SopTokenFilter(TokenStream input) {
      super(input);
    }

    @Override
    public boolean incrementToken() throws IOException {
      if (input.incrementToken()) {
        if (VERBOSE) System.out.println(input.getClass().getSimpleName() + "->" + this.reflectAsString(false));
        return true;
      } else {
        return false;
      }
    }

    @Override
    public void end() throws IOException {
      super.end();
      if (VERBOSE) System.out.println(input.getClass().getSimpleName() + ".end()");
    }

    @Override
    public void close() throws IOException {
      super.close();
      if (VERBOSE) System.out.println(input.getClass().getSimpleName() + ".close()");
    }

    @Override
    public void reset() throws IOException {
      super.reset();
      if (VERBOSE) System.out.println(input.getClass().getSimpleName() + ".reset()");
    }
  }
  
  // LUCENE-5269
  @Slow
  public void testUnicodeShinglesAndNgrams() throws Exception {
    Analyzer analyzer = new Analyzer() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName) {
        Tokenizer tokenizer = new EdgeNGramTokenizer(2, 94);
        //TokenStream stream = new SopTokenFilter(tokenizer);
        TokenStream stream = new ShingleFilter(tokenizer, 5);
        //stream = new SopTokenFilter(stream);
        stream = new NGramTokenFilter(stream, 55, 83);
        //stream = new SopTokenFilter(stream);
        return new TokenStreamComponents(tokenizer, stream);
      }  
    };
    checkRandomData(random(), analyzer, 2000);
    analyzer.close();
  }
  
  public void testCuriousWikipediaString() throws Exception {
    final CharArraySet protWords = new CharArraySet(new HashSet<>(
        Arrays.asList("rrdpafa", "pupmmlu", "xlq", "dyy", "zqrxrrck", "o", "hsrlfvcha")), false);
    final byte table[] = new byte[] { 
        -57, 26, 1, 48, 63, -23, 55, -84, 18, 120, -97, 103, 58, 13, 84, 89, 57, -13, -63, 
        5, 28, 97, -54, -94, 102, -108, -5, 5, 46, 40, 43, 78, 43, -72, 36, 29, 124, -106, 
        -22, -51, 65, 5, 31, -42, 6, -99, 97, 14, 81, -128, 74, 100, 54, -55, -25, 53, -71, 
        -98, 44, 33, 86, 106, -42, 47, 115, -89, -18, -26, 22, -95, -43, 83, -125, 105, -104,
        -24, 106, -16, 126, 115, -105, 97, 65, -33, 57, 44, -1, 123, -68, 100, 13, -41, -64, 
        -119, 0, 92, 94, -36, 53, -9, -102, -18, 90, 94, -26, 31, 71, -20
    };
    Analyzer a = new Analyzer() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName) {
        Tokenizer tokenizer = new WikipediaTokenizer();
        TokenStream stream = new SopTokenFilter(tokenizer);
        stream = new WordDelimiterFilter(stream, table, -50, protWords);
        stream = new SopTokenFilter(stream);
        return new TokenStreamComponents(tokenizer, stream);
      }  
    };
    checkAnalysisConsistency(random(), a, false, "B\u28c3\ue0f8[ \ud800\udfc2 </p> jb");
    a.close();
  }
}