/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.analysis.core; import java.io.IOException; import java.io.Reader; import java.io.StringReader; import java.nio.CharBuffer; import java.util.Arrays; import java.util.HashSet; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.CharFilter; import org.apache.lucene.analysis.MockCharFilter; import org.apache.lucene.analysis.MockTokenFilter; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.charfilter.MappingCharFilter; import org.apache.lucene.analysis.charfilter.NormalizeCharMap; import org.apache.lucene.analysis.commongrams.CommonGramsFilter; import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter; import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer; import org.apache.lucene.analysis.ngram.NGramTokenFilter; import org.apache.lucene.analysis.shingle.ShingleFilter; import org.apache.lucene.analysis.wikipedia.WikipediaTokenizer; import org.apache.lucene.util.LuceneTestCase.SuppressCodecs; @SuppressCodecs("Direct") public class TestBugInSomething extends BaseTokenStreamTestCase { public void test() throws Exception { final CharArraySet cas = new CharArraySet(3, false); cas.add("jjp"); cas.add("wlmwoknt"); cas.add("tcgyreo"); final NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); builder.add("mtqlpi", ""); builder.add("mwoknt", "jjp"); builder.add("tcgyreo", "zpfpajyws"); final NormalizeCharMap map = builder.build(); Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer t = new MockTokenizer(MockTokenFilter.ENGLISH_STOPSET, false, -65); TokenFilter f = new CommonGramsFilter(t, cas); return new TokenStreamComponents(t, f); } @Override protected Reader initReader(String fieldName, Reader reader) { reader = new MockCharFilter(reader, 0); reader = new MappingCharFilter(map, reader); reader = new TestRandomChains.CheckThatYouDidntReadAnythingReaderWrapper(reader); return reader; } }; checkAnalysisConsistency(random(), a, false, "wmgddzunizdomqyj"); a.close(); } CharFilter wrappedStream = new CharFilter(new StringReader("bogus")) { @Override public void mark(int readAheadLimit) { throw new UnsupportedOperationException("mark(int)"); } @Override public boolean markSupported() { throw new UnsupportedOperationException("markSupported()"); } @Override public int read() { throw new UnsupportedOperationException("read()"); } @Override public int read(char[] cbuf) { throw new UnsupportedOperationException("read(char[])"); } @Override public int read(CharBuffer target) { throw new UnsupportedOperationException("read(CharBuffer)"); } @Override public boolean ready() { throw new UnsupportedOperationException("ready()"); } @Override public void reset() { throw new UnsupportedOperationException("reset()"); } @Override public long skip(long n) { throw new UnsupportedOperationException("skip(long)"); } @Override public int correct(int currentOff) { throw new UnsupportedOperationException("correct(int)"); } @Override public void close() { throw new UnsupportedOperationException("close()"); } @Override public int read(char[] arg0, int arg1, int arg2) { throw new UnsupportedOperationException("read(char[], int, int)"); } }; public void testWrapping() throws Exception { CharFilter cs = new TestRandomChains.CheckThatYouDidntReadAnythingReaderWrapper(wrappedStream); Exception expected = expectThrows(Exception.class, () -> { cs.mark(1); }); assertEquals("mark(int)", expected.getMessage()); expected = expectThrows(Exception.class, () -> { cs.markSupported(); }); assertEquals("markSupported()", expected.getMessage()); expected = expectThrows(Exception.class, () -> { cs.read(); }); assertEquals("read()", expected.getMessage()); expected = expectThrows(Exception.class, () -> { cs.read(new char[0]); }); assertEquals("read(char[])", expected.getMessage()); expected = expectThrows(Exception.class, () -> { cs.read(CharBuffer.wrap(new char[0])); }); assertEquals("read(CharBuffer)", expected.getMessage()); expected = expectThrows(Exception.class, () -> { cs.reset(); }); assertEquals("reset()", expected.getMessage()); expected = expectThrows(Exception.class, () -> { cs.skip(1); }); assertEquals("skip(long)", expected.getMessage()); expected = expectThrows(Exception.class, () -> { cs.correctOffset(1); }); assertEquals("correct(int)", expected.getMessage()); expected = expectThrows(Exception.class, () -> { cs.close(); }); assertEquals("close()", expected.getMessage()); expected = expectThrows(Exception.class, () -> { cs.read(new char[0], 0, 0); }); assertEquals("read(char[], int, int)", expected.getMessage()); } // todo: test framework? static final class SopTokenFilter extends TokenFilter { SopTokenFilter(TokenStream input) { super(input); } @Override public boolean incrementToken() throws IOException { if (input.incrementToken()) { if (VERBOSE) System.out.println(input.getClass().getSimpleName() + "->" + this.reflectAsString(false)); return true; } else { return false; } } @Override public void end() throws IOException { super.end(); if (VERBOSE) System.out.println(input.getClass().getSimpleName() + ".end()"); } @Override public void close() throws IOException { super.close(); if (VERBOSE) System.out.println(input.getClass().getSimpleName() + ".close()"); } @Override public void reset() throws IOException { super.reset(); if (VERBOSE) System.out.println(input.getClass().getSimpleName() + ".reset()"); } } // LUCENE-5269 @Slow public void testUnicodeShinglesAndNgrams() throws Exception { Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new EdgeNGramTokenizer(2, 94); //TokenStream stream = new SopTokenFilter(tokenizer); TokenStream stream = new ShingleFilter(tokenizer, 5); //stream = new SopTokenFilter(stream); stream = new NGramTokenFilter(stream, 55, 83); //stream = new SopTokenFilter(stream); return new TokenStreamComponents(tokenizer, stream); } }; checkRandomData(random(), analyzer, 2000); analyzer.close(); } public void testCuriousWikipediaString() throws Exception { final CharArraySet protWords = new CharArraySet(new HashSet<>( Arrays.asList("rrdpafa", "pupmmlu", "xlq", "dyy", "zqrxrrck", "o", "hsrlfvcha")), false); final byte table[] = new byte[] { -57, 26, 1, 48, 63, -23, 55, -84, 18, 120, -97, 103, 58, 13, 84, 89, 57, -13, -63, 5, 28, 97, -54, -94, 102, -108, -5, 5, 46, 40, 43, 78, 43, -72, 36, 29, 124, -106, -22, -51, 65, 5, 31, -42, 6, -99, 97, 14, 81, -128, 74, 100, 54, -55, -25, 53, -71, -98, 44, 33, 86, 106, -42, 47, 115, -89, -18, -26, 22, -95, -43, 83, -125, 105, -104, -24, 106, -16, 126, 115, -105, 97, 65, -33, 57, 44, -1, 123, -68, 100, 13, -41, -64, -119, 0, 92, 94, -36, 53, -9, -102, -18, 90, 94, -26, 31, 71, -20 }; Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new WikipediaTokenizer(); TokenStream stream = new SopTokenFilter(tokenizer); stream = new WordDelimiterFilter(stream, table, -50, protWords); stream = new SopTokenFilter(stream); return new TokenStreamComponents(tokenizer, stream); } }; checkAnalysisConsistency(random(), a, false, "B\u28c3\ue0f8[ \ud800\udfc2 </p> jb"); a.close(); } }