/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.analysis.pattern; import java.io.IOException; import java.io.StringReader; import java.util.ArrayList; import java.util.List; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.CharFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.charfilter.MappingCharFilter; import org.apache.lucene.analysis.charfilter.NormalizeCharMap; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.util.TestUtil; import org.apache.lucene.util.automaton.Automaton; public class TestSimplePatternSplitTokenizer extends BaseTokenStreamTestCase { public void testGreedy() throws Exception { Tokenizer t = new SimplePatternSplitTokenizer("(foo)+"); t.setReader(new StringReader("bar foofoo baz")); assertTokenStreamContents(t, new String[] {"bar ", " baz"}, new int[] {0, 10}, new int[] {4, 14}); } public void testBackToBack() throws Exception { Tokenizer t = new SimplePatternSplitTokenizer("foo"); t.setReader(new StringReader("bar foofoo baz")); assertTokenStreamContents(t, new String[] {"bar ", " baz"}, new int[] {0, 10}, new int[] {4, 14}); } public void testBigLookahead() throws Exception { StringBuilder b = new StringBuilder(); for(int i=0;i<100;i++) { b.append('a'); } b.append('b'); Tokenizer t = new SimplePatternSplitTokenizer(b.toString()); CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class); b = new StringBuilder(); for(int i=0;i<200;i++) { b.append('a'); } t.setReader(new StringReader(b.toString())); t.reset(); assertTrue(t.incrementToken()); assertEquals(b.toString(), termAtt.toString()); assertFalse(t.incrementToken()); } public void testNoTokens() throws Exception { Tokenizer t = new SimplePatternSplitTokenizer(".*"); CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class); String s; while (true) { s = TestUtil.randomUnicodeString(random()); if (s.length() > 0) { break; } } t.setReader(new StringReader(s)); t.reset(); assertFalse(t.incrementToken()); } public void testEmptyStringPatternNoMatch() throws Exception { Tokenizer t = new SimplePatternSplitTokenizer("a*"); CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class); t.setReader(new StringReader("bbb")); t.reset(); assertTrue(t.incrementToken()); assertEquals("bbb", termAtt.toString()); assertFalse(t.incrementToken()); } public void testSplitSingleCharWhitespace() throws Exception { Tokenizer t = new SimplePatternSplitTokenizer("[ \t\r\n]"); CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class); t.setReader(new StringReader("a \tb c")); assertTokenStreamContents(t, new String[] {"a", "b", "c"}, new int[] {0, 3, 7}, new int[] {1, 4, 8}); } public void testSplitMultiCharWhitespace() throws Exception { Tokenizer t = new SimplePatternSplitTokenizer("[ \t\r\n]*"); CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class); t.setReader(new StringReader("a \tb c")); assertTokenStreamContents(t, new String[] {"a", "b", "c"}, new int[] {0, 3, 7}, new int[] {1, 4, 8}); } public void testLeadingNonToken() throws Exception { Tokenizer t = new SimplePatternSplitTokenizer("[ \t\r\n]*"); CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class); t.setReader(new StringReader(" a c")); assertTokenStreamContents(t, new String[] {"a", "c"}, new int[] {4, 6}, new int[] {5, 7}); } public void testTrailingNonToken() throws Exception { Tokenizer t = new SimplePatternSplitTokenizer("[ \t\r\n]*"); CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class); t.setReader(new StringReader("a c ")); assertTokenStreamContents(t, new String[] {"a", "c"}, new int[] {0, 2}, new int[] {1, 3}); } public void testEmptyStringPatternOneMatch() throws Exception { Tokenizer t = new SimplePatternSplitTokenizer("a*"); CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class); t.setReader(new StringReader("bbab")); assertTokenStreamContents(t, new String[] {"bb", "b"}, new int[] {0, 3}, new int[] {2, 4}); } public void testEndOffset() throws Exception { Tokenizer t = new SimplePatternSplitTokenizer("a+"); CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = t.getAttribute(OffsetAttribute.class); t.setReader(new StringReader("aaabbb")); t.reset(); assertTrue(t.incrementToken()); assertEquals("bbb", termAtt.toString()); assertFalse(t.incrementToken()); t.end(); assertEquals(6, offsetAtt.endOffset()); } public void testFixedToken() throws Exception { Tokenizer t = new SimplePatternSplitTokenizer("aaaa"); t.setReader(new StringReader("aaaaaaaaaaaaaaa")); assertTokenStreamContents(t, new String[] {"aaa"}, new int[] {12}, new int[] {15}); } public void testBasic() throws Exception { String[][] tests = { // pattern input output { "--", "aaa--bbb--ccc", "aaa bbb ccc" }, { ":", "aaa:bbb:ccc", "aaa bbb ccc" }, { ":", "boo:and:foo", "boo and foo" }, { "o", "boo:and:foo", "b :and:f" }, }; for(String[] test : tests) { TokenStream stream = new SimplePatternSplitTokenizer(test[0]); ((Tokenizer)stream).setReader(new StringReader(test[1])); String out = tsToString(stream); assertEquals("pattern: "+test[0]+" with input: "+test[1], test[2], out); } } public void testNotDeterminized() throws Exception { Automaton a = new Automaton(); int start = a.createState(); int mid1 = a.createState(); int mid2 = a.createState(); int end = a.createState(); a.setAccept(end, true); a.addTransition(start, mid1, 'a', 'z'); a.addTransition(start, mid2, 'a', 'z'); a.addTransition(mid1, end, 'b'); a.addTransition(mid2, end, 'b'); expectThrows(IllegalArgumentException.class, () -> {new SimplePatternSplitTokenizer(a);}); } public void testOffsetCorrection() throws Exception { final String INPUT = "Günther Günther is here"; // create MappingCharFilter List<String> mappingRules = new ArrayList<>(); mappingRules.add( "\"ü\" => \"ü\"" ); NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); builder.add("ü", "ü"); NormalizeCharMap normMap = builder.build(); CharFilter charStream = new MappingCharFilter( normMap, new StringReader(INPUT)); // create SimplePatternSplitTokenizer Tokenizer stream = new SimplePatternSplitTokenizer("Günther"); stream.setReader(charStream); assertTokenStreamContents(stream, new String[] { " ", " is here" }, new int[] { 12, 25 }, new int[] { 13, 33 }, INPUT.length()); } /** * TODO: rewrite tests not to use string comparison. */ private static String tsToString(TokenStream in) throws IOException { StringBuilder out = new StringBuilder(); CharTermAttribute termAtt = in.addAttribute(CharTermAttribute.class); // extra safety to enforce, that the state is not preserved and also // assign bogus values in.clearAttributes(); termAtt.setEmpty().append("bogusTerm"); in.reset(); while (in.incrementToken()) { if (out.length() > 0) { out.append(' '); } out.append(termAtt.toString()); in.clearAttributes(); termAtt.setEmpty().append("bogusTerm"); } in.close(); return out.toString(); } /** blast some random strings through the analyzer */ public void testRandomStrings() throws Exception { Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new SimplePatternSplitTokenizer("a"); return new TokenStreamComponents(tokenizer); } }; checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER); a.close(); Analyzer b = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new SimplePatternSplitTokenizer("a"); return new TokenStreamComponents(tokenizer); } }; checkRandomData(random(), b, 1000*RANDOM_MULTIPLIER); b.close(); } public void testEndLookahead() throws Exception { Tokenizer t = new SimplePatternSplitTokenizer("(ab)+"); t.setReader(new StringReader("aba")); assertTokenStreamContents(t, new String[] { "a" }, new int[] { 2 }, new int[] { 3 }, 3); } }