/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.analysis.pattern; import java.io.IOException; import java.io.StringReader; import java.util.ArrayList; import java.util.List; import java.util.regex.Pattern; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.CharFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.charfilter.MappingCharFilter; import org.apache.lucene.analysis.charfilter.NormalizeCharMap; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; public class TestPatternTokenizer extends BaseTokenStreamTestCase { public void testSplitting() throws Exception { String qpattern = "\\'([^\\']+)\\'"; // get stuff between "'" String[][] tests = { // group pattern input output { "-1", "--", "aaa--bbb--ccc", "aaa bbb ccc" }, { "-1", ":", "aaa:bbb:ccc", "aaa bbb ccc" }, { "-1", "\\p{Space}", "aaa bbb \t\tccc ", "aaa bbb ccc" }, { "-1", ":", "boo:and:foo", "boo and foo" }, { "-1", "o", "boo:and:foo", "b :and:f" }, { "0", ":", "boo:and:foo", ": :" }, { "0", qpattern, "aaa 'bbb' 'ccc'", "'bbb' 'ccc'" }, { "1", qpattern, "aaa 'bbb' 'ccc'", "bbb ccc" } }; for( String[] test : tests ) { TokenStream stream = new PatternTokenizer(newAttributeFactory(), Pattern.compile(test[1]), Integer.parseInt(test[0])); ((Tokenizer)stream).setReader(new StringReader(test[2])); String out = tsToString( stream ); // System.out.println( test[2] + " ==> " + out ); assertEquals("pattern: "+test[1]+" with input: "+test[2], test[3], out ); // Make sure it is the same as if we called 'split' // test disabled, as we remove empty tokens /*if( "-1".equals( test[0] ) ) { String[] split = test[2].split( test[1] ); stream = tokenizer.create( new StringReader( test[2] ) ); int i=0; for( Token t = stream.next(); null != t; t = stream.next() ) { assertEquals( "split: "+test[1] + " "+i, split[i++], new String(t.termBuffer(), 0, t.termLength()) ); } }*/ } } public void testOffsetCorrection() throws Exception { final String INPUT = "Günther Günther is here"; // create MappingCharFilter List<String> mappingRules = new ArrayList<>(); mappingRules.add( "\"ü\" => \"ü\"" ); NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); builder.add("ü", "ü"); NormalizeCharMap normMap = builder.build(); CharFilter charStream = new MappingCharFilter( normMap, new StringReader( INPUT ) ); // create PatternTokenizer Tokenizer stream = new PatternTokenizer(newAttributeFactory(), Pattern.compile("[,;/\\s]+"), -1); stream.setReader(charStream); assertTokenStreamContents(stream, new String[] { "Günther", "Günther", "is", "here" }, new int[] { 0, 13, 26, 29 }, new int[] { 12, 25, 28, 33 }, INPUT.length()); charStream = new MappingCharFilter( normMap, new StringReader( INPUT ) ); stream = new PatternTokenizer(newAttributeFactory(), Pattern.compile("Günther"), 0); stream.setReader(charStream); assertTokenStreamContents(stream, new String[] { "Günther", "Günther" }, new int[] { 0, 13 }, new int[] { 12, 25 }, INPUT.length()); } /** * TODO: rewrite tests not to use string comparison. */ private static String tsToString(TokenStream in) throws IOException { StringBuilder out = new StringBuilder(); CharTermAttribute termAtt = in.addAttribute(CharTermAttribute.class); // extra safety to enforce, that the state is not preserved and also // assign bogus values in.clearAttributes(); termAtt.setEmpty().append("bogusTerm"); in.reset(); while (in.incrementToken()) { if (out.length() > 0) out.append(' '); out.append(termAtt.toString()); in.clearAttributes(); termAtt.setEmpty().append("bogusTerm"); } in.close(); return out.toString(); } /** blast some random strings through the analyzer */ public void testRandomStrings() throws Exception { Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new PatternTokenizer(newAttributeFactory(), Pattern.compile("a"), -1); return new TokenStreamComponents(tokenizer); } }; checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER); a.close(); Analyzer b = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new PatternTokenizer(newAttributeFactory(), Pattern.compile("a"), 0); return new TokenStreamComponents(tokenizer); } }; checkRandomData(random(), b, 1000*RANDOM_MULTIPLIER); b.close(); } // LUCENE-6814 public void testHeapFreedAfterClose() throws Exception { // TODO: can we move this to BaseTSTC to catch other "hangs onto heap"ers? // Build a 1MB string: StringBuilder b = new StringBuilder(); for(int i=0;i<1024;i++) { // 1023 spaces, then an x for(int j=0;j<1023;j++) { b.append(' '); } b.append('x'); } String big = b.toString(); Pattern x = Pattern.compile("x"); List<Tokenizer> tokenizers = new ArrayList<>(); for(int i=0;i<512;i++) { Tokenizer stream = new PatternTokenizer(x, -1); tokenizers.add(stream); stream.setReader(new StringReader(big)); stream.reset(); for(int j=0;j<1024;j++) { assertTrue(stream.incrementToken()); } assertFalse(stream.incrementToken()); stream.end(); stream.close(); } } }