/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.analysis.core; import java.io.IOException; import java.io.StringReader; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.LowerCaseFilter; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IOUtils; public class TestAnalyzers extends BaseTokenStreamTestCase { public void testSimple() throws Exception { Analyzer a = new SimpleAnalyzer(); assertAnalyzesTo(a, "foo bar FOO BAR", new String[] { "foo", "bar", "foo", "bar" }); assertAnalyzesTo(a, "foo bar . FOO <> BAR", new String[] { "foo", "bar", "foo", "bar" }); assertAnalyzesTo(a, "foo.bar.FOO.BAR", new String[] { "foo", "bar", "foo", "bar" }); assertAnalyzesTo(a, "U.S.A.", new String[] { "u", "s", "a" }); assertAnalyzesTo(a, "C++", new String[] { "c" }); assertAnalyzesTo(a, "B2B", new String[] { "b", "b" }); assertAnalyzesTo(a, "2B", new String[] { "b" }); assertAnalyzesTo(a, "\"QUOTED\" word", new String[] { "quoted", "word" }); assertEquals(new BytesRef("\"\\à3[]()! cz@"), a.normalize("dummy", "\"\\À3[]()! Cz@")); a.close(); } public void testNull() throws Exception { Analyzer a = new WhitespaceAnalyzer(); assertAnalyzesTo(a, "foo bar FOO BAR", new String[] { "foo", "bar", "FOO", "BAR" }); assertAnalyzesTo(a, "foo bar . FOO <> BAR", new String[] { "foo", "bar", ".", "FOO", "<>", "BAR" }); assertAnalyzesTo(a, "foo.bar.FOO.BAR", new String[] { "foo.bar.FOO.BAR" }); assertAnalyzesTo(a, "U.S.A.", new String[] { "U.S.A." }); assertAnalyzesTo(a, "C++", new String[] { "C++" }); assertAnalyzesTo(a, "B2B", new String[] { "B2B" }); assertAnalyzesTo(a, "2B", new String[] { "2B" }); assertAnalyzesTo(a, "\"QUOTED\" word", new String[] { "\"QUOTED\"", "word" }); assertEquals(new BytesRef("\"\\À3[]()! Cz@"), a.normalize("dummy", "\"\\À3[]()! Cz@")); a.close(); } public void testStop() throws Exception { Analyzer a = new StopAnalyzer(); assertAnalyzesTo(a, "foo bar FOO BAR", new String[] { "foo", "bar", "foo", "bar" }); assertAnalyzesTo(a, "foo a bar such FOO THESE BAR", new String[] { "foo", "bar", "foo", "bar" }); assertEquals(new BytesRef("\"\\à3[]()! cz@"), a.normalize("dummy", "\"\\À3[]()! Cz@")); assertEquals(new BytesRef("the"), a.normalize("dummy", "the")); a.close(); } void verifyPayload(TokenStream ts) throws IOException { PayloadAttribute payloadAtt = ts.getAttribute(PayloadAttribute.class); ts.reset(); for(byte b=1;;b++) { boolean hasNext = ts.incrementToken(); if (!hasNext) break; // System.out.println("id="+System.identityHashCode(nextToken) + " " + t); // System.out.println("payload=" + (int)nextToken.getPayload().toByteArray()[0]); assertEquals(b, payloadAtt.getPayload().bytes[0]); } } // Make sure old style next() calls result in a new copy of payloads public void testPayloadCopy() throws IOException { String s = "how now brown cow"; TokenStream ts; ts = new WhitespaceTokenizer(); ((Tokenizer)ts).setReader(new StringReader(s)); ts = new PayloadSetter(ts); verifyPayload(ts); ts = new WhitespaceTokenizer(); ((Tokenizer)ts).setReader(new StringReader(s)); ts = new PayloadSetter(ts); verifyPayload(ts); } // LUCENE-1150: Just a compile time test, to ensure the // StandardAnalyzer constants remain publicly accessible @SuppressWarnings("unused") public void _testStandardConstants() { int x = StandardTokenizer.ALPHANUM; x = StandardTokenizer.APOSTROPHE; x = StandardTokenizer.ACRONYM; x = StandardTokenizer.COMPANY; x = StandardTokenizer.EMAIL; x = StandardTokenizer.HOST; x = StandardTokenizer.NUM; x = StandardTokenizer.CJ; String[] y = StandardTokenizer.TOKEN_TYPES; } private static class LowerCaseWhitespaceAnalyzer extends Analyzer { @Override public TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = random().nextBoolean() ? new WhitespaceTokenizer() : new UnicodeWhitespaceTokenizer(); return new TokenStreamComponents(tokenizer, new LowerCaseFilter(tokenizer)); } } private static class UpperCaseWhitespaceAnalyzer extends Analyzer { @Override public TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = random().nextBoolean() ? new WhitespaceTokenizer() : new UnicodeWhitespaceTokenizer(); return new TokenStreamComponents(tokenizer, new UpperCaseFilter(tokenizer)); } } /** * Test that LowercaseFilter handles entire unicode range correctly */ public void testLowerCaseFilter() throws IOException { Analyzer a = new LowerCaseWhitespaceAnalyzer(); // BMP assertAnalyzesTo(a, "AbaCaDabA", new String[] { "abacadaba" }); // supplementary assertAnalyzesTo(a, "\ud801\udc16\ud801\udc16\ud801\udc16\ud801\udc16", new String[] {"\ud801\udc3e\ud801\udc3e\ud801\udc3e\ud801\udc3e"}); assertAnalyzesTo(a, "AbaCa\ud801\udc16DabA", new String[] { "abaca\ud801\udc3edaba" }); // unpaired lead surrogate assertAnalyzesTo(a, "AbaC\uD801AdaBa", new String [] { "abac\uD801adaba" }); // unpaired trail surrogate assertAnalyzesTo(a, "AbaC\uDC16AdaBa", new String [] { "abac\uDC16adaba" }); a.close(); } /** * Test that LowercaseFilter handles entire unicode range correctly */ public void testUpperCaseFilter() throws IOException { Analyzer a = new UpperCaseWhitespaceAnalyzer(); // BMP assertAnalyzesTo(a, "AbaCaDabA", new String[] { "ABACADABA" }); // supplementary assertAnalyzesTo(a, "\ud801\udc3e\ud801\udc3e\ud801\udc3e\ud801\udc3e", new String[] {"\ud801\udc16\ud801\udc16\ud801\udc16\ud801\udc16"}); assertAnalyzesTo(a, "AbaCa\ud801\udc3eDabA", new String[] { "ABACA\ud801\udc16DABA" }); // unpaired lead surrogate assertAnalyzesTo(a, "AbaC\uD801AdaBa", new String [] { "ABAC\uD801ADABA" }); // unpaired trail surrogate assertAnalyzesTo(a, "AbaC\uDC16AdaBa", new String [] { "ABAC\uDC16ADABA" }); a.close(); } /** * Test that LowercaseFilter handles the lowercasing correctly if the term * buffer has a trailing surrogate character leftover and the current term in * the buffer ends with a corresponding leading surrogate. */ public void testLowerCaseFilterLowSurrogateLeftover() throws IOException { // test if the limit of the termbuffer is correctly used with supplementary // chars WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader("BogustermBogusterm\udc16")); LowerCaseFilter filter = new LowerCaseFilter(tokenizer); assertTokenStreamContents(filter, new String[] {"bogustermbogusterm\udc16"}); filter.reset(); String highSurEndingUpper = "BogustermBoguster\ud801"; String highSurEndingLower = "bogustermboguster\ud801"; tokenizer.setReader(new StringReader(highSurEndingUpper)); assertTokenStreamContents(filter, new String[] {highSurEndingLower}); assertTrue(filter.hasAttribute(CharTermAttribute.class)); char[] termBuffer = filter.getAttribute(CharTermAttribute.class).buffer(); int length = highSurEndingLower.length(); assertEquals('\ud801', termBuffer[length - 1]); } public void testLowerCaseTokenizer() throws IOException { StringReader reader = new StringReader("Tokenizer \ud801\udc1ctest"); LowerCaseTokenizer tokenizer = new LowerCaseTokenizer(); tokenizer.setReader(reader); assertTokenStreamContents(tokenizer, new String[] { "tokenizer", "\ud801\udc44test" }); } public void testWhitespaceTokenizer() throws IOException { StringReader reader = new StringReader("Tokenizer \ud801\udc1ctest"); WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(reader); assertTokenStreamContents(tokenizer, new String[] { "Tokenizer", "\ud801\udc1ctest" }); } /** blast some random strings through the analyzer */ public void testRandomStrings() throws Exception { Analyzer analyzers[] = new Analyzer[] { new WhitespaceAnalyzer(), new SimpleAnalyzer(), new StopAnalyzer(), new UnicodeWhitespaceAnalyzer() }; for (Analyzer analyzer : analyzers) { checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER); } IOUtils.close(analyzers); } /** blast some random large strings through the analyzer */ public void testRandomHugeStrings() throws Exception { Analyzer analyzers[] = new Analyzer[] { new WhitespaceAnalyzer(), new SimpleAnalyzer(), new StopAnalyzer(), new UnicodeWhitespaceAnalyzer() }; for (Analyzer analyzer : analyzers) { checkRandomData(random(), analyzer, 100*RANDOM_MULTIPLIER, 8192); } IOUtils.close(analyzers); } } final class PayloadSetter extends TokenFilter { PayloadAttribute payloadAtt; public PayloadSetter(TokenStream input) { super(input); payloadAtt = addAttribute(PayloadAttribute.class); } byte[] data = new byte[1]; BytesRef p = new BytesRef(data,0,1); @Override public boolean incrementToken() throws IOException { boolean hasNext = input.incrementToken(); if (!hasNext) return false; payloadAtt.setPayload(p); // reuse the payload / byte[] data[0]++; return true; } }