package org.apache.lucene.analysis; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.io.StringReader; import java.io.Reader; import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.index.Payload; import org.apache.lucene.util.Version; public class TestAnalyzers extends BaseTokenStreamTestCase { public TestAnalyzers(String name) { super(name); } public void testSimple() throws Exception { Analyzer a = new SimpleAnalyzer(); assertAnalyzesTo(a, "foo bar FOO BAR", new String[] { "foo", "bar", "foo", "bar" }); assertAnalyzesTo(a, "foo bar . FOO <> BAR", new String[] { "foo", "bar", "foo", "bar" }); assertAnalyzesTo(a, "foo.bar.FOO.BAR", new String[] { "foo", "bar", "foo", "bar" }); assertAnalyzesTo(a, "U.S.A.", new String[] { "u", "s", "a" }); assertAnalyzesTo(a, "C++", new String[] { "c" }); assertAnalyzesTo(a, "B2B", new String[] { "b", "b" }); assertAnalyzesTo(a, "2B", new String[] { "b" }); assertAnalyzesTo(a, "\"QUOTED\" word", new String[] { "quoted", "word" }); } public void testNull() throws Exception { Analyzer a = new WhitespaceAnalyzer(); assertAnalyzesTo(a, "foo bar FOO BAR", new String[] { "foo", "bar", "FOO", "BAR" }); assertAnalyzesTo(a, "foo bar . FOO <> BAR", new String[] { "foo", "bar", ".", "FOO", "<>", "BAR" }); assertAnalyzesTo(a, "foo.bar.FOO.BAR", new String[] { "foo.bar.FOO.BAR" }); assertAnalyzesTo(a, "U.S.A.", new String[] { "U.S.A." }); assertAnalyzesTo(a, "C++", new String[] { "C++" }); assertAnalyzesTo(a, "B2B", new String[] { "B2B" }); assertAnalyzesTo(a, "2B", new String[] { "2B" }); assertAnalyzesTo(a, "\"QUOTED\" word", new String[] { "\"QUOTED\"", "word" }); } public void testStop() throws Exception { Analyzer a = new StopAnalyzer(Version.LUCENE_CURRENT); assertAnalyzesTo(a, "foo bar FOO BAR", new String[] { "foo", "bar", "foo", "bar" }); assertAnalyzesTo(a, "foo a bar such FOO THESE BAR", new String[] { "foo", "bar", "foo", "bar" }); } void verifyPayload(TokenStream ts) throws IOException { PayloadAttribute payloadAtt = ts.getAttribute(PayloadAttribute.class); for(byte b=1;;b++) { boolean hasNext = ts.incrementToken(); if (!hasNext) break; // System.out.println("id="+System.identityHashCode(nextToken) + " " + t); // System.out.println("payload=" + (int)nextToken.getPayload().toByteArray()[0]); assertEquals(b, payloadAtt.getPayload().toByteArray()[0]); } } // Make sure old style next() calls result in a new copy of payloads public void testPayloadCopy() throws IOException { String s = "how now brown cow"; TokenStream ts; ts = new WhitespaceTokenizer(new StringReader(s)); ts = new PayloadSetter(ts); verifyPayload(ts); ts = new WhitespaceTokenizer(new StringReader(s)); ts = new PayloadSetter(ts); verifyPayload(ts); } // LUCENE-1150: Just a compile time test, to ensure the // StandardAnalyzer constants remain publicly accessible public void _testStandardConstants() { int x = StandardTokenizer.ALPHANUM; x = StandardTokenizer.APOSTROPHE; x = StandardTokenizer.ACRONYM; x = StandardTokenizer.COMPANY; x = StandardTokenizer.EMAIL; x = StandardTokenizer.HOST; x = StandardTokenizer.NUM; x = StandardTokenizer.CJ; String[] y = StandardTokenizer.TOKEN_TYPES; } private static class MyStandardAnalyzer extends StandardAnalyzer { public MyStandardAnalyzer() { super(org.apache.lucene.util.Version.LUCENE_CURRENT); } @Override public TokenStream tokenStream(String field, Reader reader) { return new WhitespaceAnalyzer().tokenStream(field, reader); } } public void testSubclassOverridingOnlyTokenStream() throws Throwable { Analyzer a = new MyStandardAnalyzer(); TokenStream ts = a.reusableTokenStream("field", new StringReader("the")); // StandardAnalyzer will discard "the" (it's a // stopword), by my subclass will not: assertTrue(ts.incrementToken()); assertFalse(ts.incrementToken()); } } class PayloadSetter extends TokenFilter { PayloadAttribute payloadAtt; public PayloadSetter(TokenStream input) { super(input); payloadAtt = addAttribute(PayloadAttribute.class); } byte[] data = new byte[1]; Payload p = new Payload(data,0,1); @Override public boolean incrementToken() throws IOException { boolean hasNext = input.incrementToken(); if (!hasNext) return false; payloadAtt.setPayload(p); // reuse the payload / byte[] data[0]++; return true; } }