package org.apache.lucene.analysis; /** * Copyright 2004 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.analysis.standard.StandardFilter; import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.util.English; import org.apache.lucene.util.LuceneTestCase; import java.io.IOException; import java.io.StringReader; import java.util.ArrayList; import java.util.List; /** * tests for the TeeTokenFilter and SinkTokenizer */ public class TestTeeTokenFilter extends LuceneTestCase { protected StringBuffer buffer1; protected StringBuffer buffer2; protected String[] tokens1; protected String[] tokens2; public TestTeeTokenFilter(String s) { super(s); } protected void setUp() throws Exception { super.setUp(); tokens1 = new String[]{"The", "quick", "Burgundy", "Fox", "jumped", "over", "the", "lazy", "Red", "Dogs"}; tokens2 = new String[]{"The", "Lazy", "Dogs", "should", "stay", "on", "the", "porch"}; buffer1 = new StringBuffer(); for (int i = 0; i < tokens1.length; i++) { buffer1.append(tokens1[i]).append(' '); } buffer2 = new StringBuffer(); for (int i = 0; i < tokens2.length; i++) { buffer2.append(tokens2[i]).append(' '); } } public void test() throws IOException { SinkTokenizer sink1 = new SinkTokenizer(null) { public void add(Token t) { if (t != null && t.term().equalsIgnoreCase("The")) { super.add(t); } } }; TokenStream source = new TeeTokenFilter(new WhitespaceTokenizer(new StringReader(buffer1.toString())), sink1); int i = 0; final Token reusableToken = new Token(); for (Token nextToken = source.next(reusableToken); nextToken != null; nextToken = source.next(reusableToken)) { assertTrue(nextToken.term() + " is not equal to " + tokens1[i], nextToken.term().equals(tokens1[i]) == true); i++; } assertTrue(i + " does not equal: " + tokens1.length, i == tokens1.length); assertTrue("sink1 Size: " + sink1.getTokens().size() + " is not: " + 2, sink1.getTokens().size() == 2); i = 0; for (Token token = sink1.next(reusableToken); token != null; token = sink1.next(reusableToken)) { assertTrue(token.term() + " is not equal to " + "The", token.term().equalsIgnoreCase("The") == true); i++; } assertTrue(i + " does not equal: " + sink1.getTokens().size(), i == sink1.getTokens().size()); } public void testMultipleSources() throws Exception { SinkTokenizer theDetector = new SinkTokenizer(null) { public void add(Token t) { if (t != null && t.term().equalsIgnoreCase("The")) { super.add(t); } } }; SinkTokenizer dogDetector = new SinkTokenizer(null) { public void add(Token t) { if (t != null && t.term().equalsIgnoreCase("Dogs")) { super.add(t); } } }; TokenStream source1 = new CachingTokenFilter(new TeeTokenFilter(new TeeTokenFilter(new WhitespaceTokenizer(new StringReader(buffer1.toString())), theDetector), dogDetector)); TokenStream source2 = new TeeTokenFilter(new TeeTokenFilter(new WhitespaceTokenizer(new StringReader(buffer2.toString())), theDetector), dogDetector); int i = 0; final Token reusableToken = new Token(); for (Token nextToken = source1.next(reusableToken); nextToken != null; nextToken = source1.next(reusableToken)) { assertTrue(nextToken.term() + " is not equal to " + tokens1[i], nextToken.term().equals(tokens1[i]) == true); i++; } assertTrue(i + " does not equal: " + tokens1.length, i == tokens1.length); assertTrue("theDetector Size: " + theDetector.getTokens().size() + " is not: " + 2, theDetector.getTokens().size() == 2); assertTrue("dogDetector Size: " + dogDetector.getTokens().size() + " is not: " + 1, dogDetector.getTokens().size() == 1); i = 0; for (Token nextToken = source2.next(reusableToken); nextToken != null; nextToken = source2.next(reusableToken)) { assertTrue(nextToken.term() + " is not equal to " + tokens2[i], nextToken.term().equals(tokens2[i]) == true); i++; } assertTrue(i + " does not equal: " + tokens2.length, i == tokens2.length); assertTrue("theDetector Size: " + theDetector.getTokens().size() + " is not: " + 4, theDetector.getTokens().size() == 4); assertTrue("dogDetector Size: " + dogDetector.getTokens().size() + " is not: " + 2, dogDetector.getTokens().size() == 2); i = 0; for (Token nextToken = theDetector.next(reusableToken); nextToken != null; nextToken = theDetector.next(reusableToken)) { assertTrue(nextToken.term() + " is not equal to " + "The", nextToken.term().equalsIgnoreCase("The") == true); i++; } assertTrue(i + " does not equal: " + theDetector.getTokens().size(), i == theDetector.getTokens().size()); i = 0; for (Token nextToken = dogDetector.next(reusableToken); nextToken != null; nextToken = dogDetector.next(reusableToken)) { assertTrue(nextToken.term() + " is not equal to " + "Dogs", nextToken.term().equalsIgnoreCase("Dogs") == true); i++; } assertTrue(i + " does not equal: " + dogDetector.getTokens().size(), i == dogDetector.getTokens().size()); source1.reset(); TokenStream lowerCasing = new LowerCaseFilter(source1); i = 0; for (Token nextToken = lowerCasing.next(reusableToken); nextToken != null; nextToken = lowerCasing.next(reusableToken)) { assertTrue(nextToken.term() + " is not equal to " + tokens1[i].toLowerCase(), nextToken.term().equals(tokens1[i].toLowerCase()) == true); i++; } assertTrue(i + " does not equal: " + tokens1.length, i == tokens1.length); } /** * Not an explicit test, just useful to print out some info on performance * * @throws Exception */ public void performance() throws Exception { int[] tokCount = {100, 500, 1000, 2000, 5000, 10000}; int[] modCounts = {1, 2, 5, 10, 20, 50, 100, 200, 500}; for (int k = 0; k < tokCount.length; k++) { StringBuffer buffer = new StringBuffer(); System.out.println("-----Tokens: " + tokCount[k] + "-----"); for (int i = 0; i < tokCount[k]; i++) { buffer.append(English.intToEnglish(i).toUpperCase()).append(' '); } //make sure we produce the same tokens ModuloSinkTokenizer sink = new ModuloSinkTokenizer(tokCount[k], 100); final Token reusableToken = new Token(); TokenStream stream = new TeeTokenFilter(new StandardFilter(new StandardTokenizer(new StringReader(buffer.toString()))), sink); while (stream.next(reusableToken) != null) { } stream = new ModuloTokenFilter(new StandardFilter(new StandardTokenizer(new StringReader(buffer.toString()))), 100); List tmp = new ArrayList(); for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) { tmp.add(nextToken.clone()); } List sinkList = sink.getTokens(); assertTrue("tmp Size: " + tmp.size() + " is not: " + sinkList.size(), tmp.size() == sinkList.size()); for (int i = 0; i < tmp.size(); i++) { Token tfTok = (Token) tmp.get(i); Token sinkTok = (Token) sinkList.get(i); assertTrue(tfTok.term() + " is not equal to " + sinkTok.term() + " at token: " + i, tfTok.term().equals(sinkTok.term()) == true); } //simulate two fields, each being analyzed once, for 20 documents for (int j = 0; j < modCounts.length; j++) { int tfPos = 0; long start = System.currentTimeMillis(); for (int i = 0; i < 20; i++) { stream = new StandardFilter(new StandardTokenizer(new StringReader(buffer.toString()))); for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) { tfPos += nextToken.getPositionIncrement(); } stream = new ModuloTokenFilter(new StandardFilter(new StandardTokenizer(new StringReader(buffer.toString()))), modCounts[j]); for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) { tfPos += nextToken.getPositionIncrement(); } } long finish = System.currentTimeMillis(); System.out.println("ModCount: " + modCounts[j] + " Two fields took " + (finish - start) + " ms"); int sinkPos = 0; //simulate one field with one sink start = System.currentTimeMillis(); for (int i = 0; i < 20; i++) { sink = new ModuloSinkTokenizer(tokCount[k], modCounts[j]); stream = new TeeTokenFilter(new StandardFilter(new StandardTokenizer(new StringReader(buffer.toString()))), sink); for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) { sinkPos += nextToken.getPositionIncrement(); } //System.out.println("Modulo--------"); stream = sink; for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) { sinkPos += nextToken.getPositionIncrement(); } } finish = System.currentTimeMillis(); System.out.println("ModCount: " + modCounts[j] + " Tee fields took " + (finish - start) + " ms"); assertTrue(sinkPos + " does not equal: " + tfPos, sinkPos == tfPos); } System.out.println("- End Tokens: " + tokCount[k] + "-----"); } } class ModuloTokenFilter extends TokenFilter { int modCount; ModuloTokenFilter(TokenStream input, int mc) { super(input); modCount = mc; } int count = 0; //return every 100 tokens public Token next(final Token reusableToken) throws IOException { Token nextToken = null; for (nextToken = input.next(reusableToken); nextToken != null && count % modCount != 0; nextToken = input.next(reusableToken)) { count++; } count++; return nextToken; } } class ModuloSinkTokenizer extends SinkTokenizer { int count = 0; int modCount; ModuloSinkTokenizer(int numToks, int mc) { modCount = mc; lst = new ArrayList(numToks % mc); } public void add(Token t) { if (t != null && count % modCount == 0) { super.add(t); } count++; } } }