/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.analysis.core; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.CannedTokenStream; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; public class TestFlattenGraphFilter extends BaseTokenStreamTestCase { private static Token token(String term, int posInc, int posLength, int startOffset, int endOffset) { final Token t = new Token(term, startOffset, endOffset); t.setPositionIncrement(posInc); t.setPositionLength(posLength); return t; } public void testSimpleMock() throws Exception { Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true); TokenStream ts = new FlattenGraphFilter(tokenizer); return new TokenStreamComponents(tokenizer, ts); } }; assertAnalyzesTo(a, "wtf happened", new String[] {"wtf", "happened"}, new int[] { 0, 4}, new int[] { 3, 12}, null, new int[] { 1, 1}, new int[] { 1, 1}, true); } // Make sure graph is unchanged if it's already flat public void testAlreadyFlatten() throws Exception { TokenStream in = new CannedTokenStream(0, 12, new Token[] { token("wtf", 1, 1, 0, 3), token("what", 0, 1, 0, 3), token("wow", 0, 1, 0, 3), token("the", 1, 1, 0, 3), token("that's", 0, 1, 0, 3), token("fudge", 1, 1, 0, 3), token("funny", 0, 1, 0, 3), token("happened", 1, 1, 4, 12) }); TokenStream out = new FlattenGraphFilter(in); // ... but on output, it's flattened to wtf/what/wow that's/the fudge/funny happened: assertTokenStreamContents(out, new String[] {"wtf", "what", "wow", "the", "that's", "fudge", "funny", "happened"}, new int[] {0, 0, 0, 0, 0, 0, 0, 4}, new int[] {3, 3, 3, 3, 3, 3, 3, 12}, new int[] {1, 0, 0, 1, 0, 1, 0, 1}, new int[] {1, 1, 1, 1, 1, 1, 1, 1}, 12); } public void testWTF1() throws Exception { // "wow that's funny" and "what the fudge" are separate side paths, in parallel with "wtf", on input: TokenStream in = new CannedTokenStream(0, 12, new Token[] { token("wtf", 1, 5, 0, 3), token("what", 0, 1, 0, 3), token("wow", 0, 3, 0, 3), token("the", 1, 1, 0, 3), token("fudge", 1, 3, 0, 3), token("that's", 1, 1, 0, 3), token("funny", 1, 1, 0, 3), token("happened", 1, 1, 4, 12) }); TokenStream out = new FlattenGraphFilter(in); // ... but on output, it's flattened to wtf/what/wow that's/the fudge/funny happened: assertTokenStreamContents(out, new String[] {"wtf", "what", "wow", "the", "that's", "fudge", "funny", "happened"}, new int[] {0, 0, 0, 0, 0, 0, 0, 4}, new int[] {3, 3, 3, 3, 3, 3, 3, 12}, new int[] {1, 0, 0, 1, 0, 1, 0, 1}, new int[] {3, 1, 1, 1, 1, 1, 1, 1}, 12); } /** Same as testWTF1 except the "wtf" token comes out later */ public void testWTF2() throws Exception { // "wow that's funny" and "what the fudge" are separate side paths, in parallel with "wtf", on input: TokenStream in = new CannedTokenStream(0, 12, new Token[] { token("what", 1, 1, 0, 3), token("wow", 0, 3, 0, 3), token("wtf", 0, 5, 0, 3), token("the", 1, 1, 0, 3), token("fudge", 1, 3, 0, 3), token("that's", 1, 1, 0, 3), token("funny", 1, 1, 0, 3), token("happened", 1, 1, 4, 12) }); TokenStream out = new FlattenGraphFilter(in); // ... but on output, it's flattened to wtf/what/wow that's/the fudge/funny happened: assertTokenStreamContents(out, new String[] {"what", "wow", "wtf", "the", "that's", "fudge", "funny", "happened"}, new int[] {0, 0, 0, 0, 0, 0, 0, 4}, new int[] {3, 3, 3, 3, 3, 3, 3, 12}, new int[] {1, 0, 0, 1, 0, 1, 0, 1}, new int[] {1, 1, 3, 1, 1, 1, 1, 1}, 12); } public void testNonGreedySynonyms() throws Exception { // This is just "hypothetical" for Lucene today, because SynFilter is // greedy: when two syn rules match on overlapping tokens, only one // (greedily) wins. This test pretends all syn matches could match: TokenStream in = new CannedTokenStream(0, 20, new Token[] { token("wizard", 1, 1, 0, 6), token("wizard_of_oz", 0, 3, 0, 12), token("of", 1, 1, 7, 9), token("oz", 1, 1, 10, 12), token("oz_screams", 0, 2, 10, 20), token("screams", 1, 1, 13, 20), }); TokenStream out = new FlattenGraphFilter(in); // ... but on output, it's flattened to wtf/what/wow that's/the fudge/funny happened: assertTokenStreamContents(out, new String[] {"wizard", "wizard_of_oz", "of", "oz", "oz_screams", "screams"}, new int[] {0, 0, 7, 10, 10, 13}, new int[] {6, 12, 9, 12, 20, 20}, new int[] {1, 0, 1, 1, 0, 1}, new int[] {1, 3, 1, 1, 2, 1}, 20); } public void testNonGraph() throws Exception { TokenStream in = new CannedTokenStream(0, 22, new Token[] { token("hello", 1, 1, 0, 5), token("pseudo", 1, 1, 6, 12), token("world", 1, 1, 13, 18), token("fun", 1, 1, 19, 22), }); TokenStream out = new FlattenGraphFilter(in); // ... but on output, it's flattened to wtf/what/wow that's/the fudge/funny happened: assertTokenStreamContents(out, new String[] {"hello", "pseudo", "world", "fun"}, new int[] {0, 6, 13, 19}, new int[] {5, 12, 18, 22}, new int[] {1, 1, 1, 1}, new int[] {1, 1, 1, 1}, 22); } public void testSimpleHole() throws Exception { TokenStream in = new CannedTokenStream(0, 13, new Token[] { token("hello", 1, 1, 0, 5), token("hole", 2, 1, 6, 10), token("fun", 1, 1, 11, 13), }); TokenStream out = new FlattenGraphFilter(in); // ... but on output, it's flattened to wtf/what/wow that's/the fudge/funny happened: assertTokenStreamContents(out, new String[] {"hello", "hole", "fun"}, new int[] {0, 6, 11}, new int[] {5, 10, 13}, new int[] {1, 2, 1}, new int[] {1, 1, 1}, 13); } public void testHoleUnderSyn() throws Exception { // Tests a StopFilter after SynFilter where a stopword in a syn is removed // // wizard of oz -> woz syn, but then "of" becomes a hole TokenStream in = new CannedTokenStream(0, 12, new Token[] { token("wizard", 1, 1, 0, 6), token("woz", 0, 3, 0, 12), token("oz", 2, 1, 10, 12), }); TokenStream out = new FlattenGraphFilter(in); assertTokenStreamContents(out, new String[] {"wizard", "woz", "oz"}, new int[] {0, 0, 10}, new int[] {6, 12, 12}, new int[] {1, 0, 2}, new int[] {1, 3, 1}, 12); } public void testStrangelyNumberedNodes() throws Exception { // Uses only nodes 0, 2, 3, i.e. 1 is just never used (it is not a hole!!) TokenStream in = new CannedTokenStream(0, 27, new Token[] { token("dog", 1, 3, 0, 5), token("puppy", 0, 3, 0, 5), token("flies", 3, 1, 6, 11), }); TokenStream out = new FlattenGraphFilter(in); assertTokenStreamContents(out, new String[] {"dog", "puppy", "flies"}, new int[] {0, 0, 6}, new int[] {5, 5, 11}, new int[] {1, 0, 1}, new int[] {1, 1, 1}, 27); } public void testTwoLongParallelPaths() throws Exception { // "a a a a a a" in parallel with "b b b b b b" TokenStream in = new CannedTokenStream(0, 11, new Token[] { token("a", 1, 1, 0, 1), token("b", 0, 2, 0, 1), token("a", 1, 2, 2, 3), token("b", 1, 2, 2, 3), token("a", 1, 2, 4, 5), token("b", 1, 2, 4, 5), token("a", 1, 2, 6, 7), token("b", 1, 2, 6, 7), token("a", 1, 2, 8, 9), token("b", 1, 2, 8, 9), token("a", 1, 2, 10, 11), token("b", 1, 2, 10, 11), }); TokenStream out = new FlattenGraphFilter(in); // ... becomes flattened to a single path with overlapping a/b token between each node: assertTokenStreamContents(out, new String[] {"a", "b", "a", "b", "a", "b", "a", "b", "a", "b", "a", "b"}, new int[] {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10}, new int[] {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11}, new int[] {1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0}, new int[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}, 11); } // NOTE: TestSynonymGraphFilter's testRandomSyns also tests FlattenGraphFilter }