/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.analysis.synonym; import java.io.IOException; import java.io.StringReader; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Random; import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.MockGraphTokenFilter; import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.tokenattributes.*; import org.apache.lucene.util.CharsRefBuilder; import org.apache.lucene.util.TestUtil; public class TestSynonymMapFilter extends BaseTokenStreamTestCase { private SynonymMap.Builder b; private Tokenizer tokensIn; private SynonymFilter tokensOut; private CharTermAttribute termAtt; private PositionIncrementAttribute posIncrAtt; private PositionLengthAttribute posLenAtt; private OffsetAttribute offsetAtt; private void add(String input, String output, boolean keepOrig) { if (VERBOSE) { System.out.println(" add input=" + input + " output=" + output + " keepOrig=" + keepOrig); } CharsRefBuilder inputCharsRef = new CharsRefBuilder(); SynonymMap.Builder.join(input.split(" +"), inputCharsRef); CharsRefBuilder outputCharsRef = new CharsRefBuilder(); SynonymMap.Builder.join(output.split(" +"), outputCharsRef); b.add(inputCharsRef.get(), outputCharsRef.get(), keepOrig); } private void assertEquals(CharTermAttribute term, String expected) { assertEquals(expected.length(), term.length()); final char[] buffer = term.buffer(); for(int chIDX=0;chIDX<expected.length();chIDX++) { assertEquals(expected.charAt(chIDX), buffer[chIDX]); } } // For the output string: separate positions with a space, // and separate multiple tokens at each position with a // /. If a token should have end offset != the input // token's end offset then add :X to it: // TODO: we should probably refactor this guy to use/take analyzer, // the tests are a little messy private void verify(String input, String output) throws Exception { if (VERBOSE) { System.out.println("TEST: verify input=" + input + " expectedOutput=" + output); } tokensIn.setReader(new StringReader(input)); tokensOut.reset(); final String[] expected = output.split(" "); int expectedUpto = 0; while(tokensOut.incrementToken()) { if (VERBOSE) { System.out.println(" incr token=" + termAtt.toString() + " posIncr=" + posIncrAtt.getPositionIncrement() + " startOff=" + offsetAtt.startOffset() + " endOff=" + offsetAtt.endOffset()); } assertTrue(expectedUpto < expected.length); final int startOffset = offsetAtt.startOffset(); final int endOffset = offsetAtt.endOffset(); final String[] expectedAtPos = expected[expectedUpto++].split("/"); for(int atPos=0;atPos<expectedAtPos.length;atPos++) { if (atPos > 0) { assertTrue(tokensOut.incrementToken()); if (VERBOSE) { System.out.println(" incr token=" + termAtt.toString() + " posIncr=" + posIncrAtt.getPositionIncrement() + " startOff=" + offsetAtt.startOffset() + " endOff=" + offsetAtt.endOffset()); } } final int colonIndex = expectedAtPos[atPos].indexOf(':'); final int underbarIndex = expectedAtPos[atPos].indexOf('_'); final String expectedToken; final int expectedEndOffset; final int expectedPosLen; if (colonIndex != -1) { expectedToken = expectedAtPos[atPos].substring(0, colonIndex); if (underbarIndex != -1) { expectedEndOffset = Integer.parseInt(expectedAtPos[atPos].substring(1+colonIndex, underbarIndex)); expectedPosLen = Integer.parseInt(expectedAtPos[atPos].substring(1+underbarIndex)); } else { expectedEndOffset = Integer.parseInt(expectedAtPos[atPos].substring(1+colonIndex)); expectedPosLen = 1; } } else { expectedToken = expectedAtPos[atPos]; expectedEndOffset = endOffset; expectedPosLen = 1; } assertEquals(expectedToken, termAtt.toString()); assertEquals(atPos == 0 ? 1 : 0, posIncrAtt.getPositionIncrement()); // start/end offset of all tokens at same pos should // be the same: assertEquals(startOffset, offsetAtt.startOffset()); assertEquals(expectedEndOffset, offsetAtt.endOffset()); assertEquals(expectedPosLen, posLenAtt.getPositionLength()); } } tokensOut.end(); tokensOut.close(); if (VERBOSE) { System.out.println(" incr: END"); } assertEquals(expectedUpto, expected.length); } public void testDontKeepOrig() throws Exception { b = new SynonymMap.Builder(true); add("a b", "foo", false); final SynonymMap map = b.build(); final Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true); return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, false)); } }; assertAnalyzesTo(analyzer, "a b c", new String[] {"foo", "c"}, new int[] {0, 4}, new int[] {3, 5}, null, new int[] {1, 1}, new int[] {1, 1}, true); checkAnalysisConsistency(random(), analyzer, false, "a b c"); analyzer.close(); } public void testDoKeepOrig() throws Exception { b = new SynonymMap.Builder(true); add("a b", "foo", true); final SynonymMap map = b.build(); final Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true); return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, false)); } }; assertAnalyzesTo(analyzer, "a b c", new String[] {"a", "foo", "b", "c"}, new int[] {0, 0, 2, 4}, new int[] {1, 3, 3, 5}, null, new int[] {1, 0, 1, 1}, new int[] {1, 2, 1, 1}, true); checkAnalysisConsistency(random(), analyzer, false, "a b c"); analyzer.close(); } public void testBasic() throws Exception { b = new SynonymMap.Builder(true); add("a", "foo", true); add("a b", "bar fee", true); add("b c", "dog collar", true); add("c d", "dog harness holder extras", true); add("m c e", "dog barks loudly", false); add("i j k", "feep", true); add("e f", "foo bar", false); add("e f", "baz bee", false); add("z", "boo", false); add("y", "bee", true); tokensIn = new MockTokenizer(MockTokenizer.WHITESPACE, true); tokensIn.setReader(new StringReader("a")); tokensIn.reset(); assertTrue(tokensIn.incrementToken()); assertFalse(tokensIn.incrementToken()); tokensIn.end(); tokensIn.close(); tokensOut = new SynonymFilter(tokensIn, b.build(), true); termAtt = tokensOut.addAttribute(CharTermAttribute.class); posIncrAtt = tokensOut.addAttribute(PositionIncrementAttribute.class); posLenAtt = tokensOut.addAttribute(PositionLengthAttribute.class); offsetAtt = tokensOut.addAttribute(OffsetAttribute.class); verify("a b c", "a/bar b/fee c"); // syn output extends beyond input tokens verify("x a b c d", "x a/bar b/fee c/dog d/harness holder extras"); verify("a b a", "a/bar b/fee a/foo"); // outputs that add to one another: verify("c d c d", "c/dog d/harness c/holder/dog d/extras/harness holder extras"); // two outputs for same input verify("e f", "foo/baz bar/bee"); // verify multi-word / single-output offsets: verify("g i j k g", "g i/feep:7_3 j k g"); // mixed keepOrig true/false: verify("a m c e x", "a/foo dog barks loudly x"); verify("c d m c e x", "c/dog d/harness holder/dog extras/barks loudly x"); assertTrue(tokensOut.getCaptureCount() > 0); // no captureStates when no syns matched verify("p q r s t", "p q r s t"); assertEquals(0, tokensOut.getCaptureCount()); // no captureStates when only single-input syns, w/ no // lookahead needed, matched verify("p q z y t", "p q boo y/bee t"); assertEquals(0, tokensOut.getCaptureCount()); } private String getRandomString(char start, int alphabetSize, int length) { assert alphabetSize <= 26; char[] s = new char[2*length]; for(int charIDX=0;charIDX<length;charIDX++) { s[2*charIDX] = (char) (start + random().nextInt(alphabetSize)); s[2*charIDX+1] = ' '; } return new String(s); } private static class OneSyn { String in; List<String> out; boolean keepOrig; } public String slowSynMatcher(String doc, List<OneSyn> syns, int maxOutputLength) { assertTrue(doc.length() % 2 == 0); final int numInputs = doc.length()/2; boolean[] keepOrigs = new boolean[numInputs]; boolean[] hasMatch = new boolean[numInputs]; Arrays.fill(keepOrigs, false); String[] outputs = new String[numInputs + maxOutputLength]; OneSyn[] matches = new OneSyn[numInputs]; for(OneSyn syn : syns) { int idx = -1; while(true) { idx = doc.indexOf(syn.in, 1+idx); if (idx == -1) { break; } assertTrue(idx % 2 == 0); final int matchIDX = idx/2; assertTrue(syn.in.length() % 2 == 1); if (matches[matchIDX] == null) { matches[matchIDX] = syn; } else if (syn.in.length() > matches[matchIDX].in.length()) { // Greedy conflict resolution: longer match wins: matches[matchIDX] = syn; } else { assertTrue(syn.in.length() < matches[matchIDX].in.length()); } } } // Greedy conflict resolution: if syn matches a range of inputs, // it prevents other syns from matching that range for(int inputIDX=0;inputIDX<numInputs;inputIDX++) { final OneSyn match = matches[inputIDX]; if (match != null) { final int synInLength = (1+match.in.length())/2; for(int nextInputIDX=inputIDX+1;nextInputIDX<numInputs && nextInputIDX<(inputIDX+synInLength);nextInputIDX++) { matches[nextInputIDX] = null; } } } // Fill overlapping outputs: for(int inputIDX=0;inputIDX<numInputs;inputIDX++) { final OneSyn syn = matches[inputIDX]; if (syn == null) { continue; } for(int idx=0;idx<(1+syn.in.length())/2;idx++) { hasMatch[inputIDX+idx] = true; keepOrigs[inputIDX+idx] |= syn.keepOrig; } for(String synOut : syn.out) { final String[] synOutputs = synOut.split(" "); assertEquals(synOutputs.length, (1+synOut.length())/2); final int matchEnd = inputIDX + synOutputs.length; int synUpto = 0; for(int matchIDX=inputIDX;matchIDX<matchEnd;matchIDX++) { if (outputs[matchIDX] == null) { outputs[matchIDX] = synOutputs[synUpto++]; } else { outputs[matchIDX] = outputs[matchIDX] + "/" + synOutputs[synUpto++]; } final int endOffset; if (matchIDX < numInputs) { final int posLen; if (synOutputs.length == 1) { // Add full endOffset endOffset = (inputIDX*2) + syn.in.length(); posLen = syn.keepOrig ? (1+syn.in.length())/2 : 1; } else { // Add endOffset matching input token's endOffset = (matchIDX*2) + 1; posLen = 1; } outputs[matchIDX] = outputs[matchIDX] + ":" + endOffset + "_" + posLen; } } } } StringBuilder sb = new StringBuilder(); String[] inputTokens = doc.split(" "); final int limit = inputTokens.length + maxOutputLength; for(int inputIDX=0;inputIDX<limit;inputIDX++) { boolean posHasOutput = false; if (inputIDX >= numInputs && outputs[inputIDX] == null) { break; } if (inputIDX < numInputs && (!hasMatch[inputIDX] || keepOrigs[inputIDX])) { assertTrue(inputTokens[inputIDX].length() != 0); sb.append(inputTokens[inputIDX]); posHasOutput = true; } if (outputs[inputIDX] != null) { if (posHasOutput) { sb.append('/'); } sb.append(outputs[inputIDX]); } else if (!posHasOutput) { continue; } if (inputIDX < limit-1) { sb.append(' '); } } return sb.toString(); } public void testRandom() throws Exception { final int alphabetSize = TestUtil.nextInt(random(), 2, 7); final int docLen = atLeast(3000); //final int docLen = 50; final String document = getRandomString('a', alphabetSize, docLen); if (VERBOSE) { System.out.println("TEST: doc=" + document); } final int numSyn = atLeast(5); //final int numSyn = 2; final Map<String,OneSyn> synMap = new HashMap<>(); final List<OneSyn> syns = new ArrayList<>(); final boolean dedup = random().nextBoolean(); if (VERBOSE) { System.out.println(" dedup=" + dedup); } b = new SynonymMap.Builder(dedup); for(int synIDX=0;synIDX<numSyn;synIDX++) { final String synIn = getRandomString('a', alphabetSize, TestUtil.nextInt(random(), 1, 5)).trim(); OneSyn s = synMap.get(synIn); if (s == null) { s = new OneSyn(); s.in = synIn; syns.add(s); s.out = new ArrayList<>(); synMap.put(synIn, s); s.keepOrig = random().nextBoolean(); } final String synOut = getRandomString('0', 10, TestUtil.nextInt(random(), 1, 5)).trim(); s.out.add(synOut); add(synIn, synOut, s.keepOrig); if (VERBOSE) { System.out.println(" syns[" + synIDX + "] = " + s.in + " -> " + s.out + " keepOrig=" + s.keepOrig); } } tokensIn = new MockTokenizer(MockTokenizer.WHITESPACE, true); tokensIn.setReader(new StringReader("a")); tokensIn.reset(); assertTrue(tokensIn.incrementToken()); assertFalse(tokensIn.incrementToken()); tokensIn.end(); tokensIn.close(); tokensOut = new SynonymFilter(tokensIn, b.build(), true); termAtt = tokensOut.addAttribute(CharTermAttribute.class); posIncrAtt = tokensOut.addAttribute(PositionIncrementAttribute.class); posLenAtt = tokensOut.addAttribute(PositionLengthAttribute.class); offsetAtt = tokensOut.addAttribute(OffsetAttribute.class); if (dedup) { pruneDups(syns); } final String expected = slowSynMatcher(document, syns, 5); if (VERBOSE) { System.out.println("TEST: expected=" + expected); } verify(document, expected); } private void pruneDups(List<OneSyn> syns) { Set<String> seen = new HashSet<>(); for(OneSyn syn : syns) { int idx = 0; while(idx < syn.out.size()) { String out = syn.out.get(idx); if (!seen.contains(out)) { seen.add(out); idx++; } else { syn.out.remove(idx); } } seen.clear(); } } private String randomNonEmptyString() { while(true) { final String s = TestUtil.randomUnicodeString(random()).trim(); if (s.length() != 0 && s.indexOf('\u0000') == -1) { return s; } } } /** simple random test, doesn't verify correctness. * does verify it doesnt throw exceptions, or that the stream doesn't misbehave */ public void testRandom2() throws Exception { final int numIters = atLeast(3); for (int i = 0; i < numIters; i++) { b = new SynonymMap.Builder(random().nextBoolean()); final int numEntries = atLeast(10); for (int j = 0; j < numEntries; j++) { add(randomNonEmptyString(), randomNonEmptyString(), random().nextBoolean()); } final SynonymMap map = b.build(); final boolean ignoreCase = random().nextBoolean(); final Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true); return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, ignoreCase)); } }; checkRandomData(random(), analyzer, 100); analyzer.close(); } } // NOTE: this is an invalid test... SynFilter today can't // properly consume a graph... we can re-enable this once // we fix that... /* // Adds MockGraphTokenFilter before SynFilter: public void testRandom2GraphBefore() throws Exception { final int numIters = atLeast(10); Random random = random(); for (int i = 0; i < numIters; i++) { b = new SynonymMap.Builder(random.nextBoolean()); final int numEntries = atLeast(10); for (int j = 0; j < numEntries; j++) { add(randomNonEmptyString(), randomNonEmptyString(), random.nextBoolean()); } final SynonymMap map = b.build(); final boolean ignoreCase = random.nextBoolean(); final Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true); TokenStream graph = new MockGraphTokenFilter(random(), tokenizer); return new TokenStreamComponents(tokenizer, new SynonymFilter(graph, map, ignoreCase)); } }; checkRandomData(random, analyzer, 1000*RANDOM_MULTIPLIER); } } */ // Adds MockGraphTokenFilter after SynFilter: public void testRandom2GraphAfter() throws Exception { final int numIters = atLeast(3); Random random = random(); for (int i = 0; i < numIters; i++) { b = new SynonymMap.Builder(random.nextBoolean()); final int numEntries = atLeast(10); for (int j = 0; j < numEntries; j++) { add(randomNonEmptyString(), randomNonEmptyString(), random.nextBoolean()); } final SynonymMap map = b.build(); final boolean ignoreCase = random.nextBoolean(); final Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true); TokenStream syns = new SynonymFilter(tokenizer, map, ignoreCase); TokenStream graph = new MockGraphTokenFilter(random(), syns); return new TokenStreamComponents(tokenizer, graph); } }; checkRandomData(random, analyzer, 100); analyzer.close(); } } public void testEmptyTerm() throws IOException { Random random = random(); final int numIters = atLeast(10); for (int i = 0; i < numIters; i++) { b = new SynonymMap.Builder(random.nextBoolean()); final int numEntries = atLeast(10); for (int j = 0; j < numEntries; j++) { add(randomNonEmptyString(), randomNonEmptyString(), random.nextBoolean()); } final SynonymMap map = b.build(); final boolean ignoreCase = random.nextBoolean(); final Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new KeywordTokenizer(); return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, ignoreCase)); } }; checkAnalysisConsistency(random, analyzer, random.nextBoolean(), ""); analyzer.close(); } } /** simple random test like testRandom2, but for larger docs */ public void testRandomHuge() throws Exception { Random random = random(); final int numIters = atLeast(3); for (int i = 0; i < numIters; i++) { b = new SynonymMap.Builder(random.nextBoolean()); final int numEntries = atLeast(10); if (VERBOSE) { System.out.println("TEST: iter=" + i + " numEntries=" + numEntries); } for (int j = 0; j < numEntries; j++) { add(randomNonEmptyString(), randomNonEmptyString(), random.nextBoolean()); } final SynonymMap map = b.build(); final boolean ignoreCase = random.nextBoolean(); final Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true); return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, ignoreCase)); } }; checkRandomData(random, analyzer, 100, 1024); analyzer.close(); } } // LUCENE-3375 public void testVanishingTerms() throws Exception { String testFile = "aaa => aaaa1 aaaa2 aaaa3\n" + "bbb => bbbb1 bbbb2\n"; Analyzer synAnalyzer = new MockAnalyzer(random()); SolrSynonymParser parser = new SolrSynonymParser(true, true, synAnalyzer); parser.parse(new StringReader(testFile)); final SynonymMap map = parser.build(); synAnalyzer.close(); Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true); return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true)); } }; // where did my pot go?! assertAnalyzesTo(analyzer, "xyzzy bbb pot of gold", new String[] { "xyzzy", "bbbb1", "pot", "bbbb2", "of", "gold" }); // this one nukes 'pot' and 'of' // xyzzy aaa pot of gold -> xyzzy aaaa1 aaaa2 aaaa3 gold assertAnalyzesTo(analyzer, "xyzzy aaa pot of gold", new String[] { "xyzzy", "aaaa1", "pot", "aaaa2", "of", "aaaa3", "gold" }); analyzer.close(); } public void testBasic2() throws Exception { b = new SynonymMap.Builder(true); final boolean keepOrig = false; add("aaa", "aaaa1 aaaa2 aaaa3", keepOrig); add("bbb", "bbbb1 bbbb2", keepOrig); tokensIn = new MockTokenizer(MockTokenizer.WHITESPACE, true); tokensIn.setReader(new StringReader("a")); tokensIn.reset(); assertTrue(tokensIn.incrementToken()); assertFalse(tokensIn.incrementToken()); tokensIn.end(); tokensIn.close(); tokensOut = new SynonymFilter(tokensIn, b.build(), true); termAtt = tokensOut.addAttribute(CharTermAttribute.class); posIncrAtt = tokensOut.addAttribute(PositionIncrementAttribute.class); posLenAtt = tokensOut.addAttribute(PositionLengthAttribute.class); offsetAtt = tokensOut.addAttribute(OffsetAttribute.class); if (keepOrig) { verify("xyzzy bbb pot of gold", "xyzzy bbb/bbbb1 pot/bbbb2 of gold"); verify("xyzzy aaa pot of gold", "xyzzy aaa/aaaa1 pot/aaaa2 of/aaaa3 gold"); } else { verify("xyzzy bbb pot of gold", "xyzzy bbbb1 pot/bbbb2 of gold"); verify("xyzzy aaa pot of gold", "xyzzy aaaa1 pot/aaaa2 of/aaaa3 gold"); } } public void testMatching() throws Exception { b = new SynonymMap.Builder(true); final boolean keepOrig = false; add("a b", "ab", keepOrig); add("a c", "ac", keepOrig); add("a", "aa", keepOrig); add("b", "bb", keepOrig); add("z x c v", "zxcv", keepOrig); add("x c", "xc", keepOrig); final SynonymMap map = b.build(); Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true)); } }; checkOneTerm(a, "$", "$"); checkOneTerm(a, "a", "aa"); checkOneTerm(a, "b", "bb"); assertAnalyzesTo(a, "a $", new String[] { "aa", "$" }, new int[] { 1, 1 }); assertAnalyzesTo(a, "$ a", new String[] { "$", "aa" }, new int[] { 1, 1 }); assertAnalyzesTo(a, "a a", new String[] { "aa", "aa" }, new int[] { 1, 1 }); assertAnalyzesTo(a, "z x c v", new String[] { "zxcv" }, new int[] { 1 }); assertAnalyzesTo(a, "z x c $", new String[] { "z", "xc", "$" }, new int[] { 1, 1, 1 }); a.close(); } public void testRepeatsOff() throws Exception { b = new SynonymMap.Builder(true); final boolean keepOrig = false; add("a b", "ab", keepOrig); add("a b", "ab", keepOrig); add("a b", "ab", keepOrig); final SynonymMap map = b.build(); Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true)); } }; assertAnalyzesTo(a, "a b", new String[] { "ab" }, new int[] { 1 }); a.close(); } public void testRepeatsOn() throws Exception { b = new SynonymMap.Builder(false); final boolean keepOrig = false; add("a b", "ab", keepOrig); add("a b", "ab", keepOrig); add("a b", "ab", keepOrig); final SynonymMap map = b.build(); Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true)); } }; assertAnalyzesTo(a, "a b", new String[] { "ab", "ab", "ab" }, new int[] { 1, 0, 0 }); a.close(); } public void testRecursion() throws Exception { b = new SynonymMap.Builder(true); final boolean keepOrig = false; add("zoo", "zoo", keepOrig); final SynonymMap map = b.build(); Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true)); } }; assertAnalyzesTo(a, "zoo zoo $ zoo", new String[] { "zoo", "zoo", "$", "zoo" }, new int[] { 1, 1, 1, 1 }); a.close(); } public void testRecursion2() throws Exception { b = new SynonymMap.Builder(true); final boolean keepOrig = false; add("zoo", "zoo", keepOrig); add("zoo", "zoo zoo", keepOrig); final SynonymMap map = b.build(); Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true)); } }; // verify("zoo zoo $ zoo", "zoo/zoo zoo/zoo/zoo $/zoo zoo/zoo zoo"); assertAnalyzesTo(a, "zoo zoo $ zoo", new String[] { "zoo", "zoo", "zoo", "zoo", "zoo", "$", "zoo", "zoo", "zoo", "zoo" }, new int[] { 1, 0, 1, 0, 0, 1, 0, 1, 0, 1 }); a.close(); } public void testOutputHangsOffEnd() throws Exception { b = new SynonymMap.Builder(true); final boolean keepOrig = false; // b hangs off the end (no input token under it): add("a", "a b", keepOrig); tokensIn = new MockTokenizer(MockTokenizer.WHITESPACE, true); tokensIn.setReader(new StringReader("a")); tokensIn.reset(); assertTrue(tokensIn.incrementToken()); assertFalse(tokensIn.incrementToken()); tokensIn.end(); tokensIn.close(); tokensOut = new SynonymFilter(tokensIn, b.build(), true); termAtt = tokensOut.addAttribute(CharTermAttribute.class); posIncrAtt = tokensOut.addAttribute(PositionIncrementAttribute.class); offsetAtt = tokensOut.addAttribute(OffsetAttribute.class); posLenAtt = tokensOut.addAttribute(PositionLengthAttribute.class); // Make sure endOffset inherits from previous input token: verify("a", "a b:1"); } public void testIncludeOrig() throws Exception { b = new SynonymMap.Builder(true); final boolean keepOrig = true; add("a b", "ab", keepOrig); add("a c", "ac", keepOrig); add("a", "aa", keepOrig); add("b", "bb", keepOrig); add("z x c v", "zxcv", keepOrig); add("x c", "xc", keepOrig); final SynonymMap map = b.build(); Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true)); } }; assertAnalyzesTo(a, "$", new String[] { "$" }, new int[] { 1 }); assertAnalyzesTo(a, "a", new String[] { "a", "aa" }, new int[] { 1, 0 }); assertAnalyzesTo(a, "a", new String[] { "a", "aa" }, new int[] { 1, 0 }); assertAnalyzesTo(a, "$ a", new String[] { "$", "a", "aa" }, new int[] { 1, 1, 0 }); assertAnalyzesTo(a, "a $", new String[] { "a", "aa", "$" }, new int[] { 1, 0, 1 }); assertAnalyzesTo(a, "$ a !", new String[] { "$", "a", "aa", "!" }, new int[] { 1, 1, 0, 1 }); assertAnalyzesTo(a, "a a", new String[] { "a", "aa", "a", "aa" }, new int[] { 1, 0, 1, 0 }); assertAnalyzesTo(a, "b", new String[] { "b", "bb" }, new int[] { 1, 0 }); assertAnalyzesTo(a, "z x c v", new String[] { "z", "zxcv", "x", "c", "v" }, new int[] { 1, 0, 1, 1, 1 }); assertAnalyzesTo(a, "z x c $", new String[] { "z", "x", "xc", "c", "$" }, new int[] { 1, 1, 0, 1, 1 }); a.close(); } public void testRecursion3() throws Exception { b = new SynonymMap.Builder(true); final boolean keepOrig = true; add("zoo zoo", "zoo", keepOrig); final SynonymMap map = b.build(); Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true)); } }; assertAnalyzesTo(a, "zoo zoo $ zoo", new String[] { "zoo", "zoo", "zoo", "$", "zoo" }, new int[] { 1, 0, 1, 1, 1 }); a.close(); } public void testRecursion4() throws Exception { b = new SynonymMap.Builder(true); final boolean keepOrig = true; add("zoo zoo", "zoo", keepOrig); add("zoo", "zoo zoo", keepOrig); final SynonymMap map = b.build(); Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true)); } }; assertAnalyzesTo(a, "zoo zoo $ zoo", new String[] { "zoo", "zoo", "zoo", "$", "zoo", "zoo", "zoo" }, new int[] { 1, 0, 1, 1, 1, 0, 1 }); a.close(); } public void testMultiwordOffsets() throws Exception { b = new SynonymMap.Builder(true); final boolean keepOrig = true; add("national hockey league", "nhl", keepOrig); final SynonymMap map = b.build(); Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true)); } }; assertAnalyzesTo(a, "national hockey league", new String[] { "national", "nhl", "hockey", "league" }, new int[] { 0, 0, 9, 16 }, new int[] { 8, 22, 15, 22 }, new int[] { 1, 0, 1, 1 }); a.close(); } public void testEmpty() throws Exception { Tokenizer tokenizer = new MockTokenizer(); tokenizer.setReader(new StringReader("aa bb")); IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> { new SynonymFilter(tokenizer, new SynonymMap.Builder(true).build(), true); }); assertEquals("fst must be non-null", expected.getMessage()); } }