/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.synonym;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
/**
* @version $Id: TestSynonymFilter.java 950008 2010-06-01 10:35:13Z rmuir $
*/
public class TestSynonymFilter extends BaseTokenStreamTestCase {
static List<String> strings(String str) {
String[] arr = str.split(" ");
return Arrays.asList(arr);
}
static void assertTokenizesTo(SynonymMap dict, String input,
String expected[]) throws IOException {
Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
SynonymFilter stream = new SynonymFilter(tokenizer, dict);
assertTokenStreamContents(stream, expected);
}
static void assertTokenizesTo(SynonymMap dict, String input,
String expected[], int posIncs[]) throws IOException {
Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
SynonymFilter stream = new SynonymFilter(tokenizer, dict);
assertTokenStreamContents(stream, expected, posIncs);
}
static void assertTokenizesTo(SynonymMap dict, List<Token> input,
String expected[], int posIncs[])
throws IOException {
TokenStream tokenizer = new IterTokenStream(input);
SynonymFilter stream = new SynonymFilter(tokenizer, dict);
assertTokenStreamContents(stream, expected, posIncs);
}
static void assertTokenizesTo(SynonymMap dict, List<Token> input,
String expected[], int startOffsets[], int endOffsets[], int posIncs[])
throws IOException {
TokenStream tokenizer = new IterTokenStream(input);
SynonymFilter stream = new SynonymFilter(tokenizer, dict);
assertTokenStreamContents(stream, expected, startOffsets, endOffsets,
posIncs);
}
public void testMatching() throws IOException {
SynonymMap map = new SynonymMap();
boolean orig = false;
boolean merge = true;
map.add(strings("a b"), tokens("ab"), orig, merge);
map.add(strings("a c"), tokens("ac"), orig, merge);
map.add(strings("a"), tokens("aa"), orig, merge);
map.add(strings("b"), tokens("bb"), orig, merge);
map.add(strings("z x c v"), tokens("zxcv"), orig, merge);
map.add(strings("x c"), tokens("xc"), orig, merge);
assertTokenizesTo(map, "$", new String[] { "$" });
assertTokenizesTo(map, "a", new String[] { "aa" });
assertTokenizesTo(map, "a $", new String[] { "aa", "$" });
assertTokenizesTo(map, "$ a", new String[] { "$", "aa" });
assertTokenizesTo(map, "a a", new String[] { "aa", "aa" });
assertTokenizesTo(map, "b", new String[] { "bb" });
assertTokenizesTo(map, "z x c v", new String[] { "zxcv" });
assertTokenizesTo(map, "z x c $", new String[] { "z", "xc", "$" });
// repeats
map.add(strings("a b"), tokens("ab"), orig, merge);
map.add(strings("a b"), tokens("ab"), orig, merge);
// FIXME: the below test intended to be { "ab" }
assertTokenizesTo(map, "a b", new String[] { "ab", "ab", "ab" });
// check for lack of recursion
map.add(strings("zoo"), tokens("zoo"), orig, merge);
assertTokenizesTo(map, "zoo zoo $ zoo", new String[] { "zoo", "zoo", "$", "zoo" });
map.add(strings("zoo"), tokens("zoo zoo"), orig, merge);
// FIXME: the below test intended to be { "zoo", "zoo", "zoo", "zoo", "$", "zoo", "zoo" }
// maybe this was just a typo in the old test????
assertTokenizesTo(map, "zoo zoo $ zoo", new String[] { "zoo", "zoo", "zoo", "zoo", "zoo", "zoo", "$", "zoo", "zoo", "zoo" });
}
public void testIncludeOrig() throws IOException {
SynonymMap map = new SynonymMap();
boolean orig = true;
boolean merge = true;
map.add(strings("a b"), tokens("ab"), orig, merge);
map.add(strings("a c"), tokens("ac"), orig, merge);
map.add(strings("a"), tokens("aa"), orig, merge);
map.add(strings("b"), tokens("bb"), orig, merge);
map.add(strings("z x c v"), tokens("zxcv"), orig, merge);
map.add(strings("x c"), tokens("xc"), orig, merge);
assertTokenizesTo(map, "$",
new String[] { "$" },
new int[] { 1 });
assertTokenizesTo(map, "a",
new String[] { "a", "aa" },
new int[] { 1, 0 });
assertTokenizesTo(map, "a",
new String[] { "a", "aa" },
new int[] { 1, 0 });
assertTokenizesTo(map, "$ a",
new String[] { "$", "a", "aa" },
new int[] { 1, 1, 0 });
assertTokenizesTo(map, "a $",
new String[] { "a", "aa", "$" },
new int[] { 1, 0, 1 });
assertTokenizesTo(map, "$ a !",
new String[] { "$", "a", "aa", "!" },
new int[] { 1, 1, 0, 1 });
assertTokenizesTo(map, "a a",
new String[] { "a", "aa", "a", "aa" },
new int[] { 1, 0, 1, 0 });
assertTokenizesTo(map, "b",
new String[] { "b", "bb" },
new int[] { 1, 0 });
assertTokenizesTo(map, "z x c v",
new String[] { "z", "zxcv", "x", "c", "v" },
new int[] { 1, 0, 1, 1, 1 });
assertTokenizesTo(map, "z x c $",
new String[] { "z", "x", "xc", "c", "$" },
new int[] { 1, 1, 0, 1, 1 });
// check for lack of recursion
map.add(strings("zoo zoo"), tokens("zoo"), orig, merge);
// CHECKME: I think the previous test (with 4 zoo's), was just a typo.
assertTokenizesTo(map, "zoo zoo $ zoo",
new String[] { "zoo", "zoo", "zoo", "$", "zoo" },
new int[] { 1, 0, 1, 1, 1 });
map.add(strings("zoo"), tokens("zoo zoo"), orig, merge);
assertTokenizesTo(map, "zoo zoo $ zoo",
new String[] { "zoo", "zoo", "zoo", "$", "zoo", "zoo", "zoo" },
new int[] { 1, 0, 1, 1, 1, 0, 1 });
}
public void testMapMerge() throws IOException {
SynonymMap map = new SynonymMap();
boolean orig = false;
boolean merge = true;
map.add(strings("a"), tokens("a5,5"), orig, merge);
map.add(strings("a"), tokens("a3,3"), orig, merge);
assertTokenizesTo(map, "a",
new String[] { "a3", "a5" },
new int[] { 1, 2 });
map.add(strings("b"), tokens("b3,3"), orig, merge);
map.add(strings("b"), tokens("b5,5"), orig, merge);
assertTokenizesTo(map, "b",
new String[] { "b3", "b5" },
new int[] { 1, 2 });
map.add(strings("a"), tokens("A3,3"), orig, merge);
map.add(strings("a"), tokens("A5,5"), orig, merge);
assertTokenizesTo(map, "a",
new String[] { "a3", "A3", "a5", "A5" },
new int[] { 1, 0, 2, 0 });
map.add(strings("a"), tokens("a1"), orig, merge);
assertTokenizesTo(map, "a",
new String[] { "a1", "a3", "A3", "a5", "A5" },
new int[] { 1, 2, 0, 2, 0 });
map.add(strings("a"), tokens("a2,2"), orig, merge);
map.add(strings("a"), tokens("a4,4 a6,2"), orig, merge);
assertTokenizesTo(map, "a",
new String[] { "a1", "a2", "a3", "A3", "a4", "a5", "A5", "a6" },
new int[] { 1, 1, 1, 0, 1, 1, 0, 1 });
}
public void testOverlap() throws IOException {
SynonymMap map = new SynonymMap();
boolean orig = false;
boolean merge = true;
map.add(strings("qwe"), tokens("qq/ww/ee"), orig, merge);
map.add(strings("qwe"), tokens("xx"), orig, merge);
map.add(strings("qwe"), tokens("yy"), orig, merge);
map.add(strings("qwe"), tokens("zz"), orig, merge);
assertTokenizesTo(map, "$", new String[] { "$" });
assertTokenizesTo(map, "qwe",
new String[] { "qq", "ww", "ee", "xx", "yy", "zz" },
new int[] { 1, 0, 0, 0, 0, 0 });
// test merging within the map
map.add(strings("a"), tokens("a5,5 a8,3 a10,2"), orig, merge);
map.add(strings("a"), tokens("a3,3 a7,4 a9,2 a11,2 a111,100"), orig, merge);
assertTokenizesTo(map, "a",
new String[] { "a3", "a5", "a7", "a8", "a9", "a10", "a11", "a111" },
new int[] { 1, 2, 2, 1, 1, 1, 1, 100 });
}
public void testPositionIncrements() throws IOException {
SynonymMap map = new SynonymMap();
boolean orig = false;
boolean merge = true;
// test that generated tokens start at the same posInc as the original
map.add(strings("a"), tokens("aa"), orig, merge);
assertTokenizesTo(map, tokens("a,5"),
new String[] { "aa" },
new int[] { 5 });
assertTokenizesTo(map, tokens("a,0"),
new String[] { "aa" },
new int[] { 0 });
// test that offset of first replacement is ignored (always takes the orig offset)
map.add(strings("b"), tokens("bb,100"), orig, merge);
assertTokenizesTo(map, tokens("b,5"),
new String[] { "bb" },
new int[] { 5 });
assertTokenizesTo(map, tokens("b,0"),
new String[] { "bb" },
new int[] { 0 });
// test that subsequent tokens are adjusted accordingly
map.add(strings("c"), tokens("cc,100 c2,2"), orig, merge);
assertTokenizesTo(map, tokens("c,5"),
new String[] { "cc", "c2" },
new int[] { 5, 2 });
assertTokenizesTo(map, tokens("c,0"),
new String[] { "cc", "c2" },
new int[] { 0, 2 });
}
public void testPositionIncrementsWithOrig() throws IOException {
SynonymMap map = new SynonymMap();
boolean orig = true;
boolean merge = true;
// test that generated tokens start at the same offset as the original
map.add(strings("a"), tokens("aa"), orig, merge);
assertTokenizesTo(map, tokens("a,5"),
new String[] { "a", "aa" },
new int[] { 5, 0 });
assertTokenizesTo(map, tokens("a,0"),
new String[] { "a", "aa" },
new int[] { 0, 0 });
// test that offset of first replacement is ignored (always takes the orig offset)
map.add(strings("b"), tokens("bb,100"), orig, merge);
assertTokenizesTo(map, tokens("b,5"),
new String[] { "b", "bb" },
new int[] { 5, 0 });
assertTokenizesTo(map, tokens("b,0"),
new String[] { "b", "bb" },
new int[] { 0, 0 });
// test that subsequent tokens are adjusted accordingly
map.add(strings("c"), tokens("cc,100 c2,2"), orig, merge);
assertTokenizesTo(map, tokens("c,5"),
new String[] { "c", "cc", "c2" },
new int[] { 5, 0, 2 });
assertTokenizesTo(map, tokens("c,0"),
new String[] { "c", "cc", "c2" },
new int[] { 0, 0, 2 });
}
public void testOffsetBug() throws IOException {
// With the following rules:
// a a=>b
// x=>y
// analysing "a x" causes "y" to have a bad offset (end less than start)
// SOLR-167
SynonymMap map = new SynonymMap();
boolean orig = false;
boolean merge = true;
map.add(strings("a a"), tokens("b"), orig, merge);
map.add(strings("x"), tokens("y"), orig, merge);
// "a a x" => "b y"
assertTokenizesTo(map, tokens("a,1,0,1 a,1,2,3 x,1,4,5"),
new String[] { "b", "y" },
new int[] { 0, 4 },
new int[] { 3, 5 },
new int[] { 1, 1 });
}
/***
* Return a list of tokens according to a test string format:
* a b c => returns List<Token> [a,b,c]
* a/b => tokens a and b share the same spot (b.positionIncrement=0)
* a,3/b/c => a,b,c all share same position (a.positionIncrement=3, b.positionIncrement=0, c.positionIncrement=0)
* a,1,10,11 => "a" with positionIncrement=1, startOffset=10, endOffset=11
* @deprecated does not support attributes api
*/
private List<Token> tokens(String str) {
String[] arr = str.split(" ");
List<Token> result = new ArrayList<Token>();
for (int i=0; i<arr.length; i++) {
String[] toks = arr[i].split("/");
String[] params = toks[0].split(",");
int posInc;
int start;
int end;
if (params.length > 1) {
posInc = Integer.parseInt(params[1]);
} else {
posInc = 1;
}
if (params.length > 2) {
start = Integer.parseInt(params[2]);
} else {
start = 0;
}
if (params.length > 3) {
end = Integer.parseInt(params[3]);
} else {
end = start + params[0].length();
}
Token t = new Token(params[0],start,end,"TEST");
t.setPositionIncrement(posInc);
result.add(t);
for (int j=1; j<toks.length; j++) {
t = new Token(toks[j],0,0,"TEST");
t.setPositionIncrement(0);
result.add(t);
}
}
return result;
}
/**
* @deprecated does not support custom attributes
*/
private static class IterTokenStream extends TokenStream {
final Token tokens[];
int index = 0;
CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
PayloadAttribute payloadAtt = addAttribute(PayloadAttribute.class);
public IterTokenStream(Token... tokens) {
super();
this.tokens = tokens;
}
public IterTokenStream(Collection<Token> tokens) {
this(tokens.toArray(new Token[tokens.size()]));
}
public boolean incrementToken() throws IOException {
if (index >= tokens.length)
return false;
else {
clearAttributes();
Token token = tokens[index++];
termAtt.setEmpty().append(token);
offsetAtt.setOffset(token.startOffset(), token.endOffset());
posIncAtt.setPositionIncrement(token.getPositionIncrement());
flagsAtt.setFlags(token.getFlags());
typeAtt.setType(token.type());
payloadAtt.setPayload(token.getPayload());
return true;
}
}
}
}