package org.apache.lucene.analysis.synonym;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.ResourceLoader;
import org.apache.lucene.analysis.core.KeywordTokenizerFactory;
import org.apache.lucene.analysis.synonym.SynonymFilter;
import org.apache.lucene.analysis.synonym.NewSynonymFilterFactory.SynonymParser;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.CharsRef;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringReader;
import java.util.HashMap;
import java.util.Map;
public class TestNewMultiWordSynonyms extends BaseTokenStreamTestCase {
private StringMockResourceLoader getSyn() {
return new StringMockResourceLoader(
"hubble\0space\0telescope,HST,hs telescope\n" +
"foo\0bar,foo ba,fu ba,foobar\n" +
"foo\0baz,fu ba");
}
private StringMockResourceLoader getSemicolonSingleSyn() {
return new StringMockResourceLoader(
"žščřdťň, á;zscrdtn, a\n" +
"fůů, bar => foo, bar; fuu, bar\n" +
"ADAMŠuk, m; ADAMGuk, m;ADAMČuk, m\n"
);
}
private StringMockResourceLoader getSolrSingleSyn() {
return new StringMockResourceLoader(
"žščřdťň\\,\\ á,zscrdtn\\,\\ a\n" +
"fůů\\,\\ bar => foo\\,\\ bar, fuu\\,\\ bar\n"
);
}
String O = TypeAttribute.DEFAULT_TYPE;
String S = SynonymFilter.TYPE_SYNONYM;
public void testSingleWordSolrSynonyms() throws IOException {
Map<String,String> args = new HashMap<String,String>();
args.put("synonyms", "synonyms.txt");
args.put("tokenizerFactory", KeywordTokenizerFactory.class.getCanonicalName().toString());
NewSynonymFilterFactory factory = new NewSynonymFilterFactory(args);
factory.inform(getSolrSingleSyn());
TokenStream ts = factory.create(keywordMockTokenizer(new StringReader("žščřdťň, á")));
assertTokenStreamContents(ts, new String[] { "žščřdťň, á", "zscrdtn, a" },
new int[] {0, 0}, //startOffset
new int[] {10,10}, //endOffset
new String[] {S, S}, //type
new int[] {1, 0} //posIncr
);
}
public void testSingleWordSemicolonSynonyms() throws IOException {
Map<String,String> args = new HashMap<String,String>();
args.put("synonyms", "synonyms.txt");
args.put("format", "semicolon");
args.put("tokenizerFactory", KeywordTokenizerFactory.class.getCanonicalName().toString());
NewSynonymFilterFactory factory = new NewSynonymFilterFactory(args);
factory.inform(getSemicolonSingleSyn());
TokenStream ts = factory.create(keywordMockTokenizer(new StringReader("žščřdťň, á")));
assertTokenStreamContents(ts, new String[] { "žščřdťň, á", "zscrdtn, a" },
new int[] {0, 0}, //startOffset
new int[] {10,10}, //endOffset
new String[] {S, S}, //type
new int[] {1, 0} //posIncr
);
ts = factory.create(keywordMockTokenizer(new StringReader("žščřdťň, á")));
assertTokenStreamContents(ts, new String[] { "žščřdťň, á", "zscrdtn, a" },
new int[] {0, 0}, //startOffset
new int[] {10,10}, //endOffset
new String[] {S, S}, //type
new int[] {1, 0} //posIncr
);
}
/*
* This parser is useful if you want to index multi-token synonyms (as one token)
* as well as their components. Ie. "hubble space telescope was..." will be
* indexed as
* 0: hubble|hubble space telescope|HST
* 1: space
* 2: telescope
*/
public static class TestParserReplaceNullsInclOrig extends NewSynonymFilterFactory.SynonymBuilderFactory {
public TestParserReplaceNullsInclOrig(Map<String,String> args) {
super(args);
}
protected SynonymParser getParser(Analyzer analyzer) {
return new NewSolrSynonymParser(true, true, analyzer) {
@Override
public void add(CharsRef input, CharsRef output, boolean includeOrig) {
super.add(input, NewSynonymFilterFactory.replaceNulls(output), true);
}
};
}
}
/**
* @since solr 1.4
*/
public void testMultiWordSynonyms() throws IOException {
Map<String,String> args = new HashMap<String,String>();
args.put("synonyms", "synonyms.txt");
NewSynonymFilterFactory factory = new NewSynonymFilterFactory(args);
factory.inform(new StringMockResourceLoader("a b c,d"));
TokenStream ts = factory.create(whitespaceMockTokenizer(new StringReader("a e")));
// This fails because ["e","e"] is the value of the token stream
assertTokenStreamContents(ts, new String[] { "a", "e" });
}
public void testMultiWordSynonymsReplaceNullsCustomInclOrigAnalyzer() throws IOException {
Map<String,String> args = new HashMap<String,String>();
args.put("synonyms", "synonyms.txt");
args.put("tokenizerFactory", "org.apache.lucene.analysis.core.KeywordTokenizerFactory");
args.put("builderFactory", NewSynonymFilterFactory.BestEffortSearchLowercase.class.getName());
NewSynonymFilterFactory factory = new NewSynonymFilterFactory(args);
factory.inform(getSyn());
TokenStream ts = factory.create(whitespaceMockTokenizer(new StringReader("foo hubble space telescope")));
assertTokenStreamContents(ts, new String[] { "foo", "hubble", "hubble space telescope", "HST", "hs telescope", "space", "telescope" },
new int[] {0, 4, 4, 4, 4,11,17}, //startOffset
new int[] {3,10,26,26,26,16,26}, //endOffset
new String[] {O, O, S, S, S, O, O}, //type
new int[] {1, 1, 0, 0, 0, 1, 1} //posIncr
);
// test ignoreCase=true
ts = factory.create(whitespaceMockTokenizer(new StringReader("hst")));
assertTokenStreamContents(ts, new String[] { "hubble space telescope", "HST", "hs telescope"},
new int[] {0, 0, 0},
new int[] {3, 3, 3},
new String[] {S, S, S},
new int[] {1, 0, 0}
);
ts = factory.create(whitespaceMockTokenizer(new StringReader("some foo bar")));
assertTokenStreamContents(ts, new String[] { "some", "foo", "foo bar", "foo ba", "fu ba", "foobar", "bar" },
new int[] {0, 5, 5, 5, 5, 5, 9}, //startOffset
new int[] {4, 8,12,12,12,12,12}, //endOffset
new String[] {O, O, S, S, S, S, O}, //type
new int[] {1, 1, 0, 0, 0, 0, 1} //posIncr
);
ts = factory.create(whitespaceMockTokenizer(new StringReader("some foobar")));
assertTokenStreamContents(ts, new String[] { "some", "foo bar", "foo ba", "fu ba", "foobar"},
new int[] {0, 5, 5, 5, 5, 5}, //startOffset
new int[] {4,11,11,11,11,11}, //endOffset
new String[] {O, S, S, S, S, S}, //type
new int[] {1, 1, 0, 0, 0, 1} //posIncr
);
}
public void testMultiWordSynonymsReplaceNullsInclOrig() throws IOException {
Map<String,String> args = new HashMap<String,String>();
args.put("synonyms", "synonyms.txt");
args.put("ignoreCase", "true");
args.put("tokenizerFactory", "org.apache.lucene.analysis.core.KeywordTokenizerFactory");
args.put("builderFactory", TestParserReplaceNullsInclOrig.class.getName());
NewSynonymFilterFactory factory = new NewSynonymFilterFactory(args);
factory.inform(getSyn());
TokenStream ts = factory.create(whitespaceMockTokenizer(new StringReader("foo hubble space telescope")));
assertTokenStreamContents(ts, new String[] { "foo", "hubble", "hubble space telescope", "hst", "hs telescope", "space", "telescope" },
new int[] {0, 4, 4, 4, 4,11,17}, //startOffset
new int[] {3,10,26,26,26,16,26}, //endOffset
new String[] {O, O, S, S, S, O, O}, //type
new int[] {1, 1, 0, 0, 0, 1, 1} //posIncr
);
ts = factory.create(whitespaceMockTokenizer(new StringReader("hst")));
assertTokenStreamContents(ts, new String[] { "hst", "hubble space telescope", "hst", "hs telescope"},
new int[] {0, 0, 0, 0},
new int[] {3, 3, 3, 3},
new String[] {O, S, S, S},
new int[] {1, 0, 0, 0}
);
ts = factory.create(whitespaceMockTokenizer(new StringReader("some foo bar")));
assertTokenStreamContents(ts, new String[] { "some", "foo", "foo bar", "foo ba", "fu ba", "foobar", "bar" },
new int[] {0, 5, 5, 5, 5, 5, 9}, //startOffset
new int[] {4, 8,12,12,12,12,12}, //endOffset
new String[] {O, O, S, S, S, S, O}, //type
new int[] {1, 1, 0, 0, 0, 0, 1} //posIncr
);
ts = factory.create(whitespaceMockTokenizer(new StringReader("some foobar")));
assertTokenStreamContents(ts, new String[] { "some", "foobar", "foo bar", "foo ba", "fu ba", "foobar"},
new int[] {0, 5, 5, 5, 5, 5, 5}, //startOffset
new int[] {4,11,11,11,11,11,11}, //endOffset
new String[] {O, O, S, S, S, S, S}, //type
new int[] {1, 1, 0, 0, 0, 0, 1} //posIncr
);
}
public void testMultiWordSynonymsNullReplaced() throws IOException {
Map<String,String> args = new HashMap<String,String>();
args.put("synonyms", "synonyms.txt");
args.put("ignoreCase", "false");
args.put("tokenizerFactory", "org.apache.lucene.analysis.core.KeywordTokenizerFactory");
args.put("builderFactory", NewSynonymFilterFactory.MultiTokenReplaceNulls.class.getName());
NewSynonymFilterFactory factory = new NewSynonymFilterFactory(args);
factory.inform(getSyn());
TokenStream ts = factory.create(whitespaceMockTokenizer(new StringReader("foo hubble space telescope")));
assertTokenStreamContents(ts, new String[] { "foo", "hubble space telescope", "HST", "hs telescope" },
new int[] {0, 4, 4, 4}, //startOffset
new int[] {3,26,26,26}, //endOffset
new String[] {O, S, S, S}, //type
new int[] {1, 1, 0, 0} //posIncr
);
ts = factory.create(whitespaceMockTokenizer(new StringReader("HST")));
assertTokenStreamContents(ts, new String[] { "hubble space telescope", "HST", "hs telescope"},
new int[] {0, 0, 0},
new int[] {3, 3, 3},
new String[] {S, S, S},
new int[] {1, 0, 0}
);
ts = factory.create(whitespaceMockTokenizer(new StringReader("some foo bar")));
assertTokenStreamContents(ts, new String[] { "some", "foo bar", "foo ba", "fu ba", "foobar" },
new int[] {0, 5, 5, 5, 5}, //startOffset
new int[] {4,12,12,12,12}, //endOffset
new String[] {O, S, S, S, S}, //type
new int[] {1, 1, 0, 0, 0} //posIncr
);
ts = factory.create(whitespaceMockTokenizer(new StringReader("some foobar")));
assertTokenStreamContents(ts, new String[] { "some", "foo bar", "foo ba", "fu ba", "foobar"},
new int[] {0, 5, 5, 5, 5, 5}, //startOffset
new int[] {4,11,11,11,11,11}, //endOffset
new String[] {O, S, S, S, S, S}, //type
new int[] {1, 1, 0, 0, 0, 1} //posIncr
);
}
public void testMultiWordSynonymsDefault() throws IOException {
Map<String,String> args = new HashMap<String,String>();
args.put("synonyms", "synonyms.txt");
args.put("tokenizerFactory", "org.apache.lucene.analysis.core.KeywordTokenizerFactory");
NewSynonymFilterFactory factory = new NewSynonymFilterFactory(args);
factory.inform(getSyn());
TokenStream ts = factory.create(whitespaceMockTokenizer(new StringReader("foo hubble space telescope")));
assertTokenStreamContents(ts, new String[] { "foo", "hubble", "HST", "hs telescope", "space", "telescope" },
new int[] {0, 4, 4, 4,11,17}, //startOffset
new int[] {3,10,26,26,16,26}, //endOffset
new String[] {O, S, S, S, S, S}, //type
new int[] {1, 1, 0, 0, 1, 1} //posIncr
);
ts = factory.create(whitespaceMockTokenizer(new StringReader("HST")));
assertTokenStreamContents(ts, new String[] { "hubble", "HST", "hs telescope", "space", "telescope" },
new int[] {0, 0, 0, 0, 0},
new int[] {3, 3, 3, 3, 3},
new String[] {S, S, S, S, S},
new int[] {1, 0, 0, 1, 1}
);
ts = factory.create(whitespaceMockTokenizer(new StringReader("some foo bar")));
assertTokenStreamContents(ts, new String[] { "some", "foo", "foo ba", "fu ba", "foobar", "bar" },
new int[] {0, 5, 5, 5, 5, 9}, //startOffset
new int[] {4, 8,12,12,12,12}, //endOffset
new String[] {O, S, S, S, S, S}, //type
new int[] {1, 1, 0, 0, 0, 1} //posIncr
);
ts = factory.create(whitespaceMockTokenizer(new StringReader("some foobar")));
assertTokenStreamContents(ts, new String[] { "some", "foo", "foo ba", "fu ba", "foobar", "bar"},
new int[] {0, 5, 5, 5, 5, 5}, //startOffset
new int[] {4,11,11,11,11,11}, //endOffset
new String[] {O, S, S, S, S, S}, //type
new int[] {1, 1, 0, 0, 0, 1} //posIncr
);
}
/*
* The default behaviour but the original tokens are emitted
* before the synonyms
*/
public void testMultiWordSynonymsInclOrig() throws IOException {
Map<String,String> args = new HashMap<String,String>();
args.put("synonyms", "synonyms.txt");
args.put("ignoreCase", "true");
args.put("tokenizerFactory", "org.apache.lucene.analysis.core.KeywordTokenizerFactory");
args.put("builderFactory", NewSynonymFilterFactory.AlwaysIncludeOriginal.class.getName());
NewSynonymFilterFactory factory = new NewSynonymFilterFactory(args);
factory.inform(getSyn());
TokenStream ts = factory.create(whitespaceMockTokenizer(new StringReader("foo hubble space telescope")));
assertTokenStreamContents(ts, new String[] { "foo", "hubble", "hubble", "hst", "hs telescope", "space", "space", "telescope", "telescope" },
new int[] {0, 4, 4, 4, 4,11,11,17,17}, //startOffset
new int[] {3,10,10,26,26,16,16,26,26}, //endOffset
new String[] {O, O, S, S, S, O, S, O, S}, //type
new int[] {1, 1, 0, 0, 0, 1, 0, 1, 0} //posIncr
);
ts = factory.create(whitespaceMockTokenizer(new StringReader("hst")));
assertTokenStreamContents(ts, new String[] { "hst", "hubble", "hst", "hs telescope", "space", "telescope" },
new int[] {0, 0, 0, 0, 0, 0},
new int[] {3, 3, 3, 3, 3, 3},
new String[] {O, S, S, S, S, S},
new int[] {1, 0, 0, 0, 1, 1}
);
ts = factory.create(whitespaceMockTokenizer(new StringReader("some foo bar")));
assertTokenStreamContents(ts, new String[] { "some", "foo", "foo", "foo ba", "fu ba", "foobar", "bar", "bar" },
new int[] {0, 5, 5, 5, 5, 5, 9, 9}, //startOffset
new int[] {4, 8, 8,12,12,12,12,12}, //endOffset
new String[] {O, O, S, S, S, S, O, S}, //type
new int[] {1, 1, 0, 0, 0, 0, 1, 0} //posIncr
);
ts = factory.create(whitespaceMockTokenizer(new StringReader("some foobar")));
assertTokenStreamContents(ts, new String[] { "some", "foobar", "foo", "foo ba", "fu ba", "foobar", "bar"},
new int[] {0, 5, 5, 5, 5, 5, 5}, //startOffset
new int[] {4,11,11,11,11,11,11}, //endOffset
new String[] {O, O, S, S, S, S, S}, //type
new int[] {1, 1, 0, 0, 0, 0, 1} //posIncr
);
}
}
class StringMockResourceLoader implements ResourceLoader {
String text;
public StringMockResourceLoader(String text) {
this.text = text;
}
public <T> T newInstance(String cname, Class<T> expectedType) {
try {
Class<? extends T> clazz = Class.forName(cname).asSubclass(expectedType);
return clazz.newInstance();
} catch (Exception e) {
throw new RuntimeException(e);
}
}
public InputStream openResource(String resource) throws IOException {
return new ByteArrayInputStream(text.getBytes("UTF-8"));
}
@Override
public <T> Class<? extends T> findClass(String cname, Class<T> expectedType) {
try {
return Class.forName(cname, true, Thread.currentThread().getContextClassLoader()).asSubclass(expectedType);
} catch (Exception e) {
throw new RuntimeException("Cannot load class: " + cname, e);
}
}
}