/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.analysis;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.synonym.SynonymFilter;
import org.apache.lucene.analysis.synonym.SynonymMap;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.solr.common.ResourceLoader;
import org.apache.solr.common.util.StrUtils;
import org.apache.solr.util.plugin.ResourceLoaderAware;
import java.io.File;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
/**
* @version $Id: SynonymFilterFactory.java 942827 2010-05-10 17:37:45Z rmuir $
*/
public class SynonymFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
public void inform(ResourceLoader loader) {
String synonyms = args.get("synonyms");
boolean ignoreCase = getBoolean("ignoreCase", false);
boolean expand = getBoolean("expand", true);
String tf = args.get("tokenizerFactory");
TokenizerFactory tokFactory = null;
if( tf != null ){
tokFactory = loadTokenizerFactory( loader, tf, args );
}
if (synonyms != null) {
List<String> wlist=null;
try {
File synonymFile = new File(synonyms);
if (synonymFile.exists()) {
wlist = loader.getLines(synonyms);
} else {
List<String> files = StrUtils.splitFileNames(synonyms);
wlist = new ArrayList<String>();
for (String file : files) {
List<String> lines = loader.getLines(file.trim());
wlist.addAll(lines);
}
}
} catch (IOException e) {
throw new RuntimeException(e);
}
synMap = new SynonymMap(ignoreCase);
parseRules(wlist, synMap, "=>", ",", expand,tokFactory);
}
}
private SynonymMap synMap;
static void parseRules(List<String> rules, SynonymMap map, String mappingSep,
String synSep, boolean expansion, TokenizerFactory tokFactory) {
int count=0;
for (String rule : rules) {
// To use regexes, we need an expression that specifies an odd number of chars.
// This can't really be done with string.split(), and since we need to
// do unescaping at some point anyway, we wouldn't be saving any effort
// by using regexes.
List<String> mapping = StrUtils.splitSmart(rule, mappingSep, false);
List<List<String>> source;
List<List<String>> target;
if (mapping.size() > 2) {
throw new RuntimeException("Invalid Synonym Rule:" + rule);
} else if (mapping.size()==2) {
source = getSynList(mapping.get(0), synSep, tokFactory);
target = getSynList(mapping.get(1), synSep, tokFactory);
} else {
source = getSynList(mapping.get(0), synSep, tokFactory);
if (expansion) {
// expand to all arguments
target = source;
} else {
// reduce to first argument
target = new ArrayList<List<String>>(1);
target.add(source.get(0));
}
}
boolean includeOrig=false;
for (List<String> fromToks : source) {
count++;
for (List<String> toToks : target) {
map.add(fromToks,
SynonymMap.makeTokens(toToks),
includeOrig,
true
);
}
}
}
}
// a , b c , d e f => [[a],[b,c],[d,e,f]]
private static List<List<String>> getSynList(String str, String separator, TokenizerFactory tokFactory) {
List<String> strList = StrUtils.splitSmart(str, separator, false);
// now split on whitespace to get a list of token strings
List<List<String>> synList = new ArrayList<List<String>>();
for (String toks : strList) {
List<String> tokList = tokFactory == null ?
StrUtils.splitWS(toks, true) : splitByTokenizer(toks, tokFactory);
synList.add(tokList);
}
return synList;
}
private static List<String> splitByTokenizer(String source, TokenizerFactory tokFactory){
StringReader reader = new StringReader( source );
TokenStream ts = loadTokenizer(tokFactory, reader);
List<String> tokList = new ArrayList<String>();
try {
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
while (ts.incrementToken()){
if( termAtt.length() > 0 )
tokList.add( termAtt.toString() );
}
} catch (IOException e) {
throw new RuntimeException(e);
}
finally{
reader.close();
}
return tokList;
}
private static TokenizerFactory loadTokenizerFactory(ResourceLoader loader, String cname, Map<String,String> args){
TokenizerFactory tokFactory = (TokenizerFactory)loader.newInstance( cname );
tokFactory.init( args );
return tokFactory;
}
private static TokenStream loadTokenizer(TokenizerFactory tokFactory, Reader reader){
return tokFactory.create( reader );
}
public SynonymMap getSynonymMap() {
return synMap;
}
public SynonymFilter create(TokenStream input) {
return new SynonymFilter(input,synMap);
}
}