package org.apache.lucene.analysis.synonym;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.util.*;
import java.io.File;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
/**
* Factory for {@link SlowSynonymFilter} (only used with luceneMatchVersion < 3.4)
* <pre class="prettyprint" >
* <fieldType name="text_synonym" class="solr.TextField" positionIncrementGap="100">
* <analyzer>
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
* <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="false"
* expand="true" tokenizerFactory="solr.WhitespaceTokenizerFactory"/>
* </analyzer>
* </fieldType></pre>
* @deprecated (3.4) use {@link SynonymFilterFactory} instead. only for precise index backwards compatibility. this factory will be removed in Lucene 5.0
*/
@Deprecated
final class SlowSynonymFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
public void inform(ResourceLoader loader) throws IOException {
String synonyms = args.get("synonyms");
if (synonyms == null)
throw new IllegalArgumentException("Missing required argument 'synonyms'.");
boolean ignoreCase = getBoolean("ignoreCase", false);
boolean expand = getBoolean("expand", true);
String tf = args.get("tokenizerFactory");
TokenizerFactory tokFactory = null;
if( tf != null ){
tokFactory = loadTokenizerFactory(loader, tf);
}
Iterable<String> wlist=loadRules( synonyms, loader );
synMap = new SlowSynonymMap(ignoreCase);
parseRules(wlist, synMap, "=>", ",", expand,tokFactory);
}
/**
* @return a list of all rules
*/
protected Iterable<String> loadRules( String synonyms, ResourceLoader loader ) throws IOException {
List<String> wlist=null;
File synonymFile = new File(synonyms);
if (synonymFile.exists()) {
wlist = getLines(loader, synonyms);
} else {
List<String> files = splitFileNames(synonyms);
wlist = new ArrayList<String>();
for (String file : files) {
List<String> lines = getLines(loader, file.trim());
wlist.addAll(lines);
}
}
return wlist;
}
private SlowSynonymMap synMap;
static void parseRules(Iterable<String> rules, SlowSynonymMap map, String mappingSep,
String synSep, boolean expansion, TokenizerFactory tokFactory) throws IOException {
int count=0;
for (String rule : rules) {
// To use regexes, we need an expression that specifies an odd number of chars.
// This can't really be done with string.split(), and since we need to
// do unescaping at some point anyway, we wouldn't be saving any effort
// by using regexes.
List<String> mapping = splitSmart(rule, mappingSep, false);
List<List<String>> source;
List<List<String>> target;
if (mapping.size() > 2) {
throw new IllegalArgumentException("Invalid Synonym Rule:" + rule);
} else if (mapping.size()==2) {
source = getSynList(mapping.get(0), synSep, tokFactory);
target = getSynList(mapping.get(1), synSep, tokFactory);
} else {
source = getSynList(mapping.get(0), synSep, tokFactory);
if (expansion) {
// expand to all arguments
target = source;
} else {
// reduce to first argument
target = new ArrayList<List<String>>(1);
target.add(source.get(0));
}
}
boolean includeOrig=false;
for (List<String> fromToks : source) {
count++;
for (List<String> toToks : target) {
map.add(fromToks,
SlowSynonymMap.makeTokens(toToks),
includeOrig,
true
);
}
}
}
}
// a , b c , d e f => [[a],[b,c],[d,e,f]]
private static List<List<String>> getSynList(String str, String separator, TokenizerFactory tokFactory) throws IOException {
List<String> strList = splitSmart(str, separator, false);
// now split on whitespace to get a list of token strings
List<List<String>> synList = new ArrayList<List<String>>();
for (String toks : strList) {
List<String> tokList = tokFactory == null ?
splitWS(toks, true) : splitByTokenizer(toks, tokFactory);
synList.add(tokList);
}
return synList;
}
private static List<String> splitByTokenizer(String source, TokenizerFactory tokFactory) throws IOException{
StringReader reader = new StringReader( source );
TokenStream ts = loadTokenizer(tokFactory, reader);
List<String> tokList = new ArrayList<String>();
try {
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
while (ts.incrementToken()){
if( termAtt.length() > 0 )
tokList.add( termAtt.toString() );
}
} finally{
reader.close();
}
return tokList;
}
private TokenizerFactory loadTokenizerFactory(ResourceLoader loader, String cname) throws IOException {
TokenizerFactory tokFactory = loader.newInstance(cname, TokenizerFactory.class);
tokFactory.setLuceneMatchVersion(luceneMatchVersion);
tokFactory.init( args );
if (tokFactory instanceof ResourceLoaderAware) {
((ResourceLoaderAware) tokFactory).inform(loader);
}
return tokFactory;
}
private static TokenStream loadTokenizer(TokenizerFactory tokFactory, Reader reader){
return tokFactory.create( reader );
}
public SlowSynonymMap getSynonymMap() {
return synMap;
}
public SlowSynonymFilter create(TokenStream input) {
return new SlowSynonymFilter(input,synMap);
}
public static List<String> splitWS(String s, boolean decode) {
ArrayList<String> lst = new ArrayList<String>(2);
StringBuilder sb = new StringBuilder();
int pos=0, end=s.length();
while (pos < end) {
char ch = s.charAt(pos++);
if (Character.isWhitespace(ch)) {
if (sb.length() > 0) {
lst.add(sb.toString());
sb=new StringBuilder();
}
continue;
}
if (ch=='\\') {
if (!decode) sb.append(ch);
if (pos>=end) break; // ERROR, or let it go?
ch = s.charAt(pos++);
if (decode) {
switch(ch) {
case 'n' : ch='\n'; break;
case 't' : ch='\t'; break;
case 'r' : ch='\r'; break;
case 'b' : ch='\b'; break;
case 'f' : ch='\f'; break;
}
}
}
sb.append(ch);
}
if (sb.length() > 0) {
lst.add(sb.toString());
}
return lst;
}
/** Splits a backslash escaped string on the separator.
* <p>
* Current backslash escaping supported:
* <br> \n \t \r \b \f are escaped the same as a Java String
* <br> Other characters following a backslash are produced verbatim (\c => c)
*
* @param s the string to split
* @param separator the separator to split on
* @param decode decode backslash escaping
*/
public static List<String> splitSmart(String s, String separator, boolean decode) {
ArrayList<String> lst = new ArrayList<String>(2);
StringBuilder sb = new StringBuilder();
int pos=0, end=s.length();
while (pos < end) {
if (s.startsWith(separator,pos)) {
if (sb.length() > 0) {
lst.add(sb.toString());
sb=new StringBuilder();
}
pos+=separator.length();
continue;
}
char ch = s.charAt(pos++);
if (ch=='\\') {
if (!decode) sb.append(ch);
if (pos>=end) break; // ERROR, or let it go?
ch = s.charAt(pos++);
if (decode) {
switch(ch) {
case 'n' : ch='\n'; break;
case 't' : ch='\t'; break;
case 'r' : ch='\r'; break;
case 'b' : ch='\b'; break;
case 'f' : ch='\f'; break;
}
}
}
sb.append(ch);
}
if (sb.length() > 0) {
lst.add(sb.toString());
}
return lst;
}
}