package org.apache.lucene.analysis.util;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
/**
* Abstract parent class for analysis factories {@link TokenizerFactory},
* {@link TokenFilterFactory} and {@link CharFilterFactory}.
* <p>
* The typical lifecycle for a factory consumer is:
* <ol>
* <li>Create factory via its a no-arg constructor
* <li>Set version emulation by calling {@link #setLuceneMatchVersion(Version)}
* <li>Calls {@link #init(Map)} passing arguments as key-value mappings.
* <li>(Optional) If the factory uses resources such as files, {@link ResourceLoaderAware#inform(ResourceLoader)} is called to initialize those resources.
* <li>Consumer calls create() to obtain instances.
* </ol>
*/
public abstract class AbstractAnalysisFactory {
/** The init args */
protected Map<String,String> args;
/** the luceneVersion arg */
protected Version luceneMatchVersion = null;
/**
* Initialize this factory via a set of key-value pairs.
*/
public void init(Map<String,String> args) {
this.args = args;
}
public Map<String,String> getArgs() {
return args;
}
/** this method can be called in the {@link org.apache.lucene.analysis.util.TokenizerFactory#create(java.io.Reader)}
* or {@link org.apache.lucene.analysis.util.TokenFilterFactory#create(org.apache.lucene.analysis.TokenStream)} methods,
* to inform user, that for this factory a {@link #luceneMatchVersion} is required */
protected final void assureMatchVersion() {
if (luceneMatchVersion == null) {
throw new IllegalArgumentException("Configuration Error: Factory '" + this.getClass().getName() +
"' needs a 'luceneMatchVersion' parameter");
}
}
public void setLuceneMatchVersion(Version luceneMatchVersion) {
this.luceneMatchVersion = luceneMatchVersion;
}
public Version getLuceneMatchVersion() {
return this.luceneMatchVersion;
}
protected int getInt(String name) {
return getInt(name, -1, false);
}
protected int getInt(String name, int defaultVal) {
return getInt(name, defaultVal, true);
}
protected int getInt(String name, int defaultVal, boolean useDefault) {
String s = args.get(name);
if (s == null) {
if (useDefault) {
return defaultVal;
}
throw new IllegalArgumentException("Configuration Error: missing parameter '" + name + "'");
}
return Integer.parseInt(s);
}
protected boolean getBoolean(String name, boolean defaultVal) {
return getBoolean(name, defaultVal, true);
}
protected boolean getBoolean(String name, boolean defaultVal, boolean useDefault) {
String s = args.get(name);
if (s==null) {
if (useDefault) return defaultVal;
throw new IllegalArgumentException("Configuration Error: missing parameter '" + name + "'");
}
return Boolean.parseBoolean(s);
}
/**
* Compiles a pattern for the value of the specified argument key <code>name</code>
*/
protected Pattern getPattern(String name) {
try {
String pat = args.get(name);
if (null == pat) {
throw new IllegalArgumentException("Configuration Error: missing parameter '" + name + "'");
}
return Pattern.compile(args.get(name));
} catch (PatternSyntaxException e) {
throw new IllegalArgumentException
("Configuration Error: '" + name + "' can not be parsed in " +
this.getClass().getSimpleName(), e);
}
}
/**
* Returns as {@link CharArraySet} from wordFiles, which
* can be a comma-separated list of filenames
*/
protected CharArraySet getWordSet(ResourceLoader loader,
String wordFiles, boolean ignoreCase) throws IOException {
assureMatchVersion();
List<String> files = splitFileNames(wordFiles);
CharArraySet words = null;
if (files.size() > 0) {
// default stopwords list has 35 or so words, but maybe don't make it that
// big to start
words = new CharArraySet(luceneMatchVersion,
files.size() * 10, ignoreCase);
for (String file : files) {
List<String> wlist = getLines(loader, file.trim());
words.addAll(StopFilter.makeStopSet(luceneMatchVersion, wlist,
ignoreCase));
}
}
return words;
}
/**
* Returns the resource's lines (with content treated as UTF-8)
*/
protected List<String> getLines(ResourceLoader loader, String resource) throws IOException {
return WordlistLoader.getLines(loader.openResource(resource), IOUtils.CHARSET_UTF_8);
}
/** same as {@link #getWordSet(ResourceLoader, String, boolean)},
* except the input is in snowball format. */
protected CharArraySet getSnowballWordSet(ResourceLoader loader,
String wordFiles, boolean ignoreCase) throws IOException {
assureMatchVersion();
List<String> files = splitFileNames(wordFiles);
CharArraySet words = null;
if (files.size() > 0) {
// default stopwords list has 35 or so words, but maybe don't make it that
// big to start
words = new CharArraySet(luceneMatchVersion,
files.size() * 10, ignoreCase);
for (String file : files) {
InputStream stream = null;
Reader reader = null;
try {
stream = loader.openResource(file.trim());
CharsetDecoder decoder = IOUtils.CHARSET_UTF_8.newDecoder()
.onMalformedInput(CodingErrorAction.REPORT)
.onUnmappableCharacter(CodingErrorAction.REPORT);
reader = new InputStreamReader(stream, decoder);
WordlistLoader.getSnowballWordSet(reader, words);
} finally {
IOUtils.closeWhileHandlingException(reader, stream);
}
}
}
return words;
}
/**
* Splits file names separated by comma character.
* File names can contain comma characters escaped by backslash '\'
*
* @param fileNames the string containing file names
* @return a list of file names with the escaping backslashed removed
*/
protected List<String> splitFileNames(String fileNames) {
if (fileNames == null)
return Collections.<String>emptyList();
List<String> result = new ArrayList<String>();
for (String file : fileNames.split("(?<!\\\\),")) {
result.add(file.replaceAll("\\\\(?=,)", ""));
}
return result;
}
}