package org.apache.lucene.analysis.jate;
import org.apache.commons.lang.exception.ExceptionUtils;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.ResourceLoader;
import java.io.IOException;
import java.util.*;
import java.util.regex.Pattern;
/**
* This class wraps OpenNLPTokenizer to support PoS pattern sequence
* matching based candidate extraction at index-time.
* <p/>
* Created by zqz on 28/09/2015.
* <p/>
*/
public class OpenNLPRegexChunkerFactory extends MWEFilterFactory {
private Map<String, Pattern[]> patterns = new HashMap<>();
private String patternFile;
/**
* Initialize this factory via a set of key-value pairs.
*
* @param args arguments for the filter,
* see example of jate_text_2_terms field type in JATE 2.0 solr schema.xml
*/
public OpenNLPRegexChunkerFactory(Map<String, String> args) {
super(args);
patternFile = args.get("patterns");
if (patternFile == null) {
throw new IllegalArgumentException("Parameter 'patterns' for chunker is missing.");
}
}
private void initPatterns(List<String> rawData, Map<String, Pattern[]> patterns) throws IOException {
//is patternStr a file?
Map<String, List<Pattern>> m = new HashMap<>();
for (String lineStr : rawData) {
if (lineStr.trim().length() == 0 || lineStr.startsWith("#"))
continue;
String[] parts = lineStr.split("\t", 2);
List<Pattern> pats = m.get(parts[0]);
if (pats == null)
pats = new ArrayList<>();
pats.add(Pattern.compile(parts[1]));
m.put(parts[0], pats);
}
for (Map.Entry<String, List<Pattern>> en : m.entrySet()) {
patterns.put(en.getKey(), en.getValue().toArray(new Pattern[0]));
}
}
@Override
public TokenStream create(TokenStream input) {
return new OpenNLPRegexChunker(input, patterns, maxTokens,
minTokens,
maxCharLength, minCharLength,
removeLeadingStopwords, removeTrailingStopwords,
removeLeadingSymbolicTokens, removeTrailingSymbolicTokens,
stripLeadingSymbolChars,
stripTrailingSymbolChars,
stripAnySymbolChars,
stopWords, stopWordsIgnoreCase);
}
@Override
public void inform(ResourceLoader loader) throws IOException {
super.inform(loader);
if (patternFile != null) {
try {
List<String> lines = getLines(loader, patternFile.trim());
initPatterns(lines, patterns);
} catch (IOException ioe) {
StringBuilder sb = new StringBuilder("Initiating ");
sb.append(this.getClass().getName()).append(" failed due to patterns. Details:\n");
sb.append(ExceptionUtils.getFullStackTrace(ioe));
throw new IllegalArgumentException(sb.toString());
}
}
}
}