package org.apache.lucene.analysis.jate; import opennlp.tools.namefind.RegexNameFinder; import opennlp.tools.util.Span; import org.apache.lucene.analysis.TokenStream; import java.io.IOException; import java.util.*; import java.util.regex.Pattern; /** * Generates candidate terms based on PoS regex */ public final class OpenNLPRegexChunker extends OpenNLPMWEFilter { private RegexNameFinder regexChunker; public OpenNLPRegexChunker( TokenStream input, Map<String, Pattern[]> patterns, int maxTokens, int minTokens, int maxCharLength, int minCharLength, boolean removeLeadingStopWords, boolean removeTrailingStopwords, boolean removeLeadingSymbolicTokens, boolean removeTrailingSymbolicTokens, boolean stripLeadingSymbolChars, boolean stripTrailingSymbolChars, boolean stripAllSymbolChars, Set<String> stopWords, boolean stopWordsIgnoreCase) { super(input, minTokens, maxTokens, minCharLength, maxCharLength, removeLeadingStopWords, removeTrailingStopwords, removeLeadingSymbolicTokens, removeTrailingSymbolicTokens, stripLeadingSymbolChars, stripTrailingSymbolChars, stripAllSymbolChars, stopWords, stopWordsIgnoreCase); regexChunker = new RegexNameFinder(patterns); } @Override public boolean incrementToken() throws IOException { clearAttributes(); if (first) { String[][] wordsAndPOS = walkTokens(); //gather all tokens from doc String[] words = wordsAndPOS[0]; if (words.length == 0) { return false; } //tagging String[] pos = wordsAndPOS[1]; //chunking Span[] chunks = regexChunker.find(pos); chunks = prune(chunks, words); for (Span sp : chunks) { List<Integer> ends = chunkSpans.get(sp.getStart()); if(ends==null) ends=new ArrayList<>(); ends.add(sp.getEnd()); chunkSpans.put(sp.getStart(), ends); chunkTypes.put(sp.getStart(), sp.getType()); } first = false; tokenIdx = 0; } if (tokenIdx == tokenAttrs.size()) { resetParams(); return false; } if (chunkStart != -1 && chunkEnds.contains(tokenIdx)) { //already found a new chunk and now we found its end addMWE(tokenIdx); //do not increment token index here because end span is exclusive //tokenIdx++; return true; } if (chunkSpans.containsKey(tokenIdx)) { //found a new chunk, the current tokindex is the beginning of the chunk chunkStart = tokenIdx; chunkEnds = chunkSpans.get(tokenIdx); tokenIdx = chunkEnds.get(0); //set tokenIdx to be the next end index for the beginning index return true; } else { //a token that is not part of a chunk tokenIdx++; return true; } } }