/*
* Copyright 2007 T-Rank AS
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package no.trank.openpipe.opennlp.step;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.ListIterator;
import java.util.Map;
import opennlp.tools.namefind.NameFinder;
import opennlp.tools.namefind.NameFinderME;
import no.trank.openpipe.api.MultiInputFieldPipelineStep;
import no.trank.openpipe.api.PipelineException;
import no.trank.openpipe.api.document.AnnotatedField;
import no.trank.openpipe.api.document.Annotation;
import no.trank.openpipe.api.document.BaseAnnotation;
import no.trank.openpipe.api.document.Document;
import no.trank.openpipe.api.document.ResolvedAnnotation;
import no.trank.openpipe.config.annotation.NotEmpty;
/**
* @version $Revision$
*/
public class ONLPNEDetector extends MultiInputFieldPipelineStep {
public static final String TYPE_NE = "opennlp.ne.";
@NotEmpty
private Map<String, NameFinder> nameFinders = null;
@Override
protected void process(Document doc, String fieldName, List<AnnotatedField> fieldValues) throws PipelineException {
for (AnnotatedField fieldValue : fieldValues) {
processField(fieldValue);
}
}
private void processField(AnnotatedField field) {
final Map<String, NameFinderHolder> holders = buildHolders();
final Iterator<ResolvedAnnotation> sentIt = field.iterator(ONLPSentenceDetector.TYPE_SENTENCE);
final ListIterator<ResolvedAnnotation> tokIt = field.iterator(ONLPTokenizer.TYPE_TOKENIZE);
final ArrayList<String> words = new ArrayList<String>();
final ArrayList<ResolvedAnnotation> tokens = new ArrayList<ResolvedAnnotation>();
while (sentIt.hasNext()) {
if (buildWordsTokens(sentIt.next(), tokIt, words, tokens)) {
for (Map.Entry<String, NameFinderHolder> entry : holders.entrySet()) {
findNE(words, tokens, entry.getValue());
}
}
}
for (Map.Entry<String,NameFinderHolder> entry : holders.entrySet()) {
final List<Annotation> list = entry.getValue().getAnnotations();
if (!list.isEmpty()) {
field.add(TYPE_NE + entry.getKey(), list);
}
}
}
private static void findNE(ArrayList<String> words, ArrayList<ResolvedAnnotation> tokens, NameFinderHolder holder) {
final NameFinder nameFinder = holder.getNameFinder();
final Map<String, String> prevTags = holder.getPreviousTags();
final List<?> tags = nameFinder.find(words, prevTags);
Annotation startToken = null;
final Iterator<String> wordIt = words.iterator();
for (ListIterator<?> it = tags.listIterator(); it.hasNext();) {
final String tag = (String) it.next();
prevTags.put(wordIt.next(), tag);
if (startToken != null) {
final boolean startTag = NameFinderME.START.equals(tag);
if (startTag || NameFinderME.OTHER.equals(tag)) {
final Annotation endToken = tokens.get(it.previousIndex() - 1);
holder.getAnnotations().add(new BaseAnnotation(startToken.getStartPos(), endToken.getEndPos()));
if (startTag) {
startToken = endToken;
} else {
startToken = null;
}
}
} else if (NameFinderME.START.equals(tag)) {
startToken = tokens.get(it.nextIndex() - 1);
}
}
}
private static boolean buildWordsTokens(ResolvedAnnotation sentence, ListIterator<ResolvedAnnotation> tokIt,
ArrayList<String> words, ArrayList<ResolvedAnnotation> tokens) {
words.clear();
tokens.clear();
ResolvedAnnotation tok = tokIt.next();
while (tokIt.hasNext() && tok.getStartPos() < sentence.getEndPos()) {
words.add(tok.getValue());
tokens.add(tok);
tok = tokIt.next();
}
if (tokIt.hasNext()) {
tokIt.previous();
}
return !words.isEmpty();
}
private Map<String, NameFinderHolder> buildHolders() {
final Map<String, NameFinderHolder> holderMap = new HashMap<String, NameFinderHolder>();
for (Map.Entry<String,NameFinder> e : nameFinders.entrySet()) {
holderMap.put(e.getKey(), new NameFinderHolder(e.getValue()));
}
return holderMap;
}
@Override
public String getRevision() {
return "$Revision$";
}
public Map<String, NameFinder> getNameFinders() {
return nameFinders;
}
public void setNameFinders(Map<String, NameFinder> nameFinders) {
this.nameFinders = nameFinders;
}
private static final class NameFinderHolder {
private final NameFinder nameFinder;
private final List<Annotation> annotations = new ArrayList<Annotation>();
private final Map<String, String> previousTags = new HashMap<String, String>();
public NameFinderHolder(NameFinder nameFinder) {
this.nameFinder = nameFinder;
}
public NameFinder getNameFinder() {
return nameFinder;
}
public List<Annotation> getAnnotations() {
return annotations;
}
public Map<String, String> getPreviousTags() {
return previousTags;
}
}
}