/**
* Copyright (C) 2013 Alexander Reelsen <alr@spinscale.de>
*
* This file is part of elasticsearch-plugin-opennlp.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.elasticsearch.service.opennlp;
import opennlp.tools.namefind.NameFinderME;
import opennlp.tools.namefind.TokenNameFinderModel;
import opennlp.tools.tokenize.SimpleTokenizer;
import opennlp.tools.util.Span;
import org.elasticsearch.ElasticSearchException;
import org.elasticsearch.common.StopWatch;
import org.elasticsearch.common.base.Joiner;
import org.elasticsearch.common.collect.Maps;
import org.elasticsearch.common.collect.Sets;
import org.elasticsearch.common.component.AbstractLifecycleComponent;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.service.opennlp.models.PooledTokenNameFinderModel;
import org.elasticsearch.service.opennlp.models.TextAnnotation;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.*;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.TimeUnit;
public class OpenNlpService extends AbstractLifecycleComponent<OpenNlpService> {
private static Map<String, TokenNameFinderModel> finders = Maps.newHashMap();
@Inject public OpenNlpService(Settings settings) {
super(settings);
}
@Override
protected void doStart() throws ElasticSearchException {
CountDownLatch countDownLatch = new CountDownLatch(3);
new Thread(new LoaderRunnable(settings, "opennlp.models.name.file", "name", countDownLatch)).start();
new Thread(new LoaderRunnable(settings, "opennlp.models.date.file", "date", countDownLatch)).start();
new Thread(new LoaderRunnable(settings, "opennlp.models.location.file", "location", countDownLatch)).start();
try {
countDownLatch.await(10, TimeUnit.SECONDS);
} catch (InterruptedException e) {}
}
@Override
protected void doStop() throws ElasticSearchException {}
@Override
protected void doClose() throws ElasticSearchException {}
class LoaderRunnable implements Runnable {
private Settings settings;
private String configParameter;
private String type;
private CountDownLatch countDownLatch;
public LoaderRunnable(Settings settings, String configParameter, String type, CountDownLatch countDownLatch) {
this.settings = settings;
this.configParameter = configParameter;
this.type = type;
this.countDownLatch = countDownLatch;
}
@Override
public void run() {
String filePath = settings.get(configParameter, "");
if (filePath.length() == 0) {
logger.error("OpenNLP property [{}] is not set.", configParameter);
return;
}
File modelFile = new File(filePath);
if (!modelFile.exists() || !modelFile.canRead()) {
logger.error("Model file {} does not exist.", modelFile);
return;
}
StopWatch sw = new StopWatch("Loading model " + filePath).start();
try {
finders.put(type,
new PooledTokenNameFinderModel(
new FileInputStream(modelFile)));
} catch (IOException e) {
logger.error("Error loading model file {}: {}", e, modelFile, e.getMessage());
} finally {
sw.stop();
}
logger.info("Loaded file {} in {}", modelFile, sw.totalTime());
countDownLatch.countDown();
}
}
public Map<String, Set<String>> tokenize(String content) {
Map<String, Set<String>> namedEntities = Maps.newHashMap();
List<TextAnnotation> allTextAnnotations = new ArrayList<TextAnnotation>();
String[] tokens = SimpleTokenizer.INSTANCE.tokenize(content);
for (Map.Entry<String, TokenNameFinderModel> finderEntry : finders.entrySet()) {
String type = finderEntry.getKey();
NameFinderME finder = new NameFinderME(finderEntry.getValue());
Span[] spans = finder.find(tokens);
double[] probs = finder.probs(spans);
for (int ni = 0; ni < spans.length; ni++) {
allTextAnnotations.add(new TextAnnotation(type, spans[ni], probs[ni]));
}
}
if (allTextAnnotations.size() > 0 ) {
removeConflicts(allTextAnnotations);
}
convertTextAnnotationsToNamedEntities(tokens, allTextAnnotations, namedEntities);
return namedEntities;
}
public void convertTextAnnotationsToNamedEntities(String[] tokens, List<TextAnnotation> TextAnnotations, Map<String, Set<String>> namedEntities) {
for (TextAnnotation TextAnnotation : TextAnnotations) {
int start = TextAnnotation.getSpan().getStart();
int end = TextAnnotation.getSpan().getEnd();
String[] TextAnnotationData = Arrays.copyOfRange(tokens, start, end);
String content = Joiner.on(" ").join(TextAnnotationData);
String type = TextAnnotation.getType();
if (!namedEntities.containsKey(type)) {
Set<String> typeList = Sets.newHashSet();
namedEntities.put(type, typeList);
}
namedEntities.get(type).add(content);
}
}
/* Copied from https://github.com/tamingtext/book/blob/master/src/test/java/com/tamingtext/opennlp/NameFinderTest.java */
private void removeConflicts(List<TextAnnotation> allTextAnnotations) {
java.util.Collections.sort(allTextAnnotations);
List<TextAnnotation> stack = new ArrayList<TextAnnotation>();
stack.add(allTextAnnotations.get(0));
for (int ai = 1; ai < allTextAnnotations.size(); ai++) {
TextAnnotation curr = allTextAnnotations.get(ai);
boolean deleteCurr = false;
for (int ki = stack.size() - 1; ki >= 0; ki--) {
TextAnnotation prev = stack.get(ki);
if (prev.getSpan().equals(curr.getSpan())) {
if (prev.getProb() > curr.getProb()) {
deleteCurr = true;
break;
} else {
allTextAnnotations.remove(stack.remove(ki));
ai--;
}
} else if (prev.getSpan().intersects(curr.getSpan())) {
if (prev.getProb() > curr.getProb()) {
deleteCurr = true;
break;
} else {
allTextAnnotations.remove(stack.remove(ki));
ai--;
}
} else if (prev.getSpan().contains(curr.getSpan())) {
break;
} else {
stack.remove(ki);
}
}
if (deleteCurr) {
allTextAnnotations.remove(ai);
ai--;
deleteCurr = false;
} else {
stack.add(curr);
}
}
}
}