package io.lumify.opennlpDictionary;
import com.google.inject.Inject;
import io.lumify.core.ingest.graphProperty.GraphPropertyWorkData;
import io.lumify.core.ingest.graphProperty.GraphPropertyWorker;
import io.lumify.core.ingest.graphProperty.GraphPropertyWorkerPrepareData;
import io.lumify.core.model.properties.LumifyProperties;
import io.lumify.core.model.termMention.TermMentionBuilder;
import io.lumify.core.util.LumifyLogger;
import io.lumify.core.util.LumifyLoggerFactory;
import io.lumify.opennlpDictionary.model.DictionaryEntry;
import io.lumify.opennlpDictionary.model.DictionaryEntryRepository;
import io.lumify.web.clientapi.model.VisibilityJson;
import opennlp.tools.dictionary.Dictionary;
import opennlp.tools.namefind.DictionaryNameFinder;
import opennlp.tools.namefind.TokenNameFinder;
import opennlp.tools.tokenize.Tokenizer;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;
import opennlp.tools.util.Span;
import opennlp.tools.util.StringList;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.securegraph.Element;
import org.securegraph.Property;
import org.securegraph.Vertex;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class OpenNLPDictionaryExtractorGraphPropertyWorker extends GraphPropertyWorker {
private static final LumifyLogger LOGGER = LumifyLoggerFactory.getLogger(OpenNLPDictionaryExtractorGraphPropertyWorker.class);
public static final String PATH_PREFIX_CONFIG = "termextraction.opennlp.pathPrefix";
private static final String DEFAULT_PATH_PREFIX = "/lumify/config/opennlp/";
private static final int NEW_LINE_CHARACTER_LENGTH = 1;
private List<TokenNameFinder> finders;
private DictionaryEntryRepository dictionaryEntryRepository;
private Tokenizer tokenizer;
private String locationIri;
private String organizationIri;
private String personIri;
@Override
public void prepare(GraphPropertyWorkerPrepareData workerPrepareData) throws Exception {
super.prepare(workerPrepareData);
this.locationIri = getOntologyRepository().getRequiredConceptIRIByIntent("location");
this.organizationIri = getOntologyRepository().getRequiredConceptIRIByIntent("organization");
this.personIri = getOntologyRepository().getRequiredConceptIRIByIntent("person");
dictionaryEntryRepository.initializeTable(workerPrepareData.getUser());
String pathPrefix = (String) workerPrepareData.getConfiguration().get(PATH_PREFIX_CONFIG);
if (pathPrefix == null) {
pathPrefix = DEFAULT_PATH_PREFIX;
}
this.tokenizer = loadTokenizer(pathPrefix, workerPrepareData.getHdfsFileSystem());
this.finders = loadFinders();
}
@Override
public void execute(InputStream in, GraphPropertyWorkData data) throws Exception {
ObjectStream<String> untokenizedLineStream = new PlainTextByLineStream(new InputStreamReader(in));
String line;
int charOffset = 0;
LOGGER.debug("Processing artifact content stream");
Vertex sourceVertex = (Vertex) data.getElement();
List<Vertex> termMentions = new ArrayList<>();
while ((line = untokenizedLineStream.read()) != null) {
termMentions.addAll(processLine(sourceVertex, data.getProperty().getKey(), line, charOffset, LumifyProperties.VISIBILITY_JSON.getPropertyValue(sourceVertex)));
getGraph().flush();
charOffset += line.length() + NEW_LINE_CHARACTER_LENGTH;
}
applyTermMentionFilters(sourceVertex, termMentions);
pushTextUpdated(data);
untokenizedLineStream.close();
LOGGER.debug("Stream processing completed");
}
private List<Vertex> processLine(Vertex sourceVertex, String propertyKey, String line, int charOffset, VisibilityJson visibilityJson) {
List<Vertex> termMentions = new ArrayList<>();
String tokenList[] = tokenizer.tokenize(line);
Span[] tokenListPositions = tokenizer.tokenizePos(line);
for (TokenNameFinder finder : finders) {
Span[] foundSpans = finder.find(tokenList);
for (Span span : foundSpans) {
termMentions.add(createTermMention(sourceVertex, propertyKey, charOffset, span, tokenList, tokenListPositions, visibilityJson));
}
finder.clearAdaptiveData();
}
return termMentions;
}
private Vertex createTermMention(Vertex sourceVertex, String propertyKey, int charOffset, Span foundName, String[] tokens, Span[] tokenListPositions, VisibilityJson visibilityJson) {
String name = Span.spansToStrings(new Span[]{foundName}, tokens)[0];
int start = charOffset + tokenListPositions[foundName.getStart()].getStart();
int end = charOffset + tokenListPositions[foundName.getEnd() - 1].getEnd();
String type = foundName.getType();
String ontologyClassUri = mapToOntologyIri(type);
return new TermMentionBuilder()
.sourceVertex(sourceVertex)
.propertyKey(propertyKey)
.start(start)
.end(end)
.title(name)
.conceptIri(ontologyClassUri)
.visibilityJson(visibilityJson)
.process(getClass().getName())
.save(getGraph(), getVisibilityTranslator(), getAuthorizations());
}
protected String mapToOntologyIri(String type) {
String ontologyClassUri;
if ("location".equals(type)) {
ontologyClassUri = this.locationIri;
} else if ("organization".equals(type)) {
ontologyClassUri = this.organizationIri;
} else if ("person".equals(type)) {
ontologyClassUri = this.personIri;
} else {
ontologyClassUri = LumifyProperties.CONCEPT_TYPE_THING;
}
return ontologyClassUri;
}
@Override
public boolean isHandled(Element element, Property property) {
if (property == null) {
return false;
}
if (property.getName().equals(LumifyProperties.RAW.getPropertyName())) {
return false;
}
String mimeType = LumifyProperties.MIME_TYPE.getMetadataValue(property.getMetadata(), null);
return !(mimeType == null || !mimeType.startsWith("text"));
}
protected List<TokenNameFinder> loadFinders() throws IOException {
List<TokenNameFinder> finders = new ArrayList<>();
for (Map.Entry<String, Dictionary> dictionaryEntry : getDictionaries().entrySet()) {
finders.add(new DictionaryNameFinder(dictionaryEntry.getValue(), dictionaryEntry.getKey()));
}
return finders;
}
protected Tokenizer loadTokenizer(String pathPrefix, FileSystem fs) throws IOException {
Path tokenizerHdfsPath = new Path(pathPrefix + "/en-token.bin");
TokenizerModel tokenizerModel;
try (InputStream tokenizerModelInputStream = fs.open(tokenizerHdfsPath)) {
tokenizerModel = new TokenizerModel(tokenizerModelInputStream);
}
return new TokenizerME(tokenizerModel);
}
private Map<String, Dictionary> getDictionaries() {
Map<String, Dictionary> dictionaries = new HashMap<>();
Iterable<DictionaryEntry> entries = dictionaryEntryRepository.findAll(getUser().getModelUserContext());
for (DictionaryEntry entry : entries) {
if (!dictionaries.containsKey(entry.getMetadata().getConcept())) {
dictionaries.put(entry.getMetadata().getConcept(), new Dictionary());
}
dictionaries.get(entry.getMetadata().getConcept()).put(tokensToStringList(entry.getMetadata().getTokens()));
}
return dictionaries;
}
private StringList tokensToStringList(String tokens) {
return new StringList(tokens.split(" "));
}
@Inject
public void setDictionaryEntryRepository(DictionaryEntryRepository dictionaryEntryRepository) {
this.dictionaryEntryRepository = dictionaryEntryRepository;
}
}