package io.lumify.knownEntity;
import io.lumify.core.ingest.graphProperty.GraphPropertyWorkData;
import io.lumify.core.ingest.graphProperty.GraphPropertyWorker;
import io.lumify.core.ingest.graphProperty.GraphPropertyWorkerPrepareData;
import io.lumify.core.model.audit.AuditAction;
import io.lumify.core.model.properties.LumifyProperties;
import io.lumify.core.model.termMention.TermMentionBuilder;
import io.lumify.core.util.LumifyLogger;
import io.lumify.core.util.LumifyLoggerFactory;
import io.lumify.web.clientapi.model.VisibilityJson;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.arabidopsis.ahocorasick.AhoCorasick;
import org.arabidopsis.ahocorasick.SearchResult;
import org.securegraph.*;
import org.supercsv.io.CsvListReader;
import org.supercsv.prefs.CsvPreference;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URLDecoder;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import static org.securegraph.util.IterableUtils.singleOrDefault;
public class KnownEntityExtractorGraphPropertyWorker extends GraphPropertyWorker {
private static final LumifyLogger LOGGER = LumifyLoggerFactory.getLogger(KnownEntityExtractorGraphPropertyWorker.class);
public static final String PATH_PREFIX_CONFIG = "termextraction.knownEntities.pathPrefix";
public static final String DEFAULT_PATH_PREFIX = "/lumify/config/knownEntities/";
private static final String PROCESS = KnownEntityExtractorGraphPropertyWorker.class.getName();
private AhoCorasick tree;
private String artifactHasEntityIri;
private String locationIri;
private String organizationIri;
private String personIri;
@Override
public void prepare(GraphPropertyWorkerPrepareData workerPrepareData) throws Exception {
super.prepare(workerPrepareData);
this.locationIri = getOntologyRepository().getRequiredConceptIRIByIntent("location");
this.organizationIri = getOntologyRepository().getRequiredConceptIRIByIntent("organization");
this.personIri = getOntologyRepository().getRequiredConceptIRIByIntent("person");
this.artifactHasEntityIri = getOntologyRepository().getRequiredRelationshipIRIByIntent("artifactHasEntity");
String pathPrefix = (String) workerPrepareData.getConfiguration().get(PATH_PREFIX_CONFIG);
if (pathPrefix == null) {
pathPrefix = DEFAULT_PATH_PREFIX;
}
FileSystem fs = workerPrepareData.getHdfsFileSystem();
this.tree = loadDictionaries(fs, pathPrefix);
}
@Override
public void execute(InputStream in, GraphPropertyWorkData data) throws Exception {
String text = IOUtils.toString(in, "UTF-8"); // TODO convert AhoCorasick to use InputStream
Iterator<SearchResult<Match>> searchResults = tree.search(text.toCharArray());
Vertex sourceVertex = (Vertex) data.getElement();
List<Vertex> termMentions = new ArrayList<>();
while (searchResults.hasNext()) {
SearchResult searchResult = searchResults.next();
VisibilityJson visibilityJson = data.getVisibilitySourceJson();
List<Vertex> newTermMentions = outputResultToTermMention(sourceVertex, searchResult, data.getProperty().getKey(), visibilityJson, data.getVisibility());
termMentions.addAll(newTermMentions);
getGraph().flush();
}
applyTermMentionFilters(sourceVertex, termMentions);
pushTextUpdated(data);
}
private List<Vertex> outputResultToTermMention(Vertex sourceVertex, SearchResult<Match> searchResult, String propertyKey, VisibilityJson visibilityJson, Visibility visibility) {
List<Vertex> termMentions = new ArrayList<>();
for (Match match : searchResult.getOutputs()) {
int start = searchResult.getLastIndex() - match.getMatchText().length();
int end = searchResult.getLastIndex();
String title = match.getEntityTitle();
String ontologyClassUri = mapToOntologyIri(match.getConceptTitle());
Vertex resolvedToVertex = findOrAddEntity(title, ontologyClassUri, visibility);
Edge resolvedEdge = findOrAddEdge(sourceVertex, resolvedToVertex, visibilityJson, visibility);
Vertex termMention = new TermMentionBuilder()
.sourceVertex(sourceVertex)
.propertyKey(propertyKey)
.start(start)
.end(end)
.title(title)
.conceptIri(ontologyClassUri)
.visibilityJson(visibilityJson)
.process(PROCESS)
.resolvedTo(resolvedToVertex, resolvedEdge)
.save(getGraph(), getVisibilityTranslator(), getAuthorizations());
termMentions.add(termMention);
}
return termMentions;
}
protected String mapToOntologyIri(String type) {
String ontologyClassUri;
if ("location".equals(type)) {
ontologyClassUri = this.locationIri;
} else if ("organization".equals(type)) {
ontologyClassUri = this.organizationIri;
} else if ("person".equals(type)) {
ontologyClassUri = this.personIri;
} else {
ontologyClassUri = LumifyProperties.CONCEPT_TYPE_THING;
}
return ontologyClassUri;
}
private Edge findOrAddEdge(Vertex sourceVertex, Vertex resolvedToVertex, VisibilityJson visibilityJson, Visibility visibility) {
Edge resolvedEdge = singleOrDefault(sourceVertex.getEdges(resolvedToVertex, Direction.BOTH, getAuthorizations()), null);
if (resolvedEdge == null) {
EdgeBuilder resolvedEdgeBuilder = getGraph().prepareEdge(sourceVertex, resolvedToVertex, artifactHasEntityIri, visibility);
LumifyProperties.VISIBILITY_JSON.setProperty(resolvedEdgeBuilder, visibilityJson, visibility);
resolvedEdge = resolvedEdgeBuilder.save(getAuthorizations());
getAuditRepository().auditRelationship(AuditAction.CREATE, sourceVertex, resolvedToVertex, resolvedEdge, PROCESS, "", getUser(), visibility);
}
return resolvedEdge;
}
private Vertex findOrAddEntity(String title, String ontologyClassUri, Visibility visibility) {
Vertex vertex = singleOrDefault(getGraph().query(getAuthorizations())
.has(LumifyProperties.TITLE.getPropertyName(), title)
.has(LumifyProperties.CONCEPT_TYPE.getPropertyName(), ontologyClassUri)
.vertices(), null);
if (vertex != null) {
return vertex;
}
VertexBuilder vertexElementMutation = getGraph().prepareVertex(visibility);
LumifyProperties.TITLE.setProperty(vertexElementMutation, title, visibility);
LumifyProperties.CONCEPT_TYPE.setProperty(vertexElementMutation, ontologyClassUri, visibility);
vertex = vertexElementMutation.save(getAuthorizations());
getGraph().flush();
return vertex;
}
@Override
public boolean isHandled(Element element, Property property) {
if (property == null) {
return false;
}
if (property.getName().equals(LumifyProperties.RAW.getPropertyName())) {
return false;
}
String mimeType = LumifyProperties.MIME_TYPE.getMetadataValue(property.getMetadata(), null);
return !(mimeType == null || !mimeType.startsWith("text"));
}
private static AhoCorasick loadDictionaries(FileSystem fs, String pathPrefix) throws IOException {
AhoCorasick tree = new AhoCorasick();
Path hdfsDirectory = new Path(pathPrefix, "dictionaries");
if (!fs.exists(hdfsDirectory)) {
fs.mkdirs(hdfsDirectory);
}
for (FileStatus dictionaryFileStatus : fs.listStatus(hdfsDirectory)) {
Path hdfsPath = dictionaryFileStatus.getPath();
if (hdfsPath.getName().startsWith(".") || !hdfsPath.getName().endsWith(".dict")) {
continue;
}
LOGGER.info("Loading known entity dictionary %s", hdfsPath.toString());
String conceptName = FilenameUtils.getBaseName(hdfsPath.getName());
conceptName = URLDecoder.decode(conceptName, "UTF-8");
try (InputStream dictionaryInputStream = fs.open(hdfsPath)) {
addDictionaryEntriesToTree(tree, conceptName, dictionaryInputStream);
}
}
tree.prepare();
return tree;
}
private static void addDictionaryEntriesToTree(AhoCorasick tree, String type, InputStream dictionaryInputStream) throws IOException {
CsvPreference csvPrefs = CsvPreference.EXCEL_PREFERENCE;
CsvListReader csvReader = new CsvListReader(new InputStreamReader(dictionaryInputStream), csvPrefs);
List<String> line;
while ((line = csvReader.read()) != null) {
if (line.size() != 2) {
throw new RuntimeException("Invalid number of entries on a line. Expected 2 found " + line.size());
}
tree.add(line.get(0), new Match(type, line.get(0), line.get(1)));
}
}
private static class Match {
private final String conceptTitle;
private final String entityTitle;
private final String matchText;
public Match(String type, String matchText, String entityTitle) {
conceptTitle = type;
this.matchText = matchText;
this.entityTitle = entityTitle;
}
private String getConceptTitle() {
return conceptTitle;
}
private String getEntityTitle() {
return entityTitle;
}
private String getMatchText() {
return matchText;
}
@Override
public String toString() {
return matchText;
}
}
}