package io.lumify.core.ingest.graphProperty; import com.google.common.base.Charsets; import com.google.common.io.CharStreams; import io.lumify.core.model.audit.AuditAction; import io.lumify.core.model.ontology.Concept; import io.lumify.core.model.properties.LumifyProperties; import io.lumify.core.model.termMention.TermMentionBuilder; import io.lumify.core.util.LumifyLogger; import io.lumify.core.util.LumifyLoggerFactory; import org.securegraph.Element; import org.securegraph.Property; import org.securegraph.Vertex; import java.io.InputStream; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; public abstract class RegexGraphPropertyWorker extends GraphPropertyWorker { private static final LumifyLogger LOGGER = LumifyLoggerFactory.getLogger(RegexGraphPropertyWorker.class); private final Pattern pattern; public RegexGraphPropertyWorker(String regEx) { this.pattern = Pattern.compile(regEx, Pattern.MULTILINE); } protected abstract Concept getConcept(); @Override public void prepare(GraphPropertyWorkerPrepareData workerPrepareData) throws Exception { super.prepare(workerPrepareData); LOGGER.debug("Extractor prepared for entity type [%s] with regular expression: %s", getConcept().getIRI(), this.pattern.toString()); } @Override public void execute(InputStream in, GraphPropertyWorkData data) throws Exception { LOGGER.debug("Extracting pattern [%s] from provided text", pattern); final String text = CharStreams.toString(new InputStreamReader(in, Charsets.UTF_8)); final Matcher matcher = pattern.matcher(text); Vertex sourceVertex = (Vertex) data.getElement(); List<Vertex> termMentions = new ArrayList<>(); while (matcher.find()) { final String patternGroup = matcher.group(); int start = matcher.start(); int end = matcher.end(); Vertex termMention = new TermMentionBuilder() .sourceVertex(sourceVertex) .propertyKey(data.getProperty().getKey()) .start(start) .end(end) .title(patternGroup) .conceptIri(getConcept().getIRI()) .visibilityJson(data.getVisibilityJson()) .process(getClass().getName()) .save(getGraph(), getVisibilityTranslator(), getAuthorizations()); termMentions.add(termMention); } applyTermMentionFilters(sourceVertex, termMentions); getAuditRepository().auditAnalyzedBy(AuditAction.ANALYZED_BY, sourceVertex, getClass().getSimpleName(), getUser(), sourceVertex.getVisibility()); pushTextUpdated(data); } @Override public boolean isHandled(Element element, Property property) { if (property == null) { return false; } if (property.getName().equals(LumifyProperties.RAW.getPropertyName())) { return false; } String mimeType = (String) property.getMetadata().getValue(LumifyProperties.MIME_TYPE.getPropertyName()); return !(mimeType == null || !mimeType.startsWith("text")); } }