//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.annotators.gazetteer.helpers;
import java.lang.reflect.Method;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import org.ahocorasick.trie.Emit;
import org.ahocorasick.trie.Trie;
import org.ahocorasick.trie.Trie.TrieBuilder;
import org.apache.commons.lang3.StringUtils;
import org.apache.uima.UIMAException;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.CommonArrayFS;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.factory.JCasFactory;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.cas.StringArray;
import org.apache.uima.jcas.tcas.Annotation;
import org.apache.uima.resource.ResourceInitializationException;
import org.atteo.evo.inflector.English;
import org.bson.Document;
import com.google.common.base.Strings;
import uk.gov.dstl.baleen.exceptions.BaleenException;
import uk.gov.dstl.baleen.resources.gazetteer.IGazetteer;
import uk.gov.dstl.baleen.types.BaleenAnnotation;
import uk.gov.dstl.baleen.types.semantic.Entity;
import uk.gov.dstl.baleen.types.semantic.ReferenceTarget;
import uk.gov.dstl.baleen.uima.BaleenTextAwareAnnotator;
import uk.gov.dstl.baleen.uima.data.TextBlock;
import uk.gov.dstl.baleen.uima.utils.TypeSystemSingleton;
import uk.gov.dstl.baleen.uima.utils.TypeUtils;
/**
* Abstract class implementing the a gazetteer using the Aho-Corasick algorithm.
*
* Reflection is used to try and identify entity properties and set them based on additional data
* fields in the gazetteer. This means that this annotator can be used for any entity type, though
* there is a risk that a malformed gazetteer could corrupt the entities.
*
* @baleen.javadoc
*/
public abstract class AbstractAhoCorasickAnnotator extends BaleenTextAwareAnnotator {
/**
* Should comparisons be done case sensitively?
*
* @baleen.config false
*/
public static final String PARAM_CASE_SENSITIVE = "caseSensitive";
@ConfigurationParameter(name = PARAM_CASE_SENSITIVE, defaultValue = "false")
protected boolean caseSensitive;
/**
* Should whitespace in document be preserved?
*
* If set to false, the document text is normalized prior to comparison, so that any sequence of
* whitespace characters is translated to a single space character before matching against the
* gazetteer. The document text in the CAS is not modified, and any annotations created will
* cover the correct span (including any ignored whitespace) of surface text.
*
* @baleen.config true
*/
public static final String PARAM_EXACT_WHITESPACE = "exactWhitespace";
@ConfigurationParameter(name = PARAM_EXACT_WHITESPACE, defaultValue = "true")
protected boolean exactWhitespace;
/**
* The type to use for extracted entities
*
* @baleen.config Entity
*/
public static final String PARAM_TYPE = "type";
@ConfigurationParameter(name = PARAM_TYPE, defaultValue = "Entity")
protected String type;
/**
* The subtype to use for extracted entities
*
* @baleen.config
*/
public static final String PARAM_SUBTYPE = "subtype";
@ConfigurationParameter(name = PARAM_SUBTYPE, defaultValue = "")
protected String subtype;
/**
* Should additional data (e.g. GeoJSON) be added to entities?
*
* If false, then only the value of the entity will be set.
*
* @baleen.config true
*/
public static final String PARAM_ADDITIONAL_DATA = "useAdditionalData";
@ConfigurationParameter(name = PARAM_ADDITIONAL_DATA, defaultValue = "true")
protected boolean useAdditionalData;
/**
* Should plurals be accepted (i.e. if 'fox' is in the gazetteer,
* should we also accept foxes). Only regular plurals are considered,
* irregular plurals should be added separately to the gazetteer.
*
* @baleen.config false
*/
public static final String PARAM_PLURALS = "plural";
@ConfigurationParameter(name = PARAM_PLURALS, defaultValue = "false")
protected boolean plurals;
protected IGazetteer gazetteer;
protected Class<? extends Annotation> entityType;
protected Trie trie;
private static final String ERROR_CANT_ASSIGN_ENTITY_PROPERTY = "Unable to assign property on entity - property will be skipped";
/**
* Constructor
*
* @param logger
* The Logger to use for errors, etc.
*/
public AbstractAhoCorasickAnnotator() {
}
/**
* Configure a gazetteer object and initialise it. Remember that the caseSensitive and type
* properties may also need to be passed to the gazetteer, dependent on the gazetteer.
*
* @return A initialised gazetteer implementing IGazetteer
*/
public abstract IGazetteer configureGazetteer() throws BaleenException;
@SuppressWarnings("unchecked")
@Override
public void doInitialize(UimaContext aContext) throws ResourceInitializationException {
try {
gazetteer = configureGazetteer();
} catch (BaleenException be) {
throw new ResourceInitializationException(be);
}
buildTrie();
try {
entityType = (Class<? extends Annotation>) TypeUtils.getType(type, JCasFactory.createJCas(TypeSystemSingleton.getTypeSystemDescriptionInstance()));
if (entityType == null) {
getMonitor().warn("Type {} not found, Entity will be used instead", type);
entityType = Entity.class;
}
} catch (UIMAException e) {
throw new ResourceInitializationException(e);
}
}
/**
* Build the Trie and set the <em>trie</em> variable. This method can be overridden if you want
* to modify the gazetteer before parsing it.
*/
protected void buildTrie() {
TrieBuilder builder = Trie.builder().onlyWholeWords();
if (!caseSensitive) {
builder = builder.ignoreCase();
}
for (String s : gazetteer.getValues()) {
builder = builder.addKeyword(s);
if(plurals){
builder = builder.addKeyword(pluraliseWord(s));
}
}
trie = builder.build();
}
@Override
protected final void doProcess(JCas jCas) throws AnalysisEngineProcessException {
// Final so as to prevent other implementations being non text aware
super.doProcess(jCas);
}
@Override
public void doProcessTextBlock(TextBlock block) throws AnalysisEngineProcessException {
Map<String, List<BaleenAnnotation>> entities = exactWhitespace ? processExactWhitespace(block)
: processNormalisedWhitespace(block);
createReferenceTargets(block, entities.values());
}
private Map<String, List<BaleenAnnotation>> processExactWhitespace(TextBlock block) {
Map<String, List<BaleenAnnotation>> entities = new HashMap<>();
String text = block.getCoveredText();
Collection<Emit> emits = trie.parseText(text);
for (Emit emit : emits) {
try {
String match = text.substring(emit.getStart(), emit.getEnd() + 1);
createEntityAndAliases(block, emit.getStart(), emit.getEnd() + 1, match, match, entities);
} catch (BaleenException be) {
getMonitor().error("Unable to create entity of type {} for value '{}'", entityType.getName(),
emit.getKeyword(), be);
continue;
}
}
return entities;
}
private Map<String, List<BaleenAnnotation>> processNormalisedWhitespace(TextBlock block) {
Map<String, List<BaleenAnnotation>> entities = new HashMap<>();
TransformedString norm = normaliseString(block.getCoveredText());
Collection<Emit> emits = trie.parseText(norm.getTransformedString());
for (Emit emit : emits) {
try {
Integer start = norm.getMapping().get(emit.getStart());
Integer end = norm.getMapping().get(emit.getEnd() + 1);
String match = norm.getOriginalString().substring(start, end);
createEntityAndAliases(block, start, end, match, match, entities);
} catch (BaleenException be) {
getMonitor().error("Unable to create entity of type {} for value '{}'", entityType.getName(),
emit.getKeyword(), be);
continue;
}
}
return entities;
}
protected void createEntityAndAliases(TextBlock block, Integer start, Integer end, String value, String aliasKey,
Map<String, List<BaleenAnnotation>> entities) throws BaleenException {
BaleenAnnotation ent = createEntity(block, start, end, value, aliasKey);
List<String> aliases = new ArrayList<>(Arrays.asList(gazetteer.getAliases(aliasKey)));
aliases.add(aliasKey);
String key = generateKey(aliases);
List<BaleenAnnotation> groupEntities = entities.containsKey(key) ? entities.get(key) : new ArrayList<>();
groupEntities.add(ent);
entities.put(key, groupEntities);
}
/**
* Generate a key for an alias set by ordering and joining them
*
* @param aliases
* @return
*/
protected String generateKey(List<String> aliases) {
List<String> correctCaseAliases;
if (!caseSensitive) {
correctCaseAliases = aliases.stream().map(String::toLowerCase).collect(Collectors.toList());
} else {
correctCaseAliases = aliases;
}
Collections.sort(correctCaseAliases);
return StringUtils.join(correctCaseAliases, "|");
}
/**
* Create a new entity of the configured type
*
* @param block
* JCas object in which to create the entity
* @param begin
* The beginning of the entity in the text
* @param end
* The end of the entity in the text
* @param value
* The value of the entity
* @param gazetteerKey
* The key as it appears in the gazetteer
* @throws Exception
*/
protected BaleenAnnotation createEntity(TextBlock block, int begin, int end, String value, String gazetteerKey)
throws BaleenException {
BaleenAnnotation ent;
try {
ent = (BaleenAnnotation) block.newAnnotation(entityType, begin, end);
} catch (Exception e) {
throw new BaleenException("Could not create new entity", e);
}
if (ent instanceof Entity) {
((Entity) ent).setValue(value);
((Entity) ent).setConfidence(1.0);
if(!Strings.isNullOrEmpty(subtype))
((Entity) ent).setSubType(subtype);
}
if(useAdditionalData){
Map<String, Object> additionalData = gazetteer.getAdditionalData(gazetteerKey);
if (additionalData != null && !additionalData.isEmpty()) {
for (Method m : entityType.getMethods()) {
setProperty(ent, m, additionalData);
}
}
}
addToJCasIndex(ent);
return ent;
}
/**
* Create reference targets for entities with the same keys
*
* @param jCas
* UIMA JCas Object
* @param entities
* A collection of lists of entities to coreference
*/
protected void createReferenceTargets(TextBlock block, Collection<List<BaleenAnnotation>> entities) {
int begin = block.toDocumentOffset(0);
int end = block.toDocumentOffset(block.getCoveredText().length());
for (List<BaleenAnnotation> group : entities) {
if (group.size() <= 1) {
continue;
}
ReferenceTarget rt = new ReferenceTarget(block.getJCas());
rt.setBegin(begin);
rt.setEnd(end);
addToJCasIndex(rt);
for (BaleenAnnotation e : group) {
if (e instanceof Entity) {
((Entity) e).setReferent(rt);
}
}
}
}
@SuppressWarnings("unchecked")
private void setProperty(BaleenAnnotation entity, Method method, Map<String, Object> additionalData) {
if (method.getName().startsWith("set") && method.getName().substring(3, 4).matches("[A-Z]")
&& method.getParameterCount() == 1) {
String property = method.getName().substring(3);
property = property.substring(0, 1).toLowerCase() + property.substring(1);
Object obj = additionalData.get(property);
if (obj == null) {
return;
}
if (method.getParameterTypes()[0].isAssignableFrom(obj.getClass())) {
setPropertyObject(entity, method, obj);
} else if (method.getParameterTypes()[0].isAssignableFrom(String.class)) {
getMonitor().debug("Converting gazetteer object of type {} to String", obj.getClass().getName());
if(obj instanceof Document){
//Special case for Mongo Document objects, where the toString function
//doesn't convert to JSON as expected (e.g. for GeoJSON)
setPropertyString(entity, method, ((Document)obj).toJson());
}else{
setPropertyString(entity, method, obj.toString());
}
} else if (List.class.isAssignableFrom(obj.getClass())
&& CommonArrayFS.class.isAssignableFrom(method.getParameterTypes()[0])) {
setPropertyArray(entity, method, (List<Object>) obj);
}
}
}
private void setPropertyObject(BaleenAnnotation entity, Method method, Object obj) {
try {
method.invoke(entity, obj);
} catch (Exception e) {
getMonitor().error(ERROR_CANT_ASSIGN_ENTITY_PROPERTY, e);
}
}
private void setPropertyString(BaleenAnnotation entity, Method method, String string) {
try {
method.invoke(entity, string);
} catch (Exception e) {
getMonitor().error(ERROR_CANT_ASSIGN_ENTITY_PROPERTY, e);
}
}
private void setPropertyArray(BaleenAnnotation entity, Method method, List<Object> obj) {
if (StringArray.class.isAssignableFrom(method.getParameterTypes()[0])) {
try {
StringArray sa = listToStringArray(entity.getCAS().getJCas(), obj);
method.invoke(entity, sa);
} catch (Exception e) {
getMonitor().error(ERROR_CANT_ASSIGN_ENTITY_PROPERTY, e);
}
} else {
getMonitor().error("Unsupported array type {} - property will be skipped",
method.getParameterTypes()[0].getName());
}
}
/**
* Replace repeated horizontal whitespace characters with a single space character, and return a
* TransformedString that maps between the original and normalised string
*
* @param s
* The string to normalise
* @return A TransformedString mapping between the original and normalised text
*/
public static TransformedString normaliseString(String s) {
String remaining = s;
StringBuilder builder = new StringBuilder();
String previousChar = "";
Map<Integer, Integer> indexMap = new HashMap<>();
Integer index = 0;
while (!remaining.isEmpty()) {
indexMap.put(builder.length(), index);
index++;
String character = remaining.substring(0, 1);
remaining = remaining.substring(1);
if (!(character.matches("\\h") && previousChar.matches("\\h"))) {
if (character.matches("\\h")) {
character = " ";
}
builder.append(character);
}
previousChar = character;
}
indexMap.put(builder.length(), index);
return new TransformedString(s, builder.toString(), indexMap);
}
private StringArray listToStringArray(JCas jCas, List<Object> l) {
StringArray sa = new StringArray(jCas, l.size());
int index = 0;
for (Object o : l) {
sa.set(index, o.toString());
index++;
}
return sa;
}
@Override
public void doDestroy() {
gazetteer.destroy();
gazetteer = null;
entityType = null;
trie = null;
}
private String pluraliseWord(String s){
return English.plural(s);
}
}