//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.annotators.cleaners; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Set; import java.util.stream.Collectors; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import com.google.common.collect.ImmutableSet; import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction; import uk.gov.dstl.baleen.types.semantic.Entity; import uk.gov.dstl.baleen.types.semantic.Relation; import uk.gov.dstl.baleen.uima.BaleenAnnotator; /** * Removes multiple copies of the same relation within a document. * * This is a naive and simple approach which can hide many issues - it is effectively performing * relationship coreference and deduplication based solely at a relationship level. The algorithm * works by looking is the relationship types are the same, and if the entities are the same (here * as well is difficult, this is based on entities having the same type and value which may be * incorrect for multiple John Smiths). * * This only really useful if you want to ensure that from a single document you get only a single * relationship of the same type, subtype between the same two entities because you want to * (naively) push data into database and not have to consider this in future algorithms (focusing on * counting the same relations appearing in different documents). * */ public class NaiveMergeRelations extends BaleenAnnotator { /** * Symmetric relations (x ~ y and y ~ x are considered the same) if true * * @baleen.config true */ public static final String KEY_SYMMETRIC = "symmetric"; @ConfigurationParameter(name = KEY_SYMMETRIC, defaultValue = "true") private Boolean symmetric; @Override protected void doProcess(final JCas jCas) throws AnalysisEngineProcessException { final List<Relation> relations = new ArrayList<>(JCasUtil.select(jCas, Relation.class)); final Set<Relation> toRemove = new HashSet<>(); for (int i = 0; i < relations.size(); i++) { final Relation a = relations.get(i); if (!toRemove.contains(a)) { toRemove.addAll(findSameRelations(a, relations.subList(i + 1, relations.size()))); } } removeFromJCasIndex(toRemove); } /** * Finds any relations from the list <em>relations</em> that is the same as <em>a</em> */ private List<Relation> findSameRelations(Relation a, List<Relation> relations){ return relations.stream().filter(b -> isSame(a, b)).collect(Collectors.toList()); } /** * Checks if relations are the same. * * @param a * the first relation * @param b * the second relation * @return true, if is same */ private boolean isSame(final Relation a, final Relation b) { boolean sameSourceTarget = false; if(isSame(a.getSource(), b.getSource()) && isSame(a.getTarget(), b.getTarget())){ sameSourceTarget = true; }else if(symmetric && isSame(a.getSource(), b.getTarget()) && isSame(a.getTarget(), b.getSource())){ //Symmetric, so source and target could be switched sameSourceTarget = true; } return sameSourceTarget && isSame(a.getRelationshipType(), b.getRelationshipType()) && isSame(a.getRelationSubType(), b.getRelationSubType()); } /** * Checks if entity is the same * * @param a * the first entity * @param b * the second entity * @return true, if is same */ private boolean isSame(final Entity a, final Entity b) { if (a == null && b == null) { return true; } if (a == null || b == null) { // implies b != null (as a != b) return false; } return a.getType().equals(b.getType()) && isSame(a.getValue(), b.getValue()); } /** * Checks if two strings are the same. * * @param a * first string * @param b * second string * @return true, if is same */ private boolean isSame(final String a, final String b) { if (a == null && b == null) { return true; } else if (a == null || b == null) { return false; } else { return a.equalsIgnoreCase(b); } } @Override public AnalysisEngineAction getAction() { return new AnalysisEngineAction(ImmutableSet.of(Relation.class), ImmutableSet.of(Relation.class)); } }