//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.annotators.grammatical; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import com.google.common.collect.ImmutableSet; import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction; import uk.gov.dstl.baleen.types.common.Quantity; import uk.gov.dstl.baleen.types.language.PhraseChunk; import uk.gov.dstl.baleen.types.semantic.Entity; import uk.gov.dstl.baleen.types.semantic.Relation; import uk.gov.dstl.baleen.uima.BaleenAnnotator; /** * Annotate generic entities by looking for unmarked up NPs after '[[Quantity]] of', * and create relationships between entities. * * <p>We find quantities that are followed by the word of, and then check to see if the following POS is a NP. * If it is, and it is unannotated, we can annotate it as an entity (although we can't determine the type of entity).</p> * <p>For example, in the phrase '7kg of blue powder was found hidden in the building', we should be able to identify blue powder as an entity.</p> * <p>A relationship is created between the quantity annotation and the entity annotation (or an existing entity annotation if one exists).</p> * <p>This annotator should be run towards the end of the pipeline, once language features and quantities have been annotated.</p> * * */ public class QuantityNPEntity extends BaleenAnnotator { private Pattern ofPattern = Pattern.compile("([\\h]*of[\\h]*).*", Pattern.CASE_INSENSITIVE); @Override public void doProcess(JCas jCas) throws AnalysisEngineProcessException { Collection<PhraseChunk> phraseChunks = JCasUtil.select(jCas, PhraseChunk.class); Map<Integer, PhraseChunk> nounPhrases = new HashMap<>(); for(PhraseChunk pc : phraseChunks){ if("NP".equals(pc.getChunkType())){ nounPhrases.put(pc.getBegin(), pc); } } Collection<Quantity> quantities = JCasUtil.select(jCas, Quantity.class); String text = jCas.getDocumentText(); for(Quantity q : quantities){ String followingText = text.substring(q.getEnd()); Matcher m = ofPattern.matcher(followingText); if(m.matches()){ int start = q.getEnd() + m.end(1); if(nounPhrases.containsKey(start)){ PhraseChunk pc = nounPhrases.get(start); processNounPhrase(jCas, q, pc, text); } } } } private void processNounPhrase(JCas jCas, Quantity q, PhraseChunk pc, String text){ List<Entity> coveredEntities = new ArrayList<>(JCasUtil.selectCovered(jCas, Entity.class, pc)); if(coveredEntities.isEmpty()){ Entity e = new Entity(jCas); e.setConfidence(pc.getConfidence()); e.setBegin(pc.getBegin()); e.setEnd(pc.getEnd()); e.setValue(text.substring(pc.getBegin(), pc.getEnd())); addToJCasIndex(e); coveredEntities.add(e); } for(Entity e : coveredEntities){ Relation r = new Relation(jCas); r.setBegin(q.getBegin()); r.setEnd(e.getEnd()); r.setConfidence(1.0); r.setSource(q); r.setTarget(e); r.setRelationshipType("QUANTITY"); addToJCasIndex(r); } } @Override public AnalysisEngineAction getAction() { return new AnalysisEngineAction(ImmutableSet.of(Quantity.class, PhraseChunk.class), ImmutableSet.of(Entity.class, Relation.class)); } }