//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.annotators.grammatical; import java.util.regex.Pattern; import java.util.stream.Collectors; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import com.google.common.collect.ImmutableSet; import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction; import uk.gov.dstl.baleen.types.geo.Coordinate; import uk.gov.dstl.baleen.types.language.PhraseChunk; import uk.gov.dstl.baleen.types.semantic.Location; import uk.gov.dstl.baleen.types.semantic.ReferenceTarget; import uk.gov.dstl.baleen.uima.BaleenAnnotator; /** * Identify new locations and coreferences where text has the pattern [NP/Location] at [Coordinate]. * * For example, in the statement "the former school house at GR 1234 5678", we would identify * "the former school house" as a location and a coreference between it and "GR 1234 5678". * * If an existing Location exists, that is used and NP are ignored. * Otherwise a new Location is created to match the NP. * * @baleen.javadoc */ public class NPAtCoordinate extends BaleenAnnotator { private static final Pattern AT = Pattern.compile("\\sat\\s", Pattern.CASE_INSENSITIVE); private static final Pattern IS_AT = Pattern.compile("\\sis\\sat\\s", Pattern.CASE_INSENSITIVE); @Override protected void doProcess(JCas jCas) throws AnalysisEngineProcessException { for(Coordinate coord : JCasUtil.select(jCas, Coordinate.class)){ Integer substringStartAt = Math.max(0, coord.getBegin() - 4); Integer substringStartIsAt = Math.max(0, coord.getBegin() - 7); String precedingTextAt = jCas.getDocumentText().substring(substringStartAt, coord.getBegin()); String precedingTextIsAt = jCas.getDocumentText().substring(substringStartIsAt, coord.getBegin()); final int substringStart; if(IS_AT.matcher(precedingTextIsAt).matches()){ substringStart = substringStartIsAt; }else if(AT.matcher(precedingTextAt).matches()){ substringStart = substringStartAt; }else{ substringStart = -1; } if(substringStart >= 0){ //Get NP or Location at this location boolean locFound = false; for(Location l : JCasUtil.select(jCas, Location.class).stream().filter(l -> substringStart == l.getEnd()).collect(Collectors.toList())){ locFound = true; setReferent(jCas, l, coord); } if(locFound) continue; //Get NP and create a Location for(PhraseChunk pc : JCasUtil.select(jCas, PhraseChunk.class).stream().filter(pc -> "NP".equalsIgnoreCase(pc.getChunkType())).filter(pc -> substringStart == pc.getEnd()).collect(Collectors.toList())){ createNewLocation(jCas, pc, coord); } } } } private void setReferent(JCas jCas, Location l, Coordinate c){ if(l.getReferent() == null && c.getReferent() == null){ ReferenceTarget rt = new ReferenceTarget(jCas); rt.addToIndexes(); l.setReferent(rt); c.setReferent(rt); }else if(l.getReferent() != null && c.getReferent() == null){ c.setReferent(l.getReferent()); }else if(l.getReferent() == null && c.getReferent() != null){ l.setReferent(c.getReferent()); }else{ //Merge all references for(Location lRt : JCasUtil.select(jCas, Location.class).stream().filter(l2 -> l2.getReferent().equals(l.getReferent())).collect(Collectors.toList())){ lRt.setReferent(c.getReferent()); } } } private void createNewLocation(JCas jCas, PhraseChunk pc, Coordinate c){ Location l = new Location(jCas, pc.getBegin(), pc.getEnd()); if(c.getReferent() != null){ l.setReferent(c.getReferent()); }else{ ReferenceTarget rt = new ReferenceTarget(jCas); rt.addToIndexes(); c.setReferent(rt); l.setReferent(rt); } l.addToIndexes(); } @Override public AnalysisEngineAction getAction() { return new AnalysisEngineAction(ImmutableSet.of(Coordinate.class, Location.class, PhraseChunk.class), ImmutableSet.of(Location.class)); } }