//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.annotators.cleaners; import java.util.Collection; import java.util.Collections; import java.util.HashSet; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.descriptor.ConfigurationParameter; import com.google.common.collect.ImmutableSet; import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction; import uk.gov.dstl.baleen.types.common.Quantity; import uk.gov.dstl.baleen.types.semantic.Location; import uk.gov.dstl.baleen.uima.BaleenTextAwareAnnotator; import uk.gov.dstl.baleen.uima.data.TextBlock; /** * Where a Location is preceded by text such as "20km east of", or "100m north west of", * expand the Location to cover the description. * * The distance is optional, but if it is included it must be a Quantity annotator, * with subtype 'distance'. Optionally, the Quantity annotation can be removed. * * Text such as 'the area of' or 'near to' will also be detected, * and distances aren't required for these. * * Any GeoJSON associated with the location is removed, as it is unlikely to be correct. * * @baleen.javadoc */ public class ExpandLocationToDescription extends BaleenTextAwareAnnotator{ /** * Should we remove the distance quantity from the JCas? * * @baleen.config false */ public static final String PARAM_REMOVE_QUANTITY = "removeQuantity"; @ConfigurationParameter(name = PARAM_REMOVE_QUANTITY, defaultValue = "false") private boolean removeQuantity = false; private static final Pattern DIRECTION_DESCRIPTION = Pattern.compile(".*?(north([- ]?(east|west))?|south([- ]?(east|west))?|east|west|N|E|S|W|NE|SE|SW|NW|NNE|ENE|ESE|SSE|SSW|WSW|WNW|NNW) of$", Pattern.CASE_INSENSITIVE); private static final Pattern AREA_DESCRIPTION = Pattern.compile(".*?(the (area|region|territory|vicinity|outskirts) (of|surrounding)|(close|near) to|parts of|(northern|eastern|southern|western)( part of)?)$", Pattern.CASE_INSENSITIVE); @Override protected void doProcessTextBlock(TextBlock block) throws AnalysisEngineProcessException { Collection<Quantity> quantities = block.select(Quantity.class); Set<Quantity> quantitiesToRemove = new HashSet<>(); for(Location l : block.select(Location.class)){ String precedingText = block.getCoveredText().substring(0, block.toBlockOffset(l.getBegin())).trim(); Matcher ma = AREA_DESCRIPTION.matcher(precedingText); if(ma.matches()){ l.setBegin(ma.start(1)); l.setGeoJson(null); continue; } Matcher md = DIRECTION_DESCRIPTION.matcher(precedingText); if(md.matches()){ l.setBegin(md.start(1)); l.setGeoJson(null); Quantity q = findQuantity(quantities, l); if(q != null) quantitiesToRemove.add(q); } } if(removeQuantity) removeFromJCasIndex(quantitiesToRemove); } /** * Finds a quantity prepending a location, and expand the location to include * the quantity if found. * * Returns the quantity, or null. */ private Quantity findQuantity(Collection<Quantity> quantities, Location l){ for(Quantity q : quantities){ if(("distance".equalsIgnoreCase(q.getSubType())) && (q.getEnd() == l.getBegin() || q.getEnd() == l.getBegin() - 1)){ l.setBegin(q.getBegin()); return q; } } return null; } @Override public AnalysisEngineAction getAction() { return new AnalysisEngineAction(ImmutableSet.of(Location.class, Quantity.class), Collections.emptySet()); } }