//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.annotators.cleaners; import java.time.LocalDateTime; import java.time.ZoneOffset; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.List; import org.apache.uima.UimaContext; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import com.google.common.collect.ImmutableSet; import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction; import uk.gov.dstl.baleen.core.utils.ConfigUtils; import uk.gov.dstl.baleen.types.semantic.Entity; import uk.gov.dstl.baleen.types.semantic.Temporal; import uk.gov.dstl.baleen.uima.BaleenAnnotator; /** * This cleaner performs the following actions on all Temporal entitites: * <ul> * <li>For any Temporal entity with timestamps set, checks to see if the timestamp refers to a date more than x years in the past/future and if so removes it.</li> * <li>Removes any Temporal entity that begins with a currency sign (this seems to be a common error in the OpenNLP NER model for dates).</li> * <li>Optionally removes any timestamp with a value of 0L (generally this means the timestamp has not been set) * </ul> * * @baleen.javadoc */ public class CleanTemporal extends BaleenAnnotator { LocalDateTime start = null; LocalDateTime end = null; /** * The number of years before or after the current date that we consider to be valid * * @baleen.config 50 */ public static final String PARAM_YEARS = "years"; @ConfigurationParameter(name = PARAM_YEARS, defaultValue="50") private String yearsString; //Parse the years config parameter into this variable to avoid issues with parameter types private int years; /** * Remove entities with a timestamp set to 0L. * * Generally, a timestamp will be 0L if it hasn't been set, * but it will also be 0L if the timestamp represents 1970-01-01 00:00:00 * and therefore valid data may be removed. * * @baleen.config removeZeroTimestamp true */ public static final String PARAM_REMOVE_ZERO_TIMESTAMP = "removeZeroTimestamp"; @ConfigurationParameter(name = PARAM_REMOVE_ZERO_TIMESTAMP, defaultValue="false") private Boolean removeZeroTimestamp; @Override public void doInitialize(UimaContext aContext) throws ResourceInitializationException { years = ConfigUtils.stringToInteger(yearsString, 50); start = LocalDateTime.now().minusYears(years); end = LocalDateTime.now().plusYears(years); } @Override public void doProcess(JCas jCas) throws AnalysisEngineProcessException{ Collection<Temporal> tempora = JCasUtil.select(jCas, Temporal.class); List<Entity> toRemove = new ArrayList<>(); for(Temporal t : tempora){ if(zeroTimestamp(t) || (isMoney(t.getValue()) || isMoney(t.getCoveredText())) || isOutsideRange(t)){ toRemove.add(t); continue; } } removeFromJCasIndex(toRemove); } private boolean zeroTimestamp(Temporal t){ return removeZeroTimestamp && (t.getTimestampStart() == 0L || t.getTimestampStop() == 0L); } private boolean isOutsideRange(Temporal t){ if(t.getTimestampStart() != 0L && t.getTimestampStart() < start.toEpochSecond(ZoneOffset.UTC)) return true; if(t.getTimestampStop() != 0L && t.getTimestampStop() > end.toEpochSecond(ZoneOffset.UTC)) return true; return false; } private boolean isMoney(String text){ return text.startsWith("£") || text.startsWith("$") || text.startsWith("€"); } @Override public AnalysisEngineAction getAction() { return new AnalysisEngineAction(ImmutableSet.of(Temporal.class), Collections.emptySet()); } }