//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.annotators.regex; import java.util.Arrays; import java.util.Collections; import java.util.TimeZone; import java.util.regex.Matcher; import java.util.stream.Collectors; import org.apache.commons.lang.StringUtils; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.jcas.JCas; import com.google.common.collect.ImmutableSet; import uk.gov.dstl.baleen.annotators.regex.helpers.AbstractRegexAnnotator; import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction; import uk.gov.dstl.baleen.types.semantic.Temporal; /** * Annotate times within a document using regular expressions * * <p>The document content is run through a regular expression matcher looking for things that match the following time regular expression, * where UTC is being used to represent all time zone acronyms defined in Java:</p> * <pre>\\b(((0?[0-9])|([0-9]{2}))[:][0-9]{2}\\h*((UTC)([ ]?[+-][ ]?((0?[0-9])|(1[0-2])))?)?\\h*(pm|am)?)\\b|\\b(((1[0-2])|([1-9]))(pm|am))\\b|\\b(midnight)\\b|\\b(midday)\\b|\\b((12\\h)?noon)\\b|\\b([0-2][0-9][0-5][0-9][ ]?(hr(s)?)?[ ]?((UTC)([ ]?[+-][ ]?((0?[0-9])|(1[0-2])))?)?)\\b</pre> * <p>This will only capture times that match the regular expression, and will miss times expressed in a different format.</p> * * By default, only times that contain alphabetical characters or colons will be accepted to minimise false positives. */ public class Time extends AbstractRegexAnnotator<Temporal> { /** * Do we require that there are alphabetical characters in the time? * This helps avoid picking out things like 2015 as a time when it should be a year, * as it forces the time to be written like 2015hrs or 8:15pm. * * For the purposes of the TimeRegex annotator, colons are treated as alphabetical characters, * such that times such as 20:15 are captured. Other punctuation isn't, as 20.15 is more like to * be an amount than a time. * * @baleen.config true */ public static final String PARAM_REQUIRE_ALPHA = "requireAlpha"; @ConfigurationParameter(name = PARAM_REQUIRE_ALPHA, defaultValue = "true") private Boolean requireAlpha; private static final String TIME_ZONES = StringUtils.join( Arrays.asList(TimeZone.getAvailableIDs()) .stream().filter(s -> StringUtils.isAllUpperCase(s) && s.length() <= 3) .collect(Collectors.toList()), "|"); private static final String TIME_REGEX = "\\b(([0-1]?[0-9]|2[0-4])[:\\.][0-5][0-9]\\h*(("+TIME_ZONES+")([ ]?[+-][ ]?((0?[0-9])|(1[0-2])))?)?\\h*(pm|am)?)\\b|\\b(((1[0-2])|([1-9]))(pm|am))\\b|\\b(midnight)\\b|\\b(midday)\\b|\\b((12\\h)?noon)\\b|\\b([0-1][0-9]|2[0-4])[0-5][0-9][ ]?(((hr(s)?)?[ ]?(("+TIME_ZONES+")([ ]?[+-][ ]?((0?[0-9])|(1[0-2])))?)?)|hours|h)\\b"; /** New instance. * */ public Time() { super(TIME_REGEX, false, 1.0); } @Override protected Temporal create(JCas jCas, Matcher matcher) { if(requireAlpha){ String time = matcher.group(); if(!time.matches(".*[a-zA-Z:].*")){ return null; } } Temporal dtg = new Temporal(jCas); dtg.setPrecision("UNQUALIFIED"); dtg.setScope("SINGLE"); dtg.setTemporalType("TIME"); return dtg; } @Override public AnalysisEngineAction getAction() { return new AnalysisEngineAction(Collections.emptySet(), ImmutableSet.of(Temporal.class)); } }