//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.annotators.regex; import java.time.ZoneId; import java.time.ZoneOffset; import java.time.ZonedDateTime; import java.time.format.DateTimeFormatter; import java.time.format.DateTimeParseException; import java.util.Arrays; import java.util.Collections; import java.util.TimeZone; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.stream.Collectors; import org.apache.commons.lang.StringUtils; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import com.google.common.collect.ImmutableSet; import uk.gov.dstl.baleen.annotators.helpers.DateTimeUtils; import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction; import uk.gov.dstl.baleen.types.semantic.Temporal; import uk.gov.dstl.baleen.uima.BaleenTextAwareAnnotator; import uk.gov.dstl.baleen.uima.data.TextBlock; /** * Annotate date time strings as Temporal entities. The following examples show the types of date times that are detected. * <ul> * <li>ISO8601 Format</li> * <li>0725hrs on 9 Sept 15</li> * <li>22 Apr 2014 1529 UTC</li> * </ul> * * @baleen.javadoc */ public class DateTime extends BaleenTextAwareAnnotator { private static final String DAYS = "(?:(?:Mon|Monday|Tue|Tues|Tuesday|Wed|Wednesday|Thu|Thurs|Thursday|Fri|Friday|Sat|Saturday|Sun|Sunday)\\s+)?"; //Non-capturing as we don't use this information private static final String MONTHS = "(Jan(uary)?|Feb(ruary)?|Mar(ch)?|Apr(il)?|May|Jun(e)?|Jul(y)?|Aug(ust)?|Sep(t)?(ember)?|Oct(ober)?|Nov(ember)?|Dec(ember)?)"; private static final String DATE_SUFFIXES = "(st|nd|rd|th)"; private static final String TIME_ZONES = StringUtils.join( Arrays.asList(TimeZone.getAvailableIDs()) .stream().filter(s -> StringUtils.isAllUpperCase(s) && s.length() <= 3) .collect(Collectors.toList()), "|"); @Override protected void doProcessTextBlock(TextBlock block) throws AnalysisEngineProcessException { processIso(block); processTimeOnDate(block); processDayMonthTime(block); processMonthDayTime(block); } @Override public AnalysisEngineAction getAction() { return new AnalysisEngineAction(Collections.emptySet(), ImmutableSet.of(Temporal.class)); } private void processIso(TextBlock block){ Pattern iso8601 = Pattern.compile("\\b(\\d{4})-?(\\d{2})-?(\\d{2})[T ](\\d{2}):?(\\d{2}):?(\\d{2})(\\.\\d{3})?\\s?(Z|[-+]\\d{2}:\\d{2})?\\b"); Matcher m = iso8601.matcher(block.getCoveredText()); while(m.find()){ try{ ZonedDateTime zdt; if(m.group(8) == null){ zdt = ZonedDateTime.parse(m.group().replaceAll(" ", "T")+"Z", DateTimeFormatter.ISO_DATE_TIME); }else{ zdt = ZonedDateTime.parse(m.group().replaceAll(" ", "T"), DateTimeFormatter.ISO_DATE_TIME); } createDateTime(block, m.start(), m.end(), zdt); }catch(DateTimeParseException dtpe){ getMonitor().debug("Unable to parse date time {}", m.group(), dtpe); } } } private void processTimeOnDate(TextBlock block){ Pattern timeOnDate = Pattern.compile("\\b([01][0-9]|2[0-3]):?([0-5][0-9]):?([0-5][0-9])?(hrs)? on ([0-2]?[0-9]|3[01]) "+MONTHS+" (\\d{4}|'?\\d{2})\\b", Pattern.CASE_INSENSITIVE); Matcher m = timeOnDate.matcher(block.getCoveredText()); while(m.find()){ if(m.group(3) != null){ ZonedDateTime zdt = ZonedDateTime.of( DateTimeUtils.asYear(m.group(19)).getValue(), DateTimeUtils.asMonth(m.group(6)).getValue(), Integer.parseInt(m.group(5)), Integer.parseInt(m.group(1)), Integer.parseInt(m.group(2)), Integer.parseInt(m.group(3)), 0, ZoneOffset.UTC); createDateTime(block, m.start(), m.end(), zdt); }else{ ZonedDateTime zdtStart = ZonedDateTime.of( DateTimeUtils.asYear(m.group(19)).getValue(), DateTimeUtils.asMonth(m.group(6)).getValue(), Integer.parseInt(m.group(5)), Integer.parseInt(m.group(1)), Integer.parseInt(m.group(2)), 0, 0, ZoneOffset.UTC); ZonedDateTime zdtEnd = zdtStart.plusMinutes(1); createDateTime(block, m.start(), m.end(), zdtStart, zdtEnd); } } } private void processDayMonthTime(TextBlock block){ Pattern dayMonthTime = Pattern.compile("\\b"+DAYS+"([0-2]?[0-9]|3[01])\\s*"+DATE_SUFFIXES+"?\\s+"+MONTHS+",?\\s+(\\d{4}|'?\\d{2})\\s+([01][0-9]|2[0-3]):?([0-5][0-9]):?([0-5][0-9])?\\s*(Z|"+TIME_ZONES+")?\\b", Pattern.CASE_INSENSITIVE); Matcher m = dayMonthTime.matcher(block.getCoveredText()); while(m.find()){ ZoneId zone; if(m.group(20) == null){ zone = ZoneId.of("Z"); }else{ zone = TimeZone.getTimeZone(m.group(20)).toZoneId(); } if(m.group(19) != null){ ZonedDateTime zdt = ZonedDateTime.of( DateTimeUtils.asYear(m.group(16)).getValue(), DateTimeUtils.asMonth(m.group(3)).getValue(), Integer.parseInt(m.group(1)), Integer.parseInt(m.group(17)), Integer.parseInt(m.group(18)), Integer.parseInt(m.group(19)), 0, zone); createDateTime(block, m.start(), m.end(), zdt); }else{ ZonedDateTime zdtStart = ZonedDateTime.of( DateTimeUtils.asYear(m.group(16)).getValue(), DateTimeUtils.asMonth(m.group(3)).getValue(), Integer.parseInt(m.group(1)), Integer.parseInt(m.group(17)), Integer.parseInt(m.group(18)), 0, 0, zone); ZonedDateTime zdtEnd = zdtStart.plusMinutes(1); createDateTime(block, m.start(), m.end(), zdtStart, zdtEnd); } } } private void processMonthDayTime(TextBlock block){ Pattern monthDayTime = Pattern.compile("\\b"+MONTHS+"\\s+([0-2]?[0-9]|3[01])\\s*"+DATE_SUFFIXES+"?,?\\s+(\\d{4}|'?\\d{2})\\s+([01][0-9]|2[0-3]):?([0-5][0-9]):?([0-5][0-9])?\\s*(Z|"+TIME_ZONES+")?\\b", Pattern.CASE_INSENSITIVE); Matcher m = monthDayTime.matcher(block.getCoveredText()); while(m.find()){ ZoneId zone; if(m.group(20) == null){ zone = ZoneId.of("Z"); }else{ zone = TimeZone.getTimeZone(m.group(20)).toZoneId(); } if(m.group(19) != null){ ZonedDateTime zdt = ZonedDateTime.of( DateTimeUtils.asYear(m.group(16)).getValue(), DateTimeUtils.asMonth(m.group(1)).getValue(), Integer.parseInt(m.group(14)), Integer.parseInt(m.group(17)), Integer.parseInt(m.group(18)), Integer.parseInt(m.group(19)), 0, zone); createDateTime(block, m.start(), m.end(), zdt); }else{ ZonedDateTime zdtStart = ZonedDateTime.of( DateTimeUtils.asYear(m.group(16)).getValue(), DateTimeUtils.asMonth(m.group(1)).getValue(), Integer.parseInt(m.group(14)), Integer.parseInt(m.group(17)), Integer.parseInt(m.group(18)), 0, 0, zone); ZonedDateTime zdtEnd = zdtStart.plusMinutes(1); createDateTime(block, m.start(), m.end(), zdtStart, zdtEnd); } } } private void createDateTime(TextBlock block, Integer charBegin, Integer charEnd, ZonedDateTime zdt){ Temporal dt = block.newAnnotation(Temporal.class, charBegin, charEnd); dt.setConfidence(1.0); dt.setPrecision("EXACT"); dt.setScope("SINGLE"); dt.setTemporalType("DATETIME"); dt.setTimestampStart(zdt.toEpochSecond()); dt.setTimestampStop(zdt.plusSeconds(1).toEpochSecond()); addToJCasIndex(dt); } private void createDateTime(TextBlock block, Integer charBegin, Integer charEnd, ZonedDateTime zdtStart, ZonedDateTime zdtEnd){ Temporal dt = block.newAnnotation(Temporal.class, charBegin, charEnd); dt.setConfidence(1.0); dt.setPrecision("EXACT"); dt.setScope("SINGLE"); dt.setTemporalType("DATETIME"); dt.setTimestampStart(zdtStart.toEpochSecond()); dt.setTimestampStop(zdtEnd.toEpochSecond()); addToJCasIndex(dt); } }