//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.annotators.regex;
import java.time.DateTimeException;
import java.time.LocalDate;
import java.time.Year;
import java.time.YearMonth;
import java.time.ZoneOffset;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import com.google.common.collect.ImmutableSet;
import uk.gov.dstl.baleen.annotators.helpers.DateTimeUtils;
import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction;
import uk.gov.dstl.baleen.types.semantic.Temporal;
import uk.gov.dstl.baleen.uima.BaleenTextAwareAnnotator;
import uk.gov.dstl.baleen.uima.data.TextBlock;
/**
* Annotate dates and date ranges as Temporal entities. The following examples show the types of dates and ranges that are detected.
*
* <ul>
* <li>1 December 2016</li>
* <li>December 1 2016</li>
* <li>2016-12-01</li>
* <li>1/12/2016</li>
* <li>2011-14</li>
* <li>2011-2016</li>
* <li>March 2015</li>
* <li>late August 2016</li>
* <li>June-September 2015</li>
* <li>June 2015 - September 2016</li>
* <li>10-15 Jan 2015</li>
* <li>10/11 Jan 2015</li>
* <li>27th September - 4th October 2016</li>
* <li>23 December 2016 - 2nd January 2017</li>
* </ul>
*
* The word 'to' is supported in place of a hyphen, as is the word 'and' if the expression is preceded by 'between'.
*
* Years on their own will only extracted for the range 1970-2099 to reduce false positives. Two digit years on their own will not be extracted.
*
* @baleen.javadoc
*/
public class Date extends BaleenTextAwareAnnotator{
/**
* Should we use American dates where applicable (i.e. mm-dd-yy)
*
* @baleen.config false
*/
public static final String PARAM_AMERICAN_FORMAT = "americanDates";
@ConfigurationParameter(name = PARAM_AMERICAN_FORMAT, defaultValue="false")
private boolean americanDates;
private static final String DAYS = "(?:(?:Mon|Monday|Tue|Tues|Tuesday|Wed|Wednesday|Thu|Thurs|Thursday|Fri|Friday|Sat|Saturday|Sun|Sunday)\\s+)?"; //Non-capturing as we don't use this information
private static final String MONTHS = "(Jan(\\.|uary)?|Feb(\\.|ruary)?|Mar(\\.|ch)?|Apr(\\.|il)?|May|Jun(\\.|e)?|Jul(\\.|y)?|Aug(\\.|ust)?|Sep(\\.|t(\\.|ember)?)?|Oct(\\.|ober)?|Nov(\\.|ember)?|Dec(\\.|ember)?)";
private static final String DATES = "([1-9]|[12][0-9]|3[01])\\s*";
private static final String DATE_SUFFIXES = "(st|nd|rd|th)";
private static final String EXACT = "EXACT";
private static final String RANGE = "RANGE";
private static final String SINGLE = "SINGLE";
private static final String DATE_TYPE = "DATE";
private static final String INVALID_DATE_FOUND = "Invalid date found";
private List<Temporal> extracted;
@Override
protected void doProcessTextBlock(TextBlock block) throws AnalysisEngineProcessException {
extracted = new ArrayList<>();
identifyYearRanges(block);
identifyMonthYearRanges(block);
identifyDayMonthYearRanges(block);
identifyDates(block);
identifyMonths(block);
identifyYears(block);
}
@Override
public AnalysisEngineAction getAction() {
return new AnalysisEngineAction(Collections.emptySet(), ImmutableSet.of(Temporal.class));
}
private void identifyYearRanges(TextBlock block){
Pattern longYearShortYear = Pattern.compile("\\b(\\d{2})(\\d{2})-(\\d{2})\\b", Pattern.CASE_INSENSITIVE);
String text = block.getCoveredText();
Matcher m = longYearShortYear.matcher(text);
while(m.find()){
if(dateSeparatorSuffix(text, m.end())){
continue;
}
Year y1 = Year.parse(m.group(1)+m.group(2));
Year y2 = Year.parse(m.group(1)+m.group(3));
createYearTimeRange(block, m.start(), m.end(), y1, y2);
}
Pattern longYearLongYear = Pattern.compile("\\b(\\d{4})\\s*(-|to|and)\\s*(\\d{4})\\b", Pattern.CASE_INSENSITIVE);
m = longYearLongYear.matcher(text);
while(m.find()){
if("and".equalsIgnoreCase(m.group(2)) && !betweenPrefix(text, m.start())){
continue;
}
Year y1 = Year.parse(m.group(1));
Year y2 = Year.parse(m.group(3));
createYearTimeRange(block, m.start(), m.end(), y1, y2);
}
}
private void createYearTimeRange(TextBlock block, Integer charBegin, Integer charEnd, Year y1, Year y2){
Temporal dtg = createExactRangeDate(block, charBegin, charEnd);
LocalDate start = y1.atDay(1);
LocalDate end = y2.plusYears(1).atDay(1);
dtg.setTimestampStart(start.atStartOfDay(ZoneOffset.UTC).toEpochSecond());
dtg.setTimestampStop(end.atStartOfDay(ZoneOffset.UTC).toEpochSecond());
addToJCasIndex(dtg);
extracted.add(dtg);
}
private void identifyMonthYearRanges(TextBlock block){
Pattern sameYear = Pattern.compile("\\b"+MONTHS+"\\s*(-|to|and)\\s*"+MONTHS+"\\s+(\\d{4}|'?\\d{2})\\b", Pattern.CASE_INSENSITIVE);
String text = block.getCoveredText();
Matcher m = sameYear.matcher(text);
while(m.find()){
if("and".equalsIgnoreCase(m.group(14)) && !betweenPrefix(text, m.start())){
continue;
}
Year y = DateTimeUtils.asYear(m.group(28));
String m1 = m.group(1);
if(m1.endsWith("."))
m1 = m1.substring(0, m1.length() - 1);
String m2 = m.group(15);
if(m2.endsWith("."))
m2 = m2.substring(0, m2.length() - 1);
YearMonth ym1 = y.atMonth(DateTimeUtils.asMonth(m1));
YearMonth ym2 = y.atMonth(DateTimeUtils.asMonth(m2));
createMonthYearTimeRange(block, m.start(), m.end(), ym1, ym2);
}
Pattern diffYear = Pattern.compile("\\b"+MONTHS+"\\s+(\\d{4}|'?\\d{2})\\s*(-|to|and)\\s*"+MONTHS+"\\s+(\\d{4}|'?\\d{2})\\b", Pattern.CASE_INSENSITIVE);
m = diffYear.matcher(text);
while(m.find()){
if("and".equalsIgnoreCase(m.group(15)) && !betweenPrefix(text, m.start())){
continue;
}
String m1 = m.group(1);
if(m1.endsWith("."))
m1 = m1.substring(0, m1.length() - 1);
String m2 = m.group(16);
if(m2.endsWith("."))
m2 = m2.substring(0, m2.length() - 1);
Year y1 = DateTimeUtils.asYear(m.group(14));
YearMonth ym1 = y1.atMonth(DateTimeUtils.asMonth(m1));
Year y2 = DateTimeUtils.asYear(m.group(29));
YearMonth ym2 = y2.atMonth(DateTimeUtils.asMonth(m2));
createMonthYearTimeRange(block, m.start(), m.end(), ym1, ym2);
}
}
private void createMonthYearTimeRange(TextBlock block, Integer charBegin, Integer charEnd, YearMonth ym1, YearMonth ym2){
Temporal dtg = createExactRangeDate(block, charBegin, charEnd);
LocalDate start = ym1.atDay(1);
LocalDate end = ym2.plusMonths(1).atDay(1);
dtg.setTimestampStart(start.atStartOfDay(ZoneOffset.UTC).toEpochSecond());
dtg.setTimestampStop(end.atStartOfDay(ZoneOffset.UTC).toEpochSecond());
addToJCasIndex(dtg);
extracted.add(dtg);
}
private void identifyDayMonthYearRanges(TextBlock block){
Pattern sameMonth = Pattern.compile("\\b"+DAYS+"([0-2]?[0-9]|3[01])\\s*"+DATE_SUFFIXES+"?\\s*(-|to|and|\\\\|/)\\s*"+DAYS+"([0-2]?[0-9]|3[01])\\s*"+DATE_SUFFIXES+"?\\s+"+MONTHS+"\\s+(\\d{4}|'?\\d{2})\\b", Pattern.CASE_INSENSITIVE);
String text = block.getCoveredText();
Matcher m = sameMonth.matcher(text);
while(m.find()){
if(!DateTimeUtils.suffixCorrect(Integer.parseInt(m.group(1)), m.group(2)) || !DateTimeUtils.suffixCorrect(Integer.parseInt(m.group(4)), m.group(5))){
continue;
}
Year y = DateTimeUtils.asYear(m.group(19));
String month = m.group(6);
if(month.endsWith("."))
month = month.substring(0, month.length() - 1);
YearMonth ym = y.atMonth(DateTimeUtils.asMonth(month));
LocalDate ld1;
LocalDate ld2;
try{
ld1 = ym.atDay(Integer.parseInt(m.group(1)));
ld2 = ym.atDay(Integer.parseInt(m.group(4)));
}catch(DateTimeException dte){
getMonitor().warn(INVALID_DATE_FOUND, dte);
continue;
}
if(("and".equalsIgnoreCase(m.group(3)) && !betweenPrefix(text, m.start())) || "/".equals(m.group(3)) || "\\".equals(m.group(3))){
if(ld2.equals(ld1.plusDays(1))){
//Create time range
createDayMonthYearRange(block, m.start(), m.end(), ld1, ld2);
}else{
//Create separate dates as they're not adjacent
createDate(block, m.start(4), m.end(), ld2);
Temporal t = createDate(block, m.start(), m.end(), ld1);
if(t != null)
t.setValue(text.substring(m.start(), m.start(3)).trim() + " " + text.substring(m.start(6), m.end()).trim());
}
}else{
//Create time range
createDayMonthYearRange(block, m.start(), m.end(), ld1, ld2);
}
}
Pattern sameYear = Pattern.compile("\\b"+DAYS+DATES+DATE_SUFFIXES+"?\\s+"+MONTHS+"\\s*(-|to|and)\\s*"+DAYS+DATES+DATE_SUFFIXES+"?\\s+"+MONTHS+"\\s+(\\d{4}|'?\\d{2})\\b", Pattern.CASE_INSENSITIVE);
m = sameYear.matcher(text);
while(m.find()){
Boolean suffixesCorrect = DateTimeUtils.suffixCorrect(Integer.parseInt(m.group(1)), m.group(2)) && DateTimeUtils.suffixCorrect(Integer.parseInt(m.group(17)), m.group(18));
Boolean andNotBetween = "and".equalsIgnoreCase(m.group(16)) && !betweenPrefix(text, m.start());
if(!suffixesCorrect || andNotBetween){
continue;
}
String m1 = m.group(3);
if(m1.endsWith("."))
m1 = m1.substring(0, m1.length() - 1);
String m2 = m.group(19);
if(m2.endsWith("."))
m2 = m2.substring(0, m2.length() - 1);
Year y = DateTimeUtils.asYear(m.group(32));
YearMonth ym1 = y.atMonth(DateTimeUtils.asMonth(m1));
YearMonth ym2 = y.atMonth(DateTimeUtils.asMonth(m2));
try{
createDayMonthYearRange(block, m.start(), m.end(),
ym1.atDay(Integer.parseInt(m.group(1))),
ym2.atDay(Integer.parseInt(m.group(17))));
}catch(DateTimeException dte){
getMonitor().warn(INVALID_DATE_FOUND, dte);
continue;
}
}
Pattern fullDates = Pattern.compile("\\b"+DAYS+"([0-2]?[0-9]|3[01])\\s*"+DATE_SUFFIXES+"?\\s+"+MONTHS+"\\s+(\\d{4}|'?\\d{2})\\s*(-|to|and)\\s*"+DAYS+"([0-2]?[0-9]|3[01])\\s*"+DATE_SUFFIXES+"?\\s+"+MONTHS+"\\s+(\\d{4}|'?\\d{2})\\b", Pattern.CASE_INSENSITIVE);
m = fullDates.matcher(text);
while(m.find()){
Boolean suffixesCorrect = DateTimeUtils.suffixCorrect(Integer.parseInt(m.group(1)), m.group(2)) && DateTimeUtils.suffixCorrect(Integer.parseInt(m.group(18)), m.group(19));
Boolean andNotBetween = "and".equalsIgnoreCase(m.group(17)) && !betweenPrefix(text, m.start());
if(!suffixesCorrect || andNotBetween){
continue;
}
String m1 = m.group(3);
if(m1.endsWith("."))
m1 = m1.substring(0, m1.length() - 1);
String m2 = m.group(20);
if(m2.endsWith("."))
m2 = m2.substring(0, m2.length() - 1);
Year y1 = DateTimeUtils.asYear(m.group(16));
YearMonth ym1 = y1.atMonth(DateTimeUtils.asMonth(m1));
Year y2 = DateTimeUtils.asYear(m.group(33));
YearMonth ym2 = y2.atMonth(DateTimeUtils.asMonth(m2));
try{
createDayMonthYearRange(block, m.start(), m.end(),
ym1.atDay(Integer.parseInt(m.group(1))),
ym2.atDay(Integer.parseInt(m.group(18))));
}catch(DateTimeException dte){
getMonitor().warn(INVALID_DATE_FOUND, dte);
}
}
}
private void createDayMonthYearRange(TextBlock block, Integer charBegin, Integer charEnd, LocalDate ld1, LocalDate ld2){
Temporal dtg = createExactRangeDate(block, charBegin, charEnd);
dtg.setTimestampStart(ld1.atStartOfDay(ZoneOffset.UTC).toEpochSecond());
dtg.setTimestampStop(ld2.plusDays(1).atStartOfDay(ZoneOffset.UTC).toEpochSecond());
addToJCasIndex(dtg);
extracted.add(dtg);
}
private void identifyDates(TextBlock block){
Pattern fullDateDayMonth = Pattern.compile("\\b"+DAYS+DATES+DATE_SUFFIXES+"?\\s+"+MONTHS+",?\\s+(\\d{4}|'?\\d{2}\\b)", Pattern.CASE_INSENSITIVE);
String text = block.getCoveredText();
Matcher m = fullDateDayMonth.matcher(text);
while(m.find()){
createDateFromMatcher(block, m, 16, 3, 1);
}
Pattern fullDateMonthDay = Pattern.compile("\\b"+MONTHS+"\\s+([0-2]?[0-9]|3[01])\\s*"+DATE_SUFFIXES+"?,?\\s+(\\d{4}|'?\\d{2}\\b)", Pattern.CASE_INSENSITIVE);
m = fullDateMonthDay.matcher(text);
while(m.find()){
createDateFromMatcher(block, m, 16, 1, 14);
}
Pattern shortDateYearFirst = Pattern.compile("\\b(\\d{4})[-\\\\/\\.](0?[1-9]|1[0-2])[-\\\\/\\.]([0-2]?[0-9]|3[01])\\b", Pattern.CASE_INSENSITIVE);
m = shortDateYearFirst.matcher(text);
while(m.find()){
createDateFromMatcher(block, m, 1, 2, 3);
}
Pattern shortDate = Pattern.compile("\\b([0-2]?[0-9]|3[01])[-\\\\/\\.]([0-2]?[0-9]|3[01])[-\\\\/\\.](\\d{4}|\\d{2})\\b", Pattern.CASE_INSENSITIVE);
m = shortDate.matcher(text);
while(m.find()){
Year y = DateTimeUtils.asYear(m.group(3));
Integer n1 = Integer.parseInt(m.group(1));
Integer n2 = Integer.parseInt(m.group(2));
Integer day;
Integer month;
if(n1 >= 1 && n1 <= 12){
//n1 could be a month or a day
if(n2 >= 12 && n2 <= 31){
//n2 must be a day
month = n1;
day = n2;
}else if(n2 >= 1 && n2 <= 12){
if(americanDates){
day = n2;
month = n1;
}else{
day = n1;
month = n2;
}
}else{
//invalid combination of n1 and n2
continue;
}
}else if(n1 >= 1 && n1 <= 31){
//n1 must be a day
day = n1;
if(n2 >= 1 && n2 <= 12){
//n2 must be a month
month = n2;
}else{
//invalid combination of n1 and n2
continue;
}
}else{
//n1 can't be a month or a day
continue;
}
YearMonth ym = y.atMonth(month);
LocalDate ld;
try{
ld = ym.atDay(day);
}catch(DateTimeException dte){
getMonitor().warn(INVALID_DATE_FOUND, dte);
continue;
}
createDate(block, m.start(), m.end(), ld);
}
}
private Temporal createDate(TextBlock block, Integer charBegin, Integer charEnd, LocalDate ld){
//Check the date isn't already covered by a range
if(alreadyExtracted(block, charBegin, charEnd)) {
return null;
}
Temporal date = createExactSingleDate(block, charBegin, charEnd);
date.setTimestampStart(ld.atStartOfDay(ZoneOffset.UTC).toEpochSecond());
date.setTimestampStop(ld.plusDays(1).atStartOfDay(ZoneOffset.UTC).toEpochSecond());
addToJCasIndex(date);
extracted.add(date);
return date;
}
private boolean alreadyExtracted(TextBlock block, Integer blockBegin, Integer blockEnd) {
int docBegin = block.toDocumentOffset(blockBegin);
int docEnd = block.toDocumentOffset(blockEnd);
for(Temporal t : extracted){
if(t.getBegin() <= docBegin && t.getEnd() >= docEnd){
return true;
}
}
return false;
}
private void identifyMonths(TextBlock block){
Pattern monthYear = Pattern.compile("\\b((beginning of|start of|early|mid|late|end of)[- ])?"+MONTHS+"\\s+(\\d{4}|'?\\d{2}\\b)", Pattern.CASE_INSENSITIVE);
String text = block.getCoveredText();
Matcher m = monthYear.matcher(text);
while(m.find()){
Year y = DateTimeUtils.asYear(m.group(16));
String month = m.group(3);
if(month.endsWith("."))
month = month.substring(0, month.length() - 1);
YearMonth ym = y.atMonth(DateTimeUtils.asMonth(month));
if(m.group(2) != null){
LocalDate ld1;
LocalDate ld2;
switch(m.group(2).toLowerCase()){
case "beginning of":
case "start of":
ld1 = ym.atDay(1);
ld2 = ym.atDay(5);
break;
case "early":
ld1 = ym.atDay(1);
ld2 = ym.atDay(10);
break;
case "mid":
ld1 = ym.atDay(11);
ld2 = ym.atDay(20);
break;
case "late":
ld1 = ym.atDay(21);
ld2 = ym.atEndOfMonth();
break;
case "end of":
ld1 = ym.atEndOfMonth().minusDays(5);
ld2 = ym.atEndOfMonth();
break;
default:
continue;
}
createDayMonthYearRange(block, m.start(), m.end(), ld1, ld2);
}else{
createMonth(block, m.start(), m.end(), ym);
}
}
}
private void createMonth(TextBlock block, Integer charBegin, Integer charEnd, YearMonth ym){
//Check the date isn't already covered by a range
if(alreadyExtracted(block, charBegin, charEnd)) {
return;
}
Temporal date = createExactSingleDate(block, charBegin, charEnd);
LocalDate start = ym.atDay(1);
LocalDate end = ym.atEndOfMonth();
date.setTimestampStart(start.atStartOfDay(ZoneOffset.UTC).toEpochSecond());
date.setTimestampStop(end.plusDays(1).atStartOfDay(ZoneOffset.UTC).toEpochSecond());
addToJCasIndex(date);
extracted.add(date);
}
private void identifyYears(TextBlock block){
Pattern monthYear = Pattern.compile("\\b(19[789][0-9]|20[0-9][0-9])\\b", Pattern.CASE_INSENSITIVE);
String text = block.getCoveredText();
Matcher m = monthYear.matcher(text);
while(m.find()){
Year y = DateTimeUtils.asYear(m.group(1));
createYear(block, m.start(), m.end(), y);
}
}
private void createYear(TextBlock block, Integer charBegin, Integer charEnd, Year y){
//Check the date isn't already covered by a range
if(alreadyExtracted(block, charBegin, charEnd)) {
return;
}
Temporal date = createExactSingleDate(block, charBegin, charEnd);
LocalDate start = y.atDay(1);
LocalDate end;
if(y.isLeap()){
end = y.atDay(366);
}else{
end = y.atDay(365);
}
date.setTimestampStart(start.atStartOfDay(ZoneOffset.UTC).toEpochSecond());
date.setTimestampStop(end.plusDays(1).atStartOfDay(ZoneOffset.UTC).toEpochSecond());
addToJCasIndex(date);
extracted.add(date);
}
private static boolean betweenPrefix (String text, Integer matchStart){
return text.substring(0, matchStart)
.trim().toLowerCase()
.endsWith("between");
}
private static boolean dateSeparatorSuffix (String text, Integer matchEnd){
if(matchEnd >= text.length())
return false;
String nextChar = text.substring(matchEnd, matchEnd + 1);
return "-".equals(nextChar) || "/".equals(nextChar) || "\\".equals(nextChar);
}
private void createDateFromMatcher(TextBlock block, Matcher m, Integer yearGroup, Integer monthGroup, Integer dayGroup){
Year y = DateTimeUtils.asYear(m.group(yearGroup));
String month = m.group(monthGroup);
if(month.endsWith("."))
month = month.substring(0, month.length() - 1);
YearMonth ym = y.atMonth(DateTimeUtils.asMonth(month));
LocalDate ld;
try{
ld = ym.atDay(Integer.parseInt(m.group(dayGroup)));
}catch(DateTimeException dte){
getMonitor().warn(INVALID_DATE_FOUND, dte);
return;
}
createDate(block, m.start(), m.end(), ld);
}
private Temporal createExactSingleDate(TextBlock block, Integer charBegin, Integer charEnd){
Temporal date = block.newAnnotation(Temporal.class, charBegin, charEnd);
date.setConfidence(1.0);
date.setPrecision(EXACT);
date.setScope(SINGLE);
date.setTemporalType(DATE_TYPE);
return date;
}
private Temporal createExactRangeDate(TextBlock block, Integer charBegin, Integer charEnd){
Temporal date = block.newAnnotation(Temporal.class, charBegin, charEnd);
date.setConfidence(1.0);
date.setPrecision(EXACT);
date.setScope(RANGE);
date.setTemporalType(DATE_TYPE);
return date;
}
}