//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.annotators.regex;
import java.util.Collections;
import java.util.regex.Matcher;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.jcas.JCas;
import com.google.common.collect.ImmutableSet;
import uk.gov.dstl.baleen.annotators.regex.helpers.AbstractRegexAnnotator;
import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction;
import uk.gov.dstl.baleen.types.semantic.Temporal;
/**
* Extracts unqualified dates from text and annotates them as Temporal entities.
* We take an unqualified date to be any date without a year for the purposes of this annotator.
*
* @baleen.javadoc
*/
public class UnqualifiedDate extends AbstractRegexAnnotator<Temporal> {
private static final String DAYS = "(Mon(day)?+|Tue(s(day)?+)?+|Wed(nesday)?+|Thu(r(s(day)?+)?+)?+|Fri(day)?+|Sat(urday)?+|Sun(day)?+)";
private static final String SUFFIXES = "(st|nd|rd|th)";
private static final String MONTHS = "(Jan(uary)?+|Feb(ruary)?+|Mar(ch)?+|Apr(il)?+|May|Jun(e)?+|Jul(y)?+|Aug(ust)?+|Sep(t(ember)?+)?+|Oct(ober)?+|Nov(ember)?+|Dec(ember)?+)";
private static final String PATTERN = "\\b(("+DAYS+" )?((([1-9]|[12][0-9]|3[01])"+SUFFIXES+"?+ (?:of )?"+MONTHS+"|"+MONTHS+" ([1-9]|[12][0-9]|3[01])"+SUFFIXES+"?+|"+MONTHS+"|([1-9]|[12][0-9]|3[01])"+SUFFIXES+")+)|"+DAYS+" ?)\\b(\\s*(\\d{4}|'?\\d{2}))?";
/**
* Allow lower case letters for months and days?
*
* @baleen.config false
*/
public static final String PARAM_ALLOW_LOWERCASE = "allowLowercase";
@ConfigurationParameter(name = PARAM_ALLOW_LOWERCASE, defaultValue="false")
private boolean allowLowercase;
/**
* New instance
*/
public UnqualifiedDate(){
super(PATTERN, false, 1.0);
}
@Override
protected Temporal create(JCas jCas, Matcher matcher) {
if(matcher.group(72) != null){
return null;
}
if(!allowLowercase && (!startsWithCapital(matcher.group(2))
|| !startsWithCapital(matcher.group(18))
|| !startsWithCapital(matcher.group(31))
|| !startsWithCapital(matcher.group(46))
|| !startsWithCapital(matcher.group(61))))
return null;
Temporal t = new Temporal(jCas);
t.setConfidence(1.0);
t.setPrecision("UNQUALIFIED");
t.setScope("SINGLE");
t.setTemporalType("DATE");
return t;
}
/**
* Returns true if the String s starts with a capital letter
*/
public static boolean startsWithCapital(String s){
if(s == null || s.length() == 0)
return true;
String letter = s.substring(0, 1);
return letter.toUpperCase().equals(letter);
}
@Override
public AnalysisEngineAction getAction() {
return new AnalysisEngineAction(Collections.emptySet(), ImmutableSet.of(Temporal.class));
}
}