package edu.stanford.nlp.time;
import edu.stanford.nlp.ie.regexp.NumberSequenceClassifier;
import edu.stanford.nlp.ling.CoreAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.Annotator;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.logging.Redwood;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Collections;
import java.util.List;
import java.util.Properties;
import java.util.Set;
/**
* Annotate temporal expressions in text with {@link SUTime}.
* The expressions recognized by SUTime are loosely based on GUTIME.
*
* After annotation, the {@link TimeAnnotations.TimexAnnotations} annotation
* will be populated with a {@code List<CoreMap>}, each of which
* will represent one temporal expression.
*
* If a reference time is set (via {@link edu.stanford.nlp.ling.CoreAnnotations.DocDateAnnotation}),
* then temporal expressions are resolved with respect to the document date. You set it on an
* Annotation as follows:
* <blockquote>{@code annotation.set(CoreAnnotations.DocDateAnnotation.class, "2013-07-14");}</blockquote>
* <p>
* <br>
* <b>Input annotations</b>
* <table border="1">
* <tr>
* <th>Annotation</th>
* <th>Type</th>
* <th>Description</th>
* <th>Required?</th>
* </tr>
* <tr>
* <td>{@link edu.stanford.nlp.ling.CoreAnnotations.DocDateAnnotation}</td>
* <td>{@code String}</td>
* <td>If present, then the string is interpreted as a date/time and
* used as the reference document date with respect to which other
* temporal expressions are resolved</td>
* <td>Optional</td>
* </tr>
* <tr>
* <td>{@link edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation}</td>
* <td>{@code List<CoreMap>}</td>
* <td>If present, time expressions will be extracted from each sentence
* and each sentence will be annotated individually.</td>
* <td>Optional (good to have)</td>
* </tr>
* <tr>
* <td>{@link edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation}</td>
* <td>{@code List<CoreLabel>}</td>
* <td>Tokens (for each sentence or for entire annotation if no sentences)</td>
* <td>Required</td>
* </tr>
* <tr>
* <td>{@link edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation}</td>
* <td>{@code String}</td>
* <td>Text (for each sentence or for entire annotation if no sentences)</td>
* <td>Optional</td>
* </tr>
* <tr><td colspan="4"><center><b>Per token annotations</b></center></td></tr>
* <tr>
* <td>{@link edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation}</td>
* <td>{@code String}</td>
* <td>Token text (normalized)</td>
* <td>Required</td>
* </tr>
* <tr>
* <td>{@link edu.stanford.nlp.ling.CoreAnnotations.OriginalTextAnnotation}</td>
* <td>{@code String}</td>
* <td>Token text (original)</td>
* <td>Required</td>
* </tr>
* <tr>
* <td>{@link edu.stanford.nlp.ling.CoreAnnotations.CharacterOffsetBeginAnnotation}</td>
* <td>{@code Integer}</td>
* <td>The index of the first character of this token
* (0-based wrt to TextAnnotation of the annotation containing the TokensAnnotation).</td>
* <td>Required</td>
* </tr>
* <tr>
* <td>{@link edu.stanford.nlp.ling.CoreAnnotations.CharacterOffsetEndAnnotation}</td>
* <td>{@code Integer}</td>
* <td>The index of the first character after this token
* (0-based wrt to TextAnnotation of the annotation containing the TokensAnnotation).</td>
* <td>Required</td>
* </tr>
* <tr>
* <td>{@link edu.stanford.nlp.ling.CoreAnnotations.PartOfSpeechAnnotation}</td>
* <td>{@code String}</td>
* <td>Token part of speech</td>
* <td>Optional</td>
* </tr>
* </table>
*
* <p>
* <br>
* <b>Output annotations</b>
* <table border="1">
* <tr>
* <th>Annotation</th>
* <th>Type</th>
* <th>Description</th>
* </tr>
* <tr>
* <td>{@link TimeAnnotations.TimexAnnotations}</td>
* <td>{@code List<CoreMap>}</td>
* <td>List of temporal expressions (on the entire annotation and also for each sentence)</td>
* </tr>
* <tr><td colspan="3"><center><b>Per each temporal expression</b></center></td></tr>
* <tr>
* <td>{@link TimeAnnotations.TimexAnnotation}</td>
* <td>{@link Timex}</td>
* <td>Timex object with TIMEX3 XML attributes, use for exporting TIMEX3 information</td>
* </tr>
* <tr>
* <td>{@link TimeExpression.Annotation}</td>
* <td>{@link TimeExpression}</td>
* <td>TimeExpression object. Use {@code getTemporal()} to get internal temporal representation.</td>
* </tr>
* <tr>
* <td>{@link TimeExpression.ChildrenAnnotation}</td>
* <td>{@code List<CoreMap>}</td>
* <td>List of chunks forming this time expression (inner chunks can be tokens, nested time expressions,
* numeric expressions, etc)</td>
* </tr>
* <tr>
* <td>{@link edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation}</td>
* <td>{@code String}</td>
* <td>Text of this time expression</td>
* </tr>
* <tr>
* <td>{@link edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation}</td>
* <td>{@code List<CoreLabel>}</td>
* <td>Tokens that make up this time expression</td>
* </tr>
* <tr>
* <td>{@link edu.stanford.nlp.ling.CoreAnnotations.CharacterOffsetBeginAnnotation}</td>
* <td>{@code Integer}</td>
* <td>The index of the first character of this token (0-based).</td>
* </tr>
* <tr>
* <td>{@link edu.stanford.nlp.ling.CoreAnnotations.CharacterOffsetEndAnnotation}</td>
* <td>{@code Integer}</td>
* <td>The index of the first character after this token (0-based).</td>
* </tr>
* <tr>
* <td>{@link edu.stanford.nlp.ling.CoreAnnotations.TokenBeginAnnotation}</td>
* <td>{@code Integer}</td>
* <td>The index of the first token of this time expression (0-based).</td>
* </tr>
* <tr>
* <td>{@link edu.stanford.nlp.ling.CoreAnnotations.TokenEndAnnotation}</td>
* <td>{@code Integer}</td>
* <td>The index of the first token after this time expression (0-based).</td>
* </tr>
* </table>
*/
public class TimeAnnotator implements Annotator {
/** A logger for this class */
private static final Redwood.RedwoodChannels log = Redwood.channels(TimeAnnotator.class);
private final TimeExpressionExtractorImpl timexExtractor;
private final boolean quiet;
public TimeAnnotator() {
this(false);
}
public TimeAnnotator(boolean quiet) {
timexExtractor = new TimeExpressionExtractorImpl();
this.quiet = quiet;
}
public TimeAnnotator(String name, Properties props) {
this(name, props, false);
}
public TimeAnnotator(String name, Properties props, boolean quiet) {
timexExtractor = new TimeExpressionExtractorImpl(name, props);
this.quiet = quiet;
}
@Override
public void annotate(Annotation annotation) {
SUTime.TimeIndex timeIndex = new SUTime.TimeIndex();
String docDate = annotation.get(CoreAnnotations.DocDateAnnotation.class);
if (docDate == null) {
Calendar cal = annotation.get(CoreAnnotations.CalendarAnnotation.class);
if (cal == null) {
if ( ! quiet) { log.warn("No document date specified"); }
} else {
SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd:hh:mm:ss");
docDate = dateFormat.format(cal.getTime());
}
}
List<CoreMap> allTimeExpressions; // initialized below = null;
List<CoreMap> sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class);
if (sentences != null) {
allTimeExpressions = new ArrayList<>();
List<CoreMap> allNumerics = new ArrayList<>();
for (CoreMap sentence: sentences) {
// make sure that token character offsets align with the actual sentence text
// They may not align due to token normalizations, such as "(" to "-LRB-".
CoreMap alignedSentence = NumberSequenceClassifier.alignSentence(sentence);
// uncomment the next line for verbose dumping of tokens....
// log.info("SENTENCE: " + ((ArrayCoreMap) sentence).toShorterString());
List<CoreMap> timeExpressions =
timexExtractor.extractTimeExpressionCoreMaps(alignedSentence, docDate, timeIndex);
if (timeExpressions != null) {
allTimeExpressions.addAll(timeExpressions);
sentence.set(TimeAnnotations.TimexAnnotations.class, timeExpressions);
for (CoreMap timeExpression:timeExpressions) {
timeExpression.set(CoreAnnotations.SentenceIndexAnnotation.class, sentence.get(CoreAnnotations.SentenceIndexAnnotation.class));
}
}
List<CoreMap> numbers = alignedSentence.get(CoreAnnotations.NumerizedTokensAnnotation.class);
if(numbers != null){
sentence.set(CoreAnnotations.NumerizedTokensAnnotation.class, numbers);
allNumerics.addAll(numbers);
}
}
annotation.set(CoreAnnotations.NumerizedTokensAnnotation.class, allNumerics);
} else {
allTimeExpressions = annotateSingleSentence(annotation, docDate, timeIndex);
}
annotation.set(TimeAnnotations.TimexAnnotations.class, allTimeExpressions);
}
/**
* Helper method for people not working from a complete Annotation.
*
* @return A list of CoreMap. Each CoreMap represents a detected temporal expression.
*/
public List<CoreMap> annotateSingleSentence(CoreMap sentence, String docDate, SUTime.TimeIndex timeIndex) {
CoreMap annotationCopy = NumberSequenceClassifier.alignSentence(sentence);
if (docDate != null && docDate.isEmpty()) {
docDate = null;
}
return timexExtractor.extractTimeExpressionCoreMaps(annotationCopy, docDate, timeIndex);
}
@Override
public Set<Class<? extends CoreAnnotation>> requires() {
return Collections.singleton(CoreAnnotations.TokensAnnotation.class);
}
@Override
public Set<Class<? extends CoreAnnotation>> requirementsSatisfied() {
return Collections.singleton(TimeAnnotations.TimexAnnotations.class);
}
}