HeidelTimeAnnotator.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.time;

import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.io.RuntimeIOException;
import edu.stanford.nlp.ling.CoreAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.Annotator;
import edu.stanford.nlp.util.*;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.dom.Text;

import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.util.*;
import java.util.regex.Pattern;


/**
 * Annotates text using HeidelTime.
 *
 * GUTIME/TimeML specifications can be found at:
 * <a href="http://www.timeml.org/site/tarsqi/modules/gutime/index.html">
 * http://www.timeml.org/site/tarsqi/modules/gutime/index.html</a>.
 *
 * @author Gabor Angeli
 */
public class HeidelTimeAnnotator implements Annotator {

  // TODO HeidelTime doesn't actually run on the NLP machines :( (TreeTagger doesn't run.)
  // This could probably be fixed in newer HeidelTime versions, which even support using our tagger.

  private static final String BASE_PATH = "$NLP_DATA_HOME/packages/heideltime/";
  private static final String DEFAULT_PATH = DataFilePaths.convert(BASE_PATH);
  private final File heideltimePath;
  private final boolean outputResults;

  // if used in a pipeline or constructed with a Properties object,
  // this property tells the annotator where to find the script
  public static final String HEIDELTIME_PATH_PROPERTY = "heideltime.path";
  public static final String HEIDELTIME_OUTPUT_RESULTS = "heideltime.outputResults";

  public HeidelTimeAnnotator() {
    this(new File(System.getProperty("heideltime", DEFAULT_PATH)));
  }

  public HeidelTimeAnnotator(File heideltimePath) {
    this.heideltimePath = heideltimePath;
    this.outputResults = false;
  }

  public HeidelTimeAnnotator(String name, Properties props) {
    String path = props.getProperty(HEIDELTIME_PATH_PROPERTY,
            System.getProperty("heideltime",
                    DEFAULT_PATH));
    this.heideltimePath = new File(path);

    this.outputResults =
            Boolean.valueOf(props.getProperty(HEIDELTIME_OUTPUT_RESULTS, "false"));
  }

  @Override
  public void annotate(Annotation annotation) {
    try {
      this.annotate((CoreMap)annotation);
    } catch (IOException e) {
      throw new RuntimeIOException(e);
    }
  }

  public void annotate(CoreMap document) throws IOException {
    //--Create Input File
    //(create file)
    File inputFile = File.createTempFile("heideltime", ".input");
    //(write to file)
    PrintWriter inputWriter = new PrintWriter(inputFile);
    inputWriter.println(document.get(CoreAnnotations.TextAnnotation.class));
    inputWriter.close();

    //--Get Date
    //(error checks)
    if(!document.containsKey(CoreAnnotations.CalendarAnnotation.class) && !document.containsKey(CoreAnnotations.DocDateAnnotation.class)){
      throw new IllegalArgumentException("CoreMap must have either a Calendar or DocDate annotation"); //not strictly necessary, technically...
    }
    //(variables)
    Calendar dateCalendar = document.get(CoreAnnotations.CalendarAnnotation.class);
    String pubDate = null;
    if (dateCalendar != null) {
      //(case: calendar annotation)
      pubDate = String.format("%TF", dateCalendar);
    } else {
      //(case: docdateannotation)
      String s = document.get(CoreAnnotations.DocDateAnnotation.class);
      if (s != null) {
        pubDate = s;
      }
    }

    //--Build Command
    ArrayList<String> args = new ArrayList<>();
    args.add("java");
    args.add("-jar"); args.add(this.heideltimePath.getPath() + "/heideltime.jar");
    args.add("-c"); args.add(this.heideltimePath.getPath()+"/config.props");
    args.add("-t"); args.add("NEWS");
    if(pubDate != null){
      args.add("-dct"); args.add(pubDate);
    }
    args.add(inputFile.getPath());
    // run HeidelTime on the input file
    ProcessBuilder process = new ProcessBuilder(args);

    StringWriter outputWriter = new StringWriter();
    SystemUtils.run(process, outputWriter, null);
    String output = outputWriter.getBuffer().toString();
    Pattern docClose = Pattern.compile("</DOC>.*", Pattern.DOTALL);
    output = docClose.matcher(output).replaceAll("</DOC>").replaceAll("<!DOCTYPE TimeML SYSTEM \"TimeML.dtd\">",""); //TODO TimeML.dtd? FileNotFoundException if we leave it in
    Pattern badNestedTimex = Pattern.compile(Pattern.quote("<T</TIMEX3>IMEX3"));
    output = badNestedTimex.matcher(output).replaceAll("</TIMEX3><TIMEX3");
    Pattern badNestedTimex2 = Pattern.compile(Pattern.quote("<TI</TIMEX3>MEX3"));
    output = badNestedTimex2.matcher(output).replaceAll("</TIMEX3><TIMEX3");
    output = output.replaceAll("\\n\\n<TimeML>\\n\\n","<TimeML>");

    // parse the HeidelTime output
    Element outputXML;
    try {
      outputXML = XMLUtils.parseElement(output);
    } catch (Exception ex) {
      throw new RuntimeException(String.format("error:\n%s\ninput:\n%s\noutput:\n%s",
              ex, IOUtils.slurpFile(inputFile), output), ex);
    }
    inputFile.delete();

    // get Timex annotations
    List<CoreMap> timexAnns = toTimexCoreMaps(outputXML, document);
    document.set(TimeAnnotations.TimexAnnotations.class, timexAnns);
    if (outputResults) {
      System.out.println(timexAnns);
    }

    // align Timex annotations to sentences
    int timexIndex = 0;
    for (CoreMap sentence: document.get(CoreAnnotations.SentencesAnnotation.class)) {
      int sentBegin = beginOffset(sentence);
      int sentEnd = endOffset(sentence);

      // skip times before the sentence
      while (timexIndex < timexAnns.size() && beginOffset(timexAnns.get(timexIndex)) < sentBegin) {
        ++timexIndex;
      }

      // determine times within the sentence
      int sublistBegin = timexIndex;
      int sublistEnd = timexIndex;
      while (timexIndex < timexAnns.size() &&
              sentBegin <= beginOffset(timexAnns.get(timexIndex)) &&
              endOffset(timexAnns.get(timexIndex)) <= sentEnd) {
        ++sublistEnd;
        ++timexIndex;
      }

      // set the sentence timexes
      sentence.set(TimeAnnotations.TimexAnnotations.class, timexAnns.subList(sublistBegin, sublistEnd));
    }

  }


  private static int beginOffset(CoreMap ann) {
    return ann.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
  }

  private static int endOffset(CoreMap ann) {
    return ann.get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
  }

  private static List<CoreMap> toTimexCoreMaps(Element docElem, CoreMap originalDocument) {
    //--Collect Token Offsets
    Map<Integer,Integer> beginMap = Generics.newHashMap();
    Map<Integer,Integer> endMap = Generics.newHashMap();
    boolean haveTokenOffsets = true;
    for(CoreMap sent : originalDocument.get(CoreAnnotations.SentencesAnnotation.class)){
      for(CoreLabel token : sent.get(CoreAnnotations.TokensAnnotation.class)){
        Integer tokBegin = token.get(CoreAnnotations.TokenBeginAnnotation.class);
        Integer tokEnd = token.get(CoreAnnotations.TokenEndAnnotation.class);
        if(tokBegin == null || tokEnd == null){ haveTokenOffsets = false; }
        int charBegin = token.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
        int charEnd = token.get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
        beginMap.put(charBegin,tokBegin);
        endMap.put(charEnd,tokEnd);
      }
    }
    List<CoreMap> timexMaps = new ArrayList<>();
    int offset = 0;
    NodeList docNodes = docElem.getChildNodes();
    for (int i = 0; i < docNodes.getLength(); i++) {
      Node content = docNodes.item(i);
      if (content instanceof Text) {
        Text text = (Text)content;
        offset += text.getWholeText().length();
      } else if (content instanceof Element) {
        Element child = (Element)content;
        if (child.getNodeName().equals("TIMEX3")) {
          Timex timex = new Timex(child);
          if (child.getChildNodes().getLength() != 1) {
            throw new RuntimeException("TIMEX3 should only contain text " + child);
          }
          String timexText = child.getTextContent();
          CoreMap timexMap = new ArrayCoreMap();
          timexMap.set(TimeAnnotations.TimexAnnotation.class, timex);
          timexMap.set(CoreAnnotations.TextAnnotation.class, timexText);
          int charBegin = offset;
          timexMap.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, offset);
          offset += timexText.length();
          timexMap.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, offset);
          int charEnd = offset;
          //(tokens)
          if(haveTokenOffsets){
            Integer tokBegin = beginMap.get(charBegin);
            int searchStep = 1;          //if no exact match, search around the character offset
            while(tokBegin == null){
              tokBegin = beginMap.get(charBegin - searchStep);
              if(tokBegin == null){
                tokBegin = beginMap.get(charBegin + searchStep);
              }
              searchStep += 1;
            }
            searchStep = 1;
            Integer tokEnd = endMap.get(charEnd);
            while(tokEnd == null){
              tokEnd = endMap.get(charEnd - searchStep);
              if(tokEnd == null){
                tokEnd = endMap.get(charEnd + searchStep);
              }
              searchStep += 1;
            }
            timexMap.set(CoreAnnotations.TokenBeginAnnotation.class, tokBegin);
            timexMap.set(CoreAnnotations.TokenEndAnnotation.class, tokEnd);
          }
          timexMaps.add(timexMap);
        } else {
          throw new RuntimeException("unexpected element " + child);
        }
      } else {
        throw new RuntimeException("unexpected content " + content);
      }
    }
    return timexMaps;
  }

  @Override
  public Set<Class<? extends CoreAnnotation>> requires() {
    return Collections.unmodifiableSet(new ArraySet<>(Arrays.asList(
        CoreAnnotations.TextAnnotation.class,
        CoreAnnotations.TokensAnnotation.class,
        CoreAnnotations.CharacterOffsetBeginAnnotation.class,
        CoreAnnotations.CharacterOffsetEndAnnotation.class,
        CoreAnnotations.SentencesAnnotation.class
    )));
  }

  @Override
  public Set<Class<? extends CoreAnnotation>> requirementsSatisfied() {
    return Collections.singleton(TimeAnnotations.TimexAnnotations.class);
  }

}