GUTimeAnnotator.java example

Explorer
CoreNLP-master

package edu.stanford.nlp.time;

import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.io.RuntimeIOException;
import edu.stanford.nlp.ling.CoreAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.Annotator;
import edu.stanford.nlp.util.*;
import org.w3c.dom.*;

import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.util.*;
import java.util.regex.Pattern;


/**
 * Annotates text using GUTime perl script.
 *
 * GUTIME/TimeML specifications can be found at:
 * <a href="http://www.timeml.org/site/tarsqi/modules/gutime/index.html">
 * http://www.timeml.org/site/tarsqi/modules/gutime/index.html</a>.
 */
public class GUTimeAnnotator implements Annotator {
  
  private static final String BASE_PATH = "$NLP_DATA_HOME/packages/GUTime";
  private static final String DEFAULT_PATH = DataFilePaths.convert(BASE_PATH);
  private final File gutimePath;
  private final boolean outputResults;
  
  // if used in a pipeline or constructed with a Properties object,
  // this property tells the annotator where to find the script
  public static final String GUTIME_PATH_PROPERTY = "gutime.path";
  public static final String GUTIME_OUTPUT_RESULTS = "gutime.outputResults";

  public GUTimeAnnotator() {
    this(new File(System.getProperty("gutime", DEFAULT_PATH)));
  }

  public GUTimeAnnotator(File gutimePath) {
    this.gutimePath = gutimePath;
    this.outputResults = false;
  }

  public GUTimeAnnotator(String name, Properties props) {
    String path = props.getProperty(GUTIME_PATH_PROPERTY,
                                    System.getProperty("gutime", 
                                                       DEFAULT_PATH));
    this.gutimePath = new File(path);

    this.outputResults = 
      Boolean.valueOf(props.getProperty(GUTIME_OUTPUT_RESULTS, "false"));
  }

  public void annotate(Annotation annotation) {
    try {
      this.annotate((CoreMap)annotation);
    } catch (IOException e) {
      throw new RuntimeIOException(e);
    }
  }

  public void annotate(CoreMap document) throws IOException {
    // write input file in GUTime format
    Element inputXML = toInputXML(document);
    File inputFile = File.createTempFile("gutime", ".input");

    //Document doc = new Document(inputXML);
    PrintWriter inputWriter = new PrintWriter(inputFile);
    inputWriter.println(XMLUtils.nodeToString(inputXML, false));
   // new XMLOutputter().output(inputXML, inputWriter);
    inputWriter.close();

    boolean useFirstDate =
      (!document.containsKey(CoreAnnotations.CalendarAnnotation.class) && !document.containsKey(CoreAnnotations.DocDateAnnotation.class));
    
    ArrayList<String> args = new ArrayList<>();
    args.add("perl");
    args.add("-I" + this.gutimePath.getPath());
    args.add(new File(this.gutimePath, "TimeTag.pl").getPath());
    if (useFirstDate)
      args.add("-FDNW");
    args.add(inputFile.getPath());
    // run GUTime on the input file
    ProcessBuilder process = new ProcessBuilder(args);

    StringWriter outputWriter = new StringWriter();
    SystemUtils.run(process, outputWriter, null);
    String output = outputWriter.getBuffer().toString();
    Pattern docClose = Pattern.compile("</DOC>.*", Pattern.DOTALL);
    output = docClose.matcher(output).replaceAll("</DOC>");


   //The TimeTag.pl result file contains next tags which must be removed
    output = output.replaceAll("<lex.*?>", "");
    output = output.replace("</lex>", "");
    output = output.replace("<NG>", "");
    output = output.replace("</NG>", "");
    output = output.replace("<VG>", "");
    output = output.replace("</VG>", "");
    output = output.replace("<s>", "");
    output = output.replace("</s>", "");

    // parse the GUTime output
    Element outputXML;
    try {
      outputXML = XMLUtils.parseElement(output);
    } catch (Exception ex) {
      throw new RuntimeException(String.format("error:\n%s\ninput:\n%s\noutput:\n%s",
      		ex, IOUtils.slurpFile(inputFile), output), ex);
    }
    /*
    try {
      outputXML = new SAXBuilder().build(new StringReader(output)).getRootElement();
    } catch (JDOMException e) {
      throw new RuntimeException(String.format("error:\n%s\ninput:\n%s\noutput:\n%s",
      		e, IOUtils.slurpFile(inputFile), output));
    } */
    inputFile.delete();

    


    // get Timex annotations
    List<CoreMap> timexAnns = toTimexCoreMaps(outputXML, document);
    document.set(TimeAnnotations.TimexAnnotations.class, timexAnns);
    if (outputResults) {
      System.out.println(timexAnns);
    }
    
    // align Timex annotations to sentences
    int timexIndex = 0;
    for (CoreMap sentence: document.get(CoreAnnotations.SentencesAnnotation.class)) {
    	int sentBegin = beginOffset(sentence);
    	int sentEnd = endOffset(sentence);
    	
    	// skip times before the sentence
    	while (timexIndex < timexAnns.size() && beginOffset(timexAnns.get(timexIndex)) < sentBegin) {
    		++timexIndex;
    	}
    	
    	// determine times within the sentence
    	int sublistBegin = timexIndex;
    	int sublistEnd = timexIndex;
    	while (timexIndex < timexAnns.size() &&
    			   sentBegin <= beginOffset(timexAnns.get(timexIndex)) &&
    			   endOffset(timexAnns.get(timexIndex)) <= sentEnd) {
    		++sublistEnd;
    		++timexIndex;
    	}
    	
    	// set the sentence timexes
    	sentence.set(TimeAnnotations.TimexAnnotations.class, timexAnns.subList(sublistBegin, sublistEnd));
    }
  }
  
  private static int beginOffset(CoreMap ann) {
  	return ann.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
  }
  
  private static int endOffset(CoreMap ann) {
  	return ann.get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
  }
  
  private static Element toInputXML(CoreMap document) {
    // construct GUTime format XML
    Element doc = XMLUtils.createElement("DOC");
    doc.appendChild(XMLUtils.createTextNode("\n"));
    // populate the date element
    Calendar dateCalendar = 
      document.get(CoreAnnotations.CalendarAnnotation.class);
    if (dateCalendar != null) {
      Element date = XMLUtils.createElement("date");
      date.appendChild(XMLUtils.createTextNode(String.format("%TF", dateCalendar)));
      doc.appendChild(date);
      doc.appendChild(XMLUtils.createTextNode("\n"));
    } else {
      String s = document.get(CoreAnnotations.DocDateAnnotation.class);
      if (s != null) {
        Element date = XMLUtils.createElement("date");
        date.appendChild(XMLUtils.createTextNode(s));
        doc.appendChild(date);
        doc.appendChild(XMLUtils.createTextNode("\n"));
      }
    }
    Element textElem = XMLUtils.createElement("text");
    doc.appendChild(textElem);
    doc.appendChild(XMLUtils.createTextNode("\n"));
    
    // populate the text element
    String text = document.get(CoreAnnotations.TextAnnotation.class);
    int offset = 0;
    for (CoreMap sentence: document.get(CoreAnnotations.SentencesAnnotation.class)) {
      int sentBegin = sentence.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
      int sentEnd = sentence.get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
      
      // add text before the first token
      textElem.appendChild(XMLUtils.createTextNode(text.substring(offset, sentBegin)));
      offset = sentBegin;
      
      // add one "s" element per sentence
      Element s = XMLUtils.createElement("s");
      textElem.appendChild(s);
      for (CoreLabel token: sentence.get(CoreAnnotations.TokensAnnotation.class)) {
        int tokenBegin = token.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
        int tokenEnd = token.get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
        s.appendChild(XMLUtils.createTextNode(text.substring(offset, tokenBegin)));
        offset = tokenBegin;
        
        // add one "lex" element per token
        Element lex = XMLUtils.createElement("lex");
        s.appendChild(lex);
        String posTag = 
          token.get(CoreAnnotations.PartOfSpeechAnnotation.class);
        if (posTag != null){
          lex.setAttribute("pos", posTag);
        }
        assert token.word().equals(text.substring(offset, tokenEnd));
        lex.appendChild(XMLUtils.createTextNode(text.substring(offset, tokenEnd)));
        offset = tokenEnd;
      }
      
      // add text after the last token
      textElem.appendChild(XMLUtils.createTextNode(text.substring(offset, sentEnd)));
      offset = sentEnd;
    }
    
    // add text after the last sentence
    textElem.appendChild(XMLUtils.createTextNode(text.substring(offset, text.length())));
    
    // return the document
    return doc;
  }
  
  private static List<CoreMap> toTimexCoreMaps(Element docElem, CoreMap originalDocument) {
    //--Collect Token Offsets 
    Map<Integer,Integer> beginMap = Generics.newHashMap();
    Map<Integer,Integer> endMap = Generics.newHashMap();
    boolean haveTokenOffsets = true;
    for(CoreMap sent : originalDocument.get(CoreAnnotations.SentencesAnnotation.class)){
      for(CoreLabel token : sent.get(CoreAnnotations.TokensAnnotation.class)){
        Integer tokBegin = token.get(CoreAnnotations.TokenBeginAnnotation.class);
        Integer tokEnd = token.get(CoreAnnotations.TokenEndAnnotation.class);
        if(tokBegin == null || tokEnd == null){ haveTokenOffsets = false; }
        int charBegin = token.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
        int charEnd = token.get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
        beginMap.put(charBegin,tokBegin);
        endMap.put(charEnd,tokEnd);
      }
    }
    //--Set Timexes
    List<CoreMap> timexMaps = new ArrayList<>();
    int offset = 0;
    NodeList docNodes = docElem.getChildNodes();
    Element textElem = null;
    // Find first "text" elem
    for (int i = 0; i < docNodes.getLength(); i++) {
      Node n = docNodes.item(i);
      if ("text".equals(n.getNodeName())) {
        textElem = (Element) n;
        break;
      }
    }
    NodeList textNodes = textElem.getChildNodes();
    for (int i = 0; i < textNodes.getLength(); i++) {
      Node content = textNodes.item(i);
      if (content instanceof Text) {
        Text text = (Text)content;
        offset += text.getWholeText().length();
      } else if (content instanceof Element) {
        Element child = (Element)content;
        if (child.getNodeName().equals("TIMEX3")) {
          Timex timex = new Timex(child);
          if (child.getChildNodes().getLength() != 1) {
            throw new RuntimeException("TIMEX3 should only contain text " + child);
          }
          String timexText = child.getTextContent();
          CoreMap timexMap = new ArrayCoreMap();
          //(timex)
          timexMap.set(TimeAnnotations.TimexAnnotation.class, timex);
          //(text)
          timexMap.set(CoreAnnotations.TextAnnotation.class, timexText);
          //(characters)
          int charBegin = offset;
          timexMap.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, charBegin);
          offset += timexText.length();
          int charEnd = offset;
          timexMap.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, charEnd);
          //(tokens)
          if(haveTokenOffsets){
            Integer tokBegin = beginMap.get(charBegin);
            int searchStep = 1;          //if no exact match, search around the character offset
            while(tokBegin == null){
              tokBegin = beginMap.get(charBegin - searchStep);
              if(tokBegin == null){
                tokBegin = beginMap.get(charBegin + searchStep);
              }
              searchStep += 1;
            }
            searchStep = 1;
            Integer tokEnd = endMap.get(charEnd);
            while(tokEnd == null){
              tokEnd = endMap.get(charEnd - searchStep);
              if(tokEnd == null){
                tokEnd = endMap.get(charEnd + searchStep);
              }
              searchStep += 1;
            }
            timexMap.set(CoreAnnotations.TokenBeginAnnotation.class, tokBegin);
            timexMap.set(CoreAnnotations.TokenEndAnnotation.class, tokEnd);
          }
          //(add)
          timexMaps.add(timexMap);
        } else {
          throw new RuntimeException("unexpected element " + child);
        }
      } else {
        throw new RuntimeException("unexpected content " + content);
      }
    }
    return timexMaps;
  }


  @Override
  public Set<Class<? extends CoreAnnotation>> requires() {
    return Collections.unmodifiableSet(new ArraySet<>(Arrays.asList(
        CoreAnnotations.TextAnnotation.class,
        CoreAnnotations.TokensAnnotation.class,
        CoreAnnotations.CharacterOffsetBeginAnnotation.class,
        CoreAnnotations.CharacterOffsetEndAnnotation.class,
        CoreAnnotations.SentencesAnnotation.class
    )));
  }

  @Override
  public Set<Class<? extends CoreAnnotation>> requirementsSatisfied() {
    return Collections.singleton(TimeAnnotations.TimexAnnotations.class);
  }
}