AnnotatedTextReader.java example

package edu.stanford.nlp.patterns.surface;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.process.DocumentPreprocessor;
import edu.stanford.nlp.process.PTBTokenizer.PTBTokenizerFactory;
import edu.stanford.nlp.util.ArrayCoreMap;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.util.TypesafeMap;

/**
 * CanNOT handle overlapping labeled text (that is one token cannot belong to
 * multiple labels)! Note that there has to be spaces around the tags <label>
 * and </label> for the reader to work correctly!
 * 
 * @author Sonal Gupta (sonalg@stanford.edu)
 * 
 */
public class AnnotatedTextReader {
  public static List<CoreMap> parseFile(
      BufferedReader reader,
      Set<String> categoriesAllowed,
      Map<String, Class<? extends TypesafeMap.Key<String>>> setClassForTheseLabels,
      boolean setGoldClass, boolean splitOnPunct, boolean lowercase, String sentIDprefix)
      throws IOException {

    Pattern startingLabelToken = Pattern.compile("<("
        + StringUtils.join(categoriesAllowed, "|") + ")>");
    Pattern endLabelToken = Pattern.compile("</("
        + StringUtils.join(categoriesAllowed, "|") + ")>");
    String backgroundSymbol = "O";

    List<CoreMap> sentences = new ArrayList<CoreMap>();
    int lineNum = -1;
    String l = null;

    while ((l = reader.readLine()) != null) {
      lineNum++;
      String[] t = l.split("\t", 2);
      String id = null;
      String text = null;
      if (t.length == 2) {
        id = t[0];
        text = t[1];
      } else if (t.length == 1) {
        text = t[0];
        id = String.valueOf(lineNum);
      }
      id = sentIDprefix + id;
      DocumentPreprocessor dp = new DocumentPreprocessor(new StringReader(text));
      PTBTokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizerFactory
          .newCoreLabelTokenizerFactory("ptb3Escaping=false,normalizeParentheses=false,escapeForwardSlashAsterisk=false");
      dp.setTokenizerFactory(tokenizerFactory);

      String label = backgroundSymbol;
      int sentNum = -1;

      for (List<HasWord> sentence : dp) {
        sentNum++;
        String sentStr = "";
        List<CoreLabel> sent = new ArrayList<CoreLabel>();
        for (HasWord tokw : sentence) {
          String tok = tokw.word();
          Matcher startingMatcher = startingLabelToken.matcher(tok);
          Matcher endMatcher = endLabelToken.matcher(tok);
          if (startingMatcher.matches()) {
            label = startingMatcher.group(1);
          } else if (endMatcher.matches()) {
            label = backgroundSymbol;
          } else {

            CoreLabel c = new CoreLabel();

            List<String> toks = new ArrayList<String>();

            toks.add(tok);

            for (String toksplit : toks) {

              sentStr += " " + toksplit;

              c.setWord(toksplit);
              c.setLemma(toksplit);
              c.setValue(toksplit);
              c.set(CoreAnnotations.TextAnnotation.class, toksplit);
              c.set(CoreAnnotations.OriginalTextAnnotation.class, tok);

              if (setGoldClass){
                 
                c.set(CoreAnnotations.GoldAnswerAnnotation.class, label);
              }
              
              if (setClassForTheseLabels != null
                  && setClassForTheseLabels.containsKey(label))
                c.set(setClassForTheseLabels.get(label), label);

              sent.add(c);
            }
          }
        }
        CoreMap sentcm = new ArrayCoreMap();
        sentcm.set(CoreAnnotations.TextAnnotation.class, sentStr.trim());
        sentcm.set(CoreAnnotations.TokensAnnotation.class, sent);
        sentcm.set(CoreAnnotations.DocIDAnnotation.class, id + "-" + sentNum);
        sentences.add(sentcm);
      }
    }
    return sentences;
  }
}