AnnotatedTextReader.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.patterns.surface;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.patterns.DataInstance;
import edu.stanford.nlp.process.DocumentPreprocessor;
import edu.stanford.nlp.process.PTBTokenizer.PTBTokenizerFactory;
import edu.stanford.nlp.sequences.CoNLLDocumentReaderAndWriter;
import edu.stanford.nlp.sequences.SeqClassifierFlags;
import edu.stanford.nlp.util.ArrayCoreMap;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.util.TypesafeMap;

/**
 * CanNOT handle overlapping labeled text (that is one token cannot belong to
 * multiple labels)! Note that there has to be spaces around the tags <label>
 * and </label> for the reader to work correctly!
 * 
 * @author Sonal Gupta (sonalg@stanford.edu)
 * 
 */
public class AnnotatedTextReader {

    public static Map<String, DataInstance> parseColumnFile(BufferedReader reader,
                                                Set<String> categoriesAllowed,
                                                Map<String, Class<? extends TypesafeMap.Key<String>>> setClassForTheseLabels,
                                                boolean setGoldClass, String sentIDprefix ){

      CoNLLDocumentReaderAndWriter conllreader = new CoNLLDocumentReaderAndWriter();
      Properties props = new Properties();
      SeqClassifierFlags flags = new SeqClassifierFlags(props);
      flags.entitySubclassification = "noprefix";
      flags.retainEntitySubclassification = false;
      conllreader.init(flags);

      Iterator<List<CoreLabel>> dociter = conllreader.getIterator(reader);;
      int num = -1;
      Map<String, DataInstance> sents = new HashMap<>();
      while(dociter.hasNext()){

        List<CoreLabel> doc = dociter.next();

        List<String> words = new ArrayList<>();
        List<CoreLabel> sentcore = new ArrayList<>();


        int tokenindex = 0;
        for(CoreLabel l: doc){

          if(l.word().equals(CoNLLDocumentReaderAndWriter.BOUNDARY) || l.word().equals("-DOCSTART-")){
            if(words.size() > 0){
              num++;
              String docid = sentIDprefix + "-"+String.valueOf(num);
              DataInstance sentInst = DataInstance.getNewSurfaceInstance(sentcore);
              sents.put(docid, sentInst);

              words = new ArrayList<>();
              sentcore = new ArrayList<>();
              tokenindex = 0;
            }
            continue;
          }
          tokenindex ++;
          words.add(l.word());

          l.set(CoreAnnotations.IndexAnnotation.class, tokenindex);
          l.set(CoreAnnotations.ValueAnnotation.class, l.word());
          String label = l.get(CoreAnnotations.AnswerAnnotation.class);

          assert label != null : "label cannot be null";

          l.set(CoreAnnotations.TextAnnotation.class, l.word());
          l.set(CoreAnnotations.OriginalTextAnnotation.class, l.word());

          if (setGoldClass){
            l.set(CoreAnnotations.GoldAnswerAnnotation.class, label);
          }

          if (setClassForTheseLabels != null
            && setClassForTheseLabels.containsKey(label))
            l.set(setClassForTheseLabels.get(label), label);

          sentcore.add(l);

        }

        if(words.size() > 0){
          num++;
          String docid = sentIDprefix + "-"+String.valueOf(num);;
          DataInstance sentInst = DataInstance.getNewSurfaceInstance(sentcore);
          sents.put(docid, sentInst);
        }
      }
      return sents;

    }

  public static List<CoreMap> parseFile(
      BufferedReader reader,
      Set<String> categoriesAllowed,
      Map<String, Class<? extends TypesafeMap.Key<String>>> setClassForTheseLabels,
      boolean setGoldClass, String sentIDprefix)
      throws IOException {

    Pattern startingLabelToken = Pattern.compile("<("
        + StringUtils.join(categoriesAllowed, "|") + ")>");
    Pattern endLabelToken = Pattern.compile("</("
        + StringUtils.join(categoriesAllowed, "|") + ")>");
    String backgroundSymbol = "O";

    List<CoreMap> sentences = new ArrayList<>();
    int lineNum = -1;
    String l = null;

    while ((l = reader.readLine()) != null) {
      lineNum++;
      String[] t = l.split("\t", 2);
      String id = null;
      String text = null;
      if (t.length == 2) {
        id = t[0];
        text = t[1];
      } else if (t.length == 1) {
        text = t[0];
        id = String.valueOf(lineNum);
      }
      id = sentIDprefix + id;
      DocumentPreprocessor dp = new DocumentPreprocessor(new StringReader(text));
      PTBTokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizerFactory
          .newCoreLabelTokenizerFactory("ptb3Escaping=false,normalizeParentheses=false,escapeForwardSlashAsterisk=false");
      dp.setTokenizerFactory(tokenizerFactory);

      String label = backgroundSymbol;
      int sentNum = -1;

      for (List<HasWord> sentence : dp) {
        sentNum++;
        String sentStr = "";
        List<CoreLabel> sent = new ArrayList<>();
        for (HasWord tokw : sentence) {
          String tok = tokw.word();
          Matcher startingMatcher = startingLabelToken.matcher(tok);
          Matcher endMatcher = endLabelToken.matcher(tok);
          if (startingMatcher.matches()) {
            //System.out.println("matched starting");
            label = startingMatcher.group(1);
          } else if (endMatcher.matches()) {
            //System.out.println("matched end");
            label = backgroundSymbol;
          } else {

            CoreLabel c = new CoreLabel();

            List<String> toks = new ArrayList<>();

            toks.add(tok);

            for (String toksplit : toks) {

              sentStr += " " + toksplit;

              c.setWord(toksplit);
              c.setLemma(toksplit);
              c.setValue(toksplit);
              c.set(CoreAnnotations.TextAnnotation.class, toksplit);
              c.set(CoreAnnotations.OriginalTextAnnotation.class, tok);

              if (setGoldClass){
                 
                c.set(CoreAnnotations.GoldAnswerAnnotation.class, label);
              }
              
              if (setClassForTheseLabels != null
                  && setClassForTheseLabels.containsKey(label))
                c.set(setClassForTheseLabels.get(label), label);

              sent.add(c);
            }
          }
        }
        CoreMap sentcm = new ArrayCoreMap();
        sentcm.set(CoreAnnotations.TextAnnotation.class, sentStr.trim());
        sentcm.set(CoreAnnotations.TokensAnnotation.class, sent);
        sentcm.set(CoreAnnotations.DocIDAnnotation.class, id + "-" + sentNum);
        sentences.add(sentcm);
      }
    }
    return sentences;
  }
}