TSVSentenceIterator.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.util;

import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.semgraph.SemanticGraph;
import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations;
import edu.stanford.nlp.simple.Sentence;

import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Optional;

import static edu.stanford.nlp.util.TSVUtils.unescapeSQL;

/**
 * Reads sentences from a TSV, provided a list of fields to populate.
 */
public class TSVSentenceIterator implements Iterator<Sentence> {

  /** A list of possible fields in the sentence table */
  public enum SentenceField {
    ID,
    DEPENDENCIES_BASIC,
    DEPENDENCIES_COLLAPSED,
    DEPENDENCIES_COLLAPSED_CC,
    DEPENDENCIES_ALTERNATE,
    WORDS,
    LEMMAS,
    POS_TAGS,
    NER_TAGS,
    DOC_ID,
    SENTENCE_INDEX,
    CORPUS_ID,
    DOC_CHAR_BEGIN,
    DOC_CHAR_END,
    GLOSS,
    IGNORE; // Ignore this field.

    public boolean isToken() {
      switch(this) {
        case WORDS:
        case LEMMAS:
        case POS_TAGS:
        case NER_TAGS:
          return true;
        default:
          return false;
      }
    }
  }


  private final Iterator<List<String>> source;
  private final List<SentenceField> fields;

  public TSVSentenceIterator(Iterator<List<String>> recordSource, List<SentenceField> fields) {
    this.source = recordSource;
    this.fields = fields;
  }

  /**
   * Populates the fields of a sentence
   * @param fields
   * @param entries
   * @return
   */
  public static Sentence toSentence(List<SentenceField> fields, List<String> entries) {
    return new Sentence(toCoreMap(fields, entries));
  }

  public static CoreMap toCoreMap(List<SentenceField> fields, List<String> entries) {
    CoreMap map = new ArrayCoreMap(fields.size());
    Optional<List<CoreLabel>> tokens = Optional.empty();

    // First pass - process all token level stuff.
    for(Pair<SentenceField, String> entry : Iterables.zip(fields, entries)) {
      SentenceField field = entry.first;
      String value = unescapeSQL(entry.second);
      switch (field) {
        case WORDS: {
          List<String> values = TSVUtils.parseArray(value);
          if (!tokens.isPresent()) {
            tokens = Optional.of(new ArrayList<>(values.size()));
            for (int i = 0; i < values.size(); i++) tokens.get().add(new CoreLabel());
          }

          int beginChar = 0;
          for (int i = 0; i < values.size(); i++) {
            tokens.get().get(i).setValue(values.get(i));
            tokens.get().get(i).setWord(values.get(i));
            tokens.get().get(i).setBeginPosition(beginChar);
            tokens.get().get(i).setEndPosition(beginChar + values.get(i).length());
            beginChar += values.get(i).length() + 1;
          }
        } break;
        case LEMMAS: {
          List<String> values = TSVUtils.parseArray(value);
          if (!tokens.isPresent()) {
            tokens = Optional.of(new ArrayList<>(values.size()));
            for (int i = 0; i < values.size(); i++) tokens.get().add(new CoreLabel());
          }
          for (int i = 0; i < values.size(); i++) {
            tokens.get().get(i).setLemma(values.get(i));
          }
        } break;
        case POS_TAGS: {
          List<String> values = TSVUtils.parseArray(value);
          if (!tokens.isPresent()) {
            tokens = Optional.of(new ArrayList<>(values.size()));
            for (int i = 0; i < values.size(); i++) tokens.get().add(new CoreLabel());
          }
          for (int i = 0; i < values.size(); i++) {
            tokens.get().get(i).setTag(values.get(i));
          }
        } break;
        case NER_TAGS: {
          List<String> values = TSVUtils.parseArray(value);
          if (!tokens.isPresent()) {
            tokens = Optional.of(new ArrayList<>(values.size()));
            for (int i = 0; i < values.size(); i++) tokens.get().add(new CoreLabel());
          }
          for (int i = 0; i < values.size(); i++) {
            tokens.get().get(i).setNER(values.get(i));
          }
        } break;
        default: // ignore.
          break;
      }
    }

    // Document specific stuff.
    Optional<String> docId = Optional.empty();
    Optional<String> sentenceId = Optional.empty();
    Optional<Integer> sentenceIndex = Optional.empty();

    for(Pair<SentenceField, String> entry : Iterables.zip(fields, entries)) {
      SentenceField field = entry.first;
      String value = unescapeSQL(entry.second);
      switch (field) {
        case ID:
          sentenceId = Optional.of(value);
          break;
        case DOC_ID:
          docId = Optional.of(value);
          break;
        case SENTENCE_INDEX:
          sentenceIndex = Optional.of(Integer.parseInt(value));
          break;
        case GLOSS:
          value = value.replace("\\n", "\n").replace("\\t", "\t");
          map.set(CoreAnnotations.TextAnnotation.class, value);
          break;
        default: // ignore.
          break;
      }
    }
    // High level document stuff
    map.set(CoreAnnotations.SentenceIDAnnotation.class, sentenceId.orElse("-1"));
    map.set(CoreAnnotations.DocIDAnnotation.class, docId.orElse("???"));
    map.set(CoreAnnotations.SentenceIndexAnnotation.class, sentenceIndex.orElse(0));

    // Doc-char
    if(tokens.isPresent()) {
      for (Pair<SentenceField, String> entry : Iterables.zip(fields, entries)) {
        SentenceField field = entry.first;
        String value = unescapeSQL(entry.second);
        switch (field) {
          case DOC_CHAR_BEGIN: {
            List<String> values = TSVUtils.parseArray(value);
            for (int i = 0; i < tokens.get().size(); i++) {
              tokens.get().get(i).setBeginPosition(Integer.parseInt(values.get(i)));
            }
          } break;
          case DOC_CHAR_END: {
            List<String> values = TSVUtils.parseArray(value);
            for (int i = 0; i < tokens.get().size(); i++) {
              tokens.get().get(i).setEndPosition(Integer.parseInt(values.get(i)));
            }
          } break;
          default: // ignore.
            break;
        }
      }
    }

    // Final token level stuff.
    if (tokens.isPresent()) {
      for (int i = 0; i < tokens.get().size(); i++) {
        tokens.get().get(i).set(CoreAnnotations.DocIDAnnotation.class, docId.orElse("???"));
        tokens.get().get(i).set(CoreAnnotations.SentenceIndexAnnotation.class, sentenceIndex.orElse(-1));
        tokens.get().get(i).set(CoreAnnotations.IndexAnnotation.class, i+1);
        tokens.get().get(i).set(CoreAnnotations.TokenBeginAnnotation.class, i);
        tokens.get().get(i).set(CoreAnnotations.TokenEndAnnotation.class, i+1);
      }
    }

    // Dependency trees
    if(tokens.isPresent()) {
      map.set(CoreAnnotations.TokensAnnotation.class, tokens.get());
      map.set(CoreAnnotations.TokenBeginAnnotation.class, 0);
      map.set(CoreAnnotations.TokenEndAnnotation.class, tokens.get().size());

      for (Pair<SentenceField, String> entry : Iterables.zip(fields, entries)) {
        SentenceField field = entry.first;
        String value = unescapeSQL(entry.second);
        switch (field) {
          case DEPENDENCIES_BASIC: {
            SemanticGraph graph = TSVUtils.parseJsonTree(value, tokens.get());
            map.set(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class, graph);
//            if (!map.containsKey(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class))
//              map.set(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class, graph);
//            if (!map.containsKey(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation.class))
//              map.set(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation.class, graph);
          } break;
          case DEPENDENCIES_COLLAPSED: {
            SemanticGraph graph = TSVUtils.parseJsonTree(value, tokens.get());
            map.set(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class, graph);
          } break;
          case DEPENDENCIES_COLLAPSED_CC: {
            SemanticGraph graph = TSVUtils.parseJsonTree(value, tokens.get());
//            if (!map.containsKey(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class))
//              map.set(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class, graph);
//            map.set(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class, graph);
            map.set(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation.class, graph);
          } break;
          case DEPENDENCIES_ALTERNATE: {
            SemanticGraph graph = TSVUtils.parseJsonTree(value, tokens.get());
            map.set(SemanticGraphCoreAnnotations.AlternativeDependenciesAnnotation.class, graph);
          } break;
          default: // ignore.
            break;
        }
      }
    }

    return map;
  }


  @Override
  public boolean hasNext() {
    return source.hasNext();
  }

  @Override
  public Sentence next() {
    return toSentence(fields, source.next());
  }
}