TSVSentenceProcessor.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.process;

import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.util.TSVUtils;
import edu.stanford.nlp.util.logging.Redwood;

import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PrintStream;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Optional;
import java.util.function.Function;

/**
 * A callback function which operates on each line of a TSV file representing a collection of sentences.
 * This is a useful callback for processing a large batch of sentences; e.g., out of a Greenplum database.
 *
 * @author Gabor Angeli
 */
public interface TSVSentenceProcessor {

  /** A list of possible fields in the sentence table */
  enum SentenceField {
    ID,
    DEPENDENCIES_STANFORD,
    DEPENDENCIES_EXTRAS,
    DEPENDENCIES_MALT,
    DEPENDENCIES_MALT_ALT1,
    DEPENDENCIES_MALT_ALT2,
    WORDS,
    LEMMAS,
    POS_TAGS,
    NER_TAGS,
    DOC_ID,
    SENTENCE_INDEX,
    CORPUS_ID,
    DOC_CHAR_BEGIN,
    DOC_CHAR_END,
    GLOSS
  }

  /** The list of fields actually in the sentence table being passed as a query to ForEachSentence */
  List<SentenceField> DEFAULT_SENTENCE_TABLE = Collections.unmodifiableList(Arrays.asList(
          SentenceField.ID,
          SentenceField.DEPENDENCIES_STANFORD,
          SentenceField.DEPENDENCIES_EXTRAS,
          SentenceField.DEPENDENCIES_MALT,
          SentenceField.DEPENDENCIES_MALT_ALT1,
          SentenceField.DEPENDENCIES_MALT_ALT2,
          SentenceField.WORDS,
          SentenceField.LEMMAS,
          SentenceField.POS_TAGS,
          SentenceField.NER_TAGS,
          SentenceField.DOC_ID,
          SentenceField.SENTENCE_INDEX,
          SentenceField.CORPUS_ID,
          SentenceField.DOC_CHAR_BEGIN,
          SentenceField.DOC_CHAR_END,
          SentenceField.GLOSS));

  /**
   * Process a given sentence.
   *
   * @param id The sentence id (database id) of the sentence being processed.
   * @param doc The single-sentence document to annotate. This contains:
   *            <ul>
   *              <li>Tokens</li>
   *              <li>A parse tree (Collapsed dependencies)</li>
   *              <li>POS Tags</li>
   *              <li>NER tags</li>
   *              <li>Lemmas</li>
   *              <li>DocID</li>
   *              <li>Sentence index</li>
   *            </ul>
   */
  void process(long id, Annotation doc);


  /**
   * Runs the given implementation of ForEachSentence, and then exits with the appropriate error code
   * (that is, the number of exceptions encountered during processing)
   * @param in The input stream to read examples off of.
   * @param debugStream The stream to write debugging information to (e.g., stderr).
   * @param cleanup A function to run after annotation is over, to clean up open files, etc.
   *                Takes as input the candidate error code, and returns a new error code to exit on.
   * @param sentenceTableSpec The header of the sentence table fields being fed as input to this function.
   *                          By default, this can be {@link TSVSentenceProcessor#DEFAULT_SENTENCE_TABLE}.
   */
  default void runAndExit(InputStream in, PrintStream debugStream, Function<Integer, Integer> cleanup,
                          List<SentenceField> sentenceTableSpec) {
    int exceptions = 0;

    try {
      BufferedReader stdin = new BufferedReader(new InputStreamReader(in));
      int linesProcessed = 0;
      long startTime = System.currentTimeMillis();

      for (String line; (line = stdin.readLine()) != null; ) {
        long id = -1;
        try {
          // Parse line
          String[] fields = line.split("\t");
          id = Long.parseLong(fields[0]);

          // Create Annotation
          Annotation doc = TSVUtils.parseSentence(
              Optional.of(fields[sentenceTableSpec.indexOf(SentenceField.DOC_ID)]),
              Optional.of(fields[sentenceTableSpec.indexOf(SentenceField.SENTENCE_INDEX)]),
              fields[sentenceTableSpec.indexOf(SentenceField.GLOSS)],
              fields[sentenceTableSpec.indexOf(SentenceField.DEPENDENCIES_STANFORD)],
              fields[sentenceTableSpec.indexOf(SentenceField.DEPENDENCIES_MALT)],
              fields[sentenceTableSpec.indexOf(SentenceField.WORDS)],
              fields[sentenceTableSpec.indexOf(SentenceField.LEMMAS)],
              fields[sentenceTableSpec.indexOf(SentenceField.POS_TAGS)],
              fields[sentenceTableSpec.indexOf(SentenceField.NER_TAGS)],
              Optional.of(fields[sentenceTableSpec.indexOf(SentenceField.ID)])
          );

          // Process document
          process(id, doc);

          // Debug
          linesProcessed += 1;
          if (linesProcessed % 1000 == 0) {
            long currTime = System.currentTimeMillis();
            long sentPerSec = linesProcessed / ( (currTime - startTime)  / 1000 );
            debugStream.println("[" + Redwood.formatTimeDifference(currTime - startTime) + "] Processed " + linesProcessed + " sentences {" + sentPerSec + " sentences / second}... ");
          }
        } catch (Exception t) {
          debugStream.println("CAUGHT EXCEPTION ON SENTENCE ID: " + id + " (-1 if not known)");
          t.printStackTrace(debugStream);
          exceptions += 1;
        }
      }

      // DONE
      debugStream.println("[" + Redwood.formatTimeDifference(System.currentTimeMillis() - startTime) + "] DONE");
    } catch (Exception t) {
      debugStream.println("FATAL EXCEPTION!");
      t.printStackTrace(debugStream);
      exceptions += 1;
    } finally {
      debugStream.flush();
      debugStream.close();
    }
    System.exit(cleanup.apply(exceptions));
  }

  /**
   * @see TSVSentenceProcessor#runAndExit(InputStream, PrintStream, Function, List)
   */
  default void runAndExit(InputStream in, PrintStream debugStream, Function<Integer, Integer> cleanup) {
    runAndExit(in, debugStream, cleanup, DEFAULT_SENTENCE_TABLE);
  }

}