package edu.stanford.nlp.process;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.util.TSVUtils;
import edu.stanford.nlp.util.logging.Redwood;
import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PrintStream;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Optional;
import java.util.function.Function;
/**
* A callback function which operates on each line of a TSV file representing a collection of sentences.
* This is a useful callback for processing a large batch of sentences; e.g., out of a Greenplum database.
*
* @author Gabor Angeli
*/
public interface TSVSentenceProcessor {
/** A list of possible fields in the sentence table */
enum SentenceField {
ID,
DEPENDENCIES_STANFORD,
DEPENDENCIES_EXTRAS,
DEPENDENCIES_MALT,
DEPENDENCIES_MALT_ALT1,
DEPENDENCIES_MALT_ALT2,
WORDS,
LEMMAS,
POS_TAGS,
NER_TAGS,
DOC_ID,
SENTENCE_INDEX,
CORPUS_ID,
DOC_CHAR_BEGIN,
DOC_CHAR_END,
GLOSS
}
/** The list of fields actually in the sentence table being passed as a query to ForEachSentence */
List<SentenceField> DEFAULT_SENTENCE_TABLE = Collections.unmodifiableList(Arrays.asList(
SentenceField.ID,
SentenceField.DEPENDENCIES_STANFORD,
SentenceField.DEPENDENCIES_EXTRAS,
SentenceField.DEPENDENCIES_MALT,
SentenceField.DEPENDENCIES_MALT_ALT1,
SentenceField.DEPENDENCIES_MALT_ALT2,
SentenceField.WORDS,
SentenceField.LEMMAS,
SentenceField.POS_TAGS,
SentenceField.NER_TAGS,
SentenceField.DOC_ID,
SentenceField.SENTENCE_INDEX,
SentenceField.CORPUS_ID,
SentenceField.DOC_CHAR_BEGIN,
SentenceField.DOC_CHAR_END,
SentenceField.GLOSS));
/**
* Process a given sentence.
*
* @param id The sentence id (database id) of the sentence being processed.
* @param doc The single-sentence document to annotate. This contains:
* <ul>
* <li>Tokens</li>
* <li>A parse tree (Collapsed dependencies)</li>
* <li>POS Tags</li>
* <li>NER tags</li>
* <li>Lemmas</li>
* <li>DocID</li>
* <li>Sentence index</li>
* </ul>
*/
void process(long id, Annotation doc);
/**
* Runs the given implementation of ForEachSentence, and then exits with the appropriate error code
* (that is, the number of exceptions encountered during processing)
* @param in The input stream to read examples off of.
* @param debugStream The stream to write debugging information to (e.g., stderr).
* @param cleanup A function to run after annotation is over, to clean up open files, etc.
* Takes as input the candidate error code, and returns a new error code to exit on.
* @param sentenceTableSpec The header of the sentence table fields being fed as input to this function.
* By default, this can be {@link TSVSentenceProcessor#DEFAULT_SENTENCE_TABLE}.
*/
default void runAndExit(InputStream in, PrintStream debugStream, Function<Integer, Integer> cleanup,
List<SentenceField> sentenceTableSpec) {
int exceptions = 0;
try {
BufferedReader stdin = new BufferedReader(new InputStreamReader(in));
int linesProcessed = 0;
long startTime = System.currentTimeMillis();
for (String line; (line = stdin.readLine()) != null; ) {
long id = -1;
try {
// Parse line
String[] fields = line.split("\t");
id = Long.parseLong(fields[0]);
// Create Annotation
Annotation doc = TSVUtils.parseSentence(
Optional.of(fields[sentenceTableSpec.indexOf(SentenceField.DOC_ID)]),
Optional.of(fields[sentenceTableSpec.indexOf(SentenceField.SENTENCE_INDEX)]),
fields[sentenceTableSpec.indexOf(SentenceField.GLOSS)],
fields[sentenceTableSpec.indexOf(SentenceField.DEPENDENCIES_STANFORD)],
fields[sentenceTableSpec.indexOf(SentenceField.DEPENDENCIES_MALT)],
fields[sentenceTableSpec.indexOf(SentenceField.WORDS)],
fields[sentenceTableSpec.indexOf(SentenceField.LEMMAS)],
fields[sentenceTableSpec.indexOf(SentenceField.POS_TAGS)],
fields[sentenceTableSpec.indexOf(SentenceField.NER_TAGS)],
Optional.of(fields[sentenceTableSpec.indexOf(SentenceField.ID)])
);
// Process document
process(id, doc);
// Debug
linesProcessed += 1;
if (linesProcessed % 1000 == 0) {
long currTime = System.currentTimeMillis();
long sentPerSec = linesProcessed / ( (currTime - startTime) / 1000 );
debugStream.println("[" + Redwood.formatTimeDifference(currTime - startTime) + "] Processed " + linesProcessed + " sentences {" + sentPerSec + " sentences / second}... ");
}
} catch (Exception t) {
debugStream.println("CAUGHT EXCEPTION ON SENTENCE ID: " + id + " (-1 if not known)");
t.printStackTrace(debugStream);
exceptions += 1;
}
}
// DONE
debugStream.println("[" + Redwood.formatTimeDifference(System.currentTimeMillis() - startTime) + "] DONE");
} catch (Exception t) {
debugStream.println("FATAL EXCEPTION!");
t.printStackTrace(debugStream);
exceptions += 1;
} finally {
debugStream.flush();
debugStream.close();
}
System.exit(cleanup.apply(exceptions));
}
/**
* @see TSVSentenceProcessor#runAndExit(InputStream, PrintStream, Function, List)
*/
default void runAndExit(InputStream in, PrintStream debugStream, Function<Integer, Integer> cleanup) {
runAndExit(in, debugStream, cleanup, DEFAULT_SENTENCE_TABLE);
}
}