package edu.jhu.agiga;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.Writer;
import java.util.logging.Logger;
import edu.jhu.agiga.AgigaConstants.DependencyForm;
/**
* Command line tools for printing human-readable versions of the XML
* annotations.
*
* @author mgormley
*
*/
public class AgigaPrinter {
private static Logger log = Logger.getLogger(AgigaPrinter.class.getName());
public static void main(String args[]) throws Exception {
Util.initializeLogging();
//LogManager.getLogManager().
// Initialize Log4j
// String log4jProperty = System.getProperty("log4j.configuration");
// if (log4jProperty == null) {
// ConsoleAppender cAppender = new ConsoleAppender(new PatternLayout("%d{HH:mm:ss,SSS} [%t] %p %c %x - %m%n"),
// "System.err");
// BasicConfigurator.configure(cAppender);
// // Must be Level.TRACE for debug logging
// Logger.setLevel(Level.INFO);
// } else {
// // Ensure that we pick up the log4j.properties file if present
// PropertyConfigurator.configure(log4jProperty);
// }
// Create usage string
String usage = "\nusage: java " + AgigaPrinter.class.getName() + " <type> <gzipped input file>"
+ "\n where <type> is one of:";
String[][] options = new String[][] {
{ "words", "Words only, one sentence per line" },
{ "lemmas", "Lemmas only, one sentence per line" },
{ "pos", "Part-of-speech tags" },
{ "ner", "Named entity types" },
{ "basic-deps", "Basic dependency parses in CONNL-X format" },
{ "col-deps", "Collapsed dependency parses in CONNL-X format" },
{ "col-ccproc-deps", "Collapsed and propagated dependency parses in CONNL-X format" },
{ "phrase-structure", "Phrase structure parses" },
{ "coref", "Coreference resolution as SGML similar to MUC" },
{ "stanford-deps", "toString() methods of Stanford dependency parse annotations" },
{ "stanford-phrase-structure", "toString() method of Stanford phrase structure parses" },
{ "headlines", "Headlines and Datelines" },
{ "for-testing-only", "**For use in testing this API only**" } };
for (String[] pair : options) {
usage += String.format("\n %-25s (%s)", pair[0], pair[1]);
}
usage += "\n and where <gzipped input file> is an .xml.gz file";
usage += "\n from Annotated Gigaword";
log.info("Testing");
// Check for correct args
if (args.length != 2) {
log.severe(usage);
System.exit(1);
}
String type = args[0];
String inputFile = args[1];
// Print
Writer writer = new PrintWriter(System.out, true);
if (type.equals("words")) {
printWords(inputFile, writer);
} else if (type.equals("lemmas")) {
printLemmas(inputFile, writer);
} else if (type.equals("pos")) {
printPos(inputFile, writer);
} else if (type.equals("ner")) {
printNer(inputFile, writer);
} else if (type.equals("basic-deps")) {
printDeps(inputFile, writer, DependencyForm.BASIC_DEPS);
} else if (type.equals("col-deps")) {
printDeps(inputFile, writer, DependencyForm.COL_DEPS);
} else if (type.equals("col-ccproc-deps")) {
printDeps(inputFile, writer, DependencyForm.COL_CCPROC_DEPS);
} else if (type.equals("phrase-structure")) {
printPhraseStructure(inputFile, writer);
} else if (type.equals("coref")) {
printCoref(inputFile, writer);
} else if (type.equals("stanford-deps")) {
printStanfordDeps(inputFile);
} else if (type.equals("stanford-phrase-structure")) {
printStanfordPhraseStructure(inputFile);
} else if (type.equals("headlines")) {
printHeadlineDateline(inputFile, writer);
} else if (type.equals("for-testing-only")) {
printForTestingOnly(inputFile, writer);
} else {
log.severe("Printer type not recognized: " + type);
log.severe(usage);
System.exit(1);
}
writer.flush();
}
private static void printWords(String inputFile, Writer writer) throws IOException {
// Only read the words
AgigaPrefs prefs = new AgigaPrefs();
prefs.setAll(false);
prefs.setWord(true);
// Iterate through the sentences, printing each one to stdout
StreamingSentenceReader reader = new StreamingSentenceReader(inputFile, prefs);
log.info("Parsing XML for file: " + reader.getFileId());
for (AgigaSentence sent : reader) {
sent.writeWords(writer);
}
log.info("Number of docs: " + reader.getNumDocs());
log.info("Number of sentences: " + reader.getNumSents());
}
private static void printLemmas(String inputFile, Writer writer) throws IOException {
// Only read the lemmas
AgigaPrefs prefs = new AgigaPrefs();
prefs.setAll(false);
prefs.setLemma(true);
// Iterate through the sentences, printing each one to stdout
StreamingSentenceReader reader = new StreamingSentenceReader(inputFile, prefs);
log.info("Parsing XML for file: " + reader.getFileId());
for (AgigaSentence sent : reader) {
sent.writeLemmas(writer);
}
log.info("Number of docs: " + reader.getNumDocs());
log.info("Number of sentences: " + reader.getNumSents());
}
private static void printPos(String inputFile, Writer writer) throws IOException {
// Only read the words and POS tags
AgigaPrefs prefs = new AgigaPrefs();
prefs.setAll(false);
prefs.setWord(true);
prefs.setPos(true);
// Iterate through the sentences, printing each one to stdout
StreamingSentenceReader reader = new StreamingSentenceReader(inputFile, prefs);
log.info("Parsing XML for file: " + reader.getFileId());
for (AgigaSentence sent : reader) {
sent.writePosTags(writer);
}
log.info("Number of sentences: " + reader.getNumSents());
}
private static void printNer(String inputFile, Writer writer) throws IOException {
// Only read the words and NER tags
AgigaPrefs prefs = new AgigaPrefs();
prefs.setAll(false);
prefs.setWord(true);
prefs.setNer(true);
// Iterate through the sentences, printing each one to stdout
StreamingDocumentReader reader = new StreamingDocumentReader(inputFile, prefs);
log.info("Parsing XML for file: " + reader.getFileId());
for (AgigaDocument doc : reader) {
for (AgigaSentence sent : doc.getSents()) {
sent.writeNerTags(writer);
}
}
log.info("Number of sentences: " + reader.getNumSents());
}
private static void printDeps(String inputFile, Writer writer, DependencyForm form) throws IOException {
// Only read what's needed for CONNL-X style output
AgigaPrefs prefs = new AgigaPrefs();
prefs.setForConnlStyleDeps(form);
// Iterate through the sentences, printing each one to stdout
StreamingSentenceReader reader = new StreamingSentenceReader(inputFile, prefs);
log.info("Parsing XML");
for (AgigaSentence sent : reader) {
sent.writeConnlStyleDeps(writer, form);
}
log.info("Number of sentences: " + reader.getNumSents());
}
private static void printPhraseStructure(String inputFile, Writer writer) throws IOException {
// Only read the parse text
AgigaPrefs prefs = new AgigaPrefs();
prefs.setAll(false);
prefs.setParse(true);
// Iterate through the sentences, printing each one to stdout
StreamingSentenceReader reader = new StreamingSentenceReader(inputFile, prefs);
log.info("Parsing XML");
for (AgigaSentence sent : reader) {
sent.writeParseText(writer);
}
log.info("Number of sentences: " + reader.getNumSents());
}
private static void printCoref(String inputFile, Writer writer) throws IOException {
// Only read the coref and the words
AgigaPrefs prefs = new AgigaPrefs();
prefs.setAll(false);
prefs.setWord(true);
prefs.setCoref(true);
// Iterate through the sentences, printing each one to stdout
StreamingDocumentReader reader = new StreamingDocumentReader(inputFile, prefs);
log.info("Parsing XML");
for (AgigaDocument doc : reader) {
doc.writeMucStyleCoref(writer);
}
log.info("Number of docs: " + reader.getNumDocs());
}
private static void printStanfordDeps(String inputFile) {
// Only read the words, lemmas, tags
AgigaPrefs prefs = new AgigaPrefs();
prefs.setAll(false);
prefs.setWord(true);
prefs.setLemma(true);
prefs.setPos(true);
prefs.setDeps(DependencyForm.BASIC_DEPS);
prefs.setDeps(DependencyForm.COL_DEPS);
prefs.setDeps(DependencyForm.COL_CCPROC_DEPS);
// Iterate through the sentences, printing each one to stdout
StreamingSentenceReader reader = new StreamingSentenceReader(inputFile, prefs);
log.info("Parsing XML");
for (AgigaSentence sent : reader) {
// Print out all the dependency forms
System.out.println(sent.getStanfordWordLemmaTags());
System.out.println("---");
System.out.println(sent.getStanfordTreeGraphNodes(DependencyForm.BASIC_DEPS));
System.out.println(sent.getStanfordTreeGraphNodes(DependencyForm.COL_DEPS));
System.out.println(sent.getStanfordTreeGraphNodes(DependencyForm.COL_CCPROC_DEPS));
System.out.println("---");
System.out.println(sent.getStanfordTypedDependencies(DependencyForm.BASIC_DEPS));
System.out.println(sent.getStanfordTypedDependencies(DependencyForm.COL_DEPS));
System.out.println(sent.getStanfordTypedDependencies(DependencyForm.COL_CCPROC_DEPS));
System.out.println();
}
log.info("Number of sentences: " + reader.getNumSents());
}
private static void printStanfordPhraseStructure(String inputFile) {
// Only read the words, lemmas, tags
AgigaPrefs prefs = new AgigaPrefs();
prefs.setAll(false);
prefs.setParse(true);
// Iterate through the sentences, printing each one to stdout
StreamingSentenceReader reader = new StreamingSentenceReader(inputFile, prefs);
log.info("Parsing XML");
for (AgigaSentence sent : reader) {
System.out.println(sent.getStanfordContituencyTree());
}
log.info("Number of sentences: " + reader.getNumSents());
}
private static void printHeadlineDateline(String inputFile, Writer writer) throws IOException {
// Read everything
AgigaPrefs prefs = new AgigaPrefs();
prefs.setAll(false);
prefs.setHeadline(true);
prefs.setDateline(true);
// Iterate through the docs, printing each one to stdout
StreamingDocumentReader dReader = new StreamingDocumentReader(inputFile, prefs);
log.info("Parsing XML for file: " + dReader.getFileId());
for (AgigaDocument doc : dReader) {
log.info("Parsing doc: id=" + doc.getDocId() + " type=" + doc.getType());
if (doc.getHeadline() != null) { System.out.println("HEADLINE: " + doc.getHeadline()); }
if (doc.getDateline() != null) { System.out.println("DATELINE: " + doc.getDateline()); }
}
log.info("Number of docs: " + dReader.getNumDocs());
}
private static void printForTestingOnly(String inputFile, Writer writer) throws IOException {
// Read everything
AgigaPrefs prefs = new AgigaPrefs();
prefs.setAll(true);
// Iterate through the sentences, printing each one to stdout
StreamingSentenceReader sReader = new StreamingSentenceReader(inputFile, prefs);
log.info("Parsing XML for file: " + sReader.getFileId());
for (AgigaSentence sent : sReader) {
printAllSentenceAnnotations(writer, sent);
}
log.info("Number of sentences: " + sReader.getNumDocs());
log.info("Number of sentences: " + sReader.getNumSents());
// Iterate through the docs, printing each one to stdout
StreamingDocumentReader dReader = new StreamingDocumentReader(inputFile, prefs);
log.info("Parsing XML for file: " + dReader.getFileId());
for (AgigaDocument doc : dReader) {
log.info("Parsing doc: id=" + doc.getDocId() + " type=" + doc.getType());
if (doc.getHeadline() != null) { log.info("Found headline: " + doc.getHeadline()); }
if (doc.getDateline() != null) { log.info("Found dateline: " + doc.getDateline()); }
for (AgigaSentence sent : doc.getSents()) {
printAllSentenceAnnotations(writer, sent);
}
doc.writeMucStyleCoref(writer);
}
log.info("Number of docs: " + dReader.getNumDocs());
}
private static void printAllSentenceAnnotations(Writer writer, AgigaSentence sent) throws IOException {
log.info("Printing sent: id=" + sent.getSentIdx());
sent.writeWords(writer);
sent.writePosTags(writer);
sent.writeNerTags(writer);
sent.writeTags(writer, true, true, true);
for (DependencyForm df : DependencyForm.values()) {
sent.writeConnlStyleDeps(writer, df);
}
sent.writeParseText(writer);
for (AgigaToken tok : sent.getTokens()) {
System.out.print(tok.getTokIdx() + " ");
}
System.out.println();
System.out.println(sent.getStanfordWordLemmaTags());
for (DependencyForm df : DependencyForm.values()) {
System.out.println(sent.getStanfordTreeGraphNodes(df));
System.out.println(sent.getStanfordTypedDependencies(df));
}
System.out.println(sent.getStanfordContituencyTree());
}
}