package edu.stanford.nlp.coref.hybrid;
import java.io.File;
import java.io.FileOutputStream;
import java.io.PrintWriter;
import java.util.Calendar;
import java.util.Date;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.logging.Logger;
import edu.stanford.nlp.coref.CorefAlgorithm;
import edu.stanford.nlp.coref.CorefPrinter;
import edu.stanford.nlp.coref.CorefProperties;
import edu.stanford.nlp.coref.CorefScorer;
import edu.stanford.nlp.coref.CorefUtils;
import edu.stanford.nlp.coref.data.CorefChain;
import edu.stanford.nlp.coref.data.CorefCluster;
import edu.stanford.nlp.coref.data.Dictionaries;
import edu.stanford.nlp.coref.data.Document;
import edu.stanford.nlp.coref.data.DocumentMaker;
import edu.stanford.nlp.coref.data.Mention;
import edu.stanford.nlp.coref.hybrid.sieve.Sieve;
import edu.stanford.nlp.coref.hybrid.sieve.Sieve.ClassifierType;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.Pair;
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.util.concurrent.MulticoreWrapper;
import edu.stanford.nlp.util.concurrent.ThreadsafeProcessor;
import edu.stanford.nlp.util.logging.Redwood;
import edu.stanford.nlp.util.logging.RedwoodConfiguration;
public class HybridCorefSystem implements CorefAlgorithm {
/** A logger for this class */
private static Redwood.RedwoodChannels log = Redwood.channels(HybridCorefSystem.class);
public Properties props;
public List<Sieve> sieves;
public Dictionaries dictionaries;
public DocumentMaker docMaker = null;
public HybridCorefSystem(Properties props, Dictionaries dictionaries) throws Exception {
this.props = props;
this.dictionaries = dictionaries;
sieves = Sieve.loadSieves(props);
// set semantics loading
for(Sieve sieve : sieves) {
if(sieve.classifierType == ClassifierType.RULE) continue;
if(HybridCorefProperties.useWordEmbedding(props, sieve.sievename)) {
props.setProperty(HybridCorefProperties.LOAD_WORD_EMBEDDING_PROP, "true");
}
}
}
public HybridCorefSystem(Properties props) throws Exception {
this.props = props;
sieves = Sieve.loadSieves(props);
// set semantics loading
for(Sieve sieve : sieves) {
if(sieve.classifierType == ClassifierType.RULE) continue;
if(HybridCorefProperties.useWordEmbedding(props, sieve.sievename)) {
props.setProperty(HybridCorefProperties.LOAD_WORD_EMBEDDING_PROP, "true");
}
}
dictionaries = new Dictionaries(props);
docMaker = new DocumentMaker(props, dictionaries);
}
public Dictionaries dictionaries() { return dictionaries; }
public static void runCoref(String[] args) throws Exception {
runCoref(StringUtils.argsToProperties(args));
}
public static void runCoref(Properties props) throws Exception {
/*
* property, environment setting
*/
Redwood.hideChannelsEverywhere(
"debug-cluster", "debug-mention", "debug-preprocessor", "debug-docreader", "debug-mergethres",
"debug-featureselection", "debug-md"
);
int nThreads = HybridCorefProperties.getThreadCounts(props);
String timeStamp = Calendar.getInstance().getTime().toString().replaceAll("\\s", "-").replaceAll(":", "-");
Logger logger = Logger.getLogger(HybridCorefSystem.class.getName());
// set log file path
if(props.containsKey(HybridCorefProperties.LOG_PROP)){
File logFile = new File(props.getProperty(HybridCorefProperties.LOG_PROP));
RedwoodConfiguration.current().handlers(
RedwoodConfiguration.Handlers.file(logFile)).apply();
Redwood.log("Starting coref log");
}
log.info(props.toString());
if(HybridCorefProperties.checkMemory(props)) checkMemoryUsage();
HybridCorefSystem cs = new HybridCorefSystem(props);
/*
output setting
*/
// prepare conll output
String goldOutput = null;
String beforeCorefOutput = null;
String afterCorefOutput = null;
PrintWriter writerGold = null;
PrintWriter writerBeforeCoref = null;
PrintWriter writerAfterCoref = null;
if (HybridCorefProperties.doScore(props)) {
String pathOutput = CorefProperties.conllOutputPath(props);
(new File(pathOutput)).mkdir();
goldOutput = pathOutput + "output-" + timeStamp + ".gold.txt";
beforeCorefOutput = pathOutput + "output-" + timeStamp + ".predicted.txt";
afterCorefOutput = pathOutput + "output-" + timeStamp + ".coref.predicted.txt";
writerGold = new PrintWriter(new FileOutputStream(goldOutput));
writerBeforeCoref = new PrintWriter(new FileOutputStream(beforeCorefOutput));
writerAfterCoref = new PrintWriter(new FileOutputStream(afterCorefOutput));
}
// run coref
MulticoreWrapper<Pair<Document, HybridCorefSystem>, StringBuilder[]> wrapper = new MulticoreWrapper<>(
nThreads, new ThreadsafeProcessor<Pair<Document, HybridCorefSystem>, StringBuilder[]>() {
@Override
public StringBuilder[] process(Pair<Document, HybridCorefSystem> input) {
try {
Document document = input.first;
HybridCorefSystem cs = input.second;
StringBuilder[] outputs = new StringBuilder[4]; // conll output and logs
cs.coref(document, outputs);
return outputs;
} catch (Exception e) {
throw new RuntimeException(e);
}
}
@Override
public ThreadsafeProcessor<Pair<Document, HybridCorefSystem>, StringBuilder[]> newInstance() {
return this;
}
});
Date startTime = null;
if(HybridCorefProperties.checkTime(props)) {
startTime = new Date();
System.err.printf("END-TO-END COREF Start time: %s\n", startTime);
}
// run processes
int docCnt = 0;
while (true) {
Document document = cs.docMaker.nextDoc();
if (document == null) break;
wrapper.put(Pair.makePair(document, cs));
docCnt = logOutput(wrapper, writerGold, writerBeforeCoref, writerAfterCoref, docCnt);
}
// Finished reading the input. Wait for jobs to finish
wrapper.join();
docCnt = logOutput(wrapper, writerGold, writerBeforeCoref, writerAfterCoref, docCnt);
IOUtils.closeIgnoringExceptions(writerGold);
IOUtils.closeIgnoringExceptions(writerBeforeCoref);
IOUtils.closeIgnoringExceptions(writerAfterCoref);
if(HybridCorefProperties.checkTime(props)) {
System.err.printf("END-TO-END COREF Elapsed time: %.3f seconds\n", (((new Date()).getTime() - startTime.getTime()) / 1000F));
// System.err.printf("CORENLP PROCESS TIME TOTAL: %.3f seconds\n", cs.mentionExtractor.corenlpProcessTime);
}
if(HybridCorefProperties.checkMemory(props)) checkMemoryUsage();
// scoring
if (HybridCorefProperties.doScore(props)) {
String summary = CorefScorer.getEvalSummary(CorefProperties.getScorerPath(props), goldOutput, beforeCorefOutput);
CorefScorer.printScoreSummary(summary, logger, false);
summary = CorefScorer.getEvalSummary(CorefProperties.getScorerPath(props), goldOutput, afterCorefOutput);
CorefScorer.printScoreSummary(summary, logger, true);
CorefScorer.printFinalConllScore(summary);
}
}
/**
* Write output of coref system in conll format, and log.
*/
private static int logOutput(MulticoreWrapper<Pair<Document, HybridCorefSystem>, StringBuilder[]> wrapper,
PrintWriter writerGold,
PrintWriter writerBeforeCoref,
PrintWriter writerAfterCoref,
int docCnt) {
while (wrapper.peek()) {
StringBuilder[] output = wrapper.poll();
writerGold.print(output[0]);
writerBeforeCoref.print(output[1]);
writerAfterCoref.print(output[2]);
if (output[3].length() > 0) {
log.info(output[3]);
}
if ((++docCnt) % 10 == 0) log.info(docCnt + " document(s) processed");
}
return docCnt;
}
@Override
public void runCoref(Document document) {
try {
coref(document);
} catch (Exception e) {
throw new RuntimeException("Error running hybrid coref system", e);
}
}
/**
* main entry of coreference system.
*
* @param document Input document for coref format (Annotation and optional information)
* @param output For output of coref system (conll format and log. list size should be 4.)
* @return Map of coref chain ID and corresponding chain
* @throws Exception
*/
public Map<Integer, CorefChain> coref(Document document, StringBuilder[] output) throws Exception {
if(HybridCorefProperties.printMDLog(props)) {
Redwood.log(HybridCorefPrinter.printMentionDetectionLog(document));
}
if(HybridCorefProperties.doScore(props)) {
output[0] = (new StringBuilder()).append(CorefPrinter.printConllOutput(document, true)); // gold
output[1] = (new StringBuilder()).append(CorefPrinter.printConllOutput(document, false)); // before coref
}
output[3] = new StringBuilder(); // log from sieves
for(Sieve sieve : sieves){
CorefUtils.checkForInterrupt();
output[3].append(sieve.resolveMention(document, dictionaries, props));
}
// post processing
if(HybridCorefProperties.doPostProcessing(props)) postProcessing(document);
if(HybridCorefProperties.doScore(props)) {
output[2] = (new StringBuilder()).append(CorefPrinter.printConllOutput(document, false, true)); // after coref
}
return makeCorefOutput(document);
}
/**
* main entry of coreference system.
*
* @param document Input document for coref format (Annotation and optional information)
* @return Map of coref chain ID and corresponding chain
* @throws Exception
*/
public Map<Integer, CorefChain> coref(Document document) throws Exception {
return coref(document, new StringBuilder[4]);
}
/**
* main entry of coreference system.
*
* @param anno Input annotation.
* @return Map of coref chain ID and corresponding chain
* @throws Exception
*/
public Map<Integer, CorefChain> coref(Annotation anno) throws Exception {
return coref(docMaker.makeDocument(anno));
}
/** Extract final coreference output from coreference document format. */
private static Map<Integer, CorefChain> makeCorefOutput(Document document) {
Map<Integer, CorefChain> result = Generics.newHashMap();
for(CorefCluster c : document.corefClusters.values()) {
result.put(c.clusterID, new CorefChain(c, document.positions));
}
return result;
}
/** Remove singletons, appositive, predicate nominatives, relative pronouns. */
private static void postProcessing(Document document) {
Set<Mention> removeSet = Generics.newHashSet();
Set<Integer> removeClusterSet = Generics.newHashSet();
for(CorefCluster c : document.corefClusters.values()){
Set<Mention> removeMentions = Generics.newHashSet();
for(Mention m : c.getCorefMentions()) {
if(HybridCorefProperties.REMOVE_APPOSITION_PREDICATENOMINATIVES
&& ((m.appositions!=null && m.appositions.size() > 0)
|| (m.predicateNominatives!=null && m.predicateNominatives.size() > 0)
|| (m.relativePronouns!=null && m.relativePronouns.size() > 0))){
removeMentions.add(m);
removeSet.add(m);
m.corefClusterID = m.mentionID;
}
}
c.corefMentions.removeAll(removeMentions);
if(HybridCorefProperties.REMOVE_SINGLETONS && c.getCorefMentions().size()==1) {
removeClusterSet.add(c.clusterID);
}
}
for (int removeId : removeClusterSet){
document.corefClusters.remove(removeId);
}
for(Mention m : removeSet){
document.positions.remove(m);
}
}
private static void checkMemoryUsage() {
Runtime runtime = Runtime.getRuntime();
runtime.gc();
long memory = runtime.totalMemory() - runtime.freeMemory();
log.info("USED MEMORY (bytes): " + memory);
}
public static void main(String[] args) throws Exception {
Date startTime = new Date();
System.err.printf("Start time: %s\n", startTime);
runCoref(args);
System.err.printf("Elapsed time: %.3f seconds\n", (((new Date()).getTime() - startTime.getTime()) / 1000F));
}
}