package arkref.analysis;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.List;
import arkref.parsestuff.AnalysisUtilities;
import arkref.parsestuff.U;
import arkref.sent.SentenceBreaker;
import edu.stanford.nlp.trees.Tree;
public class Preprocess {
/**
* @param args
* @throws IOException
*/
public static void main(String[] args) throws IOException {
if(args.length == 0){
System.err.println("You need to pass a text file as a command line argument.");
System.exit(0);
}
String txtfile = args[0];
Preprocess.go(txtfile);
}
public static boolean alreadyPreprocessed(String path) {
String shortpath = shortPath(path);
return
new File(shortpath+".sst").exists() &&
new File(shortpath+".parse").exists();
}
public static String shortPath(String path) {
return path.replace(".txt","").replace(".sent","");
}
public static void go(String path) throws IOException {
go(path, false);
}
public static void writeOffsetSentenceFile(List <SentenceBreaker.Sentence> sentences, String shortpath, boolean useTempFiles) throws FileNotFoundException {
File osentOutputFile = new File(shortpath + ".osent");
if (useTempFiles) osentOutputFile.deleteOnExit();
PrintWriter pwOSent = new PrintWriter(new FileOutputStream(osentOutputFile));
for (SentenceBreaker.Sentence s : sentences) {
pwOSent.printf("%d\t%d\t%s\n", s.charStart, s.charEnd, s.cleanText);
}
pwOSent.close();
}
public static void go(String path, boolean useTempFiles) throws IOException {
// assert path.endsWith(".txt") || path.endsWith(".sent") : "bad filename extension";
File parseOutputFile = new File(path+".parse");
File sstOutputFile = new File(path+".sst");
if (useTempFiles && !parseOutputFile.exists() && !sstOutputFile.exists()) {
parseOutputFile.deleteOnExit();
sstOutputFile.deleteOnExit();
}
PrintWriter pwParse = new PrintWriter(new FileOutputStream(parseOutputFile));
PrintWriter pwSST = new PrintWriter(new FileOutputStream(sstOutputFile));
String textpath;
if (new File( (textpath= path+".sent")).exists()) {
} else if(new File(textpath= path+".txt").exists()) {
} else { assert false : "need a sentence or text file"; }
String text = U.readFile(textpath);
String[] sentenceTexts = null;
if (textpath.endsWith(".sent")) {
sentenceTexts = text.split("\n");
} else if (textpath.endsWith(".txt")) {
List<SentenceBreaker.Sentence> sentences = AnalysisUtilities.cleanAndBreakSentences(text);
writeOffsetSentenceFile(sentences, path, useTempFiles);
sentenceTexts = new String[sentences.size()];
for(int i=0; i < sentences.size(); i++) {
sentenceTexts[i] = sentences.get(i).cleanText;
}
} else { assert false; }
for(String sentence : sentenceTexts) {
AnalysisUtilities.ParseResult res = AnalysisUtilities.getInstance().parseSentence(sentence);
List<String> supersenses = AnalysisUtilities.getInstance().annotateSentenceWithSupersenses(res.parse);
U.pf("%s\t%s\t%s\n", res.success ? "PARSE" : "ERROR", res.score, res.parse);
pwParse.printf("%s\t%s\t%s\n", res.success ? "PARSE" : "ERROR", res.score, res.parse);
for(int i=0; i < supersenses.size(); i++){
String ss = supersenses.get(i);
if(i>0) pwSST.print(" ");
pwSST.print(res.parse.getLeaves().get(i) + "/" + ss.substring(ss.indexOf("-")+1));
}
pwSST.println();
}
pwSST.close();
pwParse.close();
}
}