package edu.stanford.nlp.parser.charniak;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.logging.Logger;
import java.util.regex.Pattern;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.io.StringOutputStream;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.Trees;
import edu.stanford.nlp.util.AbstractIterator;
import edu.stanford.nlp.util.IterableIterator;
import edu.stanford.nlp.util.ScoredObject;
import edu.stanford.nlp.util.Timing;
/**
* Utility routines for printing/reading scored parses for the Charniak Parser
*
* @author Angel Chang
*/
public class CharniakScoredParsesReaderWriter {
private final static Logger logger = Logger.getLogger(CharniakScoredParsesReaderWriter.class.getName());
private final static Pattern wsDelimiter = Pattern.compile("\\s+");
/**
* Reads scored parses from the charniak parser
*
* File format of the scored parses
* <# of parses>\t<sentenceid>
* <score>
* <parse>
* <score>
* <parse>
* ...
*
* @param filename - File to read parses from
* @return iterable with list of scored parse trees
*/
public Iterable<List<ScoredObject<Tree>>> readScoredTrees(String filename)
{
try {
ScoredParsesIterator iter = new ScoredParsesIterator(filename);
return new IterableIterator<>(iter);
} catch (IOException ex) {
throw new RuntimeException(ex);
}
}
/**
* Reads scored parses from the charniak parser
*
* @param inputDesc - Description of input used in log messages
* @param br - input reader
* @return iterable with list of scored parse trees
*/
public Iterable<List<ScoredObject<Tree>>> readScoredTrees(String inputDesc, BufferedReader br)
{
ScoredParsesIterator iter = new ScoredParsesIterator(inputDesc, br);
return new IterableIterator<>(iter);
}
/**
* Convert string representing scored parses (in the charniak parser output format)
* to list of scored parse trees
* @param parseStr
* @return list of scored parse trees
*/
public List<ScoredObject<Tree>> stringToParses(String parseStr)
{
try {
BufferedReader br = new BufferedReader(new StringReader(parseStr));
Iterable<List<ScoredObject<Tree>>> trees = readScoredTrees("", br);
List<ScoredObject<Tree>> res = null;
if (trees != null) {
Iterator<List<ScoredObject<Tree>>> iter = trees.iterator();
if (iter != null && iter.hasNext()) {
res = iter.next();
}
}
br.close();
return res;
} catch (IOException ex) {
throw new RuntimeException(ex);
}
}
/**
* Convert list of scored parse trees to string representing scored parses
* (in the charniak parser output format)
* @param parses - list of scored parse trees
* @return string representing scored parses
*/
public String parsesToString(List<ScoredObject<Tree>> parses)
{
if (parses == null) return null;
StringOutputStream os = new StringOutputStream();
PrintWriter pw = new PrintWriter(os);
printScoredTrees(pw, 0, parses);
pw.close();
return os.toString();
}
/**
* Print scored parse trees in format used by charniak parser
* @param trees - trees to output
* @param filename - file to output to
*/
public void printScoredTrees(Iterable<List<ScoredObject<Tree>>> trees, String filename)
{
try {
PrintWriter pw = IOUtils.getPrintWriter(filename);
int i = 0;
for (List<ScoredObject<Tree>> treeList:trees) {
printScoredTrees(pw, i, treeList);
i++;
}
pw.close();
} catch (IOException ex) {
throw new RuntimeException(ex);
}
}
/**
* Print scored parse trees for one sentence in format used by charniak parser
* @param pw - printwriter
* @param id - sentence id
* @param trees - trees to output
*/
public void printScoredTrees(PrintWriter pw, int id, List<ScoredObject<Tree>> trees)
{
pw.println(trees.size() + "\t" + id);
for (ScoredObject<Tree> scoredTree:trees) {
pw.println(scoredTree.score());
pw.println(scoredTree.object());
}
}
private static class ScoredParsesIterator extends AbstractIterator<List<ScoredObject<Tree>>>
{
String inputDesc;
BufferedReader br;
List<ScoredObject<Tree>> next;
Timing timing;
int processed = 0;
boolean done = false;
boolean closeBufferNeeded = true;
boolean expectConsecutiveSentenceIds = true;
int lastSentenceId = -1;
private ScoredParsesIterator(String filename) throws IOException {
this(filename, IOUtils.getBufferedFileReader(filename));
}
private ScoredParsesIterator(String inputDesc, BufferedReader br) {
this.inputDesc = inputDesc;
this.br = br;
logger.info("Reading cached parses from " + inputDesc);
timing = new Timing();
timing.start();
next = getNext();
done = next == null;
}
private List<ScoredObject<Tree>> getNext()
{
try {
String line;
int parsesExpected = 0;
int sentenceId = lastSentenceId;
ScoredObject<Tree> curParse = null;
Double score = null;
List<ScoredObject<Tree>> curParses = null;
while ((line = br.readLine()) != null) {
line = line.trim();
if (line.length() > 0) {
if (parsesExpected == 0) {
// Finished processing parses
String[] fields = wsDelimiter.split(line, 2);
parsesExpected = Integer.parseInt(fields[0]);
sentenceId = Integer.parseInt(fields[1]);
if (expectConsecutiveSentenceIds) {
if (sentenceId != lastSentenceId+1) {
if (lastSentenceId < sentenceId) {
StringBuilder sb = new StringBuilder("Missing sentences");
for (int i = lastSentenceId+1; i < sentenceId; i++) {
sb.append(" ").append(i);
}
logger.warning(sb.toString());
} else {
logger.warning("sentenceIds are not increasing (last="
+ lastSentenceId + ", curr=" + sentenceId + ")");
}
}
}
lastSentenceId = sentenceId;
curParses = new ArrayList<>(parsesExpected);
} else {
if (score == null) {
// read score
score = Double.parseDouble(wsDelimiter.split(line, 2)[0]);
} else {
// Reading a parse
curParse = new ScoredObject<>(Trees.readTree(line), score);
curParses.add(curParse);
curParse = null;
score = null;
parsesExpected--;
if (parsesExpected == 0) {
return curParses;
}
}
}
}
}
} catch (IOException ex) {
throw new RuntimeException(ex);
}
return null;
}
public boolean hasNext()
{
return !done;
}
public List<ScoredObject<Tree>> next()
{
if (!done) {
List<ScoredObject<Tree>> cur = next;
next = getNext();
processed++;
if (next == null) {
logger.finer("Read " + processed + " trees, from "
+ inputDesc + " in " + timing.toSecondsString() + " secs");
done = true;
if (closeBufferNeeded) {
try { br.close(); } catch (IOException ex) {};
}
}
return cur;
} else {
throw new NoSuchElementException("No more elements from " + inputDesc);
}
}
}
}