package edu.stanford.nlp.parser.ensemble.utils;
import java.io.*;
import java.util.*;
public class Agreement {
public static void main(String[] args) throws IOException {
// runSentenceSelection(
// "/home/mcclosky/data/scr/StanfordParsed/gigaword/apw_eng/words_and_tags",
// "/home/mcclosky/data/gigaword-selected/apw_eng");
File develOutputs = new File("/home/mcclosky/data/gigaword-selected/devel_outputs");
String[] filenames = develOutputs.list(new FilenameFilter() {
@Override
public boolean accept(File dir, String name) {
return name.contains("libsvm") || name.contains("mstparser");
}
});
selectSentencesForAgreement(develOutputs, filenames, 100, develOutputs
+ "/" + "dev-6malt+mst-agree-100");
}
public static void runSentenceSelection(String input, String output)
throws IOException {
int numSentences = 0;
@SuppressWarnings("unused")
int[] numSentencesAboveThreshold = new int[21];
// number of tokens in sentences in numSentencesAboveThreshold
// (this is used for average length calculations)
@SuppressWarnings("unused")
int[] numTokensAboveThreshold = new int[21];
File inputDirectory = new File(input);
File outputDirectory = new File(output);
// make output directories
Map<Integer, File> agreementPercentToOutputDir = new HashMap<Integer, File>();
for (int i = 70; i <= 100; i += 10) {
File subDir = new File(outputDirectory, Integer.toString(i));
subDir.mkdirs();
agreementPercentToOutputDir.put(i, subDir);
}
// first, find all the simple filenames (ones with words and tags only)
String[] wordsAndTagsFilenames = inputDirectory.list(new FilenameFilter() {
@Override
public boolean accept(File dir, String name) {
return name.endsWith(".parse.gz");
}
});
/*
* given the simple filenames, we find out which filenames start with
* them since parses using the words and tags filenames use them as a
* prefix.
*/
for (final String wordsAndTagsFilename : wordsAndTagsFilenames) {
System.out.println("filename: " + wordsAndTagsFilename);
String[] allParses = inputDirectory.list(new FilenameFilter() {
@Override
public boolean accept(File dir, String name) {
return name.startsWith(wordsAndTagsFilename)
&& !name.equals(wordsAndTagsFilename);
}
});
System.out.println("# parses: " + allParses.length);
if (allParses != null && allParses.length > 1) {
// we have parses for this words and tags filename
for (int i = 70; i <= 100; i += 10) {
File outputDir = agreementPercentToOutputDir.get(i);
File outputFile = new File(outputDir, wordsAndTagsFilename);
try {
numSentences += selectSentencesForAgreement(
inputDirectory, allParses, i, outputFile.getAbsolutePath());
} catch (IOException e) {
// skip cases where not all files are available (this is
// mostly an issue while the parses are being generated)
System.out.println("(skipping due to IOError/permission problem)");
continue;
}
}
}
System.out.println("Sentences so far: " + numSentences);
// for (int i = 0; i < 21; i++) {
// double averageLength = (double) numTokensAboveThreshold[i]
// / numSentencesAboveThreshold[i];
// double percentSentences = (double) numSentencesAboveThreshold[i]
// / numSentences;
//
// System.out.format("%4s %.1f %.1f\n", i * 5,
// percentSentences * 100, averageLength);
// }
}
}
/**
* Given files in a specific directory and an agreement threshold, writes
* sentences that agree at least that amount to an output file.
*
* @return number of sentences processed
*/
@SuppressWarnings("unchecked")
public static int selectSentencesForAgreement(File inputDirectory,
String[] inputFilenames, double agreementThreshold,
String outputFilename) throws IOException {
BufferedReader[] is = makeReaders(inputDirectory, inputFilenames);
List<Token>[] sents = new List[is.length];
int numSentences = 0;
BufferedWriter bw = FileUtils.openForWriting(outputFilename);
while ((sents[0] = Token.readNextSentCoNLLX(is[0])) != null) {
for (int i = 1; i < is.length; i++) {
sents[i] = Token.readNextSentCoNLLX(is[i]);
}
numSentences++;
double agreement = labelledNodesInAgreement(sents);
if (agreement * 100 >= agreementThreshold) {
List<Token> bestParse = getHighestAgreementParse(sents);
Token.writeSentCoNLLX(bestParse, bw);
}
}
bw.close();
return numSentences;
}
@SuppressWarnings("unused")
private static void collectAgreementStats(int numSentences,
int[] numSentencesAboveThreshold, int[] numTokensAboveThreshold,
BufferedReader[] readers, List<Token>[] sentences)
throws IOException {
// number of tokens in this sentence (assume all parsers
// agree on this...)
int numTokens = sentences[0].size();
double agreementPercent = labelledNodesInAgreement(sentences);
for (int i = 0; i < 21; i++) {
if (agreementPercent >= i * 0.05) {
numSentencesAboveThreshold[i]++;
numTokensAboveThreshold[i] += numTokens;
}
}
}
public static BufferedReader[] makeReaders(File baseDirectory,
String[] filenames) throws IOException {
BufferedReader[] is = new BufferedReader[filenames.length];
for (int i = 0; i < filenames.length; i++) {
String filename = baseDirectory + "/" + filenames[i];
is[i] = FileUtils.openForReading(filename);
}
return is;
}
/**
* Returns the sentence with the highest average agreement with all other
* sentences.
*
* @param sentences array of sentences
* @return sentence with highest average agreement
*/
@SuppressWarnings("unchecked")
public static List<Token> getHighestAgreementParse(List<Token>[] sentences) {
double agreements[][] = new double[sentences.length][sentences.length];
// collect pairwise agreements
for (int i = 0; i < sentences.length; i++) {
for (int j = i + 1; j < sentences.length; j++) {
List<Token>[] justTheseSentences = new List[]{sentences[i],
sentences[j]};
double a = Agreement.labelledNodesInAgreement(justTheseSentences);
agreements[i][j] = a;
agreements[j][i] = a;
}
}
// then add up the agreements and find the largest
double totalAgreement[] = new double[sentences.length];
double highestSoFar = 0;
int bestSoFar = -1;
for (int i = 0; i < sentences.length; i++) {
for (int j = 0; j < sentences.length; j++) {
if (i == j) {
continue;
}
totalAgreement[i] += agreements[i][j];
}
if (totalAgreement[i] > highestSoFar) {
bestSoFar = i;
highestSoFar = totalAgreement[i];
}
}
return sentences[bestSoFar];
}
/**
* Calculate the percentage of how many tokens are the same in all parses of
* the sentences.
*
* @param sentences list of parses of the same sentences
* @return percentage of nodes (0.0 to 1.0)
*/
public static double labelledNodesInAgreement(List<Token>[] sentences) {
// how many tokens all parsers agree on
int numTokensAllAgreed = 0;
int numTokens = sentences[0].size();
for (int tokenIndex = 0; tokenIndex < numTokens; tokenIndex++) {
Set<String> tokens = new HashSet<String>();
for (List<Token> sent : sentences) {
Token currentToken = sent.get(tokenIndex);
// this is sort of a hack to avoid importing Pair so we can
// avoid JavaNLP dependencies
tokens.add(Integer.toString(currentToken.head)
+ currentToken.label);
}
if (tokens.size() == 1) {
numTokensAllAgreed++;
}
}
return (double) numTokensAllAgreed / numTokens;
}
/**
* Calculate the percentage of how many tokens are the same in all parses of
* the sentences.
*
* @param sentences list of parses of the same sentences
* @return percentage of nodes (0.0 to 1.0)
*/
public static double nodesInAgreement(List<Token>[] sentences) {
// how many tokens all parsers agree on
int numTokensAllAgreed = 0;
int numTokens = sentences[0].size();
for (int tokenIndex = 0; tokenIndex < numTokens; tokenIndex++) {
Set<Integer> parents = new HashSet<Integer>();
for (List<Token> sent : sentences) {
Token currentToken = sent.get(tokenIndex);
parents.add(currentToken.head);
}
if (parents.size() == 1) {
numTokensAllAgreed++;
}
}
return (double) numTokensAllAgreed / numTokens;
}
}