package edu.berkeley.cs.nlp.ocular.eval;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Random;
import edu.berkeley.cs.nlp.ocular.eval.MarkovEditDistanceComputer.EditDistanceParams;
import tberg.murphy.fileio.f;
import tberg.murphy.tuple.Pair;
/**
* @author Taylor Berg-Kirkpatrick (tberg@eecs.berkeley.edu)
*/
public class ErrorSampler {
public static class Error implements Comparable<Error> {
public final int docIdx;
public final int lineIdx;
public final int guessTokenIdx;
public final String guess;
public final String gold;
public static final String INSERTION = "<INSERTION>";
public static final String DELETION = "<DELETION>";
public Error(int docIdx, int lineIdx, int guessColumn, String guess, String gold) {
this.docIdx = docIdx;
this.lineIdx = lineIdx;
this.guessTokenIdx = guessColumn;
this.guess = guess;
this.gold = gold;
}
@Override
public int compareTo(Error e1) {
if (this.docIdx != e1.docIdx) {
return this.docIdx - e1.docIdx;
} else if (this.lineIdx != e1.lineIdx) {
return this.lineIdx - e1.lineIdx;
}
return this.guessTokenIdx - e1.guessTokenIdx;
}
public String toString() {
return "Doc " + docIdx + ", line " + lineIdx + ", guess idx " + guessTokenIdx + ": guess = " + guess + ", gold = " + gold;
}
}
public static void main(String[] args) {
List<Error> errors = aggregateWordErrors(args);
final int NUM_ERRORS = 50;
Collections.shuffle(errors, new Random(0));
List<Error> selectedErrors = errors.subList(0, Math.min(errors.size(), NUM_ERRORS));
Collections.sort(selectedErrors);
for (int i = 0; i < selectedErrors.size(); i++) {
System.out.println(selectedErrors.get(i).toString());
}
}
public static List<Error> aggregateWordErrors(String[] fileNames) {
List<Error> allErrors = new ArrayList<Error>();
for (int fileIdx = 0; fileIdx < fileNames.length; fileIdx++) {
String fileName = fileNames[fileIdx];
Pair<List<String>,List<String>> goldGuessLines = getGoldGuessLinesFromOutput(fileName);
List<String> goldLines = goldGuessLines.getFirst();
List<String> guessLines = goldGuessLines.getSecond();
assert goldLines.size() == guessLines.size();
for (int i = 0; i < goldLines.size(); i++) {
String goldStr = goldLines.get(i).replaceAll("\\|", "s");
String guessStr = guessLines.get(i).replaceAll("\\|", "s");
Form guessForm = Form.wordsAsGlyphs(Arrays.asList(guessStr.split("\\s+")));
Form goldForm = Form.wordsAsGlyphs(Arrays.asList(goldStr.split("\\s+")));
EditDistanceParams params = EditDistanceParams.getStandardParams(guessForm, goldForm, false);
MarkovEditDistanceComputer medc = new MarkovEditDistanceComputer(params);
AlignedFormPair alignedPair = medc.runEditDistance();
assert alignedPair.trg.length() == goldForm.length();
int srcGuessIdx = 0;
int trgGoldIdx = 0;
for (Operation op : alignedPair.ops) {
switch (op) {
case EQUAL:
srcGuessIdx++;
trgGoldIdx++;
break;
case SUBST:
allErrors.add(new Error(fileIdx, i, srcGuessIdx, guessForm.charAt(srcGuessIdx).toString(), goldForm.charAt(trgGoldIdx).toString()));
srcGuessIdx++;
trgGoldIdx++;
break;
case INSERT:
allErrors.add(new Error(fileIdx, i, srcGuessIdx, Error.INSERTION, goldForm.charAt(trgGoldIdx).toString()));
trgGoldIdx++;
break;
case DELETE:
allErrors.add(new Error(fileIdx, i, srcGuessIdx, guessForm.charAt(srcGuessIdx).toString(), Error.DELETION));
srcGuessIdx++;
break;
}
}
}
System.out.println("Processed file " + fileNames[fileIdx] + " with " + goldLines.size() + " lines, cumulative errors = " + allErrors.size());
}
return allErrors;
}
public static Pair<List<String>,List<String>> getGoldGuessLinesFromOutput(String outFile) {
List<String> lines = f.readLines(outFile);
List<String> guessLines = new ArrayList<String>();
List<String> goldLines = new ArrayList<String>();
for (int i = 0; i < lines.size(); i++) {
String currLine = lines.get(i).trim();
if (i % 3 == 0 && currLine.equals("")) {
break;
}
switch (i % 3) {
case 0: guessLines.add(currLine);
break;
case 1: goldLines.add(currLine);
break;
case 2: assert currLine.equals("");
break;
}
}
return Pair.makePair(goldLines, guessLines);
}
}