package edu.berkeley.cs.nlp.ocular.eval; import static edu.berkeley.cs.nlp.ocular.main.FonttrainTranscribeShared.OutputFormat.ALTO; import static edu.berkeley.cs.nlp.ocular.main.FonttrainTranscribeShared.OutputFormat.COMP; import static edu.berkeley.cs.nlp.ocular.main.FonttrainTranscribeShared.OutputFormat.DIPL; import static edu.berkeley.cs.nlp.ocular.main.FonttrainTranscribeShared.OutputFormat.HTML; import static edu.berkeley.cs.nlp.ocular.main.FonttrainTranscribeShared.OutputFormat.NORM; import static edu.berkeley.cs.nlp.ocular.main.FonttrainTranscribeShared.OutputFormat.NORMLINES; import static edu.berkeley.cs.nlp.ocular.main.FonttrainTranscribeShared.OutputFormat.WHITESPACE; import static edu.berkeley.cs.nlp.ocular.util.Tuple2.Tuple2; import static edu.berkeley.cs.nlp.ocular.util.Tuple3.Tuple3; import java.io.File; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Map; import java.util.Set; import edu.berkeley.cs.nlp.ocular.data.Document; import edu.berkeley.cs.nlp.ocular.data.textreader.Charset; import edu.berkeley.cs.nlp.ocular.eval.Evaluator.EvalSuffStats; import edu.berkeley.cs.nlp.ocular.gsm.GlyphChar; import edu.berkeley.cs.nlp.ocular.gsm.GlyphChar.GlyphType; import edu.berkeley.cs.nlp.ocular.lm.CodeSwitchLanguageModel; import edu.berkeley.cs.nlp.ocular.main.FonttrainTranscribeShared.OutputFormat; import edu.berkeley.cs.nlp.ocular.model.DecodeState; import edu.berkeley.cs.nlp.ocular.model.transition.SparseTransitionModel.TransitionState; import edu.berkeley.cs.nlp.ocular.output.AltoOutputWriter; import edu.berkeley.cs.nlp.ocular.output.HtmlOutputWriter; import edu.berkeley.cs.nlp.ocular.util.ArrayHelper; import edu.berkeley.cs.nlp.ocular.util.FileHelper; import edu.berkeley.cs.nlp.ocular.util.FileUtil; import edu.berkeley.cs.nlp.ocular.util.Tuple2; import edu.berkeley.cs.nlp.ocular.util.Tuple3; import tberg.murphy.fileio.f; import tberg.murphy.indexer.Indexer; /** * @author Taylor Berg-Kirkpatrick (tberg@eecs.berkeley.edu) * @author Dan Garrette (dhgarrette@gmail.com) */ public class BasicSingleDocumentEvaluatorAndOutputPrinter implements SingleDocumentEvaluatorAndOutputPrinter { private Indexer<String> charIndexer; private Indexer<String> langIndexer; private boolean allowGlyphSubstitution; private boolean charIncludesDiacritic; private List<String> commandLineArgs; public BasicSingleDocumentEvaluatorAndOutputPrinter(Indexer<String> charIndexer, Indexer<String> langIndexer, boolean allowGlyphSubstitution, boolean charIncludesDiacritic, List<String> commandLineArgs) { this.charIndexer = charIndexer; this.langIndexer = langIndexer; this.allowGlyphSubstitution = allowGlyphSubstitution; this.charIncludesDiacritic = charIncludesDiacritic; this.commandLineArgs = commandLineArgs; } private String joinLineForPrinting(List<String> chars) { StringBuilder b = new StringBuilder(); for (String c : chars) b.append(Charset.unescapeChar(c)); return b.toString(); } public Tuple2<Map<String, EvalSuffStats>,Map<String, EvalSuffStats>> evaluateAndPrintTranscription(int iter, int batchId, Document doc, DecodeState[][] decodeStates, String inputDocPath, String outputPath, Set<OutputFormat> outputFormats, CodeSwitchLanguageModel lm) { Tuple2<Tuple3<String[][], String[][], List<String>>, DecodeState[][]> goldTranscriptionData = loadGoldTranscriptions(doc, decodeStates); String[][] goldDiplomaticLineChars = goldTranscriptionData._1._1; String[][] goldNormalizedLineChars = goldTranscriptionData._1._2; List<String> goldNormalizedChars = goldTranscriptionData._1._3; decodeStates = goldTranscriptionData._2; // in case we needed to add blank rows int numLines = decodeStates.length; // // Get the model output // ModelTranscriptions mt = new ModelTranscriptions(decodeStates, charIndexer, langIndexer); String outputFilenameBase = makeOutputFilenameBase(iter, batchId, doc, inputDocPath, outputPath); new File(outputFilenameBase).getAbsoluteFile().getParentFile().mkdirs(); // // Evaluate the comparison // Map<String, EvalSuffStats> diplomaticEvals = goldDiplomaticLineChars != null ? Evaluator.getUnsegmentedEval(mt.getViterbiDiplomaticCharLines(), toArrayOfLists(goldDiplomaticLineChars), charIncludesDiacritic) : null; Map<String, EvalSuffStats> normalizedEvals = goldNormalizedLineChars != null ? Evaluator.getUnsegmentedEval(mt.getViterbiNormalizedCharLines(), toArrayOfLists(goldNormalizedLineChars), charIncludesDiacritic) : null; // // Diplomatic transcription output // { String transcriptionOutputFilename = diplomaticTranscriptionOutputFile(outputFilenameBase); StringBuffer transcriptionOutputBuffer = new StringBuffer(); for (int line = 0; line < numLines; ++line) { transcriptionOutputBuffer.append(joinLineForPrinting(mt.getViterbiDiplomaticCharLines()[line]) + "\n"); } System.out.println("\n" + transcriptionOutputBuffer.toString()); if (outputFormats.contains(DIPL)) { System.out.println("Writing transcription output to " + transcriptionOutputFilename); FileHelper.writeString(transcriptionOutputFilename, transcriptionOutputBuffer.toString()); } } // // Normalized transcription lines output // if (allowGlyphSubstitution) { String transcriptionOutputFilename = normalizedLinesTranscriptionOutputFile(outputFilenameBase); StringBuffer transcriptionOutputBuffer = new StringBuffer(); for (int line = 0; line < numLines; ++line) { transcriptionOutputBuffer.append(joinLineForPrinting(mt.getViterbiNormalizedCharLines()[line]) + "\n"); } //System.out.println("\n" + transcriptionOutputBuffer.toString()); if (outputFormats.contains(NORMLINES)) { System.out.println("Writing normalized transcription lines output to " + transcriptionOutputFilename); FileHelper.writeString(transcriptionOutputFilename, transcriptionOutputBuffer.toString()); } } // // Normalized transcription cleaned output // if (allowGlyphSubstitution) { String transcriptionOutputFilename = normalizedTranscriptionOutputFile(outputFilenameBase); String transcriptionOutputBuffer = joinLineForPrinting(mt.getViterbiNormalizedCharRunning()); //System.out.println("\n" + transcriptionOutputBuffer.toString() + "\n"); if (outputFormats.contains(NORM)) { System.out.println("Writing normalized transcription output to " + transcriptionOutputFilename); FileHelper.writeString(transcriptionOutputFilename, transcriptionOutputBuffer); } } // // Make comparison file // //if ((allowGlyphSubstitution || goldDiplomaticLineChars != null || goldNormalizedLineChars != null) && outputFormats.contains(COMP)) { if (outputFormats.contains(COMP)) { String transcriptionOutputFilename = comparisonsTranscriptionOutputFile(outputFilenameBase); List<String> transcriptionWithSubsOutputLines = getTranscriptionLinesWithSubs(mt.getViterbiDecodeStates()); StringBuffer goldComparisonOutputBuffer = new StringBuffer(); if (allowGlyphSubstitution) goldComparisonOutputBuffer.append("MN: " + "Model normalized transcription\n"); if (goldNormalizedLineChars != null) goldComparisonOutputBuffer.append("GN: " + "Gold normalized transcription\n"); /* */ goldComparisonOutputBuffer.append("MD: " + "Model diplomatic transcription\n"); if (goldDiplomaticLineChars != null) goldComparisonOutputBuffer.append("GD: " + "Gold diplomatic transcription\n"); if (allowGlyphSubstitution) goldComparisonOutputBuffer.append("MS: " + "Model transcription with substitutions\n"); goldComparisonOutputBuffer.append("\n\n"); for (int line = 0; line < numLines; ++line) { if (allowGlyphSubstitution) goldComparisonOutputBuffer.append("MN: " + joinLineForPrinting(mt.getViterbiNormalizedCharLines()[line]).trim() + "\n"); if (goldNormalizedLineChars != null) goldComparisonOutputBuffer.append("GN: " + joinLineForPrinting(Arrays.asList(goldNormalizedLineChars[line])).trim() + "\n"); /* */ goldComparisonOutputBuffer.append("MD: " + joinLineForPrinting(mt.getViterbiDiplomaticCharLines()[line]).trim() + "\n"); if (goldDiplomaticLineChars != null) goldComparisonOutputBuffer.append("GD: " + joinLineForPrinting(Arrays.asList(goldDiplomaticLineChars[line])).trim() + "\n"); if (allowGlyphSubstitution) goldComparisonOutputBuffer.append("MS: " + transcriptionWithSubsOutputLines.get(line).trim()+"\n"); goldComparisonOutputBuffer.append("\n"); } goldComparisonOutputBuffer.append("\n"); if ((allowGlyphSubstitution && mt.getViterbiNormalizedCharRunning() != null) || goldNormalizedChars != null) { if ((allowGlyphSubstitution && mt.getViterbiNormalizedCharRunning() != null) && goldNormalizedChars != null) { goldComparisonOutputBuffer.append("Model (top) vs. Gold (bottom) normalized transcriptions\n"); } else if (allowGlyphSubstitution && mt.getViterbiNormalizedCharRunning() != null) { goldComparisonOutputBuffer.append("Model normalized transcription\n"); } else if (goldNormalizedChars != null) { goldComparisonOutputBuffer.append("Gold normalized transcription\n"); } if (allowGlyphSubstitution && mt.getViterbiNormalizedCharRunning() != null) { goldComparisonOutputBuffer.append(joinLineForPrinting(mt.getViterbiNormalizedCharRunning()) + "\n"); } if (goldNormalizedChars != null) { goldComparisonOutputBuffer.append(joinLineForPrinting(goldNormalizedChars) + "\n"); } } goldComparisonOutputBuffer.append("\n"); if (goldDiplomaticLineChars != null) { goldComparisonOutputBuffer.append("\nDiplomatic evaluation\n"); goldComparisonOutputBuffer.append(Evaluator.renderEval(diplomaticEvals)); } if (goldNormalizedLineChars != null) { goldComparisonOutputBuffer.append("\nNormalized evaluation\n"); goldComparisonOutputBuffer.append(Evaluator.renderEval(normalizedEvals)); } System.out.println("Writing comparisons to " + transcriptionOutputFilename); f.writeString(transcriptionOutputFilename, goldComparisonOutputBuffer.toString()); } double lmPerplexity = 0; // new LmPerplexity(lm).perplexity(mt.viterbiNormalizedTranscriptionCharIndices, mt.viterbiNormalizedTranscriptionLangIndices); // System.out.println("LM perplexity = " + lmPerplexity); // // Other files // if (outputFormats.contains(ALTO)) { new AltoOutputWriter(charIndexer, langIndexer).write(numLines, mt.getViterbiDecodeStates(), doc, outputFilenameBase, inputDocPath, commandLineArgs, false, lmPerplexity); if (allowGlyphSubstitution) { new AltoOutputWriter(charIndexer, langIndexer).write(numLines, mt.getViterbiDecodeStates(), doc, outputFilenameBase, inputDocPath, commandLineArgs, true, lmPerplexity); } } if (outputFormats.contains(HTML)) { new HtmlOutputWriter(charIndexer, langIndexer).write(numLines, mt.getViterbiDecodeStates(), doc.baseName(), outputFilenameBase); } if (outputFormats.contains(WHITESPACE)) { StringBuilder whitespaceFileBuf = new StringBuilder(); Indexer<String> charIndexer = lm.getCharacterIndexer(); for (List<DecodeState> decodeStateLine : mt.getViterbiDecodeStates()) { int whitespace = 0; for (DecodeState ds : decodeStateLine) { int c = ds.ts.getGlyphChar().templateCharIndex; if (c == charIndexer.getIndex(Charset.SPACE)) { whitespace += ds.charWidth; } else { if (whitespace > 0) { whitespaceFileBuf.append("{" + whitespace + "}"); whitespace = 0; } whitespaceFileBuf.append(Charset.unescapeChar(charIndexer.getObject(c))); } whitespace += ds.padWidth; } if (whitespace > 0) { whitespaceFileBuf.append("{" + whitespace + "}"); } whitespaceFileBuf.append("\n"); } String whitespaceOutputFilename = outputFilenameBase + "_whitespace.txt"; System.out.println("Writing whitespace layout to " + whitespaceOutputFilename); f.writeString(whitespaceOutputFilename, whitespaceFileBuf.toString()); } // // Transcription with widths // // if (allowGlyphSubstitution) { // System.out.println("Transcription with widths"); // StringBuffer transcriptionWithWidthsOutputBuffer = new StringBuffer(); // for (int line = 0; line < numLines; ++line) { // transcriptionWithWidthsOutputBuffer.append(transcriptionWithSubsOutputLines.get(line)); // for (int i = 0; i < viterbiTrmt.viterbies[line].size(); ++i) { // TransitionState ts = viterbiDecodeStates[line].get(i); // int w = viterbiWidths[line].get(i); // String sglyphChar = Charset.unescapeChar(charIndexer.getObject(ts.getGlyphChar().templateCharIndex)); // transcriptionWithWidthsOutputBuffer.append(sglyphChar + "[" + ts.getGlyphChar().toString(charIndexer) + "][" + w + "]\n"); // } // transcriptionWithWidthsOutputBuffer.append("\n"); // } // //System.out.println(transcriptionWithWidthsOutputBuffer.toString()); // FileHelper.writeString(transcriptionWithWidthsOutputFilename, transcriptionWithWidthsOutputBuffer.toString()); // } return Tuple2(diplomaticEvals, normalizedEvals); } private Tuple2<Tuple3<String[][], String[][], List<String>>, DecodeState[][]> loadGoldTranscriptions(Document doc, DecodeState[][] decodeStates) { String[][] goldDiplomaticCharLines = doc.loadDiplomaticTextLines(); String[][] goldNormalizedCharLines = doc.loadNormalizedTextLines(); List<String> goldNormalizedChars = doc.loadNormalizedText(); // // Make sure the decoded states and the text have the same number of lines (numLines) // int numLines = ArrayHelper.max( decodeStates.length, goldDiplomaticCharLines != null ? goldDiplomaticCharLines.length : 0, goldNormalizedCharLines != null ? goldNormalizedCharLines.length : 0); if (decodeStates.length < numLines) { // if we need to pad the end with blank lines numLines = goldDiplomaticCharLines.length; DecodeState[][] newDecodeStates = new DecodeState[numLines][]; for (int line = 0; line < numLines; ++line) { newDecodeStates[line] = line < decodeStates.length ? decodeStates[line] : new DecodeState[0]; } decodeStates = newDecodeStates; } if (goldDiplomaticCharLines != null && goldDiplomaticCharLines.length < numLines) { // if we need to pad the end with blank lines String[][] newText = new String[numLines][]; for (int line = 0; line < numLines; ++line) { newText[line] = line < goldDiplomaticCharLines.length ? goldDiplomaticCharLines[line] : new String[0]; } goldDiplomaticCharLines = newText; } if (goldNormalizedCharLines != null && goldNormalizedCharLines.length < numLines) { // if we need to pad the end with blank lines String[][] newText = new String[numLines][]; for (int line = 0; line < numLines; ++line) { newText[line] = line < goldNormalizedCharLines.length ? goldNormalizedCharLines[line] : new String[0]; } goldNormalizedCharLines = newText; } return Tuple2(Tuple3(goldDiplomaticCharLines, goldNormalizedCharLines, goldNormalizedChars), decodeStates); } public static String makeOutputFilenameBase(Document doc, String inputDocPath, String outputPath) { return makeOutputFilenameBase(0, 0, doc, inputDocPath, outputPath); } private static String makeOutputFilenameBase(int iter, int batchId, Document doc, String inputDocPath, String outputPath) { String fileParent = FileUtil.removeCommonPathPrefixOfParents(new File(inputDocPath), new File(doc.baseName()))._2; String preext = FileUtil.withoutExtension(new File(doc.baseName()).getName()); String outputFilenameBase = outputPath + "/all_transcriptions/" + fileParent + "/" + preext; if (iter > 0) outputFilenameBase += "_iter-" + iter; if (batchId > 0) outputFilenameBase += "_batch-" + batchId; return outputFilenameBase; } public static String diplomaticTranscriptionOutputFile /* */ (String outputFilenameBase) { return outputFilenameBase + "_transcription.txt"; } public static String normalizedLinesTranscriptionOutputFile /**/ (String outputFilenameBase) { return outputFilenameBase + "_transcription_normalized_lines.txt"; } public static String normalizedTranscriptionOutputFile /* */ (String outputFilenameBase) { return outputFilenameBase + "_transcription_normalized.txt"; } public static String comparisonsTranscriptionOutputFile /* */ (String outputFilenameBase) { return outputFilenameBase + "_comparisons.txt"; } private List<String> getTranscriptionLinesWithSubs(List<DecodeState>[] viterbiDecodeStates) { List<String> transcriptionWithSubsOutputLines = new ArrayList<String>(); for (List<DecodeState> lineStates : viterbiDecodeStates) { StringBuilder lineBuffer = new StringBuilder(); for (DecodeState ds : lineStates) { TransitionState ts = ds.ts; int lmChar = ts.getLmCharIndex(); GlyphChar glyph = ts.getGlyphChar(); int glyphChar = glyph.templateCharIndex; String sglyphChar = Charset.unescapeChar(charIndexer.getObject(glyphChar)); if (lmChar != glyphChar || glyph.glyphType != GlyphType.NORMAL_CHAR) { String norm = Charset.unescapeChar(charIndexer.getObject(lmChar)); String dipl = (glyph.glyphType == GlyphType.DOUBLED ? "2x"+sglyphChar : glyph.isElided() ? "" : sglyphChar); lineBuffer.append("[" + norm + "/" + dipl + "]"); } else { lineBuffer.append(sglyphChar); } } transcriptionWithSubsOutputLines.add(lineBuffer.toString()); } return transcriptionWithSubsOutputLines; } private <A> List<A>[] toArrayOfLists(A[][] as) { @SuppressWarnings("unchecked") List<A>[] r = new List[as.length]; for (int i = 0; i < as.length; ++i) { r[i] = Arrays.asList(as[i]); } return r; } }