/* Copyright (C) 2003 Univ. of Massachusetts Amherst, Computer Science Dept. This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit). http://www.cs.umass.edu/~mccallum/mallet This software is provided under the terms of the Common Public License, version 1.0, as published by http://www.opensource.org. For further information, see the file `LICENSE' included with this distribution. */ package cc.mallet.extract; import java.io.File; import java.io.PrintWriter; import java.io.FileWriter; import java.io.IOException; import java.util.ArrayList; import java.util.List; import cc.mallet.types.Label; import cc.mallet.types.LabelAlphabet; import cc.mallet.util.ColorUtils; /** * Diagnosis class that outputs HTML pages that allows you to view errors on a more * global per-instance basis. * * Created: Mar 30, 2005 * * @author <A HREF="mailto:casutton@cs.umass.edu>casutton@cs.umass.edu</A> * @version $Id: DocumentViewer.java,v 1.1 2007/10/22 21:37:44 mccallum Exp $ */ public class DocumentViewer { private static final String DOC_ERRS_CSS_FNAME = "docerrs.css"; private static final String DOC_ERRS_PRED_CSS_FNAME = "docerrs-by-pred.css"; private static final String DOC_ERRS_TRUE_CSS_FNAME = "docerrs-by-true.css"; private static final double SATURATION = 0.4; private static class DualLabeledSpans { DualLabeledSpans (LabeledSpans ls1, LabeledSpans ls2) { ls = new LabeledSpans[] { ls1, ls2 }; } private LabeledSpans[] ls; int size () { return ls[0].size(); } LabeledSpan get (int t, int i) { return ls[i].getLabeledSpan (t); } } /** * Writes several HTML files describing a given extraction. Each HTML file shows an entire * document, with the extracted fields color-coded. * @param directory Directory to write files to * @param extraction Extraction to describe * @throws IOException */ public static void writeExtraction (File directory, Extraction extraction) throws IOException { outputIndex (directory, extraction); outputStylesheets (directory, extraction); outputDocuments (directory, extraction); } private static void outputStylesheets (File directory, Extraction extraction) throws IOException { // ERRS css PrintWriter out = new PrintWriter (new FileWriter (new File (directory, DOC_ERRS_CSS_FNAME))); out.println (".tf_legend { border-style: dashed; border-width: 2px; padding: 10px; padding-top: 0ex; float: right; margin:2em; }"); out.println (".class_legend { visibility: hidden; }"); out.println (".correct { background-color:#33FF33; }"); out.println (".wrong { background-color:pink }"); out.println (".true { background-color:#99CCFF; }"); out.println (".pred { background-color:#FFFF66 }"); out.close (); //PRED css LabelAlphabet dict = extraction.getLabelAlphabet (); String[] fields = determineFieldNames (dict); String[] colors = ColorUtils.rainbow (fields.length, (float) SATURATION, 1); out = new PrintWriter (new FileWriter (new File (directory, DOC_ERRS_PRED_CSS_FNAME))); out.println (".class_legend { border-style: dashed; border-width: 2px; padding: 10px; padding-top: 0ex; float: right; margin:2em; }"); out.println (".tf_legend { visibility: hidden; }"); for (int i = 0; i < fields.length; i++) { out.println (".pred_"+fields[i]+" { background-color:"+colors[i]+"; }"); } out.close (); //TRUE css out = new PrintWriter (new FileWriter (new File (directory, DOC_ERRS_TRUE_CSS_FNAME))); out.println (".class_legend { border-style: dashed; border-width: 2px; padding: 10px; padding-top: 0ex; float: right; margin:2em; }"); out.println (".tf_legend { visibility: hidden; }"); for (int i = 0; i < fields.length; i++) { out.println (".true_"+fields[i]+" { background-color:"+colors[i]+"; }"); } out.close (); } private static void outputDocuments (File directory, Extraction extraction) throws IOException { for (int i = 0; i < extraction.getNumDocuments (); i++) { PrintWriter out = new PrintWriter (new FileWriter (new File (directory, "extraction"+i+".html"))); outputOneDocument (out, extraction.getDocumentExtraction (i)); out.close (); } } private static void outputOneDocument (PrintWriter out, DocumentExtraction docExtr) { String name = docExtr.getName (); out.println ("<HTML><HEAD><TITLE>"+name+": Extraction from Document</TITLE>"); out.println ("<LINK REL=\"stylesheet\" TYPE=\"text/css\" HREF=\""+DOC_ERRS_CSS_FNAME+"\" title=\"Agreement\" />"); out.println ("<LINK REL=\"stylesheet\" TYPE=\"text/css\" HREF=\""+DOC_ERRS_PRED_CSS_FNAME+"\" title=\"Pred\" />"); out.println ("<LINK REL=\"stylesheet\" TYPE=\"text/css\" HREF=\""+DOC_ERRS_TRUE_CSS_FNAME+"\" title=\"True\" />"); out.println ("</HEAD><BODY>"); outputClassLegend (out, docExtr.getExtractedSpans ().getLabeledSpan (0).getLabel ().getLabelAlphabet ()); outputRightWrongLegend (out); DualLabeledSpans spans = intersectSpans (docExtr); for (int i = 0; i < spans.size(); i++) { LabeledSpan predSpan = spans.get (i, 0); LabeledSpan trueSpan = spans.get (i, 1); Label predLabel = predSpan.getLabel (); Label trueLabel = trueSpan.getLabel (); boolean predNonBgrnd = !predSpan.isBackground (); boolean trueNonBgrnd = !trueSpan.isBackground (); boolean isBackground = !predNonBgrnd && !trueNonBgrnd; String spanClass = null; if (predNonBgrnd && trueNonBgrnd) { if (predLabel == trueLabel) { spanClass = "correct"; } else { spanClass = "wrong"; } } else if (predNonBgrnd) { spanClass = "pred"; } else if (trueNonBgrnd) { spanClass = "true"; } if (!isBackground) out.print ("<SPAN CLASS=\"pred_"+predLabel+"\">"); if (!isBackground) out.print ("<SPAN CLASS=\"true_"+trueLabel+"\">"); if (spanClass != null) { out.print ("<SPAN CLASS=\""+spanClass+"\">"); } String text = predSpan.getSpan ().getText (); text = text.replaceAll ("<", "<"); text = text.replaceAll ("\n", "\n<P>"); out.print (text); if (spanClass != null) { out.print ("</SPAN>"); } if (!isBackground) out.print ("</SPAN></SPAN>"); out.println (); } out.println ("</BODY></HTML>"); } private static void outputRightWrongLegend (PrintWriter out) { out.println ("<DIV CLASS=\"tf_legend\"><B>LEGEND</B><BR>"); out.println ("<SPAN CLASS='correct'>Correct</SPAN><BR />"); out.println ("<SPAN CLASS='wrong'>Wrong</SPAN><BR />"); out.println ("<SPAN CLASS='true'>False Negative</SPAN> (True field but predicted background)<BR />"); out.println ("<SPAN CLASS='pred'>False Positive</SPAN> (True background but predicted field)<BR />"); out.println ("</DIV>"); } private static void outputClassLegend (PrintWriter out, LabelAlphabet dict) { out.println ("<DIV CLASS=\"class_legend\">"); out.println ("<H4>LEGEND</H4>"); String[] fields = determineFieldNames (dict); String[] colors = ColorUtils.rainbow (fields.length, (float) SATURATION, 1); for (int i = 0; i < fields.length; i++) { out.println ("<SPAN STYLE=\"background-color:"+colors[i]+"\">"+fields[i]+"</SPAN><BR />"); } out.println ("</DIV>"); } private static String[] determineFieldNames (LabelAlphabet dict) { List l = new ArrayList (); for (int i = 0; i < dict.size (); i++) { String lname = dict.lookupLabel (i).toString (); if (!lname.startsWith ("B-") && !lname.startsWith ("I-")) { l.add (lname); } } return (String[]) l.toArray (new String [l.size ()]); } private static DualLabeledSpans intersectSpans (DocumentExtraction docExtr) { int predIdx = 0; int trueIdx = 0; LabeledSpans trueSpans = docExtr.getTargetSpans (); LabeledSpans predSpans = docExtr.getExtractedSpans (); LabeledSpans retPredSpans = new LabeledSpans (predSpans.getDocument ()); LabeledSpans retTrueSpans = new LabeledSpans (predSpans.getDocument ()); while ((predIdx < predSpans.size()) && (trueIdx < trueSpans.size ())) { LabeledSpan predSpan = predSpans.getLabeledSpan (predIdx); LabeledSpan trueSpan = trueSpans.getLabeledSpan (trueIdx); LabeledSpan newPredSpan = (LabeledSpan) predSpan.intersection (trueSpan); LabeledSpan newTrueSpan = (LabeledSpan) trueSpan.intersection (predSpan); retPredSpans.add (newPredSpan); retTrueSpans.add (newTrueSpan); if (predSpan.getEndIdx () <= trueSpan.getEndIdx ()) { predIdx++; } if (trueSpan.getEndIdx () <= predSpan.getEndIdx ()) { trueIdx++; } } assert (retPredSpans.size() == retTrueSpans.size()); return new DualLabeledSpans (retPredSpans, retTrueSpans); } private static void outputIndex (File directory, Extraction extraction) throws IOException { PrintWriter out = new PrintWriter (new FileWriter (new File (directory, "index.html"))); out.println ("<HTML><HEAD><TITLE>Extraction Results</TITLE></HEAD><BODY><OL>"); for (int i = 0; i < extraction.getNumDocuments(); i++) { String name = extraction.getDocumentExtraction (i).getName (); out.println (" <LI><A HREF=\"extraction"+i+".html\">"+name+"</A></LI>"); } out.println ("</OL></BODY></HTML>"); out.close (); } }