DocumentViewer.java example

Explorer
topic-modeling-master
/* Copyright (C) 2003 Univ. of Massachusetts Amherst, Computer Science Dept.
   This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
   http://www.cs.umass.edu/~mccallum/mallet
   This software is provided under the terms of the Common Public License,
   version 1.0, as published by http://www.opensource.org.  For further
   information, see the file `LICENSE' included with this distribution. */
package cc.mallet.extract;


import java.io.File;
import java.io.PrintWriter;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import cc.mallet.types.Label;
import cc.mallet.types.LabelAlphabet;
import cc.mallet.util.ColorUtils;

/**
 * Diagnosis class that outputs HTML pages that allows you to view errors on a more
 *  global per-instance basis.
 *
 * Created: Mar 30, 2005
 *
 * @author <A HREF="mailto:casutton@cs.umass.edu>casutton@cs.umass.edu</A>
 * @version $Id: DocumentViewer.java,v 1.1 2007/10/22 21:37:44 mccallum Exp $
 */
public class DocumentViewer {

  private static final String DOC_ERRS_CSS_FNAME = "docerrs.css";
  private static final String DOC_ERRS_PRED_CSS_FNAME = "docerrs-by-pred.css";
  private static final String DOC_ERRS_TRUE_CSS_FNAME = "docerrs-by-true.css";
  private static final double SATURATION = 0.4;

  private static class DualLabeledSpans {

    DualLabeledSpans (LabeledSpans ls1, LabeledSpans ls2) {
      ls = new LabeledSpans[] { ls1, ls2 };
    }

    private LabeledSpans[] ls;

    int size () { return ls[0].size(); }

    LabeledSpan get (int t, int i) { return ls[i].getLabeledSpan (t); }
  }

  /**
   * Writes several HTML files describing a given extraction.  Each HTML file shows an entire
   *  document, with the extracted fields color-coded.
   * @param directory Directory to write files to
   * @param extraction Extraction to describe
   * @throws IOException
   */

  public static void writeExtraction (File directory, Extraction extraction) throws IOException
  {
    outputIndex (directory, extraction);
    outputStylesheets (directory, extraction);
    outputDocuments (directory, extraction);
  }

  private static void outputStylesheets (File directory, Extraction extraction) throws IOException
  {
    // ERRS css
    PrintWriter out = new PrintWriter (new FileWriter (new File (directory, DOC_ERRS_CSS_FNAME)));
    out.println (".tf_legend { border-style: dashed; border-width: 2px; padding: 10px; padding-top: 0ex; float: right; margin:2em; }");
    out.println (".class_legend { visibility: hidden; }");
    out.println (".correct { background-color:#33FF33; }");
    out.println (".wrong { background-color:pink }");
    out.println (".true { background-color:#99CCFF; }");
    out.println (".pred { background-color:#FFFF66 }");
    out.close ();


    //PRED css
    LabelAlphabet dict = extraction.getLabelAlphabet ();
    String[] fields = determineFieldNames (dict);
    String[] colors = ColorUtils.rainbow (fields.length, (float) SATURATION, 1);
    out = new PrintWriter (new FileWriter (new File (directory, DOC_ERRS_PRED_CSS_FNAME)));
    out.println (".class_legend { border-style: dashed; border-width: 2px; padding: 10px; padding-top: 0ex; float: right; margin:2em; }");
    out.println (".tf_legend { visibility: hidden; }");
    for (int i = 0; i < fields.length; i++) {
      out.println (".pred_"+fields[i]+" { background-color:"+colors[i]+"; }");
    }
    out.close ();

    //TRUE css
    out = new PrintWriter (new FileWriter (new File (directory, DOC_ERRS_TRUE_CSS_FNAME)));
    out.println (".class_legend { border-style: dashed; border-width: 2px; padding: 10px; padding-top: 0ex; float: right; margin:2em; }");
    out.println (".tf_legend { visibility: hidden; }");
    for (int i = 0; i < fields.length; i++) {
      out.println (".true_"+fields[i]+" { background-color:"+colors[i]+"; }");
    }
    out.close ();
  }

  private static void outputDocuments (File directory, Extraction extraction) throws IOException
  {
    for (int i = 0; i < extraction.getNumDocuments (); i++) {
      PrintWriter out = new PrintWriter (new FileWriter (new File (directory, "extraction"+i+".html")));
      outputOneDocument (out, extraction.getDocumentExtraction (i));
      out.close ();
    }
  }

  private static void outputOneDocument (PrintWriter out, DocumentExtraction docExtr)
  {
    String name = docExtr.getName ();
    out.println ("<HTML><HEAD><TITLE>"+name+": Extraction from Document</TITLE>");
    out.println ("<LINK REL=\"stylesheet\" TYPE=\"text/css\" HREF=\""+DOC_ERRS_CSS_FNAME+"\" title=\"Agreement\" />");
    out.println ("<LINK REL=\"stylesheet\" TYPE=\"text/css\" HREF=\""+DOC_ERRS_PRED_CSS_FNAME+"\" title=\"Pred\" />");
    out.println ("<LINK REL=\"stylesheet\" TYPE=\"text/css\" HREF=\""+DOC_ERRS_TRUE_CSS_FNAME+"\" title=\"True\" />");
    out.println ("</HEAD><BODY>");

    outputClassLegend (out, docExtr.getExtractedSpans ().getLabeledSpan (0).getLabel ().getLabelAlphabet ());
    outputRightWrongLegend (out);

    DualLabeledSpans spans = intersectSpans (docExtr);
    for (int i = 0; i < spans.size(); i++) {
      LabeledSpan predSpan = spans.get (i, 0);
      LabeledSpan trueSpan = spans.get (i, 1);

      Label predLabel = predSpan.getLabel ();
      Label trueLabel = trueSpan.getLabel ();

      boolean predNonBgrnd = !predSpan.isBackground ();
      boolean trueNonBgrnd = !trueSpan.isBackground ();
      boolean isBackground = !predNonBgrnd && !trueNonBgrnd;
      
      String spanClass = null;
      if (predNonBgrnd && trueNonBgrnd) {
        if (predLabel == trueLabel) {
          spanClass = "correct";
        } else {
          spanClass = "wrong";
        }
      } else if (predNonBgrnd) {
        spanClass = "pred";
      } else if (trueNonBgrnd) {
        spanClass = "true";
      }

      if (!isBackground) out.print ("<SPAN CLASS=\"pred_"+predLabel+"\">");
      if (!isBackground) out.print ("<SPAN CLASS=\"true_"+trueLabel+"\">");
      if (spanClass != null) { out.print ("<SPAN CLASS=\""+spanClass+"\">"); }

      String text = predSpan.getSpan ().getText ();
      text = text.replaceAll ("<", "<");
      text = text.replaceAll ("\n", "\n<P>");
      out.print (text);

      if (spanClass != null) { out.print ("</SPAN>"); }
      if (!isBackground) out.print ("</SPAN></SPAN>");
      out.println ();
    }

    out.println ("</BODY></HTML>");
  }

  private static void outputRightWrongLegend (PrintWriter out)
  {
    out.println ("<DIV CLASS=\"tf_legend\"><B>LEGEND</B><BR>");
    out.println ("<SPAN CLASS='correct'>Correct</SPAN><BR />");
    out.println ("<SPAN CLASS='wrong'>Wrong</SPAN><BR />");
    out.println ("<SPAN CLASS='true'>False Negative</SPAN> (True field but predicted background)<BR />");
    out.println ("<SPAN CLASS='pred'>False Positive</SPAN> (True background but predicted field)<BR />");
    out.println ("</DIV>");
  }
  private static void outputClassLegend (PrintWriter out, LabelAlphabet dict)
  {
    out.println ("<DIV CLASS=\"class_legend\">");
    out.println ("<H4>LEGEND</H4>");
    String[] fields = determineFieldNames (dict);
    String[] colors = ColorUtils.rainbow (fields.length, (float) SATURATION, 1);
    for (int i = 0; i < fields.length; i++) {
      out.println ("<SPAN STYLE=\"background-color:"+colors[i]+"\">"+fields[i]+"</SPAN><BR />");
    }
    out.println ("</DIV>");
  }

  private static String[] determineFieldNames (LabelAlphabet dict)
  {
    List l = new ArrayList ();
    for (int i = 0; i < dict.size (); i++) {
      String lname = dict.lookupLabel (i).toString ();
      if (!lname.startsWith ("B-") && !lname.startsWith ("I-")) {
        l.add (lname);
      }
    }
    return (String[]) l.toArray (new String [l.size ()]);
  }

  private static DualLabeledSpans intersectSpans (DocumentExtraction docExtr)
  {
    int predIdx = 0;
    int trueIdx = 0;
    LabeledSpans trueSpans = docExtr.getTargetSpans ();
    LabeledSpans predSpans = docExtr.getExtractedSpans ();

    LabeledSpans retPredSpans = new LabeledSpans (predSpans.getDocument ());
    LabeledSpans retTrueSpans = new LabeledSpans (predSpans.getDocument ());

    while ((predIdx < predSpans.size()) && (trueIdx < trueSpans.size ())) {
      LabeledSpan predSpan = predSpans.getLabeledSpan (predIdx);
      LabeledSpan trueSpan = trueSpans.getLabeledSpan (trueIdx);

      LabeledSpan newPredSpan = (LabeledSpan) predSpan.intersection (trueSpan);
      LabeledSpan newTrueSpan = (LabeledSpan) trueSpan.intersection (predSpan);
      retPredSpans.add (newPredSpan);
      retTrueSpans.add (newTrueSpan);

      if (predSpan.getEndIdx () <= trueSpan.getEndIdx ()) {
        predIdx++;
      }
      if (trueSpan.getEndIdx () <= predSpan.getEndIdx ()) {
        trueIdx++;
      }
    }

    assert (retPredSpans.size() == retTrueSpans.size());

    return new DualLabeledSpans (retPredSpans, retTrueSpans);
  }

  private static void outputIndex (File directory, Extraction extraction) throws IOException
  {
    PrintWriter out = new PrintWriter (new FileWriter (new File (directory, "index.html")));
    out.println ("<HTML><HEAD><TITLE>Extraction Results</TITLE></HEAD><BODY><OL>");
    for (int i = 0; i < extraction.getNumDocuments(); i++) {
      String name = extraction.getDocumentExtraction (i).getName ();
      out.println ("  <LI><A HREF=\"extraction"+i+".html\">"+name+"</A></LI>");
    }
    out.println ("</OL></BODY></HTML>");
    out.close ();
  }
}