/* Copyright (C) 2003 Univ. of Massachusetts Amherst, Computer Science Dept.
This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
http://www.cs.umass.edu/~mccallum/mallet
This software is provided under the terms of the Common Public License,
version 1.0, as published by http://www.opensource.org. For further
information, see the file `LICENSE' included with this distribution. */
package cc.mallet.classify.tui;
import java.util.logging.*;
import java.io.*;
import cc.mallet.classify.*;
import cc.mallet.pipe.*;
import cc.mallet.pipe.iterator.*;
import cc.mallet.types.*;
import cc.mallet.util.*;
/**
* Diagnostic facilities for a vector file.
@author Andrew McCallum <a href="mailto:mccallum@cs.umass.edu">mccallum@cs.umass.edu</a>
*/
public class Vectors2Info
{
private static Logger logger = MalletLogger.getLogger(Vectors2Info.class.getName());
static CommandOption.File inputFile = new CommandOption.File
(Vectors2Info.class, "input", "FILE", true, new File("-"),
"Read the instance list from this file; Using - indicates stdin.", null);
static CommandOption.Integer printInfogain = new CommandOption.Integer
(Vectors2Info.class, "print-infogain", "N", false, 0,
"Print top N words by information gain, sorted.", null);
static CommandOption.Boolean printLabels = new CommandOption.Boolean
(Vectors2Info.class, "print-labels", "[TRUE|FALSE]", false, false,
"Print class labels known to instance list, one per line.", null);
static CommandOption.Boolean printFeatures = new CommandOption.Boolean
(Vectors2Info.class, "print-features", "[TRUE|FALSE]", false, false,
"Print the data alphabet, one feature per line.", null);
static CommandOption.String printMatrix = new CommandOption.String
(Vectors2Info.class, "print-matrix", "STRING", false, "sic",
"Print word/document matrix in the specified format (a|s)(b|i)(n|w|c|e), for (all vs. sparse), (binary vs. integer), (number vs. word vs. combined vs. empty)", null)
{
public void parseArg(java.lang.String arg) {
if (arg == null) arg = this.defaultValue;
//System.out.println("pa arg=" + arg);
// sanity check the raw printing options (a la Rainbow)
char c0 = arg.charAt(0);
char c1 = arg.charAt(1);
char c2 = arg.charAt(2);
if (arg.length() != 3 ||
(c0 != 's' && c0 != 'a') ||
(c1 != 'b' && c1 != 'i') ||
(c2 != 'n' && c2 != 'w' && c2 != 'c' && c2 != 'e')) {
throw new IllegalArgumentException("Illegal argument = " + arg + " in --print-matrix=" +arg);
}
value = arg;
}
};
public static void main (String[] args) throws FileNotFoundException, IOException {
// Process the command-line options
CommandOption.setSummary (Vectors2Info.class,
"A tool for printing information about instance lists of feature vectors.");
CommandOption.process (Vectors2Info.class, args);
// Print some helpful messages for error cases
if (args.length == 0) {
CommandOption.getList(Vectors2Info.class).printUsage(false);
System.exit (-1);
}
if (false && !inputFile.wasInvoked()) {
System.err.println ("You must specify an input instance list, with --input.");
System.exit (-1);
}
// Read the InstanceList
InstanceList instances = InstanceList.load (inputFile.value);
if (printLabels.value) {
Alphabet labelAlphabet = instances.getTargetAlphabet ();
for (int i = 0; i < labelAlphabet.size(); i++) {
System.out.println (labelAlphabet.lookupObject (i));
}
System.out.print ("\n");
}
if (printFeatures.value) {
Alphabet alphabet = instances.getDataAlphabet();
for (int i = 0; i < alphabet.size(); i++) {
System.out.println(alphabet.lookupObject(i));
}
System.out.print ("\n");
}
if (printInfogain.value > 0) {
InfoGain ig = new InfoGain (instances);
for (int i = 0; i < printInfogain.value; i++) {
System.out.println (""+i+" "+ig.getObjectAtRank(i));
}
System.out.print ("\n");
}
if (printMatrix.wasInvoked()) {
printInstanceList(instances, printMatrix.value);
}
}
/** print an instance list according to the format string */
private static void printInstanceList(InstanceList instances, String formatString) {
int numInstances = instances.size();
int numClasses = instances.getTargetAlphabet().size();
int numFeatures = instances.getDataAlphabet().size();
Alphabet dataAlphabet = instances.getDataAlphabet();
double[] counts = new double[numFeatures];
double count;
for (int i = 0; i < instances.size(); i++) {
Instance instance = instances.get(i);
if (instance.getData() instanceof FeatureVector) {
FeatureVector fv = (FeatureVector) instance.getData ();
System.out.print(instance.getName() + " " + instance.getTarget());
if (formatString.charAt(0) == 'a') {
// Dense: Print all features, even those with value 0.
for (int fvi=0; fvi<numFeatures; fvi++){
printFeature(dataAlphabet.lookupObject(fvi), fvi, fv.value(fvi), formatString);
}
}
else {
// Sparse: Print features with non-zero values only.
for (int l = 0; l < fv.numLocations(); l++) {
int fvi = fv.indexAtLocation(l);
printFeature(dataAlphabet.lookupObject(fvi), fvi, fv.valueAtLocation(l), formatString);
//System.out.print(" " + dataAlphabet.lookupObject(j) + " " + ((int) fv.valueAtLocation(j)));
}
}
}
else if (instance.getData() instanceof FeatureSequence) {
FeatureSequence featureSequence = (FeatureSequence) instance.getData();
StringBuilder output = new StringBuilder();
output.append(instance.getName() + " " + instance.getTarget());
for (int position = 0; position < featureSequence.size(); position++) {
int featureIndex = featureSequence.getIndexAtPosition(position);
char featureFormat = formatString.charAt(2);
if (featureFormat == 'w') {
output.append(" " + dataAlphabet.lookupObject(featureIndex));
}
else if (featureFormat == 'n') {
output.append(" " + featureIndex);
}
else if (featureFormat == 'c') {
output.append(" " + dataAlphabet.lookupObject(featureIndex) + ":" + featureIndex);
}
}
System.out.println(output);
}
else {
throw new IllegalArgumentException ("Printing is supported for FeatureVector and FeatureSequence data, found " + instance.getData().getClass());
}
System.out.println();
}
System.out.println();
return; // counts;
}
/* helper for printInstanceList. prints a single feature within an instance */
private static void printFeature(Object o, int fvi, double featureValue, String formatString) {
// print object n,w,c,e
char c1 = formatString.charAt(2);
if (c1 == 'w') { // word
System.out.print(" " + o);
} else if (c1 == 'n') { // index of word
System.out.print(" " + fvi);
} else if (c1 == 'c') { //word and index
System.out.print(" " + o + ":" + fvi);
} else if (c1 == 'e'){ //no word identity
}
char c2 = formatString.charAt(1);
if (c2 == 'i') { // integer count
System.out.print(" " + ((int)(featureValue + .5)));
} else if (c2 == 'b') { // boolean present/not present
System.out.print(" " + ((featureValue>0.5) ? "1" : "0"));
}
}
}