/* Copyright (C) 2003 Univ. of Massachusetts Amherst, Computer Science Dept. This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit). http://www.cs.umass.edu/~mccallum/mallet This software is provided under the terms of the Common Public License, version 1.0, as published by http://www.opensource.org. For further information, see the file `LICENSE' included with this distribution. */ package cc.mallet.classify.tui; import java.util.logging.*; import java.util.Iterator; import java.util.Random; import java.util.BitSet; import java.util.ArrayList; import java.util.Collections; import java.io.*; import cc.mallet.classify.*; import cc.mallet.pipe.*; import cc.mallet.pipe.iterator.*; import cc.mallet.types.*; import cc.mallet.util.*; /** A command-line tool for manipulating InstanceLists. For example, reducing the feature space by information gain. @author Andrew McCallum <a href="mailto:mccallum@cs.umass.edu">mccallum@cs.umass.edu</a> */ public class Vectors2Vectors { private static Logger logger = MalletLogger.getLogger(Vectors2Vectors.class.getName()); static CommandOption.File inputFile = new CommandOption.File (Vectors2Vectors.class, "input", "FILE", true, new File("-"), "Read the instance list from this file; Using - indicates stdin.", null); static CommandOption.File outputFile = new CommandOption.File (Vectors2Vectors.class, "output", "FILE", true, new File("-"), "Write pruned instance list to this file (use --training-file etc. if you are splitting the list). Using - indicates stdin.", null); static CommandOption.File trainingFile = new CommandOption.File (Vectors2Vectors.class, "training-file", "FILE", true, new File("training.vectors"), "Write the training set instance list to this file (or use --output if you are only pruning features); Using - indicates stdout.", null); static CommandOption.File testFile = new CommandOption.File (Vectors2Vectors.class, "testing-file", "FILE", true, new File("test.vectors"), "Write the test set instance list to this file; Using - indicates stdout.", null); static CommandOption.File validationFile = new CommandOption.File (Vectors2Vectors.class, "validation-file", "FILE", true, new File("validation.vectors"), "Write the validation set instance list to this file; Using - indicates stdout.", null); static CommandOption.Double trainingProportion = new CommandOption.Double (Vectors2Vectors.class, "training-portion", "DECIMAL", true, 1.0, "The fraction of the instances that should be used for training.", null); static CommandOption.Double validationProportion = new CommandOption.Double (Vectors2Vectors.class, "validation-portion", "DECIMAL", true, 0.0, "The fraction of the instances that should be used for validation.", null); static CommandOption.Integer randomSeed = new CommandOption.Integer (Vectors2Vectors.class, "random-seed", "INTEGER", true, 0, "The random seed for randomly selecting a proportion of the instance list for training", null); static CommandOption.Integer pruneInfogain = new CommandOption.Integer (Vectors2Vectors.class, "prune-infogain", "N", false, 0, "Reduce features to the top N by information gain.", null); static CommandOption.Integer pruneCount = new CommandOption.Integer (Vectors2Vectors.class, "prune-count", "N", false, 0, "Reduce features to those that occur more than N times.", null); static CommandOption.Boolean vectorToSequence = new CommandOption.Boolean (Vectors2Vectors.class, "vector-to-sequence", "[TRUE|FALSE]", false, false, "Convert FeatureVector's to FeatureSequence's.", null); static CommandOption.Boolean hideTargets = new CommandOption.Boolean (Vectors2Vectors.class, "hide-targets", "[TRUE|FALSE]", false, false, "Hide targets.", null); static CommandOption.Boolean revealTargets = new CommandOption.Boolean (Vectors2Vectors.class, "reveal-targets", "[TRUE|FALSE]", false, false, "Reveal targets.", null); public static void main (String[] args) throws FileNotFoundException, IOException { // Process the command-line options CommandOption.setSummary (Vectors2Vectors.class, "A tool for manipulating instance lists of feature vectors."); CommandOption.process (Vectors2Vectors.class, args); // Print some helpful messages for error cases if (args.length == 0) { CommandOption.getList(Vectors2Vectors.class).printUsage(false); System.exit (-1); } Random r = randomSeed.wasInvoked() ? new Random (randomSeed.value) : new Random (); double t = trainingProportion.value; double v = validationProportion.value; logger.info ("Training portion = "+t); logger.info ("Validation portion = "+v); logger.info ("Testing portion = "+(1-v-t)); logger.info ("Prune info gain = "+pruneInfogain.value); logger.info ("Prune count = "+pruneCount.value); // Read the InstanceList InstanceList instances = InstanceList.load (inputFile.value); if (t == 1.0 && !vectorToSequence.value && ! (pruneInfogain.wasInvoked() || pruneCount.wasInvoked()) && ! (hideTargets.wasInvoked() || revealTargets.wasInvoked())) { logger.warning("Vectors2Vectors was invoked, but did not change anything"); instances.save(trainingFile.value()); System.exit(0); } if (pruneInfogain.wasInvoked() || pruneCount.wasInvoked()) { // Are we also splitting the instances? // Current code doesn't want to do this, so I'm // not changing it, but I don't know a reason. -DM if (t != 1.0) { throw new UnsupportedOperationException("Infogain/count processing of test or validation lists not yet supported."); } if (pruneCount.value > 0) { // Check which type of data element the instances contain Instance firstInstance = instances.get(0); if (firstInstance.getData() instanceof FeatureSequence) { // Version for feature sequences Alphabet oldAlphabet = instances.getDataAlphabet(); Alphabet newAlphabet = new Alphabet(); // It's necessary to create a new instance list in // order to make sure that the data alphabet is correct. Noop newPipe = new Noop (newAlphabet, instances.getTargetAlphabet()); InstanceList newInstanceList = new InstanceList (newPipe); // Iterate over the instances in the old list, adding // up occurrences of features. int numFeatures = oldAlphabet.size(); double[] counts = new double[numFeatures]; for (int ii = 0; ii < instances.size(); ii++) { Instance instance = instances.get(ii); FeatureSequence fs = (FeatureSequence) instance.getData(); fs.addFeatureWeightsTo(counts); } Instance instance, newInstance; // Next, iterate over the same list again, adding // each instance to the new list after pruning. while (instances.size() > 0) { instance = instances.get(0); FeatureSequence fs = (FeatureSequence) instance.getData(); fs.prune(counts, newAlphabet, pruneCount.value); newInstanceList.add(newPipe.instanceFrom(new Instance(fs, instance.getTarget(), instance.getName(), instance.getSource()))); instances.remove(0); } logger.info("features: " + oldAlphabet.size() + " -> " + newAlphabet.size()); // Make the new list the official list. instances = newInstanceList; } else if (firstInstance.getData() instanceof FeatureVector) { // Version for FeatureVector Alphabet alpha2 = new Alphabet (); Noop pipe2 = new Noop (alpha2, instances.getTargetAlphabet()); InstanceList instances2 = new InstanceList (pipe2); int numFeatures = instances.getDataAlphabet().size(); double[] counts = new double[numFeatures]; for (int ii = 0; ii < instances.size(); ii++) { Instance instance = instances.get(ii); FeatureVector fv = (FeatureVector) instance.getData(); fv.addTo(counts); } BitSet bs = new BitSet(numFeatures); for (int fi = 0; fi < numFeatures; fi++) { if (counts[fi] > pruneCount.value) { bs.set(fi); } } logger.info ("Pruning "+(numFeatures-bs.cardinality())+" features out of "+numFeatures +"; leaving "+(bs.cardinality())+" features."); FeatureSelection fs = new FeatureSelection (instances.getDataAlphabet(), bs); for (int ii = 0; ii < instances.size(); ii++) { Instance instance = instances.get(ii); FeatureVector fv = (FeatureVector) instance.getData(); FeatureVector fv2 = FeatureVector.newFeatureVector (fv, alpha2, fs); instances2.add(new Instance(fv2, instance.getTarget(), instance.getName(), instance.getSource()), instances.getInstanceWeight(ii)); instance.unLock(); instance.setData(null); // So it can be freed by the garbage collector } instances = instances2; } else { throw new UnsupportedOperationException("Pruning features from " + firstInstance.getClass().getName() + " is not currently supported"); } } if (pruneInfogain.value > 0) { Alphabet alpha2 = new Alphabet (); Noop pipe2 = new Noop (alpha2, instances.getTargetAlphabet()); InstanceList instances2 = new InstanceList (pipe2); InfoGain ig = new InfoGain (instances); FeatureSelection fs = new FeatureSelection (ig, pruneInfogain.value); for (int ii = 0; ii < instances.size(); ii++) { Instance instance = instances.get(ii); FeatureVector fv = (FeatureVector) instance.getData(); FeatureVector fv2 = FeatureVector.newFeatureVector (fv, alpha2, fs); instance.unLock(); instance.setData(null); // So it can be freed by the garbage collector instances2.add(pipe2.instanceFrom(new Instance(fv2, instance.getTarget(), instance.getName(), instance.getSource())), instances.getInstanceWeight(ii)); } instances = instances2; } if (vectorToSequence.value) { // Convert FeatureVector's to FeatureSequence's by simply randomizing the order // of all the word occurrences, including repetitions due to values larger than 1. Alphabet alpha = instances.getDataAlphabet(); Noop pipe2 = new Noop (alpha, instances.getTargetAlphabet()); InstanceList instances2 = new InstanceList (pipe2); for (int ii = 0; ii < instances.size(); ii++) { Instance instance = instances.get(ii); FeatureVector fv = (FeatureVector) instance.getData(); ArrayList seq = new ArrayList(); for (int loc = 0; loc < fv.numLocations(); loc++) for (int count = 0; count < fv.valueAtLocation(loc); count++) seq.add (new Integer(fv.indexAtLocation(loc))); Collections.shuffle(seq); int[] indices = new int[seq.size()]; for (int i = 0; i < indices.length; i++) indices[i] = ((Integer)seq.get(i)).intValue(); FeatureSequence fs = new FeatureSequence (alpha, indices); instance.unLock(); instance.setData(null); // So it can be freed by the garbage collector instances2.add(pipe2.instanceFrom(new Instance(fs, instance.getTarget(), instance.getName(), instance.getSource())), instances.getInstanceWeight(ii)); } instances = instances2; } if (outputFile.wasInvoked()) { writeInstanceList (instances, outputFile.value()); } else if (trainingFile.wasInvoked()) { writeInstanceList (instances, trainingFile.value()); } else { throw new IllegalArgumentException("You must specify a file to write to, using --output [filename]"); } } else if (vectorToSequence.value) { // Convert FeatureVector's to FeatureSequence's by simply randomizing the order // of all the word occurrences, including repetitions due to values larger than 1. Alphabet alpha = instances.getDataAlphabet(); Noop pipe2 = new Noop (alpha, instances.getTargetAlphabet()); InstanceList instances2 = new InstanceList (pipe2); for (int ii = 0; ii < instances.size(); ii++) { Instance instance = instances.get(ii); FeatureVector fv = (FeatureVector) instance.getData(); ArrayList seq = new ArrayList(); for (int loc = 0; loc < fv.numLocations(); loc++) for (int count = 0; count < fv.valueAtLocation(loc); count++) seq.add (new Integer(fv.indexAtLocation(loc))); Collections.shuffle(seq); int[] indices = new int[seq.size()]; for (int i = 0; i < indices.length; i++) indices[i] = ((Integer)seq.get(i)).intValue(); FeatureSequence fs = new FeatureSequence (alpha, indices); instance.unLock(); instance.setData(null); // So it can be freed by the garbage collector instances2.add(pipe2.instanceFrom(new Instance(fs, instance.getTarget(), instance.getName(), instance.getSource())), instances.getInstanceWeight(ii)); } instances = instances2; if (outputFile.wasInvoked()) { writeInstanceList (instances, outputFile.value()); } } else if (trainingProportion.wasInvoked() || validationProportion.wasInvoked()) { // Split into three lists... InstanceList[] instanceLists = instances.split (r, new double[] {t, 1-t-v, v}); // And write them out if (instanceLists[0].size() > 0) writeInstanceList(instanceLists[0], trainingFile.value()); if (instanceLists[1].size() > 0) writeInstanceList(instanceLists[1], testFile.value()); if (instanceLists[2].size() > 0) writeInstanceList(instanceLists[2], validationFile.value()); } else if (hideTargets.wasInvoked()) { Iterator<Instance> iter = instances.iterator(); while (iter.hasNext()) { Instance instance = iter.next(); instance.unLock(); instance.setProperty("target", instance.getTarget()); instance.setTarget(null); instance.lock(); } if (outputFile.wasInvoked()) { writeInstanceList (instances, outputFile.value()); } } else if (revealTargets.wasInvoked()) { Iterator<Instance> iter = instances.iterator(); while (iter.hasNext()) { Instance instance = iter.next(); instance.unLock(); instance.setTarget(instance.getProperty("target")); instance.lock(); } if (outputFile.wasInvoked()) { writeInstanceList (instances, outputFile.value()); } } } private static void writeInstanceList(InstanceList instances, File file) throws FileNotFoundException, IOException { logger.info ("Writing instance list to "+file); instances.save(file); } }