package LBJ2.parse; import java.io.BufferedInputStream; import java.io.ByteArrayInputStream; import java.io.DataInputStream; import java.io.EOFException; import java.io.FileInputStream; import java.util.zip.ZipFile; import java.util.zip.ZipInputStream; import LBJ2.classify.FeatureVector; import LBJ2.learn.Learner; import LBJ2.parse.FoldSeparator; import LBJ2.util.ExceptionlessInputStream; /** * This parser returns an array of arrays representing each example. The * first array represents the integer keys of the example's features; the * second array holds the values of those features. The third array holds the * example's label(s), and the fourth array holds the values of those labels. * These arrays are read in through files, and the paths to these files are * passed in through the constructor. * * <p> When run as a stand-alone program, this class takes the names of * example, lexicon, and model files as input and prints all the feature * vectors in the dataset to <code>STDOUT</code>. * * @author Michael Paul **/ public class ArrayFileParser implements Parser { /** Reader for file currently being parsed. */ protected DataInputStream in; /** The name of the file to parse. */ protected String exampleFileName; /** A single array from which all examples can be parsed. */ protected byte[] exampleData; /** Whether or not the input stream is zipped. */ protected boolean zipped; /** Whether the returned example arrays should include pruned features. */ protected boolean includePruned = false; /** * Initializes the parser with a file name assuming the input stream is not * zipped. * * @param exampleFile The name of the file containing the examples. **/ public ArrayFileParser(String exampleFile) { this(exampleFile, true); } /** * Initializes the parser with a file name, specifying whether the data is * zipped. * * @param exampleFile The name of the file containing the examples. * @param zip Whether or not the input stream is zipped. **/ public ArrayFileParser(String exampleFile, boolean zip) { exampleFileName = exampleFile; zipped = zip; reset(); } /** * Initializes the parser with a data array assuming the input stream is * not zipped. * * @param data The examples can be parsed out of this array. **/ public ArrayFileParser(byte[] data) { this(data, true); } /** * Initializes the parser with a data array, specifying whether the data is * zipped. * * @param data The examples can be parsed out of this array. * @param zip Whether or not the input stream is zipped. **/ public ArrayFileParser(byte[] data, boolean zip) { exampleData = data; zipped = zip; reset(); } /** Setter for {@link #includePruned}. */ public void setIncludePruned(boolean b) { includePruned = b; } /** * Returns the number of examples left in the example file. This may be * slow to compute as it must read through the entire file and increment * the count. {@link #reset()} is called after the examples are counted. * * @return The number of examples left in the example file. **/ public int getNumExamples() { int result = 0; try { while (true) { int L = in.readInt(); if (L == -1) continue; ++result; in.skipBytes(12 * L); // 4 for label index, 8 for its value L = in.readInt() + in.readInt(); in.skipBytes(12 * L); // 4 for feature index, 8 for its value } } catch (EOFException eof) { } catch (Exception e) { System.err.println("Can't read from '" + exampleFileName + "':"); e.printStackTrace(); System.exit(1); } reset(); return result; } /** * Returns either an <code>Object[]</code> or a {@link FoldSeparator} * deserialized out of the given file. **/ public Object next() { Object[] result = new Object[4]; try { int L = in.readInt(); // A -1 means that there was a fold separator here if (L == -1) return FoldSeparator.separator; else { int[] exampleLabels = new int[L]; double[] labelValues = new double[L]; for (int i = 0; i < L; ++i) { exampleLabels[i] = in.readInt(); labelValues[i] = in.readDouble(); } int Fup = in.readInt(); // # unpruned int Fp = in.readInt(); // # pruned int F = (includePruned) ? (Fup+Fp) : Fup; int[] exampleFeatures = new int[F]; double[] exampleValues = new double[F]; for (int i = 0; i < Fup+Fp; ++i) { int ef = in.readInt(); double ev = in.readDouble(); if (i < F) { exampleFeatures[i] = ef; exampleValues[i] = ev; } } result[0] = exampleFeatures; result[1] = exampleValues; result[2] = exampleLabels; result[3] = labelValues; } } catch (EOFException eof) { result = null; } catch (Exception e) { System.err.println("Can't read from '" + exampleFileName + "':"); e.printStackTrace(); System.exit(1); } return result; } /** Resets the example file stream to the beginning. */ public void reset() { close(); try { if (exampleFileName != null) { if (zipped) { ZipFile zip = new ZipFile(exampleFileName); in = new DataInputStream( new BufferedInputStream( zip.getInputStream( zip.getEntry(ExceptionlessInputStream.zipEntryName)))); } else in = new DataInputStream( new BufferedInputStream( new FileInputStream(exampleFileName))); } else if (zipped) { ZipInputStream zip = new ZipInputStream( new ByteArrayInputStream(exampleData)); zip.getNextEntry(); in = new DataInputStream(new BufferedInputStream(zip)); } else in = new DataInputStream( new ByteArrayInputStream(exampleData)); } catch (Exception e) { System.err.println("Can't open '" + exampleFileName + "' for input:"); e.printStackTrace(); System.exit(1); } } /** Frees any resources this parser may be holding. */ public void close() { if (in == null) return; try { in.close(); } catch (Exception e) { System.err.println("Can't close '" + exampleFileName + "':"); e.printStackTrace(); System.exit(1); } } public static void main(String[] args) { String exFileName = null; String lexFileName = null; String lcFileName = null; try { exFileName = args[0]; lexFileName = args[1]; lcFileName = args[2]; if (args.length > 3) throw new Exception(); } catch (Exception e) { System.err.println( "usage: java LBJ2.parse.ArrayFileParser <example file> <lexicon file> <lc file>"); System.exit(1); } ArrayFileParser parser = new ArrayFileParser(exFileName); Learner learner = Learner.readLearner(lcFileName); learner.readLexicon(lexFileName); for (Object e = parser.next(); e != null; e = parser.next()) { FeatureVector v = new FeatureVector((Object[]) e, learner.getLexicon(), learner.getLabelLexicon()); v.sort(); System.out.println(v); } } }