package LBJ2.nlp; import java.io.BufferedReader; import java.io.File; import java.io.FileFilter; import java.io.FileReader; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.LinkedList; import java.util.List; import java.util.Random; import LBJ2.parse.Parser; /** * This parser creates and returns labeled arrays of <code>String</code>s, * each representing all the words in a document. It assumes the documents * can be found in the subdirectories of a user supplied directory. The * names of each subdirectory becomes the label for every document it * contains. That label appears as the first element of the returned array. * The documents are returned in a randomized order by default, but that * behavior is configurable. The "words" in each document are computed * simply by splitting on whitespace. * * @author Nick Rizzolo **/ public class WordsInDocumentByDirectory implements Parser { /** The list of all files to be parsed. */ protected List files; /** Points to the next element of {@link #files} to be parsed. */ protected int filesIndex; /** * Creates a new parser that reads all subdirectories and randomizes the * order in which their contents are returned. * * @param directory This directory contains subdirectories (whose names * will be used as labels) which contain the documents to * be parsed. **/ public WordsInDocumentByDirectory(String directory) { this(directory, null); } /** * Creates a new parser that reads all subdirectories except for the named * exceptions and randomizes the order in which their contents are * returned. * * @param directory This directory contains subdirectories (whose names * will be used as labels) which contain the documents to * be parsed. * @param exceptions None of the subdirectories whose names appear in this * array will be parsed. It may be null if there are no * exceptions. **/ public WordsInDocumentByDirectory(String directory, String[] exceptions) { this(directory, exceptions, true); } /** * Creates a new parser that reads all subdirectories except for the named * exceptions. * * @param directory This directory contains subdirectories (whose names * will be used as labels) which contain the documents to * be parsed. * @param exceptions None of the subdirectories whose names appear in this * array will be parsed. It may be null if there are no * exceptions. * @param shuffle Whether or not to randomly shuffle the order in which * examples are returned. **/ public WordsInDocumentByDirectory(String directory, String[] exceptions, boolean shuffle) { this(directory, exceptions, true, -1); } /** * Creates a new parser that reads all subdirectories except for the named * exceptions. * * @param directory This directory contains subdirectories (whose names * will be used as labels) which contain the documents to * be parsed. * @param exceptions None of the subdirectories whose names appear in this * array will be parsed. It may be null if there are no * exceptions. * @param shuffle Whether or not to randomly shuffle the order in which * examples are returned. * @param seed For the random number generator. If set to -1, no * seed is used. **/ public WordsInDocumentByDirectory(String directory, String[] exceptions, boolean shuffle, long seed) { File d = new File(directory); if (!d.exists() || !d.isDirectory()) { System.err.println( "Error: '" + directory + "' does not exist or is not a directory."); new Exception().printStackTrace(); System.exit(1); } final String[] e = exceptions == null ? new String[0] : exceptions; Arrays.sort(e); File[] ds = d.listFiles( new FileFilter() { public boolean accept(File f) { return f.isDirectory() && Arrays.binarySearch(e, f.getName()) < 0; } }); files = new ArrayList(); for (int i = 0; i < ds.length; ++i) { File[] fs = ds[i].listFiles( new FileFilter() { public boolean accept(File f) { return f.isFile(); } }); for (int j = 0; j < fs.length; ++j) files.add(fs[j]); } if (shuffle) { Random random = seed < 0 ? new Random() : new Random(seed); Collections.shuffle(files, random); } } /** Sets {@link #filesIndex} back to 0. */ public void reset() { filesIndex = 0; } /** Returns the next labeled array of words. */ public Object next() { if (filesIndex == files.size()) return null; File current = (File) files.get(filesIndex++); String label = current.getAbsolutePath(); int lastSeparator = label.lastIndexOf(File.separatorChar); label = label.substring( label.lastIndexOf(File.separatorChar, lastSeparator - 1) + 1, lastSeparator); return fileToArray(current, label); } /** Frees any resources this parser may be holding. */ public void close() { } /** * Reads in the specified file, splits it on whitespace, and adds all * resulting words to an array which it returns. The specified label * string appears in the returned array first, before any of the file's * words. * * @param file The file to read in. * @param label The label associated with this file, which should appear * as the first element of the returned array. * @return An array containing <code>label</code> followed by all the words * in the file in the order they appeared. **/ public static String[] fileToArray(File file, String label) { LinkedList words = new LinkedList(); words.add(label); BufferedReader in = openReader(file); for (String line = readLine(in, file); line != null; line = readLine(in, file)) { String[] lineWords = line.split("\\s+"); for (int i = 0; i < lineWords.length; ++i) if (!lineWords[i].matches("^\\s*$")) words.add(lineWords[i]); } closeReader(in, file); return (String[]) words.toArray(new String[words.size()]); } /** * Opens a new input stream reading from the specified file, handling any * exception by reporting the error and exiting the program. * * @param inputFile The file to read from. * @return A reader to read from the input file. **/ static BufferedReader openReader(File inputFile) { BufferedReader in = null; try { in = new BufferedReader(new FileReader(inputFile)); } catch (Exception e) { System.err.println("Can't open '" + inputFile + "' for input: " + e); System.exit(1); } return in; } /** * Reads a single line from the specified input stream, handling any * exception by reporting the error and exiting the program. * * @param in The input stream. * @param inputFile The name of the file being read from (for inclusion in * any error message). * @return A single line of text from the input stream, including a * terminating newline character if any. **/ static String readLine(BufferedReader in, File inputFile) { String line = null; try { line = in.readLine(); } catch (Exception e) { System.err.println("Can't read from '" + inputFile + "': " + e); System.exit(1); } return line; } /** * Closes the specified input stream, handling any exception by reporting * the error and exiting the program. * * @param in The input stream. * @param inputFile The name of the file being read from (for inclusion in * any error message). **/ static void closeReader(BufferedReader in, File inputFile) { try { in.close(); } catch (Exception e) { System.err.println("Can't close input file '" + inputFile + "': " + e); System.exit(1); } } }