package edu.nd.nina.graph.load; import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; import java.util.ArrayList; import java.util.Iterator; import edu.nd.nina.types.Instance; /** * An iterator that generates instances from an initial * directory or set of directories. The iterator will recurse through sub-directories. * Each filename becomes the data field of an instance, and the result of * a user-specified regular expression pattern applied to the filename becomes * the target value of the instance. * <p> * In document classification it is common that the file name in the data field * will be subsequently processed by one or more pipes until it contains a feature vector. * The pattern applied to the file name is often * used to extract a directory name * that will be used as the true label of the instance; this label is kept in the target * field. * * * @author Tim Weninger */ public class LineIterator implements Iterator<Instance> { ArrayList<String> lineArray; Iterator<String> subIterator; int lineCount; /** Special value that means to use the directories[i].getPath() as the target name */ /** Use as label names the directories specified in the constructor, * optionally removing common prefix of all starting directories */ // added by Fuchun Peng public ArrayList<String> getLineArray() { return lineArray; } /** * Construct a FileIterator that will supply filenames within initial directories * as instances * @param directories Array of directories to collect files from * @param fileFilter class implementing interface FileFilter that will decide which names to accept. * May be null. * @param targetPattern regex Pattern applied to the filename whose first parenthesized group * on matching is taken to be the target value of the generated instance. The pattern is applied to * the directory with the matcher.find() method. If null, then all instances * will have target null. * @param removeCommonPrefix boolean that modifies the behavior of the STARTING_DIRECTORIES pattern, * removing the common prefix of all initially specified directories, * leaving the remainder of each filename as the target value. * */ public LineIterator(File file) { this.lineArray = new ArrayList<String> (); BufferedReader br = null; try { br = new BufferedReader(new FileReader(file)); } catch (FileNotFoundException e) { e.printStackTrace(); } String line = ""; try { while((line = br.readLine()) != null){ line = line.trim(); if(line.isEmpty()){ continue; } if(line.startsWith("#")){ continue; } lineArray.add(line); } } catch (IOException e) { e.printStackTrace(); } subIterator = lineArray.iterator(); } // The PipeInputIterator interface public Instance next () { String nextLine = subIterator.next(); String targetName = null; lineCount++; return new Instance (nextLine, targetName, lineCount, null); } public void remove () { throw new IllegalStateException ("This Iterator<Instance> does not support remove()."); } public boolean hasNext () { return subIterator.hasNext(); } }