/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept. This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit). http://www.cs.umass.edu/~mccallum/mallet This software is provided under the terms of the Common Public License, version 1.0, as published by http://www.opensource.org. For further information, see the file `LICENSE' included with this distribution. */ /** @author Gary Huang <a href="mailto:ghuang@cs.umass.edu">ghuang@cs.umass.edu</a> */ package cc.mallet.pipe.iterator; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.net.URI; import java.util.regex.*; import java.io.*; import cc.mallet.pipe.Pipe; import cc.mallet.types.Alphabet; import cc.mallet.types.Instance; import cc.mallet.types.Label; import cc.mallet.util.Strings; /** * An iterator that generates instances for a pipe from a list of filenames. * Each file is treated as a text file whose target is determined by * a user-specified regular expression pattern applied to the filename * * @author Gary Huang <a href="mailto:ghuang@cs.umass.edu">ghuang@cs.umass.edu</a> */ public class FileListIterator implements Iterator<Instance> { FileFilter fileFilter; ArrayList fileArray; Iterator subIterator; Pattern targetPattern; // Set target slot to string coming from 1st group of this Pattern int commonPrefixIndex; /** Special value that means to use the directories[i].getPath() as the target name */ // xxx Note that these are specific to UNIX directory delimiter characters! Fix this. /** Use as label names the directories of the given files, * optionally removing common prefix of all starting directories */ public static final Pattern STARTING_DIRECTORIES = Pattern.compile ("_STARTING_DIRECTORIES_"); /** Use as label names the first directory in the filename. */ public static final Pattern FIRST_DIRECTORY = Pattern.compile ("/?([^/]*)/.+"); /** Use as label name the last directory in the filename. */ public static final Pattern LAST_DIRECTORY = Pattern.compile(".*/([^/]+)/[^/]+"); // was ("([^/]*)/[^/]+"); /** Use as label names all the directory names in the filename. */ public static final Pattern ALL_DIRECTORIES = Pattern.compile ("^(.*)/[^/]+"); /* Pass null as targetPattern to get null targets */ /** * Construct an iterator over the given arry of Files * * The instances constructed from the files are returned in the same order * as they appear in the given array * * @param files Array of files from which to construct instances * @param fileFilter class implementing interface FileFilter that will decide which names to accept. * May be null. * @param targetPattern regex Pattern applied to the filename whose first parenthesized group * on matching is taken to be the target value of the generated instance. * The pattern is applied to the filename with the matcher.find() method. * @param removeCommonPrefix boolean that modifies the behavior of the STARTING_DIRECTORIES * pattern, removing the common prefix of all initially specified * directories, leaving the remainder of each filename as the target value. * */ public FileListIterator(File[] files, FileFilter fileFilter, Pattern targetPattern, boolean removeCommonPrefix) { this.fileFilter = fileFilter; this.fileArray = new ArrayList(); this.targetPattern = targetPattern; fillFileArrayAssignCommonPrefixIndexAndSubIterator(files, removeCommonPrefix); } public FileListIterator(String[] filenames, FileFilter fileFilter, Pattern targetPattern, boolean removeCommonPrefix) { this(FileIterator.stringArray2FileArray(filenames), fileFilter, targetPattern, removeCommonPrefix); } /** * Construct a FileListIterator with the file containing the list of files, which * contains one filename per line. * * The instances constructed from the filelist are returned in the same order * as listed */ public FileListIterator(File filelist, FileFilter fileFilter, Pattern targetPattern, boolean removeCommonPrefix) throws FileNotFoundException, IOException { this.fileFilter = fileFilter; this.fileArray = new ArrayList(); this.targetPattern = targetPattern; List filenames = readFileNames (filelist); File[] fa = stringList2FileArray (filenames, null); fillFileArrayAssignCommonPrefixIndexAndSubIterator(fa, removeCommonPrefix); } /** * Construct a FileListIterator with the file containing the list of files * of RELATIVE pathnames, one filename per line. * <p> * The instances constructed from the filelist are returned in the same order * as listed * @param filelist List of relative file names. * @param baseDirectory Base directory for relative file names. * */ public FileListIterator(File filelist, File baseDirectory, FileFilter fileFilter, Pattern targetPattern, boolean removeCommonPrefix) throws FileNotFoundException, IOException { this.fileFilter = fileFilter; this.fileArray = new ArrayList(); this.targetPattern = targetPattern; List filenames = readFileNames (filelist); File[] fa = stringList2FileArray (filenames, baseDirectory); fillFileArrayAssignCommonPrefixIndexAndSubIterator(fa, removeCommonPrefix); } private static File[] stringList2FileArray (List filenames, File baseDir) { File[] fa = new File[filenames.size()]; for (int i = 0; i < filenames.size(); i++) if (baseDir != null) { fa[i] = new File (baseDir, (String) filenames.get(i)); } else { fa[i] = new File ((String) filenames.get(i)); } return fa; } private static List readFileNames (File filelist) throws IOException { ArrayList filenames = new ArrayList(); BufferedReader reader = new BufferedReader(new FileReader (filelist)); String filename = reader.readLine(); while (filename != null && filename.trim().length() > 0) { filenames.add(filename.trim()); filename = reader.readLine(); } reader.close(); return filenames; } public FileListIterator(String filelistName, FileFilter fileFilter, Pattern targetPattern, boolean removeCommonPrefix) throws FileNotFoundException, IOException { this (new File(filelistName), fileFilter, targetPattern, removeCommonPrefix); } public FileListIterator(String filelistName, Pattern targetPattern) throws FileNotFoundException, IOException { this (new File(filelistName), null, targetPattern, true); } // The PipeInputIterator interface public Instance next () { File nextFile = (File) subIterator.next(); String path = nextFile.getParent(); String targetName = null; if (targetPattern == STARTING_DIRECTORIES) { targetName = path.substring(commonPrefixIndex); } else if (targetPattern != null) { Matcher m = targetPattern.matcher(path); if (m.find ()){ targetName = m.group (1); } } return new Instance (nextFile, targetName, nextFile.toURI(), null); } public File nextFile () { return (File) subIterator.next(); } public boolean hasNext () { return subIterator.hasNext(); } public void remove () { throw new IllegalStateException ("This Iterator<Instance> does not support remove()."); } public ArrayList getFileArray() { return fileArray; } private void fillFileArrayAssignCommonPrefixIndexAndSubIterator(File[] files, boolean removeCommonPrefix) { ArrayList filenames = new ArrayList(); for (int i = 0; i < files.length; i++) { if (files[i].isDirectory()) throw new IllegalArgumentException(files[i] + " is not a file."); else if (! files[i].exists()) throw new IllegalArgumentException(files[i] + " does not exist."); if (this.fileFilter == null || this.fileFilter.accept(files[i])) { this.fileArray.add(files[i]); if (removeCommonPrefix) filenames.add(files[i].getPath()); } } this.subIterator = this.fileArray.iterator(); if (removeCommonPrefix) { // find the common prefix index of all filenames String[] fn = new String[filenames.size()]; for (int i = 0; i < fn.length; i++) fn[i] = (String) filenames.get(i); this.commonPrefixIndex = Strings.commonPrefixIndex(fn); } else this.commonPrefixIndex = 0; } }