package weka.datagenerators;
import weka.datagenerators.TextSource.Real;
import weka.core.Option;
import weka.core.Utils;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileFilter;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
/**
* Reads a collection of text documents from a set of directories.
* In each directory there are a number of text files, and each text
* file contains a single document. The name of the directory
* specifies the class of all the documents that it contains.
*
* <p><b>WEKA options:</b>
* <ul>
* <li><code>-D <str></code> - The pathname of the base
* directory of the subdirectories. This parameter has no default
* value and is not optional.
*
* <li><code>-u <str></code> - The regular expression for
* choosing subdirectories. Only those subdirectories whose name
* matches this expression will be searched. The substring captured
* by group 1 will be used as the document class, or the whole
* subdirectory name will be used if there are no capturing groups
* in the regular expression. For example, if there are three
* subdirectories, namely <code>etc</code>,
* <code>news:comp.lang.c</code> and
* <code>news:comp.lang.c++</code>, and we use
* <code>news:(.+)</code> as the mask, then only
* <code>news:comp.lang.c</code> and <code>news:comp.lang.c++</code>
* will be searched, and the document classes that they represent
* will be <code>comp.lang.c</code> and <code>comp.lang.c++</code>
* respectively. If we omit the parentheses inside the mask, then
* the document classes will be <code>news:comp.lang.c</code> and
* <code>news:comp.lang.c++</code> instead. By default the mask is
* <code>.*</code>.
*
* <li><code>-l <str></code> - The regular
* expression for choosing text files. Only those files whose name
* matches this expression will be read. By default the mask is
* <code>.*</code>.
* </ul>
*
* @author ywwong
* @version $Id: DirectoryDocumentReader.java,v 1.1.1.1 2003/01/22 07:48:27 mbilenko Exp $
*/
class DirectoryDocumentReader implements DocumentReader {
/** A file filter that accepts only readable directories. */
protected class AcceptsDirectories implements FileFilter {
private TextSource m_ts;
private Pattern m_p;
public AcceptsDirectories(TextSource ts, Pattern p) {
m_ts = ts;
m_p = p;
}
public boolean accept(File f) {
Matcher m;
Real d;
if (!f.canRead() || !f.isDirectory())
return false;
m = m_p.matcher(f.getName());
if (!m.matches())
return false;
if (m.groupCount() > 0)
d = m_ts.registerClass(m.group(1));
else
d = m_ts.registerClass(m.group());
m_aClassIndices.add(d);
return true;
}
}
/** A file filter that accepts only readable files. */
protected class AcceptsFiles implements FileFilter {
private Pattern m_p;
public AcceptsFiles(Pattern p) { m_p = p; }
public boolean accept(File f) {
Matcher m;
if (!f.canRead() || !f.isFile())
return false;
m = m_p.matcher(f.getName());
return m.matches();
}
}
/** The list of all class indices. */
protected ArrayList m_aClassIndices;
/** The list of all files to be read. */
protected ArrayList m_aFiles;
/** A list of class indices that the files belong. */
protected ArrayList m_aClasses;
/** The index of the next file to be read. */
protected int m_nFile;
/** The file being read. */
protected BufferedInputStream m_reader;
////// WEKA specific. //////
/** The option string for directory. */
protected String m_strDir;
/** The option string for subdirectory mask. */
protected String m_strSubdirMask;
/** The option string for file mask. */
protected String m_strFileMask;
////// Ends WEKA specific. //////
/**
* Creates a directory document reader.
*
* @param ts The TextSource object.
*/
public DirectoryDocumentReader(TextSource ts, String[] options)
throws Exception {
super();
File fDir;
File[] aSubdirs;
File[] aFiles;
Pattern patSubdirMask;
Pattern patFileMask;
AcceptsDirectories adFilter;
AcceptsFiles afFilter;
////// WEKA specific. //////
m_strDir = Utils.getOption('D', options);
if (m_strDir.length() == 0)
throw new Exception("Base directory (-D) not set.");
m_strSubdirMask = Utils.getOption('u', options);
if (m_strSubdirMask.length() == 0)
m_strSubdirMask = ".*";
m_strFileMask = Utils.getOption('l', options);
if (m_strFileMask.length() == 0)
m_strFileMask = ".*";
////// Ends WEKA specific. //////
m_aClassIndices = new ArrayList();
m_aFiles = new ArrayList();
m_aClasses = new ArrayList();
patSubdirMask = Pattern.compile(m_strSubdirMask);
patFileMask = Pattern.compile(m_strFileMask);
adFilter = new AcceptsDirectories(ts, patSubdirMask);
afFilter = new AcceptsFiles(patFileMask);
fDir = new File(m_strDir);
aSubdirs = fDir.listFiles(adFilter);
for (int i = 0; i < aSubdirs.length; ++i) {
aFiles = aSubdirs[i].listFiles(afFilter);
for (int j = 0; j < aFiles.length; ++j) {
m_aFiles.add(aFiles[j]);
m_aClasses.add(new Integer(i));
}
}
m_nFile = 0;
m_reader = null;
}
/**
* Tests if there are any unread documents.
*
* @return <code>true</code> if there are unread documents;
* <code>false</code> if otherwise.
*/
public boolean hasNextDocument() {
return m_nFile < m_aFiles.size();
}
/**
* Resets the reader so that when <code>read()</code> is called,
* the next document is read.
*
* @return The class index of the next document; or
* <code>null</code> if there are no more documents.
*/
public Real nextDocument() throws IOException {
File f;
int nClass;
if (m_nFile >= m_aFiles.size())
return null;
f = (File) m_aFiles.get(m_nFile);
m_reader = new BufferedInputStream(new FileInputStream(f));
nClass = ((Integer) m_aClasses.get(m_nFile)).intValue();
++m_nFile;
return (Real) m_aClassIndices.get(nClass);
}
/**
* Reads the next character from the current document.
*
* @return The next character; or -1 if
* <code>nextDocument()</code> has never been called, or there are
* no more characters for the current document.
*/
public int read() throws IOException {
int c;
if (m_reader == null)
return -1;
c = m_reader.read();
if (c == -1)
m_reader = null;
return c;
}
////// WEKA specific. //////
public static Collection listOptions() {
ArrayList aOpts;
aOpts = new ArrayList();
aOpts.add(new Option("\tDirectoryDocumentReader: Base directory",
"D", 1, "-D <str>"));
aOpts.add(new Option("\tDirectoryDocumentReader: Subdir mask " +
"(default .*)",
"u", 1, "-u <str>"));
aOpts.add(new Option("\tDirectoryDocumentReader: File mask " +
"(default .*)",
"l", 1, "-l <str>"));
return aOpts;
}
public Collection getOptions() {
ArrayList aOpts;
aOpts = new ArrayList();
aOpts.add("-D");
aOpts.add(m_strDir);
aOpts.add("-u");
aOpts.add(m_strSubdirMask);
aOpts.add("-l");
aOpts.add(m_strFileMask);
return aOpts;
}
}