/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/*
* TextDirectoryLoader.java
* Copyright (C) 2006 University of Waikato, Hamilton, New Zealand
*
*/
package weka.core.converters;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.List;
import java.util.LinkedList;
import java.util.Vector;
import weka.core.Attribute;
import weka.core.DenseInstance;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.RevisionUtils;
import weka.core.Utils;
/**
<!-- globalinfo-start -->
* Loads all text files in a directory and uses the subdirectory names as class labels. The content of the text files will be stored in a String attribute, the filename can be stored as well.
* <p/>
<!-- globalinfo-end -->
*
<!-- options-start -->
* Valid options are: <p/>
*
* <pre> -D
* Enables debug output.
* (default: off)</pre>
*
* <pre> -F
* Stores the filename in an additional attribute.
* (default: off)</pre>
*
* <pre> -dir <directory>
* The directory to work on.
* (default: current directory)</pre>
*
* <pre> -charset <charset name>
* The character set to use, e.g UTF-8.
* (default: use the default character set)</pre>
*
* <pre> -R
* Retain all string attribute values when reading incrementally.</pre>
*
<!-- options-end -->
*
* Based on code from the TextDirectoryToArff tool:
* <ul>
* <li><a href="https://list.scms.waikato.ac.nz/mailman/htdig/wekalist/2002-October/000685.html" target="_blank">Original tool</a></li>
* <li><a href="https://list.scms.waikato.ac.nz/mailman/htdig/wekalist/2004-January/002160.html" target="_blank">Current version</a></li>
* <li><a href="http://weka.wikispaces.com/ARFF+files+from+Text+Collections" target="_blank">Wiki article</a></li>
* </ul>
*
* @author Ashraf M. Kibriya (amk14 at cs.waikato.ac.nz)
* @author Richard Kirkby (rkirkby at cs.waikato.ac.nz)
* @author fracpete (fracpete at waikato dot ac dot nz)
* @version $Revision: 7784 $
* @see Loader
*/
public class TextDirectoryLoader
extends AbstractLoader
implements BatchConverter, IncrementalConverter,
OptionHandler {
/** for serialization */
private static final long serialVersionUID = 2592118773712247647L;
/** Holds the determined structure (header) of the data set. */
protected Instances m_structure = null;
/** Holds the source of the data set. */
protected File m_sourceFile = new File(System.getProperty("user.dir"));
/** whether to print some debug information */
protected boolean m_Debug = false;
/** whether to include the filename as an extra attribute */
protected boolean m_OutputFilename = false;
/**
* The charset to use when loading text files (default is to just use the
* default charset).
*/
protected String m_charSet = "";
/**
* If false, and reading incrementally, then only one string value (the current
* one) will be available in the header for each String attribute
*/
protected boolean m_retainStringValues = false;
/**
* default constructor
*/
public TextDirectoryLoader() {
// No instances retrieved yet
setRetrieval(NONE);
}
/**
* Returns a string describing this loader
*
* @return a description of the evaluator suitable for
* displaying in the explorer/experimenter gui
*/
public String globalInfo() {
return
"Loads all text files in a directory and uses the subdirectory names "
+ "as class labels. The content of the text files will be stored in a "
+ "String attribute, the filename can be stored as well.";
}
/**
* Lists the available options
*
* @return an enumeration of the available options
*/
public Enumeration listOptions() {
Vector<Option> result = new Vector<Option>();
result.add(new Option(
"\tEnables debug output.\n"
+ "\t(default: off)",
"D", 0, "-D"));
result.add(new Option(
"\tStores the filename in an additional attribute.\n"
+ "\t(default: off)",
"F", 0, "-F"));
result.add(new Option(
"\tThe directory to work on.\n"
+ "\t(default: current directory)",
"dir", 0, "-dir <directory>"));
result.add(new Option("\tThe character set to use, e.g UTF-8.\n\t" +
"(default: use the default character set)", "charset", 1,
"-charset <charset name>"));
result.add(new Option(
"\tRetain all string attribute values when reading " +
"incrementally.", "R", 0, "-R"));
return result.elements();
}
/**
* Parses a given list of options. <p/>
*
<!-- options-start -->
* Valid options are: <p/>
*
* <pre> -D
* Enables debug output.
* (default: off)</pre>
*
* <pre> -F
* Stores the filename in an additional attribute.
* (default: off)</pre>
*
* <pre> -dir <directory>
* The directory to work on.
* (default: current directory)</pre>
*
* <pre> -charset <charset name>
* The character set to use, e.g UTF-8.
* (default: use the default character set)</pre>
*
* <pre> -R
* Retain all string attribute values when reading incrementally.</pre>
*
<!-- options-end -->
*
* @param options the options
* @throws Exception if options cannot be set
*/
public void setOptions(String[] options) throws Exception {
setDebug(Utils.getFlag("D", options));
setOutputFilename(Utils.getFlag("F", options));
setDirectory(new File(Utils.getOption("dir", options)));
String charSet = Utils.getOption("charset", options);
m_charSet = "";
if (charSet.length() > 0) {
m_charSet = charSet;
}
setRetainStringValues(Utils.getFlag('R', options));
}
/**
* Gets the setting
*
* @return the current setting
*/
public String[] getOptions() {
Vector<String> options = new Vector<String>();
if (getDebug())
options.add("-D");
if (getOutputFilename())
options.add("-F");
options.add("-dir");
options.add(getDirectory().getAbsolutePath());
if (m_charSet != null && m_charSet.length() > 0) {
options.add("-charset");
options.add(m_charSet);
}
if (getRetainStringValues()) {
options.add("-R");
}
return (String[]) options.toArray(new String[options.size()]);
}
/**
* the tip text for this property
*
* @return the tip text
*/
public String charSetTipText() {
return "The character set to use when reading text files (eg UTF-8) - leave" +
" blank to use the default character set.";
}
/**
* Set the character set to use when reading text files (an empty string
* indicates that the default character set will be used).
*
* @param charSet the character set to use.
*/
public void setCharSet(String charSet) {
m_charSet = charSet;
}
/**
* Get the character set to use when reading text files. An empty
* string indicates that the default character set will be used.
*
* @return the character set name to use (or empty string to indicate
* that the default character set will be used).
*/
public String getCharSet() {
return m_charSet;
}
/**
* Sets whether to print some debug information.
*
* @param value if true additional debug information will be printed.
*/
public void setDebug(boolean value) {
m_Debug = value;
}
/**
* Gets whether additional debug information is printed.
*
* @return true if additional debug information is printed
*/
public boolean getDebug() {
return m_Debug;
}
/**
* the tip text for this property
*
* @return the tip text
*/
public String debugTipText(){
return "Whether to print additional debug information to the console.";
}
/**
* Sets whether the filename will be stored as an extra attribute.
*
* @param value if true the filename will be stored in an extra
* attribute
*/
public void setOutputFilename(boolean value) {
m_OutputFilename = value;
reset();
}
/**
* Gets whether the filename will be stored as an extra attribute.
*
* @return true if the filename is stored in an extra attribute
*/
public boolean getOutputFilename() {
return m_OutputFilename;
}
/**
* the tip text for this property
*
* @return the tip text
*/
public String outputFilenameTipText(){
return "Whether to store the filename in an additional attribute.";
}
/**
* Returns a description of the file type, actually it's directories.
*
* @return a short file description
*/
public String getFileDescription() {
return "Directories";
}
/**
* get the Dir specified as the source
*
* @return the source directory
*/
public File getDirectory() {
return new File(m_sourceFile.getAbsolutePath());
}
/**
* sets the source directory
*
* @param dir the source directory
* @throws IOException if an error occurs
*/
public void setDirectory(File dir) throws IOException {
setSource(dir);
}
/**
* Set whether to retain all string values for string in the header
* when reading incrementally
*
* @param r true if all string values are to be stored (as opposed to
* just the current one).
*/
public void setRetainStringValues(boolean r) {
m_retainStringValues = r;
}
/**
* Get whether to retain all string values for string in the header
* when reading incrementally
*
* @return true if all string values are to be stored (as opposed to
* just the current one).
*/
public boolean getRetainStringValues() {
return m_retainStringValues;
}
/**
* the tip text for this property
*
* @return the tip text
*/
public String retainStringValuesTipText() {
return "When reading incrementally, whether to retain all " +
"values for string attributes. When set to false " +
"only the values for string attributes in the currently " +
"read instance will be held in memory.";
}
/**
* Resets the loader ready to read a new data set
*/
public void reset() {
m_structure = null;
m_filesByClass = null;
m_lastClassDir = 0;
setRetrieval(NONE);
}
/**
* Resets the Loader object and sets the source of the data set to be
* the supplied File object.
*
* @param dir the source directory.
* @throws IOException if an error occurs
*/
public void setSource(File dir) throws IOException {
reset();
if (dir == null) {
throw new IOException("Source directory object is null!");
}
m_sourceFile = dir;
if (!dir.exists() || !dir.isDirectory())
throw new IOException("Directory '" + dir + "' not found");
}
/**
* Determines and returns (if possible) the structure (internally the
* header) of the data set as an empty set of instances.
*
* @return the structure of the data set as an empty
* set of Instances
* @throws IOException if an error occurs
*/
public Instances getStructure() throws IOException {
if (getDirectory() == null) {
throw new IOException("No directory/source has been specified");
}
// determine class labels, i.e., sub-dirs
if (m_structure == null) {
String directoryPath = getDirectory().getAbsolutePath();
ArrayList<Attribute> atts = new ArrayList<Attribute>();
ArrayList<String> classes = new ArrayList<String>();
File dir = new File(directoryPath);
String[] subdirs = dir.list();
for (int i = 0; i < subdirs.length; i++) {
File subdir = new File(directoryPath + File.separator + subdirs[i]);
if (subdir.isDirectory())
classes.add(subdirs[i]);
}
atts.add(new Attribute("text", (ArrayList<String>) null));
if (m_OutputFilename)
atts.add(new Attribute("filename", (ArrayList<String>) null));
// make sure that the name of the class attribute is unlikely to
// clash with any attribute created via the StringToWordVector filter
atts.add(new Attribute("@@class@@", classes));
String relName = directoryPath.replaceAll("/", "_");
relName = relName.replaceAll("\\\\", "_").replaceAll(":", "_");
m_structure = new Instances(relName, atts, 0);
m_structure.setClassIndex(m_structure.numAttributes() - 1);
}
return m_structure;
}
/**
* Return the full data set. If the structure hasn't yet been determined
* by a call to getStructure then method should do so before processing
* the rest of the data set.
*
* @return the structure of the data set as an empty set of Instances
* @throws IOException if there is no source or parsing fails
*/
public Instances getDataSet() throws IOException {
if (getDirectory() == null)
throw new IOException("No directory/source has been specified");
String directoryPath = getDirectory().getAbsolutePath();
ArrayList<String> classes = new ArrayList<String>();
Enumeration enm = getStructure().classAttribute().enumerateValues();
while (enm.hasMoreElements())
classes.add((String)enm.nextElement());
Instances data = getStructure();
int fileCount = 0;
for (int k = 0; k < classes.size(); k++) {
String subdirPath = (String) classes.get(k);
File subdir = new File(directoryPath + File.separator + subdirPath);
String[] files = subdir.list();
for (int j = 0; j < files.length; j++) {
try {
fileCount++;
if (getDebug())
System.err.println(
"processing " + fileCount + " : " + subdirPath + " : " + files[j]);
double[] newInst = null;
if (m_OutputFilename)
newInst = new double[3];
else
newInst = new double[2];
File txt = new File(directoryPath + File.separator + subdirPath + File.separator + files[j]);
BufferedReader is;
if (m_charSet == null || m_charSet.length() == 0) {
is = new BufferedReader(new InputStreamReader(new FileInputStream(txt)));
} else {
is = new BufferedReader(new InputStreamReader(new FileInputStream(txt), m_charSet));
}
StringBuffer txtStr = new StringBuffer();
int c;
while ((c = is.read()) != -1) {
txtStr.append((char) c);
}
newInst[0] = (double) data.attribute(0).addStringValue(txtStr.toString());
if (m_OutputFilename)
newInst[1] = (double) data.attribute(1).addStringValue(subdirPath + File.separator + files[j]);
newInst[data.classIndex()] = (double) k;
data.add(new DenseInstance(1.0, newInst));
is.close();
}
catch (Exception e) {
System.err.println("failed to convert file: " + directoryPath + File.separator + subdirPath + File.separator + files[j]);
}
}
}
return data;
}
protected List<LinkedList<String>> m_filesByClass;
protected int m_lastClassDir = 0;
/**
* TextDirectoryLoader is unable to process a data set incrementally.
*
* @param structure ignored
* @return never returns without throwing an exception
* @throws IOException always. TextDirectoryLoader is unable to process a data
* set incrementally.
*/
public Instance getNextInstance(Instances structure) throws IOException {
//throw new IOException("TextDirectoryLoader can't read data sets incrementally.");
String directoryPath = getDirectory().getAbsolutePath();
Attribute classAtt = structure.classAttribute();
if (m_filesByClass == null) {
m_filesByClass = new ArrayList<LinkedList<String>>();
for (int i = 0; i < classAtt.numValues(); i++) {
File classDir = new File(directoryPath + File.separator
+ classAtt.value(i));
String[] files = classDir.list();
LinkedList<String> classDocs = new LinkedList<String>();
for (String cd : files) {
File txt = new File(directoryPath + File.separator +
classAtt.value(i) + File.separator + cd);
if (txt.isFile()) {
classDocs.add(cd);
}
}
m_filesByClass.add(classDocs);
}
}
// cycle through the classes
int count = 0;
LinkedList<String> classContents = m_filesByClass.get(m_lastClassDir);
boolean found = (classContents.size() > 0);
while (classContents.size() == 0) {
m_lastClassDir++;
count++;
if (m_lastClassDir == structure.classAttribute().numValues()) {
m_lastClassDir = 0;
}
classContents = m_filesByClass.get(m_lastClassDir);
if (classContents.size() > 0) {
found = true; // we have an instance we can create
break;
}
if (count == structure.classAttribute().numValues()) {
break; // must be finished
}
}
if (found) {
String nextDoc = classContents.poll();
File txt = new File(directoryPath + File.separator +
classAtt.value(m_lastClassDir) + File.separator + nextDoc);
BufferedReader is;
if (m_charSet == null || m_charSet.length() == 0) {
is = new BufferedReader(new InputStreamReader(new FileInputStream(txt)));
} else {
is = new BufferedReader(new InputStreamReader(new FileInputStream(txt), m_charSet));
}
StringBuffer txtStr = new StringBuffer();
int c;
while ((c = is.read()) != -1) {
txtStr.append((char) c);
}
double[] newInst = null;
if (m_OutputFilename)
newInst = new double[3];
else
newInst = new double[2];
if (getRetainStringValues()) {
newInst[0] = (double) structure.attribute(0).
addStringValue(txtStr.toString());
} else {
newInst[0] = 0;
structure.attribute(0).setStringValue(txtStr.toString());
}
if (m_OutputFilename) {
if (getRetainStringValues()) {
newInst[1] = (double) structure.attribute(1).
addStringValue(txt.getAbsolutePath());
} else {
newInst[1] = 0;
structure.attribute(1).setStringValue(txt.getAbsolutePath());
}
}
newInst[structure.classIndex()] = (double) m_lastClassDir;
Instance inst = new DenseInstance(1.0, newInst);
inst.setDataset(structure);
is.close();
m_lastClassDir++;
if (m_lastClassDir == structure.classAttribute().numValues()) {
m_lastClassDir = 0;
}
return inst;
} else {
return null; // done!
}
}
/**
* Returns the revision string.
*
* @return the revision
*/
public String getRevision() {
return RevisionUtils.extract("$Revision: 7784 $");
}
/**
* Main method.
*
* @param args should contain the name of an input file.
*/
public static void main(String[] args) {
if (args.length > 0) {
try {
TextDirectoryLoader loader = new TextDirectoryLoader();
loader.setOptions(args);
//System.out.println(loader.getDataSet());
Instances structure = loader.getStructure();
System.out.println(structure);
Instance temp;
do {
temp = loader.getNextInstance(structure);
if (temp != null) {
System.out.println(temp);
}
} while (temp != null);
}
catch (Exception e) {
e.printStackTrace();
}
}
else {
System.err.println(
"\nUsage:\n"
+ "\tTextDirectoryLoader [options]\n"
+ "\n"
+ "Options:\n");
Enumeration enm = ((OptionHandler) new TextDirectoryLoader()).listOptions();
while (enm.hasMoreElements()) {
Option option = (Option) enm.nextElement();
System.err.println(option.synopsis());
System.err.println(option.description());
}
System.err.println();
}
}
}