package weka.datagenerators;
import weka.core.Attribute;
import weka.core.FastVector;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.SparseInstance;
import weka.core.Utils;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.ListIterator;
import java.util.Map.Entry;
import java.util.Set;
import java.util.TreeMap;
import java.util.Vector;
import java.util.regex.Pattern;
/**
* Reads a collection of text documents and transforms them into
* sparse vectors. The sparse vectors are then put into an ARFF file
* for further processing by WEKA.
*
* <p><b>WEKA options:</b>
* <ul>
* <li><code>-I</code> - Include TFIDF scores instead of TF.
*
* <li><code>-R <str></code> - The document reader. Now only
* one is supported, namely <code>directory</code>. This parameter
* has no default value and is not optional.
*
* <li><code>-L <str></code> - The lexer. Now only one lexer
* is supported, namely <code>simple</code>. This parameter has no
* default value and is not optional.
*
* <li><code>-F <str>[:<str>...]</code> - A
* colon-separated list of filters being applied on the tokens.
* Four filters are supported, namely <code>lower_case</code>,
* <code>porter_stemmer</code>, <code>stop_word</code>, and
* <code>word_length</code>. Order of listing is significant. For
* example, if the value for <code>filters</code> is
* <code>stop_word:porter_stemmer</code>, then the
* <code>stop_word</code> filter is applied before
* <code>porter_stemmer</code>. By default the list is empty.
*
* <li>Document readers, filters and lexers have their own
* parameters. See their documentation for detail.
* </ul>
*
* <p>The generic generator options <code>-a</code>, <code>-c</code>
* and <code>-n</code> are ignored.
*
* <p>Here are some sample command lines:
*
<pre>
java weka.datagenerators.TextSource
-r news -R directory -D cmu-newsgroup-random-100/
-L simple -y whitespace -o news.arff
</pre>
*
* <p>The name of the dataset is <code>news</code>. We use the
* <code>directory</code> document reader. The directory being read
* is <code>cmu-newsgroup-random-100/</code>. We use the
* <code>simple</code> lexer and all tokens are delimited by
* whitespace. The output file is <code>news.arff</code>.
*
<pre>
java weka.datagenerators.TextSource
-r news -R directory -D cmu-newsgroup-random-100/
-L simple -y alphanum -o news.arff
</pre>
*
* <p>In this case all tokens consist of only alphanumeric characters.
*
<pre>
java weka.datagenerators.TextSource
-r news -R directory -D cmu-newsgroup-random-100/
-L simple -y alpha -o news.arff
</pre>
*
* <p>All tokens consist of only alphabets.
*
<pre>
java weka.datagenerators.TextSource
-r news -R directory -D cmu-newsgroup-random-100/
-L simple -y alpha -F lower_case -o news.arff
</pre>
*
* <p>All tokens are converted to lower case before being indexed.
*
<pre>
java weka.datagenerators.TextSource
-r news -R directory -D cmu-newsgroup-random-100/
-L simple -y alpha -F lower_case:stop_word -o news.arff
</pre>
*
* <p>All stop words are removed. The default SMART stop list is used.
*
<pre>
java weka.datagenerators.TextSource
-r news -R directory -D cmu-newsgroup-random-100/
-L simple -y alpha -F lower_case:stop_word:porter_stemmer -o news.arff
</pre>
*
* <p>After removing the stop words, we apply the Porter stemmer.
*
<pre>
java weka.datagenerators.TextSource
-r news -R directory -D cmu-newsgroup-random-100/
-L simple -y alpha
-F lower_case:stop_word:porter_stemmer:word_length -N 5 -o news.arff
</pre>
*
* <p>After stemming the tokens, we throw away all tokens whose length
* is less than five.
*
<pre>
java weka.datagenerators.TextSource
-r news -R directory -D cmu-newsgroup-random-100/
-L simple -y alpha
-F lower_case:stop_word:word_length:porter_stemmer -N 5 -o news.arff
</pre>
*
* <p>We throw away tokens whose length is less than five before
* applying the Porter stemmer.
*
<pre>
java weka.datagenerators.TextSource
-r news -R directory -D cmu-newsgroup-random-100/ -u 'talk.*'
-L simple -y alpha
-F lower_case:stop_word:word_length:porter_stemmer -N 5 -o news.arff
</pre>
*
* <p>Read only documents that belong to the classes
* <code>talk.*</code>. The argument for <code>-u</code> can be any
* regular expression.
*
* @author ywwong
* @version $Id: TextSource.java,v 1.1.1.1 2003/01/22 07:48:27 mbilenko Exp $
*/
public class TextSource extends Generator
implements OptionHandler, Serializable {
/** A simpler wrapper for int than Integer. */
public class Int implements Comparable {
public int m_i;
public Int(int i) { m_i = i; }
public int compareTo(Object o) {
Int n = (Int) o;
if (m_i < n.m_i)
return -1;
else if (m_i == n.m_i)
return 0;
else
return 1;
}
public boolean equals(Object n) {
if (n.getClass() == Int.class)
return m_i == ((Int) n).m_i;
else
return false;
}
public int hashCode() { return m_i; }
public String toString() { return Integer.toString(m_i); }
}
/** A simpler wrapper for double than Double. */
public class Real {
public double m_d;
public Real(double d) { m_d = d; }
public String toString() { return Double.toString(m_d); }
}
/** Sparse map data row structure with public hash map. */
public class DataRow {
public double m_dClass;
public TreeMap m_data;
public DataRow() {
m_data = new TreeMap();
}
public void set(Int nIndex, Real dVal) {
if (dVal.m_d == 0.0)
m_data.remove(nIndex);
else
m_data.put(nIndex, dVal);
}
public void setClass(Real dClass) {
m_dClass = dClass.m_d;
}
// WEKA specific.
public Instance makeInstance(Table table) {
Instance inst;
double[] aVals;
int[] aIndices;
Iterator it;
Entry ent;
int i;
aVals = new double[m_data.size() + 1];
aIndices = new int[m_data.size() + 1];
it = m_data.entrySet().iterator();
for (i = 0; it.hasNext(); i++) {
ent = (Entry) it.next();
aVals[i] = ((Real) ent.getValue()).m_d;
aIndices[i] = ((Int) ent.getKey()).m_i;
}
aVals[i] = m_dClass;
aIndices[i] = table.m_nIndex;
inst = new SparseInstance(0.0, aVals, aIndices,
table.m_format.numAttributes());
inst.setDataset(table.m_format);
return inst;
}
}
/** Table that allows incremental addition of attributes. */
public class Table {
protected TextSource m_ts;
protected FastVector m_attribs;
protected Instances m_format;
protected LinkedList m_data;
protected int m_nIndex; // class index
protected ListIterator m_it; // used by getNextInstance()
public Table(TextSource ts) {
m_ts = ts;
m_attribs = new FastVector();
m_data = new LinkedList();
m_it = null;
}
public void add(DataRow vector) {
m_data.add(vector);
}
public void addAttribute(Attribute attrib) {
m_attribs.addElement(attrib);
}
// WEKA specific.
public Instances makeDataFormat() throws Exception {
FastVector attribs;
FastVector aClasses;
Set setKeys;
Iterator it;
// Add class index as one of the attributes.
aClasses = new FastVector(m_ts.m_hashClasses.size());
setKeys = m_ts.m_hashClasses.keySet();
for (it = setKeys.iterator(); it.hasNext(); )
aClasses.addElement(it.next());
m_nIndex = m_attribs.size();
attribs = (FastVector) m_attribs.copy();
attribs.addElement(new Attribute("__class__", aClasses));
m_format = new Instances(m_ts.getRelationName(), attribs, 0);
m_format.setClassIndex(m_nIndex);
// Update generator variables.
m_ts.setNumClasses(aClasses.size());
m_ts.setNumExamples(m_data.size());
m_ts.setNumExamplesAct(m_data.size());
m_ts.setNumAttributes(m_attribs.size() + 1);
return m_format;
}
// WEKA specific.
public Instance getNextInstance() {
if (m_it == null)
m_it = m_data.listIterator();
return ((DataRow) m_it.next()).makeInstance(this);
}
}
/** Information about a particular token. */
protected class Token {
/** The token string. */
public String m_strToken;
/** The token ID, which is the same as the attribute index. */
public Int m_nID;
/** The document frequency. */
public int m_nDF;
public Token(String strToken, Int nID) {
m_strToken = strToken;
m_nID = nID;
m_nDF = 0;
}
}
/** The example table. */
protected Table m_table;
/** A map for looking up tokens. */
protected HashMap m_hashTokens;
/** An ordered list for looking up tokens. */
protected ArrayList m_aTokens;
/** The next token ID. */
protected int m_nNextToken;
/** A map for looking up classes. */
protected LinkedHashMap m_hashClasses;
/** The next class ID. */
protected double m_dNextClass;
/** Collect TFIDF statistics instead of TF. */
protected boolean m_bTFIDF;
/** The document reader. */
protected DocumentReader m_reader;
/** The lexer. */
protected Lexer m_lexer;
/** The list of token filters which are applied in order. */
protected LinkedList m_lstFilters;
public TextSource() {
m_table = new Table(this);
m_hashTokens = new HashMap();
m_aTokens = new ArrayList();
m_nNextToken = 0;
m_hashClasses = new LinkedHashMap();
m_dNextClass = 0.0;
m_bTFIDF = false;
m_bFormatDefined = false;
}
// Called by document readers.
public Real registerClass(String strClass) {
Real dClass;
dClass = (Real) m_hashClasses.get(strClass);
if (dClass == null) {
dClass = new Real(m_dNextClass);
m_dNextClass += 1.0;
m_hashClasses.put(strClass, dClass);
}
return dClass;
}
/**
* Tokenizes a document and transforms it into a sparse vector.
*
* @param dClass The class index of the document to be read.
*/
protected DataRow getInstance(Real dClass) throws IOException {
DataRow vector;
String strToken;
ListIterator itFilter;
TokenFilter filter;
Token token;
Int nTokenID;
Real dTF;
Attribute attrib;
Set setKeys;
Iterator itKey;
vector = new DataRow();
vector.setClass(dClass);
strToken = m_lexer.nextToken();
while (strToken != null) {
// Push token through the filters.
for (itFilter = m_lstFilters.listIterator();
itFilter.hasNext(); ) {
filter = (TokenFilter) itFilter.next();
strToken = filter.apply(strToken);
if (strToken == null)
break;
}
if (strToken != null) {
// Update token info.
token = (Token) m_hashTokens.get(strToken);
if (token != null)
nTokenID = token.m_nID;
else {
nTokenID = new Int(m_nNextToken++);
token = new Token(strToken, nTokenID);
attrib = new Attribute(strToken);
m_table.addAttribute(attrib);
m_hashTokens.put(strToken, token);
// The token with ID n can be found in m_aTokens[n].
m_aTokens.add(token);
}
// Update sparse vector.
dTF = (Real) vector.m_data.get(nTokenID);
if (dTF != null)
dTF.m_d += 1.0;
else
vector.m_data.put(nTokenID, new Real(1.0));
}
strToken = m_lexer.nextToken();
}
// Update token info again.
setKeys = vector.m_data.keySet();
for (itKey = setKeys.iterator(); itKey.hasNext(); ) {
nTokenID = (Int) itKey.next();
token = (Token) m_aTokens.get(nTokenID.m_i);
++token.m_nDF;
}
return vector;
}
/**
* Reads all documents and converts them all to sparse vectors.
*/
protected void readInstances() throws Exception {
DataRow vector;
while (m_reader.hasNextDocument()) {
vector = getInstance(m_reader.nextDocument());
m_table.add(vector);
}
// Convert to TFIDF if necessary.
if (m_bTFIDF) {
Iterator itr, itw;
DataRow row;
Entry ent;
Real r;
Token t;
double nDocs, max, d;
int nTokens, i;
nDocs = m_table.m_data.size();
nTokens = m_aTokens.size();
for (itr = m_table.m_data.iterator(); itr.hasNext(); ) {
row = (DataRow) itr.next();
max = 0.0;
for (itw = row.m_data.entrySet().iterator(); itw.hasNext(); ) {
ent = (Entry) itw.next();
if (((Int) ent.getKey()).m_i < nTokens) {
d = ((Real) ent.getValue()).m_d;
if (max < d)
max = d;
}
}
for (itw = row.m_data.entrySet().iterator(); itw.hasNext(); ) {
ent = (Entry) itw.next();
i = ((Int) ent.getKey()).m_i;
if (i < nTokens) {
r = (Real) ent.getValue();
t = ((Token) m_aTokens.get(i));
r.m_d /= max;
r.m_d *= Math.log(nDocs / t.m_nDF);
}
}
}
}
}
////// WEKA specific stuff. //////
/** The option string for document reader. */
protected String m_strDocReader;
/** The option string for lexer. */
protected String m_strLexer;
/** The option string for token filters. */
protected String m_strFilters;
/** True iff defineDataFormat() has been called. */
protected boolean m_bFormatDefined;
public String globalInfo() {
return
"A data generator that reads a collection of text documents " +
"and transforms them into sparse vectors.";
}
public Enumeration listOptions() {
Vector aOpts;
aOpts = new Vector();
aOpts.add(new Option("\tCompute TFIDF instead of TF (default false)",
"I", 0, "-I"));
aOpts.add(new Option("\tDocument reader", "R", 1, "-R <str>"));
aOpts.add(new Option("\tLexer", "L", 1, "-L <str>"));
aOpts.add(new Option("\tFilters (default empty)",
"F", 1, "-F <str>[:<str>...]"));
aOpts.addAll(DirectoryDocumentReader.listOptions());
aOpts.addAll(SimpleLexer.listOptions());
aOpts.addAll(LowerCaseFilter.listOptions());
aOpts.addAll(PorterStemmer.listOptions());
aOpts.addAll(StopWordFilter.listOptions());
aOpts.addAll(WordLengthFilter.listOptions());
return aOpts.elements();
}
public void setOptions(String[] options) throws Exception {
Pattern patSep;
String[] aFilters;
Integer n;
m_bTFIDF = Utils.getFlag('I', options);
m_strDocReader = Utils.getOption('R', options);
if (m_strDocReader.length() == 0)
throw new Exception("Document reader (-R) not set.");
else if (m_strDocReader.equals("directory"))
m_reader = new DirectoryDocumentReader(this, options);
else
throw new Exception("Invalid document reader (-R).");
m_strLexer = Utils.getOption('L', options);
if (m_strLexer.length() == 0)
throw new Exception("Lexer (-L) not set.");
else if (m_strLexer.equals("simple"))
m_lexer = new SimpleLexer(this, m_reader, options);
else
throw new Exception("Invalid lexer (-L).");
m_strFilters = Utils.getOption('F', options);
m_lstFilters = new LinkedList();
if (m_strFilters.length() > 0) {
patSep = Pattern.compile(":");
aFilters = patSep.split(m_strFilters);
for (int i = 0; i < aFilters.length; ++i)
if (aFilters[i].length() > 0) {
if (aFilters[i].equals("lower_case"))
m_lstFilters.addLast
(new LowerCaseFilter(this, options));
else if (aFilters[i].equals("porter_stemmer"))
m_lstFilters.addLast
(new PorterStemmer(this, options));
else if (aFilters[i].equals("stop_word"))
m_lstFilters.addLast
(new StopWordFilter(this, options));
else if (aFilters[i].equals("word_length"))
m_lstFilters.addLast
(new WordLengthFilter(this, options));
else
throw new Exception
("Invalid filter (-F): " + aFilters[i] + ".");
}
}
}
public String[] getOptions() {
ArrayList aOpts;
ListIterator it;
TokenFilter filter;
String[] array;
aOpts = new ArrayList();
if (m_bTFIDF)
aOpts.add("-I");
aOpts.add("-R");
aOpts.add(m_strDocReader);
aOpts.addAll(m_reader.getOptions());
aOpts.add("-L");
aOpts.add(m_strLexer);
aOpts.addAll(m_lexer.getOptions());
if (m_strFilters.length() > 0) {
aOpts.add("-F");
aOpts.add(m_strFilters);
for (it = m_lstFilters.listIterator(); it.hasNext(); ) {
filter = (TokenFilter) it.next();
aOpts.addAll(filter.getOptions());
}
}
array = new String[aOpts.size()];
return (String[]) aOpts.toArray(array);
}
public Instances defineDataFormat() throws Exception {
m_bFormatDefined = true;
readInstances();
return m_table.makeDataFormat();
}
public Instance generateExample() throws Exception {
if (!m_bFormatDefined)
throw new Exception("Dataset format not defined.");
return m_table.getNextInstance();
}
public Instances generateExamples() throws Exception {
throw new Exception("Only single mode supported.");
}
public String generateFinished() throws Exception {
return "";
}
public boolean getSingleModeFlag() throws Exception {
return true;
}
public static void main(String[] argv) throws Exception {
Generator.makeData(new TextSource(), argv);
}
}