package edu.stanford.nlp.classify;
import java.util.ArrayList;
import java.util.Map;
import java.util.Set;
import java.util.StringTokenizer;
import java.io.File;
import edu.stanford.nlp.ling.RVFDatum;
import edu.stanford.nlp.stats.ClassicCounter;
import edu.stanford.nlp.objectbank.ObjectBank;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.HashIndex;
import edu.stanford.nlp.util.Index;
import edu.stanford.nlp.util.logging.Redwood;
/**
* A class to read some UCI datasets into RVFDatum. Will incrementally add formats.
*
* @author Kristina Toutanova
* Sep 14, 2004
*
* Made type-safe by Sarah Spikes (sdspikes@cs.stanford.edu)
*/
public class NominalDataReader {
Map<String, Index<String>> indices = Generics.newHashMap(); // an Index for each feature so that its values are coded as integers
final static Redwood.RedwoodChannels logger = Redwood.channels(NominalDataReader.class);
/**
* the class is the last column and it skips the next-to-last column because it is a unique id in the audiology data
*
*/
static RVFDatum<String, Integer> readDatum(String line, String separator, Map<Integer, Index<String>> indices) {
StringTokenizer st = new StringTokenizer(line, separator);
//int fno = 0;
ArrayList<String> tokens = new ArrayList<>();
while (st.hasMoreTokens()) {
String token = st.nextToken();
tokens.add(token);
}
String[] arr = tokens.toArray(new String[tokens.size()]);
Set<Integer> skip = Generics.newHashSet();
skip.add(Integer.valueOf(arr.length - 2));
return readDatum(arr, arr.length - 1, skip, indices);
}
static RVFDatum<String, Integer> readDatum(String[] values, int classColumn, Set<Integer> skip, Map<Integer, Index<String>> indices) {
ClassicCounter<Integer> c = new ClassicCounter<>();
RVFDatum<String, Integer> d = new RVFDatum<>(c);
int attrNo = 0;
for (int index = 0; index < values.length; index++) {
if (index == classColumn) {
d.setLabel(values[index]);
continue;
}
if (skip.contains(Integer.valueOf(index))) {
continue;
}
Integer featKey = Integer.valueOf(attrNo);
Index<String> ind = indices.get(featKey);
if (ind == null) {
ind = new HashIndex<>();
indices.put(featKey, ind);
}
// MG: condition on isLocked is useless, since add(E) contains such a condition:
//if (!ind.isLocked()) {
ind.add(values[index]);
//}
int valInd = ind.indexOf(values[index]);
if (valInd == -1) {
valInd = 0;
logger.info("unknown attribute value " + values[index] + " of attribute " + attrNo);
}
c.incrementCount(featKey, valInd);
attrNo++;
}
return d;
}
/**
* Read the data as a list of RVFDatum objects. For the test set we must reuse the indices from the training set
*
*/
static ArrayList<RVFDatum<String, Integer>> readData(String filename, Map<Integer, Index<String>> indices) {
try {
String sep = ", ";
ArrayList<RVFDatum<String, Integer>> examples = new ArrayList<>();
for(String line : ObjectBank.getLineIterator(new File(filename))) {
RVFDatum<String, Integer> next = readDatum(line, sep, indices);
examples.add(next);
}
return examples;
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
}