/* Copyright 2003, Carnegie Mellon, All Rights Reserved */
package edu.cmu.minorthird.classify;
import edu.cmu.minorthird.classify.multi.*;
import edu.cmu.minorthird.classify.sequential.SequenceDataset;
import edu.cmu.minorthird.util.*;
import edu.cmu.minorthird.util.gui.ViewerFrame;
import org.apache.log4j.Logger;
import edu.cmu.minorthird.classify.relational.*;
import java.io.*;
import java.util.*;
/**
* Dataset i/o.
*
* For ordinary datasets, format is one example per line, and the
* format for a line is:
* <pre>
* <code>type subpopid label feature1 feature2 ...</code>
* </pre>where
* <ul>
* <li>type=b or k (for binary or ordinary examples)
* <li>subpopid is NUL or a string, naming the subpopulation from which the example was drawn
* <li>label is +1 or -1 for binary
* <li>features are a feature name (for binary features) and a featureName=value for numeric
* features.
* </ul>
*
* For example: <code>
* k subpop1 2 f1=4 fOrange=1 fGreen=92 ...
* k subpop1 1 f1=1 fBlue=10 fGreen=2 ...
* k subpop2 3 f1=2 fYellow=1 fRed=42 ...
* </code> or <code>
* b NUL +1 f1=2 fOrange=1 fGreen=92 ...
* b NUL -1 f1=1 fBlue=10 fGreen=2 ...
* </code> or <code>
* k subpop1 2 f1 fOrange fGreen ...
* k subpop1 1 f1 fBlue fGreen ...
* k subpop2 3 f1 fYellow fRed ...
* </code>
*
* For SequenceDatasets, examples from a diffrerent sequence are separated by a single line
* containing a "*".
*
* @author William Cohen
*/
public class DatasetLoader{
static private Logger log=Logger.getLogger(DatasetLoader.class);
static private final StringEncoder stringCoder=new StringEncoder('%'," \t");
static private final StringEncoder featureCoder=
new StringEncoder('%',"=. \t");
static private Map<String,ClassLabel> classLabelDict=new HashMap<String,ClassLabel>();
static{
classLabelDict.put(ExampleSchema.POS_CLASS_NAME,ClassLabel
.positiveLabel(+1));
classLabelDict.put(ExampleSchema.NEG_CLASS_NAME,ClassLabel
.negativeLabel(-1));
}
/** Save a dataset to a file. This should save each example in
* the order provided by the dataset.iterator()
*/
static public void save(Dataset dataset,File file) throws IOException{
PrintStream out=new PrintStream(new FileOutputStream(file));
for(Iterator<Example> i=dataset.iterator();i.hasNext();){
out.println(asParsableString(i.next()));
}
}
/** Save a dataset that can be used for regression */
static public void saveRegression(Dataset dataset,File file)
throws IOException{
PrintStream out=new PrintStream(new FileOutputStream(file));
for(Iterator<Example> i=dataset.iterator();i.hasNext();){
Example x=i.next();
StringBuffer buf=new StringBuffer("");
buf.append(x.getLabel().posWeight());
buf.append('\t');
buf.append(asParsableString(x));
out.println(buf.toString());
}
out.close();
}
/** Save a dataset that can be used for regression */
static public Dataset loadRegression(File file) throws IOException,
NumberFormatException{
Dataset dataset=new BasicDataset();
LineNumberReader in=new LineNumberReader(new FileReader(file));
String line;
while((line=in.readLine())!=null){
int tab=line.indexOf('\t');
Example x=parseLine(line.substring(tab+1),file,in);
double score=StringUtil.atof(line.substring(0,tab));
dataset.add(new Example(x.asInstance(),ClassLabel.positiveLabel(score)));
}
log.info("loaded "+dataset.size()+" examples from "+file.getName());
in.close();
return dataset;
}
/** Load a dataset from a file */
static public Dataset loadFile(File file) throws IOException,
NumberFormatException{
Dataset dataset=new BasicDataset();
ProgressCounter pc=
new ProgressCounter("loading file "+file.getName(),"line");
LineNumberReader in=new LineNumberReader(new FileReader(file));
String line;
while((line=in.readLine())!=null){
dataset.add(parseLine(line,file,in));
pc.progress();
}
log.info("loaded "+dataset.size()+" examples from "+file.getName());
in.close();
pc.finished();
return dataset;
}
/** Load a relational dataset from a file specifying objs */
static public void loadRelFile(File file,RealRelationalDataset dataset)
throws IOException,NumberFormatException{
// Dataset dataset = new BasicDataset();
ProgressCounter pc=
new ProgressCounter("loading file "+file.getName(),"line");
LineNumberReader in=new LineNumberReader(new FileReader(file));
String line;
while((line=in.readLine())!=null){
dataset.addSGM(RelparseLine(line,file,in));
// System.out.println(dataset);
pc.progress();
}
log.info("loaded "+dataset.size()+" examples from "+file.getName());
in.close();
pc.finished();
// return dataset;
}
/** Load a link file */
static public void loadLinkFile(File file,RealRelationalDataset dataset)
throws IOException,NumberFormatException{
// Dataset dataset = new RealRelationalDataset();
ProgressCounter pc=
new ProgressCounter("loading file "+file.getName(),"line");
LineNumberReader in=new LineNumberReader(new FileReader(file));
String line;
while((line=in.readLine())!=null){
CoreRelationalDataset.addLink(LinkparseLine(line,file,in));
pc.progress();
}
log.info("loaded "+dataset.size()+" examples from "+file.getName());
in.close();
pc.finished();
// return dataset;
}
/** Load a relational template file */
static public void loadRelTempFile(File file,RealRelationalDataset dataset)
throws IOException,NumberFormatException{
// Dataset dataset = new RealRelationalDataset();
ProgressCounter pc=
new ProgressCounter("loading file "+file.getName(),"line");
LineNumberReader in=new LineNumberReader(new FileReader(file));
String line;
while((line=in.readLine())!=null){
String[] arr=line.split("\\s+");
if(arr.length<3)
throw new IllegalArgumentException("too few values at line#"+
in.getLineNumber()+" of "+file.getName());
if(!arr[1].equals("ON"))
throw new IllegalArgumentException(
"the format of the relational template is COUNT ON LEFT");
RealRelationalDataset.addAggregator(arr[0],arr[2]);
pc.progress();
}
log.info("loaded "+dataset.size()+" examples from "+file.getName());
in.close();
pc.finished();
// return dataset;
}
/** Load a dataset from a file */
static public Dataset loadMulti(File file,int numDim) throws IOException,
NumberFormatException{
MultiDataset dataset=new MultiDataset();
ProgressCounter pc=
new ProgressCounter("loading file "+file.getName(),"line");
LineNumberReader in=new LineNumberReader(new FileReader(file));
String line;
while((line=in.readLine())!=null){
dataset.addMulti(parseMultiLine(line,file,in,numDim));
pc.progress();
}
log.info("loaded "+dataset.size()+" examples from "+file.getName());
in.close();
pc.finished();
return dataset;
}
/**
* Save a SequenceDataset to a file
*
* Each Example in a sequence is saved on a seperate line.
* An asterix (*) alone on a line seperates the sequences.
*
* Example1,0
* Example1,1
* Example1,2
* ...
* *
* Example2,0
* Example2,1
* ...
* *
*/
static public void saveSequence(SequenceDataset dataset,File file)
throws IOException{
PrintStream out=new PrintStream(new FileOutputStream(file));
for(Iterator<Example[]> i=dataset.sequenceIterator();i.hasNext();){
Example[] seq=i.next();
for(int j=0;j<seq.length;j++){
out.println(asParsableString(seq[j]));
}
out.println("*");
}
out.close();
}
/**
* Load a SequenceDataset from a file
* @see #saveSequence for format
*/
static public SequenceDataset loadSequence(File file) throws IOException,
NumberFormatException{
SequenceDataset dataset=new SequenceDataset();
LineNumberReader in=new LineNumberReader(new FileReader(file));
String line;
List<Example> list=new ArrayList<Example>();
while((line=in.readLine())!=null){
if("*".equals(line))
clearBuffer(list,dataset);
else
list.add(parseLine(line,file,in));
}
if(list.size()>0){
clearBuffer(list,dataset);
}
log.info("loaded "+dataset.size()+" examples from "+file.getName());
in.close();
return dataset;
}
static private void clearBuffer(List<Example> list,SequenceDataset dataset){
Example[] seq=list.toArray(new Example[list.size()]);
dataset.addSequence(seq);
list.clear();
}
static private String asParsableString(Example x){
StringBuffer buf=new StringBuffer("");
//buf.append((x instanceof BinaryExample) ? "b" : "k" );
buf.append('k'); // backward compatibility for binary examples
buf.append(' ');
buf.append(stringCoder.encode(x.getSubpopulationId()!=null?x
.getSubpopulationId():"NUL"));
buf.append(' ');
buf.append(stringCoder.encode(x.getLabel().bestClassName()));
buf.append(' ');
appendParsableFeatures(buf,x);
return buf.toString();
}
static void appendParsableFeatures(StringBuffer buf,Example x){
for(Iterator<Feature> i=x.binaryFeatureIterator();i.hasNext();){
Feature f=i.next();
buf.append(' ');
for(int j=0;j<f.size();j++){
if(j>0)
buf.append('.');
buf.append(featureCoder.encode(f.getPart(j)));
}
}
for(Iterator<Feature> i=x.numericFeatureIterator();i.hasNext();){
Feature f=i.next();
buf.append(' ');
for(int j=0;j<f.size();j++){
if(j>0)
buf.append('.');
buf.append(featureCoder.encode(f.getPart(j)));
}
buf.append("="+x.getWeight(f));
}
}
/** The value that will be returned by example.getSource() for the example
* read in from the designated location.
*/
static public String getSourceAssignedToExample(String fileName,int lineNumber){
return fileName+":"+lineNumber;
}
static private Example parseLine(String line,File file,LineNumberReader in){
String[] arr=line.split("\\s+");
if(arr.length<3)
throw new IllegalArgumentException("too few values at line#"+
in.getLineNumber()+" of "+file.getName());
for(int i=0;i<3;i++)
arr[i]=stringCoder.decode(arr[i]);
String subpopulationId=arr[1];
String source=getSourceAssignedToExample(file.getName(),in.getLineNumber());
if("NUL".equals(arr[1]))
subpopulationId=null;
MutableInstance instance=new MutableInstance(source,subpopulationId);
for(int i=3;i<arr.length;i++){
int eqPos=arr[i].indexOf("=");
if(eqPos>=0){
try{
String feature=arr[i].substring(0,eqPos);
String value=arr[i].substring(eqPos+1);
double weight=Double.parseDouble(value);
instance.addNumeric(parseFeatureName(feature),weight);
}catch(NumberFormatException e){
throw new IllegalArgumentException("bad feature# "+i+" line#"+
in.getLineNumber()+" of "+file.getName());
}
}else{
instance.addBinary(parseFeatureName(arr[i]));
}
}
ClassLabel label=classLabelDict.get(arr[2]);
if(label==null){
if("b".equals(arr[0])){
throw new IllegalArgumentException("should be POS/NEG but label is '"+
arr[2]+"' at line#"+in.getLineNumber()+" of "+file.getName());
}
classLabelDict.put(arr[2],(label=new ClassLabel(arr[2])));
}
return new Example(instance,label);
}
static private SGMExample RelparseLine(String line,File file,
LineNumberReader in){
String[] arr=line.split("\\s+");
if(arr.length<4)
throw new IllegalArgumentException("too few values at line#"+
in.getLineNumber()+" of "+file.getName());
String ID=arr[0];
for(int i=1;i<4;i++)
arr[i]=stringCoder.decode(arr[i]);
String subpopulationId=arr[2];
String source=getSourceAssignedToExample(file.getName(),in.getLineNumber());
if("NUL".equals(arr[2]))
subpopulationId=null;
MutableInstance instance=new MutableInstance(source,subpopulationId);
for(int i=4;i<arr.length;i++){
int eqPos=arr[i].indexOf("=");
if(eqPos>=0){
try{
String feature=arr[i].substring(0,eqPos);
String value=arr[i].substring(eqPos+1);
double weight=Double.parseDouble(value);
instance.addNumeric(parseFeatureName(feature),weight);
}catch(NumberFormatException e){
throw new IllegalArgumentException("bad feature# "+i+" line#"+
in.getLineNumber()+" of "+file.getName());
}
}else{
instance.addBinary(parseFeatureName(arr[i]));
}
}
ClassLabel label=classLabelDict.get(arr[3]);
if(label==null){
if("b".equals(arr[1])){
throw new IllegalArgumentException("should be POS/NEG but label is '"+
arr[3]+"' at line#"+in.getLineNumber()+" of "+file.getName());
}
classLabelDict.put(arr[3],(label=new ClassLabel(arr[3])));
}
return new SGMExample(instance,label,ID);
}
static private Link LinkparseLine(String line,File file,LineNumberReader in){
String[] arr=line.split("\\s+");
if(arr.length<3)
throw new IllegalArgumentException("too few values at line#"+
in.getLineNumber()+" of "+file.getName());
return new Link(arr[0],arr[1],arr[2]);
}
static private MultiExample parseMultiLine(String line,File file,
LineNumberReader in,int numDim){
String[] arr=line.split("\\s+");
if(arr.length<2+numDim)
throw new IllegalArgumentException("too few values at line#"+
in.getLineNumber()+" of "+file.getName());
for(int i=0;i<2+numDim;i++)
arr[i]=stringCoder.decode(arr[i]);
String subpopulationId=arr[1];
String source=file.getName()+":"+in.getLineNumber();
if("NUL".equals(arr[1]))
subpopulationId=null;
MutableInstance instance=new MutableInstance(source,subpopulationId);
for(int i=2+numDim;i<arr.length;i++){
int eqPos=arr[i].indexOf("=");
if(eqPos>=0){
try{
String feature=arr[i].substring(0,eqPos);
String value=arr[i].substring(eqPos+1);
double weight=Double.parseDouble(value);
instance.addNumeric(parseFeatureName(feature),weight);
}catch(NumberFormatException e){
throw new IllegalArgumentException("bad feature# "+i+" line#"+
in.getLineNumber()+" of "+file.getName());
}
}else{
instance.addBinary(parseFeatureName(arr[i]));
}
}
ClassLabel[] labels=new ClassLabel[numDim];
for(int i=2;i<2+numDim;i++){
ClassLabel label=classLabelDict.get(arr[i]);
labels[i-2]=label;
}
for(int i=0;i<labels.length;i++){
if(labels[i]==null){
if("b".equals(arr[0])){
throw new IllegalArgumentException(
"should be POS/NEG but label is '"+arr[2+i]+"' at line#"+
in.getLineNumber()+" of "+file.getName());
}
classLabelDict.put(arr[2+i],(labels[i]=new ClassLabel(arr[2+i])));
}
}
//
MultiClassLabel multiLabel=new MultiClassLabel(labels);
return new MultiExample(instance,multiLabel);
}
static private Feature parseFeatureName(String string){
String[] featureParts=string.split("\\.");
for(int j=0;j<featureParts.length;j++)
featureParts[j]=featureCoder.decode(featureParts[j]);
return new Feature(featureParts);
}
public static Dataset loadSVMStyle(File file) throws IOException{
Dataset dataset=new BasicDataset();
BufferedReader in=new BufferedReader(new FileReader(file));
while(in.ready()){
String line=in.readLine();
StringTokenizer st=new StringTokenizer(line," \t\n\r\f:");
//label - yes this is necessary:
//the original string representation and the reconstituted versions are different.
//ex: (string)+1 => (double)1.0 => (string)1.0
MutableInstance instance=new MutableInstance();
String label=st.nextToken();
double labelDouble=Double.parseDouble(label);
label=""+labelDouble;
//num features
// int numFeatures = st.countTokens()/2;
while(st.hasMoreTokens()){
//add features to instance
// note for svm these should be numeric
String featureName=st.nextToken();
String featureValue=st.nextToken();
instance.addNumeric(new Feature(featureName),Double
.parseDouble(featureValue));
}
//build Example
Example example=new Example(instance,ClassLabel.binaryLabel(labelDouble));
dataset.add(example);
}
return dataset;
}
/**
* Calls loadFile. The Dataset is temporarily swallowed.
* In other words, don't call this method.
* @param f
* @throws IOException
*/
public Object load(File f) throws IOException{
return loadFile(f);
}
static public void main(String[] args){
try{
boolean sequential=args[0].startsWith("-seq");
boolean regression=args[0].startsWith("-reg");
String dbName=(sequential||regression)?args[1]:args[0];
Dataset d=null;
if(sequential)
d=DatasetLoader.loadSequence(new File(dbName));
else if(regression)
d=DatasetLoader.loadRegression(new File(dbName));
else
d=DatasetLoader.loadFile(new File(dbName));
new ViewerFrame("Data from "+dbName,d.toGUI());
}catch(Exception e){
e.printStackTrace();
System.out.println("usage: file");
}
}
}