package rainbownlp.machinelearning.convertor;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.util.List;
import rainbownlp.core.FeatureValuePair;
import rainbownlp.machinelearning.MLExample;
import rainbownlp.machinelearning.MLExampleFeature;
import rainbownlp.util.FileUtil;
import rainbownlp.util.HibernateUtil;
import rainbownlp.util.ConfigurationUtil;
import weka.core.Attribute;
import weka.core.FastVector;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.SparseInstance;
import weka.core.converters.ArffSaver;
import weka.core.converters.Saver;
public class WekaFormatConvertor {
static int numClassRatio = ConfigurationUtil.getValueInteger("numClassesRatio");
public static void writeToFile(List<Integer> exampleIdsToWrite, String filePath,String taskName
,String[] possibleClasses)
throws Exception
{
if(new File(filePath).exists()) return;
int counter = 0;
// set attributes
List<FeatureValuePair> fvps = (List<FeatureValuePair>)
HibernateUtil.executeReader("from FeatureValuePair where tempFeatureIndex<>"+Integer.MAX_VALUE+
" group by tempFeatureIndex");
// 1. set up attributes
FastVector atts = new FastVector();
for(int i=0;i<fvps.size();i++){
FeatureValuePair fvp = fvps.get(i);
if(fvp.getFeatureValueAuxiliary()==null)
atts.addElement(new Attribute(fvp.getFeatureName()));
else
atts.addElement(new Attribute(fvp.getFeatureName()+fvp.getFeatureValue()));
}
FastVector classVals = new FastVector();
for (int i = 0; i < possibleClasses.length; i++)
classVals.addElement(possibleClasses[i]);
atts.addElement(new Attribute("class", classVals));
Instances data = new Instances(taskName, atts, 0);
FileOutputStream file_writer = new FileOutputStream(filePath);
ArffSaver saver = new ArffSaver();
saver.setDestination(file_writer);
saver.setRetrieval(Saver.INCREMENTAL);
saver.setStructure(data);
for(Integer example_id : exampleIdsToWrite) {
counter++;
MLExample example = MLExample.getExampleById(example_id);
if(example.getExpectedClass() == null){
FileUtil.logLine(FileUtil.DEBUG_FILE, "expected class is null!");
continue;
}
Double expectedClass = example.getNumericExpectedClass()+1;
// create instance
double[] vals = new double[fvps.size()+1];
List<MLExampleFeature> features = example.getExampleFeatures();
for(int i=0;i<fvps.size();i++){
FeatureValuePair fvp = fvps.get(i);
vals[i]=0;
for(MLExampleFeature feature:features){
FeatureValuePair featureFVP = feature.getFeatureValuePair();
if(featureFVP.getTempFeatureIndex() != fvp.getTempFeatureIndex()) continue;
if(fvp.getFeatureValueAuxiliary()==null){//single value
vals[i] = Double.parseDouble(featureFVP.getFeatureValue());
}else
vals[i] = Double.parseDouble(featureFVP.getFeatureValueAuxiliary());
break;
}
}
vals[vals.length-1] = classVals.indexOf(String.valueOf(expectedClass.intValue()));
if(vals[vals.length-1]==-1)
throw(new Exception("Expected class not found in possible class values: "+expectedClass));
SparseInstance instance = new SparseInstance(1.0, vals);
instance.setDataset(data);
// data.add(instance);
saver.writeIncremental(instance);
FileUtil.logLine(null, "example processed: "+counter+"/"+exampleIdsToWrite.size());
if(counter%100==0)
file_writer.flush();
}
file_writer.flush();
file_writer.close();
}
}