package quickml.supervised.classifier.logisticRegression;
import quickml.data.instances.ClassifierInstance;
import quickml.data.instances.ClassifierInstanceFactory;
import quickml.supervised.dataProcessing.AttributeCharacteristics;
import quickml.supervised.dataProcessing.BasicTrainingDataSurveyor;
import quickml.supervised.dataProcessing.instanceTranformer.*;
import java.io.Serializable;
import java.util.List;
import java.util.Map;
/**
* Created by alexanderhawk on 10/14/15.
*/
public abstract class StandardDataTransformer<D extends LogisticRegressionDTO<D>> implements DataTransformer<ClassifierInstance, SparseClassifierInstance, D> {
/**
* class provides the method: transformInstances, to convert a set of classifier instances into instances that can be processed by
* the LogisticRegressionBuilder.
*
* it assumes that all attributes with numeric values are numeric, and are not in need of one hot encoding.
* product feature appendation as well as common co-occurences should be hyper-params within logistic regression.
*
*/
protected ProductFeatureAppender<ClassifierInstance> productFeatureAppender;
protected boolean useProductFeatures = false;
protected boolean doLabelToDigitConversion = true;
protected int minObservationsOfAttribute;
protected Map<Serializable, Double> numericClassLabels;
public StandardDataTransformer() {}
public StandardDataTransformer productFeatureAppender(ProductFeatureAppender<ClassifierInstance> productFeatureAppender) {
this.productFeatureAppender = productFeatureAppender;
return this;
}
public boolean usingProductFeatures(){
return productFeatureAppender!=null;
}
public void doLabelToDigitConversion(boolean doLabelToDigitConversion){
this.doLabelToDigitConversion = doLabelToDigitConversion;
}
public StandardDataTransformer minObservationsOfAttribute(int minObservationsOfAttribute) {
this.minObservationsOfAttribute = minObservationsOfAttribute;
return this;
}
public Map<Serializable, Double> getNumericClassLabels() {
return numericClassLabels;
}
public StandardDataTransformer usingProductFeatures(boolean useProductFeatures) {
this.useProductFeatures = useProductFeatures;
return this;
}
//shouldn't be hard coded as a logistic Regression DTO..or at least it should be an abstract type...or a generic?
@Override
public abstract D transformData(List<ClassifierInstance> trainingData);
static OneHotEncoder<Serializable, ClassifierInstance, ClassifierInstance> getOneHotEncoder(List<ClassifierInstance> trainingData, int minObservationsOfAttribute) {
BasicTrainingDataSurveyor<ClassifierInstance> btds = new BasicTrainingDataSurveyor<ClassifierInstance>(false);
Map<String, AttributeCharacteristics> attributeCharacteristics = btds.getMapOfAttributesToAttributeCharacteristics(trainingData);
return new OneHotEncoder<>(attributeCharacteristics, new ClassifierInstanceFactory(), minObservationsOfAttribute);
}
}