/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package ml.shifu.shifu.core.binning;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import ml.shifu.shifu.container.obj.ColumnConfig;
import ml.shifu.shifu.container.obj.ModelConfig;
import ml.shifu.shifu.container.obj.ModelStatsConf.BinningMethod;
import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.lang.StringUtils;
/**
* AbstractBinning class
*/
public abstract class AbstractBinning<T> {
/**
* Special characters for object serialization
*/
public static final char FIELD_SEPARATOR = '\u0001';
public static final char SETLIST_SEPARATOR = '\u0002';
public static final char PAIR_SEPARATOR = '\u0003';
/**
* Missing data count and invalid data count
*/
protected int missingValCnt = 0;
protected int invalidValCnt = 0;
/**
* Expected missing value set. The default missing value set only contain empty string ""
*/
protected Set<String> missingValSet;
/**
* The expect bin number
*/
protected int expectedBinningNum;
/**
* Empty constructor : it is just for bin merging bin
*/
protected AbstractBinning() {
}
/**
* Constructor with expected bin number
*
* @param binningNum
* the binningNum
*/
public AbstractBinning(int binningNum) {
this(binningNum, null);
}
/**
* Constructor with expected bin number and expected missing values
*
* @param binningNum
* the binningNum
* @param missingValList
* the missing value list
*/
public AbstractBinning(int binningNum, List<String> missingValList) {
this.expectedBinningNum = binningNum;
this.missingValSet = new HashSet<String>();
this.missingValSet.add("");
if(CollectionUtils.isNotEmpty(missingValList)) {
for(String missingVal: missingValList) {
missingValSet.add(StringUtils.trimToEmpty(missingVal));
}
}
}
/**
* Get value missing count
*
* @return the missing count
*/
public int getMissingValCnt() {
return missingValCnt;
}
/**
* Get invalid value count
*
* @return invalid count
*/
public int getInvalidValCnt() {
return invalidValCnt;
}
/**
* Add data into bin generator
*
* @param val
* the value to be added
*/
public abstract void addData(String val);
/**
* Generate the bin boundary or bin category
*
* @return data bin list
*/
public abstract List<T> getDataBin();
/**
* Check some value is missing value or not
*
* @param val
* the value to be checked
* @return if it is missing value
*/
protected boolean isMissingVal(String val) {
return missingValSet.contains(val);
}
/**
* Increase the missing value count
*/
protected void incMissingValCnt() {
missingValCnt++;
}
/**
* Increase the invalid value count
*/
protected void incInvalidValCnt() {
invalidValCnt++;
}
/**
* Merge another binning info to this. Currently for the expected bin number, the max value will be used.
*
* @param another
* the second binning to be mergerd
*/
public void mergeBin(AbstractBinning<?> another) {
this.expectedBinningNum = Math.max(this.expectedBinningNum, another.expectedBinningNum);
this.missingValCnt += another.missingValCnt;
this.invalidValCnt += another.invalidValCnt;
if(missingValSet == null) {
missingValSet = new HashSet<String>();
missingValSet.add("");
}
missingValSet.addAll(another.missingValSet);
}
/**
* convert @AbstractBinning to String
*
* @param objValStr
* value string
*/
protected void stringToObj(String objValStr) {
String[] objStrArr = objValStr.split(Character.toString(FIELD_SEPARATOR), -1);
if(objStrArr.length < 4) {
throw new IllegalArgumentException("The size of argument is incorrect");
}
missingValCnt = Integer.parseInt(StringUtils.trim(objStrArr[0]));
invalidValCnt = Integer.parseInt(StringUtils.trim(objStrArr[1]));
expectedBinningNum = Integer.parseInt(StringUtils.trim(objStrArr[2]));
if(missingValSet == null) {
missingValSet = new HashSet<String>();
} else {
missingValSet.clear();
}
String[] elements = objStrArr[3].split(Character.toString(SETLIST_SEPARATOR), -1);
for(String element: elements) {
missingValSet.add(element);
}
}
/**
* convert @AbstractBinning to String
*
* @return string type of binning
*/
public String objToString() {
List<String> strList = new ArrayList<String>();
strList.add(Integer.toString(missingValCnt));
strList.add(Integer.toString(invalidValCnt));
strList.add(Integer.toString(expectedBinningNum));
String missingValStr = StringUtils.join(missingValSet, SETLIST_SEPARATOR);
strList.add(missingValStr);
return StringUtils.join(strList, FIELD_SEPARATOR);
}
/**
* Construct Binning class object from String
*
* @param modelConfig
* - the @ModelConfig to use
* @param columnConfig
* - the @ColumnConfig to create bin
* @param objValStr
* - the string present of object
* @return the Binning object for the ColumnConfig
*/
public static AbstractBinning<?> constructBinningFromStr(ModelConfig modelConfig, ColumnConfig columnConfig,
String objValStr) {
AbstractBinning<?> binning;
if(columnConfig.isCategorical()) {
binning = new CategoricalBinning();
} else {
if(modelConfig.getBinningMethod().equals(BinningMethod.EqualInterval)) {
binning = new EqualIntervalBinning();
} else {
binning = new EqualPopulationBinning();
}
}
binning.stringToObj(objValStr);
return binning;
}
}