/* * Copyright [2012-2014] PayPal Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package ml.shifu.shifu.container.obj; import com.fasterxml.jackson.annotation.JsonIgnore; import com.fasterxml.jackson.annotation.JsonIgnoreProperties; import com.fasterxml.jackson.databind.annotation.JsonDeserialize; /** * {@link ModelStatsConf} is 'stats' part configuration in ModelConfig.json */ @JsonIgnoreProperties(ignoreUnknown = true) public class ModelStatsConf { /** * Binning strategy used in stats step. * * @author Zhang David (pengzhang@paypal.com) */ @JsonDeserialize(using = BinningMethodDeserializer.class) public static enum BinningMethod { EqualNegtive, EqualInterval, EqualPositive, EqualTotal, WeightEqualNegative, WeightEqualInterval, WeightEqualPositive, WeightEqualTotal } /** * Binning algorithm on how to scale binning in 10k features well. * * @author Zhang David (pengzhang@paypal.com) */ public static enum BinningAlgorithm { Native, // sorting way SPDT, // paper reference: www.jmlr.org/papers/volume11/ben-haim10a/ben-haim10a.pdf SPDTI, // paper reference: www.jmlr.org/papers/volume11/ben-haim10a/ben-haim10a.pdf, improvement for last // binning updating step MunroPat, // paper reference: www.cs.ucsb.edu/~suri/cs290/MunroPat.pdf MunroPatI, // paper reference: www.cs.ucsb.edu/~suri/cs290/MunroPat.pdf, improvement for last // binning updating step DynamicBinning } /** * Max num bin per each numerical column. */ private Integer maxNumBin = 10; /** * Max num bin for each categorical column */ private Integer cateMaxNumBin = -1; /** * Binning method used in stats. By default is EqualPositive. */ private BinningMethod binningMethod = BinningMethod.EqualPositive; /** * Sampling rate in stats step. Sometimes is binning algorithm cannot be scaled well or slow. Try using smaller * sampleRate will accelerate stats. */ private Double sampleRate = Double.valueOf(1.0); /** * If only sample negative records or not, positive records in most cases is less than negative. By only sampling * negative can balance data. */ private Boolean sampleNegOnly = Boolean.FALSE; // don't open those options to user, this only works in some binning algorithm private Double numericalValueThreshold = Double.MAX_VALUE; private Boolean binningAutoTypeEnable = Boolean.FALSE; private Integer binningAutoTypeThreshold = 5; private Boolean binningMergeEnable = Boolean.TRUE; /** * Binning algorithm used to do binning. SPDTI is the best algorithm in terms of scalability. */ private BinningAlgorithm binningAlgorithm = BinningAlgorithm.SPDTI; /** * PSI feature enabled if not empty. In stats, PSI value will be computed. */ private String psiColumnName = ""; public Integer getMaxNumBin() { return maxNumBin; } public void setMaxNumBin(Integer maxNumBin) { this.maxNumBin = maxNumBin; } public Integer getCateMaxNumBin() { return cateMaxNumBin; } public void setCateMaxNumBin(Integer cateMaxNumBin) { this.cateMaxNumBin = cateMaxNumBin; } @JsonIgnore public Double getNumericalValueThreshold() { return numericalValueThreshold; } public void setNumericalValueThreshold(Double numericalValueThreshold) { this.numericalValueThreshold = numericalValueThreshold; } @JsonIgnore public Boolean getBinningAutoTypeEnable() { return binningAutoTypeEnable; } public void setBinningAutoTypeEnable(Boolean binningAutoTypeEnable) { this.binningAutoTypeEnable = binningAutoTypeEnable; } @JsonIgnore public Integer getBinningAutoTypeThreshold() { return binningAutoTypeThreshold; } public void setBinningAutoTypeThreshold(Integer binningAutoTypeThreshold) { this.binningAutoTypeThreshold = binningAutoTypeThreshold; } @JsonIgnore public Boolean getBinningMergeEnable() { return binningMergeEnable; } public void setBinningMergeEnable(Boolean binningMergeEnable) { this.binningMergeEnable = binningMergeEnable; } public BinningMethod getBinningMethod() { return binningMethod; } public void setBinningMethod(BinningMethod binningMethod) { this.binningMethod = binningMethod; } public Double getSampleRate() { return sampleRate; } public void setSampleRate(Double sampleRate) { this.sampleRate = sampleRate; } public Boolean getSampleNegOnly() { return sampleNegOnly; } public void setSampleNegOnly(Boolean sampleNegOnly) { this.sampleNegOnly = sampleNegOnly; } public BinningAlgorithm getBinningAlgorithm() { return binningAlgorithm; } public void setBinningAlgorithm(BinningAlgorithm binningAlgorithm) { this.binningAlgorithm = binningAlgorithm; } public String getPsiColumnName() { return psiColumnName; } public void setPsiColumnName(String psiColumnName) { this.psiColumnName = psiColumnName; } @Override public ModelStatsConf clone() { ModelStatsConf other = new ModelStatsConf(); other.setBinningAlgorithm(binningAlgorithm); other.setBinningAutoTypeEnable(binningAutoTypeEnable); other.setBinningAutoTypeThreshold(binningAutoTypeThreshold); other.setBinningMergeEnable(binningMergeEnable); other.setBinningMethod(binningMethod); other.setMaxNumBin(maxNumBin); other.setNumericalValueThreshold(numericalValueThreshold); other.setPsiColumnName(psiColumnName); other.setSampleNegOnly(sampleNegOnly); other.setSampleRate(sampleRate); return other; } }