/* * Copyright [2012-2014] PayPal Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package ml.shifu.shifu.core; import java.util.ArrayList; import java.util.List; /** * To compute ks, iv and woe values. */ public final class ColumnStatsCalculator { private ColumnStatsCalculator() { } private final static double EPS = 1e-10; public static <T extends Number> ColumnMetrics calculateColumnMetrics(List<T> negative, List<T> positive) { assert negative != null && positive != null && negative.size() == positive.size(); int numBins = negative.size(); double sumN = 0.0; double sumP = 0.0; double cumN = 0.0; double cumP = 0.0; double iv = 0.0; double ks = 0.0; for(int i = 0; i < numBins; i++) { sumN += negative.get(i).doubleValue(); sumP += positive.get(i).doubleValue(); } if(sumN == 0 || sumP == 0) { return null; } double woe = Math.log((sumP + EPS) / (sumN + EPS)); List<Double> binningWoe = new ArrayList<Double>(numBins); for(int i = 0; i < numBins; i++) { double cntN = negative.get(i).doubleValue(); double cntP = positive.get(i).doubleValue(); double p = cntP / sumP; double n = cntN / sumN; // TODO merge bin with p or q = 0 ??? double woePerBin = Math.log((p + EPS) / (n + EPS)); binningWoe.add(woePerBin); iv += (p - n) * woePerBin; cumP += p; cumN += n; double tmpKS = Math.abs(cumP - cumN); if(ks < tmpKS) { ks = tmpKS; } } return new ColumnMetrics(ks * 100, iv, woe, binningWoe); } public static ColumnMetrics calculateColumnMetrics(long[] negative, long[] positive) { assert negative != null && positive != null && negative.length == positive.length; int numBins = negative.length; double sumN = 0.0; double sumP = 0.0; double cumN = 0.0; double cumP = 0.0; double iv = 0.0; double ks = 0.0; for(int i = 0; i < numBins; i++) { sumN += negative[i]; sumP += positive[i]; } if(sumN == 0 || sumP == 0) { return null; } double woe = Math.log((sumP + EPS) / (sumN + EPS)); List<Double> binningWoe = new ArrayList<Double>(numBins); for(int i = 0; i < numBins; i++) { double cntN = negative[i]; double cntP = positive[i]; double p = cntP / sumP; double n = cntN / sumN; // TODO merge bin with p or q = 0 ??? double woePerBin = Math.log((p + EPS) / (n + EPS)); binningWoe.add(woePerBin); iv += (p - n) * woePerBin; cumP += p; cumN += n; double tmpKS = Math.abs(cumP - cumN); if(ks < tmpKS) { ks = tmpKS; } } return new ColumnMetrics(ks * 100, iv, woe, binningWoe); } public static ColumnMetrics calculateColumnMetrics(double[] negative, double[] positive) { assert negative != null && positive != null && negative.length == positive.length; int numBins = negative.length; double sumN = 0.0; double sumP = 0.0; double cumN = 0.0; double cumP = 0.0; double iv = 0.0; double ks = 0.0; for(int i = 0; i < numBins; i++) { sumN += negative[i]; sumP += positive[i]; } if(sumN == 0 || sumP == 0) { return null; } double woe = Math.log((sumP + EPS) / (sumN + EPS)); List<Double> binningWoe = new ArrayList<Double>(numBins); for(int i = 0; i < numBins; i++) { double cntN = negative[i]; double cntP = positive[i]; double p = cntP / sumP; double n = cntN / sumN; // TODO merge bin with p or q = 0 ??? double woePerBin = Math.log((p + EPS) / (n + EPS)); binningWoe.add(woePerBin); iv += (p - n) * woePerBin; cumP += p; cumN += n; double tmpKS = Math.abs(cumP - cumN); if(ks < tmpKS) { ks = tmpKS; } } return new ColumnMetrics(ks * 100, iv, woe, binningWoe); } /** * From link {@literal <a href="http://www.itl.nist.gov/div898/handbook/eda/section3/eda35b.htm">Kurtosis</a>} * * @param count * count * @param mean * mean * @param stdDev * stdDev * @param sum * sum * @param squaredSum * squaredSum * @param tripleSum * tripleSum * @param quarticSum * quarticSum * @return Kurtosis value */ public static double computeKurtosis(long count, double mean, double stdDev, double sum, double squaredSum, double tripleSum, double quarticSum) { return (quarticSum - 4 * tripleSum * mean + 6 * squaredSum * mean * mean - 4 * sum * mean * mean * mean + count * mean * mean * mean * mean) / (count * stdDev * stdDev * stdDev * stdDev); } /** * From link {@literal <a href="http://www.itl.nist.gov/div898/handbook/eda/section3/eda35b.htm">Skewness</a>} * * @param count * count * @param mean * mean * @param stdDev * stdDev * @param sum * sum * @param squaredSum * squaredSum * @param tripleSum * tripleSum * @return skewness value */ public static double computeSkewness(long count, double mean, double stdDev, double sum, double squaredSum, double tripleSum) { return (tripleSum - 3 * squaredSum * mean + 3 * mean * mean * sum - count * mean * mean * mean) / (count * stdDev * stdDev * stdDev); } public static class ColumnMetrics { public ColumnMetrics(double ks, double iv, double woe, List<Double> binningWoe) { this.ks = ks; this.iv = iv; this.woe = woe; this.binningWoe = binningWoe; } private final double ks; private final double iv; private final double woe; private final List<Double> binningWoe; /** * @return the ks */ public double getKs() { return ks; } /** * @return the iv */ public double getIv() { return iv; } /** * @return the woe */ public double getWoe() { return woe; } /** * @return the binningWoe */ public List<Double> getBinningWoe() { return binningWoe; } } }