/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package ml.shifu.shifu.udf;
import org.apache.commons.lang.StringUtils;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.logicalLayer.schema.Schema;
import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema;
import java.io.IOException;
import java.text.DecimalFormat;
import ml.shifu.shifu.container.obj.ColumnConfig;
import ml.shifu.shifu.core.ColumnStatsCalculator;
import ml.shifu.shifu.core.ColumnStatsCalculator.ColumnMetrics;
import ml.shifu.shifu.udf.stats.AbstractVarStats;
import ml.shifu.shifu.util.Base64Utils;
/**
* CalculateNewStatsUDF class
*/
public class CalculateNewStatsUDF extends AbstractTrainerUDF<Tuple> {
/**
* Experience value from modeler
*/
public static final int MAX_CATEGORICAL_BINC_COUNT = 5000;
private Double valueThreshold = 1e6;
private DecimalFormat df = new DecimalFormat("##.######");
public CalculateNewStatsUDF(String source, String pathModelConfig, String pathColumnConfig) throws IOException {
super(source, pathModelConfig, pathColumnConfig);
if(modelConfig.getNumericalValueThreshold() != null) {
valueThreshold = modelConfig.getNumericalValueThreshold();
}
log.debug("Value Threshold: " + valueThreshold);
}
/*
* (non-Javadoc)
*
* @see org.apache.pig.EvalFunc#exec(org.apache.pig.data.Tuple)
*/
@Override
public Tuple exec(Tuple input) throws IOException {
if(input == null) {
return null;
}
Integer columnId = (Integer) input.get(0);
DataBag databag = (DataBag) input.get(1);
String binningDataInfo = (String) input.get(3);
log.info("start to process column id - " + columnId.toString());
ColumnConfig columnConfig = super.columnConfigList.get(columnId);
AbstractVarStats varstats = AbstractVarStats.getVarStatsInst(modelConfig, columnConfig, valueThreshold);
varstats.runVarStats(binningDataInfo, databag);
log.info("after to process column id - " + columnId.toString());
ColumnMetrics columnCountMetrics = ColumnStatsCalculator.calculateColumnMetrics(
columnConfig.getBinCountNeg(), columnConfig.getBinCountPos());
ColumnMetrics columnWeightMetrics = ColumnStatsCalculator.calculateColumnMetrics(
columnConfig.getBinWeightedNeg(), columnConfig.getBinWeightedPos());
// Assemble the results
Tuple tuple = TupleFactory.getInstance().newTuple();
tuple.append(columnId);
if(columnConfig.isCategorical()) {
if(columnConfig.getBinCategory().size() == 0
|| columnConfig.getBinCategory().size() > MAX_CATEGORICAL_BINC_COUNT) {
return null;
}
String binCategory = "["
+ StringUtils.join(columnConfig.getBinCategory(), CalculateStatsUDF.CATEGORY_VAL_SEPARATOR) + "]";
tuple.append(Base64Utils.base64Encode(binCategory));
} else {
if(columnConfig.getBinBoundary().size() == 1) {
return null;
}
tuple.append(columnConfig.getBinBoundary().toString());
}
tuple.append(columnConfig.getBinCountNeg().toString());
tuple.append(columnConfig.getBinCountPos().toString());
tuple.append(columnConfig.getBinAvgScore().toString());
tuple.append(columnConfig.getBinPosRate().toString());
tuple.append(df.format(columnCountMetrics.getKs()));
tuple.append(df.format(columnCountMetrics.getIv()));
tuple.append(df.format(columnConfig.getColumnStats().getMax()));
tuple.append(df.format(columnConfig.getColumnStats().getMin()));
tuple.append(df.format(columnConfig.getColumnStats().getMean()));
tuple.append(df.format(columnConfig.getColumnStats().getStdDev()));
if(columnConfig.isCategorical()) {
tuple.append("C");
} else {
tuple.append("N");
}
tuple.append(df.format(columnConfig.getColumnStats().getMedian()));
tuple.append(columnConfig.getMissingCount());
tuple.append(columnConfig.getTotalCount());
tuple.append(df.format(columnConfig.getMissingPercentage()));
tuple.append(columnConfig.getBinWeightedNeg().toString());
tuple.append(columnConfig.getBinWeightedPos().toString());
tuple.append(columnCountMetrics.getWoe());
tuple.append(columnWeightMetrics.getWoe());
tuple.append(df.format(columnWeightMetrics.getKs()));
tuple.append(df.format(columnWeightMetrics.getIv()));
tuple.append(columnCountMetrics.getBinningWoe().toString());
tuple.append(columnWeightMetrics.getBinningWoe().toString());
tuple.append(columnConfig.getColumnStats().getSkewness());
tuple.append(columnConfig.getColumnStats().getKurtosis());
return tuple;
}
@Override
public Schema outputSchema(Schema input) {
try {
Schema tupleSchema = new Schema();
tupleSchema.add(new FieldSchema("columnId", DataType.INTEGER));
tupleSchema.add(new FieldSchema("binBoundary", DataType.CHARARRAY));
tupleSchema.add(new FieldSchema("binCountNeg", DataType.CHARARRAY));
tupleSchema.add(new FieldSchema("binCountPos", DataType.CHARARRAY));
tupleSchema.add(new FieldSchema("binAvgScore", DataType.CHARARRAY));
tupleSchema.add(new FieldSchema("binPosRate", DataType.CHARARRAY));
tupleSchema.add(new FieldSchema("ks", DataType.CHARARRAY));
tupleSchema.add(new FieldSchema("iv", DataType.CHARARRAY));
tupleSchema.add(new FieldSchema("max", DataType.CHARARRAY));
tupleSchema.add(new FieldSchema("min", DataType.CHARARRAY));
tupleSchema.add(new FieldSchema("mean", DataType.CHARARRAY));
tupleSchema.add(new FieldSchema("stddev", DataType.CHARARRAY));
tupleSchema.add(new FieldSchema("isCate", DataType.CHARARRAY));
tupleSchema.add(new FieldSchema("median", DataType.CHARARRAY));
tupleSchema.add(new FieldSchema("missingCount", DataType.LONG));
tupleSchema.add(new FieldSchema("totalCount", DataType.LONG));
tupleSchema.add(new FieldSchema("missingRatio", DataType.CHARARRAY));
tupleSchema.add(new FieldSchema("binWeightedNeg", DataType.CHARARRAY));
tupleSchema.add(new FieldSchema("binWeightedPos", DataType.CHARARRAY));
tupleSchema.add(new FieldSchema("woe", DataType.CHARARRAY));
tupleSchema.add(new FieldSchema("weightedWoe", DataType.CHARARRAY));
tupleSchema.add(new FieldSchema("weightedKs", DataType.CHARARRAY));
tupleSchema.add(new FieldSchema("weightedIv", DataType.CHARARRAY));
tupleSchema.add(new FieldSchema("binWoe", DataType.CHARARRAY));
tupleSchema.add(new FieldSchema("binWeightedWoe", DataType.CHARARRAY));
tupleSchema.add(new FieldSchema("skewness", DataType.CHARARRAY));
tupleSchema.add(new FieldSchema("kurtosis", DataType.CHARARRAY));
return new Schema(new Schema.FieldSchema("ColumnStatistics", tupleSchema, DataType.TUPLE));
} catch (IOException e) {
log.error("Error in outputSchema", e);
return null;
}
}
}