/* * Copyright [2012-2014] PayPal Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package ml.shifu.shifu.udf; import ml.shifu.shifu.column.NSColumn; import ml.shifu.shifu.container.CaseScoreResult; import ml.shifu.shifu.container.obj.EvalConfig; import ml.shifu.shifu.container.obj.RawSourceData.SourceType; import ml.shifu.shifu.core.ModelRunner; import ml.shifu.shifu.core.Scorer; import ml.shifu.shifu.core.model.ModelSpec; import ml.shifu.shifu.fs.ShifuFileUtils; import ml.shifu.shifu.util.CommonUtils; import ml.shifu.shifu.util.Constants; import org.apache.commons.collections.CollectionUtils; import org.apache.commons.collections.MapUtils; import org.apache.commons.lang.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.pig.data.DataType; import org.apache.pig.data.Tuple; import org.apache.pig.data.TupleFactory; import org.apache.pig.impl.logicalLayer.schema.Schema; import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema; import org.apache.pig.impl.util.UDFContext; import org.apache.pig.tools.pigstats.PigStatusReporter; import org.encog.ml.BasicML; import java.io.BufferedWriter; import java.io.File; import java.io.IOException; import java.util.Iterator; import java.util.List; import java.util.Map; /** * Calculate the score for each evaluation data */ public class EvalScoreUDF extends AbstractTrainerUDF<Tuple> { private static final String SCHEMA_PREFIX = "shifu::"; private EvalConfig evalConfig; private ModelRunner modelRunner; private String[] headers; private double maxScore = Double.MIN_VALUE; private double minScore = Double.MAX_VALUE; private Map<String, Integer> subModelsCnt; private int modelCnt; private String scale; /** * A simple weight exception validation: if over 5000 throw exceptions */ private int weightExceptions; public EvalScoreUDF(String source, String pathModelConfig, String pathColumnConfig, String evalSetName) throws IOException { this(source, pathModelConfig, pathColumnConfig, evalSetName, Integer.toString(Scorer.DEFAULT_SCORE_SCALE)); } public EvalScoreUDF(String source, String pathModelConfig, String pathColumnConfig, String evalSetName, String scale) throws IOException { super(source, pathModelConfig, pathColumnConfig); evalConfig = modelConfig.getEvalConfigByName(evalSetName); if(evalConfig.getModelsPath() != null) { // renew columnConfig this.columnConfigList = ShifuFileUtils.searchColumnConfig(evalConfig, columnConfigList); } this.headers = CommonUtils.getFinalHeaders(evalConfig); // move model runner construction in exec to avoid OOM error in client side if model is too big like RF this.modelCnt = CommonUtils.getBasicModelsCnt(modelConfig, evalConfig, evalConfig.getDataSet().getSource()); this.subModelsCnt = CommonUtils.getSubModelsCnt(modelConfig, this.columnConfigList, evalConfig, evalConfig .getDataSet().getSource()); this.scale = scale; } public Tuple exec(Tuple input) throws IOException { if(this.modelRunner == null) { // here to initialize modelRunner, this is moved from constructor to here to avoid OOM in client side. // UDF in pig client will be initialized to get some metadata issues List<BasicML> models = CommonUtils.loadBasicModels(modelConfig, evalConfig, evalConfig.getDataSet() .getSource(), evalConfig.getGbtConvertToProb()); this.modelRunner = new ModelRunner(modelConfig, columnConfigList, this.headers, evalConfig.getDataSet() .getDataDelimiter(), models); List<ModelSpec> subModels = CommonUtils.loadSubModels(modelConfig, this.columnConfigList, evalConfig, evalConfig.getDataSet().getSource(), evalConfig.getGbtConvertToProb()); if(CollectionUtils.isNotEmpty(subModels)) { for(ModelSpec modelSpec: subModels) { this.modelRunner.addSubModels(modelSpec); this.subModelsCnt.put(modelSpec.getModelName(), modelSpec.getModels().size()); } } this.modelCnt = models.size(); this.modelRunner.setScoreScale(Integer.parseInt(this.scale)); } Map<NSColumn, String> rawDataNsMap = CommonUtils.convertDataIntoNsMap(input, this.headers); if ( MapUtils.isEmpty(rawDataNsMap) ) { return null; } String tag = CommonUtils.trimTag(rawDataNsMap.get( new NSColumn(modelConfig.getTargetColumnName(evalConfig)))); // filter invalid tag record out // disable the tag check, since there is no bad tag in eval data set // and user just want to score the data, but don't run performance evaluation /* * if(!tagSet.contains(tag)) { * if(System.currentTimeMillis() % 100 == 0) { * log.warn("Invalid tag: " + tag); * } * if(isPigEnabled(Constants.SHIFU_GROUP_COUNTER, "INVALID_TAG")) { * PigStatusReporter.getInstance().getCounter(Constants.SHIFU_GROUP_COUNTER, Constants.COUNTER_RECORDS) * .increment(1); * } * return null; * } */ long startTime = System.nanoTime(); CaseScoreResult cs = modelRunner.computeNsData(rawDataNsMap); long runInterval = (System.nanoTime() - startTime) / 1000L; if(cs == null) { if(System.currentTimeMillis() % 50 == 0) { log.warn("Get null result, for input: " + input.toDelimitedString("|")); } return null; } Tuple tuple = TupleFactory.getInstance().newTuple(); tuple.append(tag); String weight = null; if(StringUtils.isNotBlank(evalConfig.getDataSet().getWeightColumnName())) { weight = rawDataNsMap.get(new NSColumn(evalConfig.getDataSet().getWeightColumnName())); } else { weight = "1.0"; } incrementTagCounters(tag, weight, runInterval); Map<String, CaseScoreResult> subModelScores = cs.getSubModelScores(); tuple.append(weight); if(modelConfig.isRegression()) { if(CollectionUtils.isNotEmpty(cs.getScores())) { appendModelScore(tuple, cs, true); } if(MapUtils.isNotEmpty(subModelScores)) { Iterator<Map.Entry<String, CaseScoreResult>> iterator = subModelScores.entrySet().iterator(); while(iterator.hasNext()) { Map.Entry<String, CaseScoreResult> entry = iterator.next(); CaseScoreResult subCs = entry.getValue(); appendModelScore(tuple, subCs, false); } } } else { if(CollectionUtils.isNotEmpty(cs.getScores())) { appendSimpleScore(tuple, cs); } if(MapUtils.isNotEmpty(subModelScores)) { Iterator<Map.Entry<String, CaseScoreResult>> iterator = subModelScores.entrySet().iterator(); while(iterator.hasNext()) { Map.Entry<String, CaseScoreResult> entry = iterator.next(); CaseScoreResult subCs = entry.getValue(); appendSimpleScore(tuple, subCs); } } } // append meta data List<String> metaColumns = evalConfig.getAllMetaColumns(modelConfig); if(CollectionUtils.isNotEmpty(metaColumns)) { for(String meta: metaColumns) { tuple.append(rawDataNsMap.get(new NSColumn(meta))); } } return tuple; } /** * Append model scores (average, max, min, median, and scores) into tuple * * @param tuple * - Tuple to append * @param cs * - CaseScoreResult * @param toGetMaxMin * - to check max/min or not */ private void appendModelScore(Tuple tuple, CaseScoreResult cs, boolean toGetMaxMin) { tuple.append(cs.getAvgScore()); tuple.append(cs.getMaxScore()); tuple.append(cs.getMinScore()); tuple.append(cs.getMedianScore()); for(double score: cs.getScores()) { tuple.append(score); } if(toGetMaxMin) { // get maxScore and minScore for such mapper or reducer if(cs.getMedianScore() > maxScore) { maxScore = cs.getMedianScore(); } if(cs.getMedianScore() < minScore) { minScore = cs.getMedianScore(); } } } /** * Append model scores into tuple * * @param tuple * - Tuple to append * @param cs * - CaseScoreResult */ private void appendSimpleScore(Tuple tuple, CaseScoreResult cs) { for(int i = 0; i < cs.getScores().size(); i++) { tuple.append(cs.getScores().get(i)); } } @Override public void finish() { // Since the modelRunner is initialized in execution, if there is no records for this reducer, // / the modelRunner may not initialized. It will cause NullPointerException if(this.modelRunner != null) { this.modelRunner.close(); } if(modelConfig.isClassification()) { return; } // only for regression, in some cases like gbdt, it's regression score is not in [0,1], to do eval performance, // max and min score should be collected to set bounds. BufferedWriter writer = null; Configuration jobConf = UDFContext.getUDFContext().getJobConf(); String scoreOutput = jobConf.get(Constants.SHIFU_EVAL_MAXMIN_SCORE_OUTPUT); log.debug("shifu.eval.maxmin.score.output is {}, job id is {}, task id is {}, attempt id is {}" + scoreOutput + " " + jobConf.get("mapreduce.job.id") + " " + jobConf.get("mapreduce.task.id") + " " + jobConf.get("mapreduce.task.partition") + " " + jobConf.get("mapreduce.task.attempt.id")); try { FileSystem fileSystem = FileSystem.get(jobConf); fileSystem.mkdirs(new Path(scoreOutput)); String taskMaxMinScoreFile = scoreOutput + File.separator + "part-" + jobConf.get("mapreduce.task.attempt.id"); writer = ShifuFileUtils.getWriter(taskMaxMinScoreFile, SourceType.HDFS); writer.write(maxScore + "," + minScore); } catch (IOException e) { log.error("error in finish", e); } finally { if(writer != null) { try { writer.close(); } catch (IOException ignore) { } } } } @SuppressWarnings("deprecation") private void incrementTagCounters(String tag, String weight, long runModelInterval) { if(tag == null || weight == null) { log.warn("tag is empty " + tag + " or weight is empty " + weight); return; } double dWeight = 1.0; if(StringUtils.isNotBlank(weight)) { try { dWeight = Double.parseDouble(weight); } catch (Exception e) { if(isPigEnabled(Constants.SHIFU_GROUP_COUNTER, "weight_exceptions")) { PigStatusReporter.getInstance().getCounter(Constants.SHIFU_GROUP_COUNTER, "weight_exceptions") .increment(1); } weightExceptions += 1; if(weightExceptions > 5000) { throw new IllegalStateException( "Please check weight column in eval, exceptional weight count is over 5000"); } } } long weightLong = (long) (dWeight * Constants.EVAL_COUNTER_WEIGHT_SCALE); // update model run time for stats if(isPigEnabled(Constants.SHIFU_GROUP_COUNTER, Constants.TOTAL_MODEL_RUNTIME)) { PigStatusReporter.getInstance().getCounter(Constants.SHIFU_GROUP_COUNTER, Constants.TOTAL_MODEL_RUNTIME) .increment(runModelInterval); } if(isPigEnabled(Constants.SHIFU_GROUP_COUNTER, Constants.COUNTER_RECORDS)) { PigStatusReporter.getInstance().getCounter(Constants.SHIFU_GROUP_COUNTER, Constants.COUNTER_RECORDS) .increment(1); } if(posTagSet.contains(tag)) { if(isPigEnabled(Constants.SHIFU_GROUP_COUNTER, Constants.COUNTER_POSTAGS)) { PigStatusReporter.getInstance().getCounter(Constants.SHIFU_GROUP_COUNTER, Constants.COUNTER_POSTAGS) .increment(1); } if(isPigEnabled(Constants.SHIFU_GROUP_COUNTER, Constants.COUNTER_WPOSTAGS)) { PigStatusReporter.getInstance().getCounter(Constants.SHIFU_GROUP_COUNTER, Constants.COUNTER_WPOSTAGS) .increment(weightLong); } } if(negTagSet.contains(tag)) { if(isPigEnabled(Constants.SHIFU_GROUP_COUNTER, Constants.COUNTER_NEGTAGS)) { PigStatusReporter.getInstance().getCounter(Constants.SHIFU_GROUP_COUNTER, Constants.COUNTER_NEGTAGS) .increment(1); } if(isPigEnabled(Constants.SHIFU_GROUP_COUNTER, Constants.COUNTER_WNEGTAGS)) { PigStatusReporter.getInstance().getCounter(Constants.SHIFU_GROUP_COUNTER, Constants.COUNTER_WNEGTAGS) .increment(weightLong); } } } /** * output the schema for evaluation score */ public Schema outputSchema(Schema input) { try { Schema tupleSchema = new Schema(); tupleSchema.add(new FieldSchema(SCHEMA_PREFIX + modelConfig.getTargetColumnName(evalConfig), DataType.CHARARRAY)); String weightName = StringUtils.isBlank(evalConfig.getDataSet().getWeightColumnName()) ? "weight" : evalConfig.getDataSet().getWeightColumnName(); tupleSchema.add(new FieldSchema(SCHEMA_PREFIX + weightName, DataType.CHARARRAY)); if(modelConfig.isRegression()) { if(this.modelCnt > 0) { addModelSchema(tupleSchema, this.modelCnt, ""); } if(MapUtils.isNotEmpty(this.subModelsCnt)) { Iterator<Map.Entry<String, Integer>> iterator = this.subModelsCnt.entrySet().iterator(); while(iterator.hasNext()) { Map.Entry<String, Integer> entry = iterator.next(); String modelName = entry.getKey(); Integer smCnt = entry.getValue(); if(smCnt > 0) { addModelSchema(tupleSchema, smCnt, modelName); } } } } else { if(this.modelCnt > 0) { addModelTagSchema(tupleSchema, modelCnt, ""); } if(MapUtils.isNotEmpty(this.subModelsCnt)) { Iterator<Map.Entry<String, Integer>> iterator = this.subModelsCnt.entrySet().iterator(); while(iterator.hasNext()) { Map.Entry<String, Integer> entry = iterator.next(); String modelName = entry.getKey(); Integer smCnt = entry.getValue(); if(smCnt > 0) { addModelTagSchema(tupleSchema, smCnt, modelName); } } } } List<String> metaColumns = evalConfig.getAllMetaColumns(modelConfig); if(CollectionUtils.isNotEmpty(metaColumns)) { for(String columnName: metaColumns) { tupleSchema.add(new FieldSchema(columnName, DataType.CHARARRAY)); } } return new Schema(new Schema.FieldSchema("EvalScore", tupleSchema, DataType.TUPLE)); } catch (IOException e) { log.error("Error in outputSchema", e); return null; } } /** * Add model(Regression) schema into tuple schema, if the modelCount > 0 * * @param tupleSchema * - schema for Tuple * @param modelCount * - model count * @param modelName * - model name */ private void addModelSchema(Schema tupleSchema, Integer modelCount, String modelName) { if(modelCount > 0) { tupleSchema.add(new FieldSchema(SCHEMA_PREFIX + addModelNameToField(modelName, "mean"), DataType.DOUBLE)); tupleSchema.add(new FieldSchema(SCHEMA_PREFIX + addModelNameToField(modelName, "max"), DataType.DOUBLE)); tupleSchema.add(new FieldSchema(SCHEMA_PREFIX + addModelNameToField(modelName, "min"), DataType.DOUBLE)); tupleSchema.add(new FieldSchema(SCHEMA_PREFIX + addModelNameToField(modelName, "median"), DataType.DOUBLE)); for(int i = 0; i < modelCount; i++) { tupleSchema.add(new FieldSchema(SCHEMA_PREFIX + addModelNameToField(modelName, "model" + i), DataType.DOUBLE)); } } } /** * Add model(Classification) schema into tuple schema, if the modelCount > 0 * * @param tupleSchema * - schema for Tuple * @param modelCount * - model count * @param modelName * - model name */ private void addModelTagSchema(Schema tupleSchema, Integer modelCount, String modelName) { if(modelConfig.isClassification() && !modelConfig.getTrain().isOneVsAll()) { for(int i = 0; i < modelCount; i++) { for(int j = 0; j < modelConfig.getTags().size(); j++) { tupleSchema.add(new FieldSchema(SCHEMA_PREFIX + addModelNameToField(modelName, "model_" + i + "_tag_" + j), DataType.DOUBLE)); } } } else { // one vs all for(int i = 0; i < modelCount; i++) { tupleSchema.add(new FieldSchema(SCHEMA_PREFIX + addModelNameToField(modelName, "model_" + i + "_tag_" + i), DataType.DOUBLE)); } } } /** * Add model name as the namespace of field * * @param modelName * - model name * @param field * - field name * @return - tuple name with namespace */ private String addModelNameToField(String modelName, String field) { return (StringUtils.isBlank(modelName) ? field : formatPigNS(modelName) + "::" + field); } private String formatPigNS(String name) { return name.replaceAll("-", "_"); } }