/* * Copyright [2012-2014] PayPal Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package ml.shifu.shifu.core.processor; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Random; import java.util.Scanner; import java.util.Set; import ml.shifu.shifu.actor.AkkaSystemExecutor; import ml.shifu.shifu.column.NSColumn; import ml.shifu.shifu.container.obj.ColumnConfig; import ml.shifu.shifu.container.obj.EvalConfig; import ml.shifu.shifu.container.obj.PerformanceResult; import ml.shifu.shifu.container.obj.RawSourceData.SourceType; import ml.shifu.shifu.core.ConfusionMatrix; import ml.shifu.shifu.core.PerformanceEvaluator; import ml.shifu.shifu.core.Scorer; import ml.shifu.shifu.core.dtrain.CommonConstants; import ml.shifu.shifu.core.eval.GainChart; import ml.shifu.shifu.core.validator.ModelInspector.ModelStep; import ml.shifu.shifu.exception.ShifuErrorCode; import ml.shifu.shifu.exception.ShifuException; import ml.shifu.shifu.fs.PathFinder; import ml.shifu.shifu.fs.ShifuFileUtils; import ml.shifu.shifu.pig.PigExecutor; import ml.shifu.shifu.util.CommonUtils; import ml.shifu.shifu.util.Constants; import ml.shifu.shifu.util.Environment; import org.apache.commons.collections.CollectionUtils; import org.apache.commons.io.FileUtils; import org.apache.commons.lang.StringUtils; import org.apache.hadoop.fs.Path; import org.apache.pig.tools.pigstats.JobStats; import org.apache.pig.tools.pigstats.PigStats; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * EvalModelProcessor class */ public class EvalModelProcessor extends BasicModelProcessor implements Processor { /** * log object */ private final static Logger LOG = LoggerFactory.getLogger(EvalModelProcessor.class); /** * Step for evaluation */ public enum EvalStep { LIST, NEW, DELETE, RUN, PERF, SCORE, CONFMAT, NORM, GAINCHART; } private String evalName = null; private EvalStep evalStep; private long evalRecords = 0l; private static final Random RANDOM = new Random(); /** * Constructor * * @param step * the evaluation step */ public EvalModelProcessor(EvalStep step) { this.evalStep = step; } public EvalModelProcessor(EvalStep step, Map<String, Object> otherConfigs) { this.evalStep = step; super.otherConfigs = otherConfigs; } /** * Constructor * * @param step * the evaluation step * @param name * the evaluation name */ public EvalModelProcessor(EvalStep step, String name) { this.evalName = name; this.evalStep = step; } /** * Runner for evaluation */ @Override public int run() throws Exception { LOG.info("Step Start: eval"); long start = System.currentTimeMillis(); try { setUp(ModelStep.EVAL); syncDataToHdfs(modelConfig.getDataSet().getSource()); switch(evalStep) { case LIST: listEvalSet(); break; case NEW: createNewEval(evalName); break; case DELETE: deleteEvalSet(evalName); break; case RUN: runEval(getEvalConfigListFromInput()); break; case NORM: runNormalize(getEvalConfigListFromInput()); break; case PERF: // FIXME, here should be failed because of this.evalRecords is 0. how to fix it runPerformance(getEvalConfigListFromInput()); break; case SCORE: runScore(getEvalConfigListFromInput()); break; case CONFMAT: // FIXME, here should be failed runConfusionMatrix(getEvalConfigListFromInput()); break; default: break; } syncDataToHdfs(modelConfig.getDataSet().getSource()); clearUp(ModelStep.EVAL); } catch (Exception e) { LOG.error("Error:", e); return -1; } LOG.info("Step Finished: eval with {} ms", (System.currentTimeMillis() - start)); return 0; } private void deleteEvalSet(String evalSetName) { EvalConfig evalConfig = modelConfig.getEvalConfigByName(evalSetName); if(evalConfig == null) { LOG.error("{} eval set doesn't exist.", evalSetName); } else { modelConfig.getEvals().remove(evalConfig); try { saveModelConfig(); } catch (IOException e) { throw new ShifuException(ShifuErrorCode.ERROR_WRITE_MODELCONFIG, e); } LOG.info("Done. Delete eval set - " + evalSetName); } } private void listEvalSet() { List<EvalConfig> evals = modelConfig.getEvals(); if(CollectionUtils.isNotEmpty(evals)) { LOG.info("There are {} eval sets.", evals.size()); for(EvalConfig evalConfig: evals) { LOG.info("\t - {}", evalConfig.getName()); } } } private List<EvalConfig> getEvalConfigListFromInput() { List<EvalConfig> evalSetList = new ArrayList<EvalConfig>(); if(StringUtils.isNotBlank(evalName)) { String[] evalList = evalName.split(","); for(String eval: evalList) { EvalConfig evalConfig = modelConfig.getEvalConfigByName(eval); if(evalConfig == null) { LOG.error("The evalset - " + eval + " doesn't exist!"); } else { evalSetList.add(evalConfig); } } } else { evalSetList = modelConfig.getEvals(); if(CollectionUtils.isEmpty(evalSetList)) { throw new ShifuException(ShifuErrorCode.ERROR_MODEL_EVALSET_DOESNT_EXIST); } } return evalSetList; } /** * run score only * * @param evalConfigList * eval config list * @throws IOException * any io exception */ private void runScore(List<EvalConfig> evalSetList) throws IOException { for(EvalConfig config: evalSetList) { runScore(config); } } /** * Run score only * * @param evalConfig * the eval config instance * @throws IOException * any io exception */ private void runScore(EvalConfig config) throws IOException { // create evalset home directory firstly in local file system PathFinder pathFinder = new PathFinder(modelConfig); String evalSetPath = pathFinder.getEvalSetPath(config, SourceType.LOCAL); FileUtils.forceMkdir(new File(evalSetPath)); syncDataToHdfs(config.getDataSet().getSource()); switch(modelConfig.getBasic().getRunMode()) { case DIST: case MAPRED: runDistScore(config); break; case LOCAL: runAkkaScore(config); break; default: break; } } /** * Run normalization against the evaluation data sets based on existing ColumnConfig.json which is from training * data set. * * @param evalConfigList * the eval config list * @throws IOException * any io exception */ private void runNormalize(List<EvalConfig> evalConfigList) throws IOException { for(EvalConfig evalConfig: evalConfigList) { runNormalize(evalConfig); } } /** * Run normalization against the evaluation data set based on existing ColumnConfig.json which is from training * data set. * * @param evalConfig * the eval config instance * @throws IOException * when any IO exception * @throws IllegalArgumentException * if LOCAL run mode */ private void runNormalize(EvalConfig evalConfig) throws IOException { String evalSetPath = super.pathFinder.getEvalSetPath(evalConfig, SourceType.LOCAL); FileUtils.forceMkdir(new File(evalSetPath)); syncDataToHdfs(evalConfig.getDataSet().getSource()); switch(modelConfig.getBasic().getRunMode()) { case DIST: case MAPRED: runPigNormalize(evalConfig); break; case LOCAL: default: throw new IllegalArgumentException("Eval norm doesn't support LOCAL run mode."); } } /** * run pig mode scoring * * @param evalConfig * the name for evaluation * @throws IOException * any io exception */ @SuppressWarnings("deprecation") private ScoreStatus runDistScore(EvalConfig evalConfig) throws IOException { // clean up output directories SourceType sourceType = evalConfig.getDataSet().getSource(); ShifuFileUtils.deleteFile(pathFinder.getEvalNormalizedPath(evalConfig), sourceType); ShifuFileUtils.deleteFile(pathFinder.getEvalScorePath(evalConfig), sourceType); ShifuFileUtils.deleteFile(pathFinder.getEvalPerformancePath(evalConfig), sourceType); // prepare special parameters and execute pig Map<String, String> paramsMap = new HashMap<String, String>(); paramsMap.put(Constants.SOURCE_TYPE, sourceType.toString()); paramsMap.put("pathEvalRawData", evalConfig.getDataSet().getDataPath()); paramsMap.put("pathEvalNormalized", pathFinder.getEvalNormalizedPath(evalConfig)); paramsMap.put("pathEvalScore", pathFinder.getEvalScorePath(evalConfig)); paramsMap.put("pathEvalPerformance", pathFinder.getEvalPerformancePath(evalConfig)); paramsMap.put("eval_set_name", evalConfig.getName()); paramsMap.put("delimiter", evalConfig.getDataSet().getDataDelimiter()); paramsMap.put("columnIndex", evalConfig.getPerformanceScoreSelector().trim()); paramsMap.put("scale", Environment.getProperty(Constants.SHIFU_SCORE_SCALE, Integer.toString(Scorer.DEFAULT_SCORE_SCALE))); String pigScript = "scripts/Eval.pig"; Map<String, String> confMap = new HashMap<String, String>(); // max min score folder String maxMinScoreFolder = ShifuFileUtils .getFileSystemBySourceType(sourceType) .makeQualified( new Path("tmp" + File.separator + "maxmin_score_" + System.currentTimeMillis() + "_" + RANDOM.nextLong())).toString(); confMap.put(Constants.SHIFU_EVAL_MAXMIN_SCORE_OUTPUT, maxMinScoreFolder); if(modelConfig.isClassification()) { pigScript = "scripts/EvalScore.pig"; } try { PigExecutor.getExecutor().submitJob(modelConfig, pathFinder.getScriptPath(pigScript), paramsMap, evalConfig.getDataSet().getSource(), confMap, super.pathFinder); } catch (IOException e) { throw new ShifuException(ShifuErrorCode.ERROR_RUNNING_PIG_JOB, e); } catch (Throwable e) { throw new RuntimeException(e); } Iterator<JobStats> iter = PigStats.get().getJobGraph().iterator(); while(iter.hasNext()) { JobStats jobStats = iter.next(); long evalRecords = jobStats.getHadoopCounters().getGroup(Constants.SHIFU_GROUP_COUNTER) .getCounter(Constants.COUNTER_RECORDS); LOG.info("Total valid eval records is : {}", evalRecords); // If no basic record counter, check next one if(evalRecords == 0L) { continue; } this.evalRecords = evalRecords; long pigPosTags = jobStats.getHadoopCounters().getGroup(Constants.SHIFU_GROUP_COUNTER) .getCounter(Constants.COUNTER_POSTAGS); long pigNegTags = jobStats.getHadoopCounters().getGroup(Constants.SHIFU_GROUP_COUNTER) .getCounter(Constants.COUNTER_NEGTAGS); double pigPosWeightTags = jobStats.getHadoopCounters().getGroup(Constants.SHIFU_GROUP_COUNTER) .getCounter(Constants.COUNTER_WPOSTAGS) / (Constants.EVAL_COUNTER_WEIGHT_SCALE * 1.0d); double pigNegWeightTags = jobStats.getHadoopCounters().getGroup(Constants.SHIFU_GROUP_COUNTER) .getCounter(Constants.COUNTER_WNEGTAGS) / (Constants.EVAL_COUNTER_WEIGHT_SCALE * 1.0d); long totalRunTime = jobStats.getHadoopCounters().getGroup(Constants.SHIFU_GROUP_COUNTER) .getCounter(Constants.TOTAL_MODEL_RUNTIME); LOG.info("Avg SLA for eval model scoring is {} micro seconds", totalRunTime / evalRecords); double maxScore = Integer.MIN_VALUE; double minScore = Integer.MAX_VALUE; if(modelConfig.isRegression()) { double[] maxMinScores = locateMaxMinScoreFromFile(sourceType, maxMinScoreFolder); maxScore = maxMinScores[0]; minScore = maxMinScores[1]; LOG.info("Max score is {}, min score is {}", maxScore, minScore); ShifuFileUtils.deleteFile(maxMinScoreFolder, sourceType); } // only one pig job with such counters, return return new ScoreStatus(pigPosTags, pigNegTags, pigPosWeightTags, pigNegWeightTags, maxScore, minScore, evalRecords); } return null; } private double[] locateMaxMinScoreFromFile(SourceType sourceType, String maxMinScoreFolder) throws IOException { List<Scanner> scanners = null; double maxScore = Double.MIN_VALUE; double minScore = Double.MAX_VALUE; try { // here only works for 1 reducer scanners = ShifuFileUtils.getDataScanners(maxMinScoreFolder, sourceType); for(Scanner scanner: scanners) { if(scanner.hasNext()) { String line = scanner.nextLine().trim(); String[] splits = line.split(","); if(splits.length >= 2) { double localMaxScore = Double.parseDouble(splits[0]); if(maxScore < localMaxScore) { maxScore = localMaxScore; } Double localMinScore = Double.parseDouble(splits[1]); if(minScore > localMinScore) { minScore = localMinScore; } } } } } finally { if(scanners != null) { for(Scanner scanner: scanners) { if(scanner != null) { scanner.close(); } } } } return new double[] { maxScore, minScore }; } /** * Normalize evaluation dataset based on pig distributed solution. * * @param evalConfig * eval config instance * @throws IOException * any io exception */ private void runPigNormalize(EvalConfig evalConfig) throws IOException { SourceType sourceType = evalConfig.getDataSet().getSource(); // clean up output directories ShifuFileUtils.deleteFile(pathFinder.getEvalNormalizedPath(evalConfig), sourceType); // prepare special parameters and execute pig Map<String, String> paramsMap = new HashMap<String, String>(); paramsMap.put(Constants.SOURCE_TYPE, sourceType.toString()); paramsMap.put("pathEvalRawData", evalConfig.getDataSet().getDataPath()); paramsMap.put("pathEvalNormalized", pathFinder.getEvalNormalizedPath(evalConfig)); paramsMap.put("eval_set_name", evalConfig.getName()); paramsMap.put("delimiter", evalConfig.getDataSet().getDataDelimiter()); paramsMap.put("scale", Environment.getProperty(Constants.SHIFU_SCORE_SCALE, Integer.toString(Scorer.DEFAULT_SCORE_SCALE))); String pigScript = "scripts/EvalNorm.pig"; try { PigExecutor.getExecutor().submitJob(modelConfig, pathFinder.getScriptPath(pigScript), paramsMap, evalConfig.getDataSet().getSource()); } catch (IOException e) { throw new ShifuException(ShifuErrorCode.ERROR_RUNNING_PIG_JOB, e); } catch (Throwable e) { throw new RuntimeException(e); } } /** * run akka mode scoring * * @param config * the name for evaluation * @throws IOException * any io exception */ private void runAkkaScore(EvalConfig config) throws IOException { SourceType sourceType = config.getDataSet().getSource(); List<Scanner> scanners = ShifuFileUtils.getDataScanners( ShifuFileUtils.expandPath(config.getDataSet().getDataPath(), sourceType), sourceType); AkkaSystemExecutor.getExecutor().submitModelEvalJob(modelConfig, ShifuFileUtils.searchColumnConfig(config, this.columnConfigList), config, scanners); // FIXME A bug here in local mode, compute eval records please // this.evalRecords = ...; closeScanners(scanners); } /** * Create a evaluation with <code>name</code> * * @param name * - the evaluation set name * @throws IOException * any io exception */ private void createNewEval(String name) throws IOException { EvalConfig evalConfig = modelConfig.getEvalConfigByName(name); if(evalConfig != null) { throw new ShifuException(ShifuErrorCode.ERROR_MODEL_EVALSET_ALREADY_EXIST, "EvalSet - " + name + " already exists in ModelConfig. Please use another evalset name"); } evalConfig = new EvalConfig(); evalConfig.setName(name); evalConfig.setDataSet(modelConfig.getDataSet().cloneRawSourceData()); // create empty <EvalSetName>Score.meta.column.names ShifuFileUtils.createFileIfNotExists(new Path(evalConfig.getName() + Constants.DEFAULT_EVALSCORE_META_COLUMN_FILE).toString(), SourceType.LOCAL); modelConfig.getEvals().add(evalConfig); try { saveModelConfig(); } catch (IOException e) { throw new ShifuException(ShifuErrorCode.ERROR_WRITE_MODELCONFIG, e); } LOG.info("Create Eval - " + name); } /** * Running evaluation including scoring and performance evaluation two steps. * * <p> * This function will switch to pig or akka evaluation depends on the modelConfig running mode * * @throws IOException * any exception in running pig evaluation or akka evaluation */ private void runEval(List<EvalConfig> evalSetList) throws IOException { for(EvalConfig evalConfig: evalSetList) { runEval(evalConfig); } } private void validateEvalColumnConfig(EvalConfig evalConfig) throws IOException { if(this.columnConfigList == null) { return; } String[] evalColumnNames = null; if(StringUtils.isNotBlank(evalConfig.getDataSet().getHeaderPath())) { String delimiter = StringUtils.isBlank(evalConfig.getDataSet().getHeaderDelimiter()) ? evalConfig .getDataSet().getDataDelimiter() : evalConfig.getDataSet().getHeaderDelimiter(); evalColumnNames = CommonUtils.getHeaders(evalConfig.getDataSet().getHeaderPath(), delimiter, evalConfig .getDataSet().getSource()); } else { String delimiter = StringUtils.isBlank(evalConfig.getDataSet().getHeaderDelimiter()) ? evalConfig .getDataSet().getDataDelimiter() : evalConfig.getDataSet().getHeaderDelimiter(); String[] fields = CommonUtils.takeFirstLine(evalConfig.getDataSet().getDataPath(), delimiter, evalConfig .getDataSet().getSource()); // if first line contains target column name, we guess it is csv format and first line is header. if(StringUtils.join(fields, "").contains(modelConfig.getTargetColumnName())) { // first line of data meaning second line in data files excluding first header line String[] dataInFirstLine = CommonUtils.takeFirstTwoLines(evalConfig.getDataSet().getDataPath(), delimiter, evalConfig.getDataSet().getSource())[1]; if(dataInFirstLine != null && fields.length != dataInFirstLine.length) { throw new IllegalArgumentException( "Eval header length and eval data length are not consistent, please check you header setting and data set setting in eval."); } evalColumnNames = new String[fields.length]; for(int i = 0; i < fields.length; i++) { evalColumnNames[i] = CommonUtils.getRelativePigHeaderColumnName(fields[i]); } LOG.warn("No header path is provided, we will try to read first line and detect schema."); LOG.warn("Schema in ColumnConfig.json are named as first line of data set path."); } else { LOG.warn("No header path is provided, we will try to read first line and detect schema."); LOG.warn("Schema in ColumnConfig.json are named as index 0, 1, 2, 3 ..."); LOG.warn("Please make sure weight column and tag column are also taking index as name."); evalColumnNames = new String[fields.length]; for(int i = 0; i < fields.length; i++) { evalColumnNames[i] = i + ""; } } } Set<NSColumn> names = new HashSet<NSColumn>(); for(String evalColumnName: evalColumnNames) { names.add(new NSColumn(evalColumnName)); } for(ColumnConfig config: this.columnConfigList) { if(config.isFinalSelect() && !names.contains(new NSColumn(config.getColumnName()))) { throw new IllegalArgumentException("Final selected column " + config.getColumnName() + " does not exist in - " + evalConfig.getDataSet().getHeaderPath()); } } if(StringUtils.isNotBlank(evalConfig.getDataSet().getTargetColumnName()) && !names.contains(new NSColumn(evalConfig.getDataSet().getTargetColumnName()))) { throw new IllegalArgumentException("Target column " + evalConfig.getDataSet().getTargetColumnName() + " does not exist in - " + evalConfig.getDataSet().getHeaderPath()); } if(StringUtils.isNotBlank(evalConfig.getDataSet().getWeightColumnName()) && !names.contains(new NSColumn(evalConfig.getDataSet().getWeightColumnName()))) { throw new IllegalArgumentException("Weight column " + evalConfig.getDataSet().getWeightColumnName() + " does not exist in - " + evalConfig.getDataSet().getHeaderPath()); } } /** * Run evaluation per EvalConfig. * * @param evalConfig * the evaluation config instance. * @throws IOException * when any IO exception */ private void runEval(EvalConfig evalConfig) throws IOException { // create evalset home directory firstly in local file system validateEvalColumnConfig(evalConfig); String evalSetPath = pathFinder.getEvalSetPath(evalConfig, SourceType.LOCAL); FileUtils.forceMkdir(new File(evalSetPath)); syncDataToHdfs(evalConfig.getDataSet().getSource()); switch(modelConfig.getBasic().getRunMode()) { case DIST: case MAPRED: runDistEval(evalConfig); break; case LOCAL: runAkkaEval(evalConfig); break; default: break; } } private boolean isGBTNotConvertToProb(EvalConfig evalConfig) { return CommonConstants.GBT_ALG_NAME.equalsIgnoreCase(modelConfig.getTrain().getAlgorithm()) && !evalConfig.getGbtConvertToProb(); } /** * Run distributed version of evaluation and performance review. * * @param evalConfig * the evaluation instance * @throws IOException * when any exception in delete the old tmp files */ private void runDistEval(EvalConfig evalConfig) throws IOException { ScoreStatus ss = runDistScore(evalConfig); List<String> scoreMetaColumns = evalConfig.getScoreMetaColumns(modelConfig); if(scoreMetaColumns == null || scoreMetaColumns.isEmpty() || !modelConfig.isRegression()) { // if no any champion score column set, go to previous evaluation with only challendge model runConfusionMatrix(evalConfig, ss, isGBTNotConvertToProb(evalConfig)); return; } // 1. Get challendge model performance PerformanceResult challendgeModelPerformance = runConfusionMatrix(evalConfig, ss, pathFinder.getEvalScorePath(evalConfig), pathFinder.getEvalPerformancePath(evalConfig), false, false, isGBTNotConvertToProb(evalConfig)); List<PerformanceResult> prList = new ArrayList<PerformanceResult>(); prList.add(challendgeModelPerformance); // 2. Get all champion model performance List<String> names = new ArrayList<String>(); names.add(modelConfig.getBasic().getName() + "-" + evalConfig.getName()); for(String metaScoreColumn: scoreMetaColumns) { if(StringUtils.isBlank(metaScoreColumn)) { continue; } names.add(metaScoreColumn); LOG.info("Model score sort for {} in eval {} is started.", metaScoreColumn, evalConfig.getName()); ScoreStatus newScoreStatus = runDistMetaScore(evalConfig, metaScoreColumn); PerformanceResult championModelPerformance = runConfusionMatrix(evalConfig, newScoreStatus, pathFinder.getEvalScorePath(evalConfig, metaScoreColumn), pathFinder.getEvalPerformancePath(evalConfig, metaScoreColumn), false, false, 0, 1, 2); prList.add(championModelPerformance); } GainChart gc = new GainChart(); boolean hasWeight = StringUtils.isNotBlank(evalConfig.getDataSet().getWeightColumnName()); // 3. Compute gain chart and other eval performance files only in local. String htmlGainChart = pathFinder.getEvalFilePath(evalConfig.getName(), evalConfig.getName() + "_gainchart.html", SourceType.LOCAL); LOG.info("Gain chart is generated in {}.", htmlGainChart); gc.generateHtml(evalConfig, modelConfig, htmlGainChart, prList, names); String hrmlPrRoc = pathFinder.getEvalFilePath(evalConfig.getName(), evalConfig.getName() + "_prroc.html", SourceType.LOCAL); LOG.info("PR & ROC chart is generated in {}.", hrmlPrRoc); gc.generateHtml4PrAndRoc(evalConfig, modelConfig, hrmlPrRoc, prList, names); for(int i = 0; i < names.size(); i++) { String name = names.get(i); PerformanceResult pr = prList.get(i); String unitGainChartCsv = pathFinder.getEvalFilePath(evalConfig.getName(), name + "_unit_wise_gainchart.csv", SourceType.LOCAL); LOG.info("Unit-wise gain chart data is generated in {} for eval {} and name {}.", unitGainChartCsv, evalConfig.getName(), name); gc.generateCsv(evalConfig, modelConfig, unitGainChartCsv, pr.gains); if(hasWeight) { String weightedGainChartCsv = pathFinder.getEvalFilePath(evalConfig.getName(), name + "_weighted_gainchart.csv", SourceType.LOCAL); LOG.info("Weighted gain chart data is generated in {} for eval {} and name {}.", weightedGainChartCsv, evalConfig.getName(), name); gc.generateCsv(evalConfig, modelConfig, weightedGainChartCsv, pr.weightedGains); } String prCsvFile = pathFinder.getEvalFilePath(evalConfig.getName(), name + "_unit_wise_pr.csv", SourceType.LOCAL); LOG.info("Unit-wise pr data is generated in {} for eval {} and name {}.", prCsvFile, evalConfig.getName(), name); gc.generateCsv(evalConfig, modelConfig, prCsvFile, pr.pr); if(hasWeight) { String weightedPrCsvFile = pathFinder.getEvalFilePath(evalConfig.getName(), name + "_weighted_pr.csv", SourceType.LOCAL); LOG.info("Weighted pr data is generated in {} for eval {} and name {}.", weightedPrCsvFile, evalConfig.getName(), name); gc.generateCsv(evalConfig, modelConfig, weightedPrCsvFile, pr.weightedPr); } String rocCsvFile = pathFinder.getEvalFilePath(evalConfig.getName(), name + "_unit_wise_roc.csv", SourceType.LOCAL); LOG.info("Unit-wise roc data is generated in {} for eval {} and name {}.", rocCsvFile, evalConfig.getName(), name); gc.generateCsv(evalConfig, modelConfig, rocCsvFile, pr.roc); if(hasWeight) { String weightedRocCsvFile = pathFinder.getEvalFilePath(evalConfig.getName(), name + "_weighted_roc.csv", SourceType.LOCAL); LOG.info("Weighted roc data is generated in {} for eval {} and name {}.", weightedRocCsvFile, evalConfig.getName(), name); gc.generateCsv(evalConfig, modelConfig, weightedRocCsvFile, pr.weightedRoc); } String modelScoreGainChartCsv = pathFinder.getEvalFilePath(evalConfig.getName(), name + "_modelscore_gainchart.csv", SourceType.LOCAL); LOG.info("Model score gain chart data is generated in {} for eval {} and name {}.", modelScoreGainChartCsv, evalConfig.getName(), name); gc.generateCsv(evalConfig, modelConfig, modelScoreGainChartCsv, pr.modelScoreList); } LOG.info("Performance Evaluation is done for {}.", evalConfig.getName()); } @SuppressWarnings("deprecation") private ScoreStatus runDistMetaScore(EvalConfig evalConfig, String metaScore) throws IOException { SourceType sourceType = evalConfig.getDataSet().getSource(); // clean up output directories ShifuFileUtils.deleteFile(pathFinder.getEvalScorePath(evalConfig, metaScore), sourceType); // prepare special parameters and execute pig Map<String, String> paramsMap = new HashMap<String, String>(); paramsMap.put(Constants.SOURCE_TYPE, sourceType.toString()); paramsMap.put("pathEvalRawData", evalConfig.getDataSet().getDataPath()); paramsMap.put("pathSortScoreData", pathFinder.getEvalScorePath(evalConfig, metaScore)); paramsMap.put("eval_set_name", evalConfig.getName()); paramsMap.put("delimiter", evalConfig.getDataSet().getDataDelimiter()); paramsMap.put("column_name", metaScore); String pigScript = "scripts/EvalScoreMetaSort.pig"; Map<String, String> confMap = new HashMap<String, String>(); // max min score folder String maxMinScoreFolder = ShifuFileUtils .getFileSystemBySourceType(sourceType) .makeQualified( new Path("tmp" + File.separator + "maxmin_score_" + System.currentTimeMillis() + "_" + RANDOM.nextLong())).toString(); confMap.put(Constants.SHIFU_EVAL_MAXMIN_SCORE_OUTPUT, maxMinScoreFolder); try { PigExecutor.getExecutor().submitJob(modelConfig, pathFinder.getScriptPath(pigScript), paramsMap, evalConfig.getDataSet().getSource(), confMap, super.pathFinder); } catch (IOException e) { throw new ShifuException(ShifuErrorCode.ERROR_RUNNING_PIG_JOB, e); } catch (Throwable e) { throw new RuntimeException(e); } Iterator<JobStats> iter = PigStats.get().getJobGraph().iterator(); while(iter.hasNext()) { JobStats jobStats = iter.next(); long evalRecords = jobStats.getHadoopCounters().getGroup(Constants.SHIFU_GROUP_COUNTER) .getCounter(Constants.COUNTER_RECORDS); LOG.info("Total valid eval records is : {}", evalRecords); // If no basic record counter, check next one if(evalRecords == 0L) { continue; } this.evalRecords = evalRecords; long pigPosTags = jobStats.getHadoopCounters().getGroup(Constants.SHIFU_GROUP_COUNTER) .getCounter(Constants.COUNTER_POSTAGS); long pigNegTags = jobStats.getHadoopCounters().getGroup(Constants.SHIFU_GROUP_COUNTER) .getCounter(Constants.COUNTER_NEGTAGS); double pigPosWeightTags = jobStats.getHadoopCounters().getGroup(Constants.SHIFU_GROUP_COUNTER) .getCounter(Constants.COUNTER_WPOSTAGS) / (Constants.EVAL_COUNTER_WEIGHT_SCALE * 1.0d); double pigNegWeightTags = jobStats.getHadoopCounters().getGroup(Constants.SHIFU_GROUP_COUNTER) .getCounter(Constants.COUNTER_WNEGTAGS) / (Constants.EVAL_COUNTER_WEIGHT_SCALE * 1.0d); double maxScore = Integer.MIN_VALUE; double minScore = Integer.MAX_VALUE; if(modelConfig.isRegression()) { double[] maxMinScores = locateMaxMinScoreFromFile(sourceType, maxMinScoreFolder); maxScore = maxMinScores[0]; minScore = maxMinScores[1]; LOG.info("Max score is {}, min score is {}", maxScore, minScore); ShifuFileUtils.deleteFile(maxMinScoreFolder, sourceType); } long badMetaScores = jobStats.getHadoopCounters().getGroup(Constants.SHIFU_GROUP_COUNTER) .getCounter("BAD_META_SCORE"); // Get score status from Counter to avoid re-computing such metrics LOG.info("Eval records is {}; and bad meta score is {}.", evalRecords, badMetaScores); return new ScoreStatus(pigPosTags, pigNegTags, pigPosWeightTags, pigNegWeightTags, maxScore, minScore, evalRecords); } return null; } /** * Use akka to run model evaluation * * @param evalConfig * the evaluation instance * @throws IOException * the error while create data scanner for input data */ private void runAkkaEval(EvalConfig evalConfig) throws IOException { runAkkaScore(evalConfig); runConfusionMatrix(evalConfig); runPerformance(evalConfig); } /** * Running the performance matrices * * @param evalSetList * EvalConfig list * @throws IOException */ private void runPerformance(List<EvalConfig> evalSetList) throws IOException { for(EvalConfig evalConfig: evalSetList) { runPerformance(evalConfig); } } /** * Running the performance matrices * * @param evalConfig * the name for evaluation * @throws IOException * any io exception */ private void runPerformance(EvalConfig evalConfig) throws IOException { PerformanceEvaluator perfEval = new PerformanceEvaluator(modelConfig, evalConfig); switch(modelConfig.getBasic().getRunMode()) { case DIST: case MAPRED: // FIXME here, this,evalRecords is 0 in initialzation perfEval.review(this.evalRecords); break; case LOCAL: default: perfEval.review(); break; } } /** * Compute confusion matrix * * @param evalSetList * a List of EvalConfig * @throws IOException * any io exception */ private void runConfusionMatrix(List<EvalConfig> evalSetList) throws IOException { for(EvalConfig config: evalSetList) { runConfusionMatrix(config); } } /** * Compute confusion matrix * * @param config * eval config * @param ss * the score stats * @return List of ConfusionMatrixObject * @throws IOException * any io exception */ private PerformanceResult runConfusionMatrix(EvalConfig config, ScoreStatus ss, boolean isUseMaxMinScore) throws IOException { return runConfusionMatrix(config, ss, pathFinder.getEvalScorePath(config), pathFinder.getEvalPerformancePath(config, config.getDataSet().getSource()), true, true, isUseMaxMinScore); } private PerformanceResult runConfusionMatrix(EvalConfig config, ScoreStatus ss, String scoreDataPath, String evalPerformancePath, boolean isPrint, boolean isGenerateChart, boolean isUseMaxMinScore) throws IOException { ConfusionMatrix worker = new ConfusionMatrix(modelConfig, config); switch(modelConfig.getBasic().getRunMode()) { case DIST: case MAPRED: if(modelConfig.isRegression()) { return worker.bufferedComputeConfusionMatrixAndPerformance(ss.pigPosTags, ss.pigNegTags, ss.pigPosWeightTags, ss.pigNegWeightTags, ss.evalRecords, ss.maxScore, ss.minScore, scoreDataPath, evalPerformancePath, isPrint, isGenerateChart, isUseMaxMinScore); } else { worker.computeConfusionMatixForMultipleClassification(this.evalRecords); return null; } default: worker.computeConfusionMatrix(); return null; } } private PerformanceResult runConfusionMatrix(EvalConfig config, ScoreStatus ss, String scoreDataPath, String evalPerformancePath, boolean isPrint, boolean isGenerateChart, int targetColumnIndex, int scoreColumnIndex, int weightColumnIndex) throws IOException { ConfusionMatrix worker = new ConfusionMatrix(modelConfig, config); switch(modelConfig.getBasic().getRunMode()) { case DIST: case MAPRED: if(modelConfig.isRegression()) { return worker.bufferedComputeConfusionMatrixAndPerformance(ss.pigPosTags, ss.pigNegTags, ss.pigPosWeightTags, ss.pigNegWeightTags, ss.evalRecords, ss.maxScore, ss.minScore, scoreDataPath, evalPerformancePath, isPrint, isGenerateChart, targetColumnIndex, scoreColumnIndex, weightColumnIndex, true); } else { worker.computeConfusionMatixForMultipleClassification(this.evalRecords); return null; } case LOCAL: default: worker.computeConfusionMatrix(); return null; } } /** * Run confusion matrix * * @param config * eval config * @return List of ConfusionMatrixObject * @throws IOException * any io exception */ private void runConfusionMatrix(EvalConfig config) throws IOException { runConfusionMatrix(config, null, false); } private static class ScoreStatus { public long pigPosTags = 0l; public long pigNegTags = 0l; public double pigPosWeightTags = 0d; public double pigNegWeightTags = 0d; public double maxScore = Integer.MIN_VALUE; public double minScore = Integer.MAX_VALUE; public long evalRecords = 0l; public ScoreStatus(long pigPosTags, long pigNegTags, double pigPosWeightTags, double pigNegWeightTags, double maxScore, double minScore, long evalRecords) { this.pigPosTags = pigPosTags; this.pigNegTags = pigNegTags; this.pigPosWeightTags = pigPosWeightTags; this.pigNegWeightTags = pigNegWeightTags; this.maxScore = maxScore; this.minScore = minScore; this.evalRecords = evalRecords; } } }