/* * Apache License * Version 2.0, January 2004 * http://www.apache.org/licenses/ * * Copyright 2013 Aurelian Tutuianu * Copyright 2014 Aurelian Tutuianu * Copyright 2015 Aurelian Tutuianu * Copyright 2016 Aurelian Tutuianu * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * */ package rapaio.experiment.ml.eval; import rapaio.core.CoreTools; import rapaio.core.RandomSource; import rapaio.core.SamplingTools; import rapaio.data.*; import rapaio.data.filter.frame.FFShuffle; import rapaio.ml.classifier.CFit; import rapaio.ml.classifier.Classifier; import rapaio.ml.eval.Confusion; import rapaio.ml.eval.ROC; import rapaio.printer.IdeaPrinter; import rapaio.sys.WS; import rapaio.util.Pin; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.function.BiConsumer; import java.util.stream.IntStream; import static java.util.stream.Collectors.toList; import static rapaio.graphics.Plotter.*; import static rapaio.sys.WS.print; /** * @author <a href="mailto:padreati@yahoo.com">Aurelian Tutuianu</a> */ @Deprecated public class CEvaluation { public static double cv(Frame df, String classColName, Classifier c, int folds) { print("\nCrossValidation with " + folds + " folds\n"); // print("Model: \n"); // c.newInstance().printSummary(); List<List<Integer>> strata = buildStrata(df, folds, classColName); Numeric acc = Numeric.empty(); for (int i = 0; i < folds; i++) { Mapping trainMapping = Mapping.empty(); Mapping testMapping = Mapping.empty(); for (int j = 0; j < folds; j++) { if (j == i) { testMapping.addAll(strata.get(j)); } else { trainMapping.addAll(strata.get(j)); } } Frame train = MappedFrame.byRow(df, trainMapping); Frame test = MappedFrame.byRow(df, testMapping); Classifier cc = c.newInstance(); cc.train(train, classColName); CFit cp = cc.fit(test); Confusion conf = new Confusion(test.var(classColName), cp.firstClasses()); acc.addValue(conf.accuracy()); print(String.format("CV %2d: acc=%.6f, mean=%.6f, se=%.6f\n", i + 1, conf.accuracy(), CoreTools.mean(acc).value(), CoreTools.var(acc).sdValue())); } double correct = CoreTools.mean(acc).value(); print("==============\n"); print(String.format("Mean accuracy:%.6f\n", correct)); print(String.format("SE: %.6f (Standard error)\n", CoreTools.var(acc).sdValue())); return correct; } private static List<List<Integer>> buildStrata(Frame df, int folds, String classColName) { String[] dict = df.var(classColName).levels(); List<List<Integer>> rows = IntStream.range(0, dict.length).boxed().map(ArrayList<Integer>::new).collect(toList()); for (int i = 0; i < df.rowCount(); i++) { rows.get(df.index(i, classColName)).add(i); } List<Integer> shuffle = new ArrayList<>(); for (int i = 0; i < dict.length; i++) { Collections.shuffle(rows.get(i), RandomSource.getRandom()); shuffle.addAll(rows.get(i)); } List<List<Integer>> strata = new ArrayList<>(); for (int i = 0; i < folds; i++) { strata.add(new ArrayList<>()); } int fold = 0; for (int next : shuffle) { strata.get(fold).add(next); fold++; if (fold == folds) { fold = 0; } } return strata; } public static void multiCv(Frame df, String classColName, List<Classifier> classifiers, int folds) { print("CrossValidation with " + folds + " folds\n"); df = new FFShuffle().fitApply(df); double[] tacc = new double[classifiers.size()]; for (int i = 0; i < folds; i++) { Mapping trainMapping = Mapping.empty(); Mapping testMapping = Mapping.empty(); if (folds >= df.rowCount() - 1) { testMapping.add(i); for (int j = 0; j < df.rowCount(); j++) { if (j != i) { trainMapping.add(j); } } } else { for (int j = 0; j < df.rowCount(); j++) { if (j % folds == i) { testMapping.add(j); } else { trainMapping.add(j); } } } Frame train = MappedFrame.byRow(df, trainMapping); Frame test = MappedFrame.byRow(df, testMapping); for (int k = 0; k < classifiers.size(); k++) { Classifier c = classifiers.get(k).newInstance(); c.train(train, classColName); CFit cp = c.fit(test); Confusion cm = new Confusion(test.var(classColName), cp.firstClasses()); // cm.printSummary(); double acc = cm.accuracy(); tacc[k] += acc; print(String.format("CV %d, accuracy:%.6f, classifier:%s\n", i + 1, acc, c.name())); } print("-----------\n"); } for (int k = 0; k < classifiers.size(); k++) { tacc[k] /= (1. * folds); print(String.format("Mean accuracy %.6f, for classifier: %s\n", tacc[k], classifiers.get(k).name())); } } public static void bootstrapValidation(Frame df, String classColName, Classifier c, int bootstraps) { Var weights = Numeric.fill(df.rowCount(), 1.0); bootstrapValidation(df, weights, classColName, c, bootstraps, 1.0); } public static void bootstrapValidation(Frame df, Var weights, String classColName, Classifier c, int bootstraps) { bootstrapValidation(df, weights, classColName, c, bootstraps, 1.0); } public static void bootstrapValidation(Frame df, String classColName, Classifier c, int bootstraps, double p) { Var weights = Numeric.fill(df.rowCount(), 1.0d); bootstrapValidation(df, weights, classColName, c, bootstraps, p); } public static void bootstrapValidation(Frame df, Var weights, String classColName, Classifier c, int bootstraps, double p) { print(bootstraps + " bootstrap evaluation\n"); double total = 0; double count = 0; for (int i = 0; i < bootstraps; i++) { // System.out.println("get sample..."); int[] rows = SamplingTools.sampleWR(df.rowCount(), (int) (df.rowCount() * p)); // System.out.println("build train set ..."); Frame train = df.mapRows(rows); // System.out.println("build test set ..."); Frame test = df.removeRows(rows); // System.out.println("learn train set ..."); Classifier cc = c.newInstance(); cc.train(train, weights.mapRows(rows), classColName); // System.out.println("fit test cases ..."); Var classes = cc.fit(test).firstClasses(); // System.out.println("build confusion matrix ..."); Confusion cm = new Confusion(test.var(classColName), classes); cm.printSummary(); double acc = cm.accuracy(); System.out.println(String.format("bootstrap(%d) : %.6f", i + 1, acc)); total += acc; count++; System.out.flush(); } System.out.println(String.format("Average accuracy: %.6f", total / count)); } public static PlotRunResult plotRunsAcc(Frame train, Frame test, String targetVar, Classifier c, int runs, int step) { BiConsumer<Classifier, Integer> oldHook = c.runningHook(); Index r = Index.empty().withName("runs"); Numeric testAcc = Numeric.empty().withName("test"); Numeric trainAcc = Numeric.empty().withName("train"); c.withRunningHook((cs, run) -> { if (run % step != 0) { return; } r.addIndex(run); testAcc.addValue(new Confusion(test.var(targetVar), c.fit(test).firstClasses()).accuracy()); trainAcc.addValue(new Confusion(train.var(targetVar), c.fit(train).firstClasses()).accuracy()); WS.setPrinter(new IdeaPrinter()); WS.draw(plot() .lines(r, testAcc, color(1)) .lines(r, trainAcc, color(2)) ); }); c.withRuns(runs); c.train(train, targetVar); WS.println("Confusion matrix on training data set: "); Confusion trainConfusion = new Confusion(train.var(targetVar), c.fit(train).firstClasses()); trainConfusion.printSummary(); WS.println(); WS.println("Confusion matrix on test data set: "); Confusion testConfusion = new Confusion(test.var(targetVar), c.fit(test).firstClasses()); testConfusion.printSummary(); return new PlotRunResult(r, trainAcc, testAcc, testConfusion, trainConfusion); } public static PlotRunResult plotRunsRoc( Frame train, Frame test, String targetVar, String label, Classifier cc, int runs, int step, boolean alterClassifier) { Classifier c = alterClassifier ? cc : cc.newInstance(); BiConsumer<Classifier, Integer> oldHook = c.runningHook(); Index r = Index.empty().withName("runs"); Numeric testAuc = Numeric.empty().withName("test"); Numeric trainAuc = Numeric.empty().withName("train"); Pin<Double> prevAuc = new Pin<>(0.0); c.withRunningHook((cs, run) -> { if ((run % step != 0) && run != 1) { return; } r.addIndex(run); ROC roc = ROC.from(c.fit(test).firstDensity().var(label), test.var(targetVar), label); WS.draw(rocCurve(roc).title("testAuc: " + WS.formatFlex(roc.auc()) + ", run: " + run)); testAuc.addValue(roc.auc()); WS.println("testAuc: " + WS.formatLong(roc.auc()) + ", run: " + run + ", auc gain: " + WS.formatLong(roc.auc()-prevAuc.get())); prevAuc.set(roc.auc()); // trainAuc.addValue(new ROC(c.fit(train).firstDensity().var(label), train.var(targetVar), label).auc()); // WS.draw(plot() // .lines(r, testAuc, color(1)) // .title("testAuc: " + WS.formatFlex(testAuc.value(testAuc.rowCount() - 1))) // ); }); c.withRuns(runs); c.train(train, targetVar); // WS.println("Confusion matrix on training data set: "); Confusion trainConfusion = new Confusion(train.var(targetVar), c.fit(train).firstClasses()); trainConfusion.printSummary(); // WS.println(); WS.println("Confusion matrix on test data set: "); Confusion testConfusion = new Confusion(test.var(targetVar), c.fit(test).firstClasses()); testConfusion.printSummary(); return new PlotRunResult(r, trainAuc, testAuc, testConfusion, trainConfusion); } public static class PlotRunResult { public final Var runs; public final Var trainAcc; public final Var testAcc; public final Confusion testConfusion; public final Confusion trainConfusion; public PlotRunResult(Var runs, Var trainAcc, Var testAcc, Confusion testConfusion, Confusion trainConfusion) { this.runs = runs; this.trainAcc = trainAcc; this.testAcc = testAcc; this.testConfusion = testConfusion; this.trainConfusion = trainConfusion; } } }