/******************************************************************************* * Copyright (c) 2010 Haifeng Li * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. *******************************************************************************/ package smile.sampling; import java.util.ArrayList; import smile.math.Math; /** * Bagging (Bootstrap aggregating) is a way to improve the classification by * combining classifications of randomly generated training sets. * * @author Haifeng Li */ public class Bagging { /** The number of samples in this bag. */ public int size; /** * Samples. The first column is the sample index while the second * column is the number of samples. */ public int[][] samples; /** * Stratified sampling. * * @param k the number of classes. * @param y class labels. * @param classWeight Priors of the classes. The weight of each class * is roughly the ratio of samples in each class. * For example, if * there are 400 positive samples and 100 negative * samples, the classWeight should be [1, 4] * (assuming label 0 is of negative, label 1 is of * positive). * @param subsample sampling rate. Draw samples with replacement if it is 1.0. */ public Bagging(int k, int[] y, int[] classWeight, double subsample) { int n = y.length; int[] sampling = new int[n]; // Stratified sampling in case class is unbalanced. // That is, we sample each class separately. if (subsample == 1.0) { // Training samples draw with replacement. for (int l = 0; l < k; l++) { int nj = 0; ArrayList<Integer> cj = new ArrayList<>(); for (int i = 0; i < n; i++) { if (y[i] == l) { cj.add(i); nj++; } } // We used to do up sampling. // But we switch to down sampling, which seems has better performance. int size = nj / classWeight[l]; for (int i = 0; i < size; i++) { int xi = smile.math.Math.randomInt(nj); sampling[cj.get(xi)] += 1; } } } else { // Training samples draw without replacement. int[] perm = new int[n]; for (int i = 0; i < n; i++) { perm[i] = i; } Math.permutate(perm); int[] nc = new int[k]; for (int i = 0; i < n; i++) { nc[y[i]]++; } for (int l = 0; l < k; l++) { int subj = (int) Math.round(nc[l] * subsample / classWeight[l]); int count = 0; for (int i = 0; i < n && count < subj; i++) { int xi = perm[i]; if (y[xi] == l) { sampling[xi] += 1; count++; } } } } int m = 0; for (int s : sampling) { if (s != 0) { m++; size += s; } } this.samples = new int[m][2]; for (int i = 0, l = 0; i < n; i++) { if (sampling[i] > 0) { samples[l][0] = i; samples[l][1] = sampling[i]; l++; } } } }