Bagging.java example

Explorer
smile-master
/*******************************************************************************
 * Copyright (c) 2010 Haifeng Li
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *******************************************************************************/

package smile.sampling;

import java.util.ArrayList;
import smile.math.Math;

/**
 * Bagging (Bootstrap aggregating) is a way to improve the classification by
 * combining classifications of randomly generated training sets.
 *
 * @author Haifeng Li
 */
public class Bagging {

    /** The number of samples in this bag. */
    public int size;
    /**
     * Samples. The first column is the sample index while the second
     * column is the number of samples.
     */
    public int[][] samples;

    /**
     * Stratified sampling.
     *
     * @param k the number of classes.
     * @param y class labels.
     * @param classWeight Priors of the classes. The weight of each class
     *                    is roughly the ratio of samples in each class.
     *                    For example, if
     *                    there are 400 positive samples and 100 negative
     *                    samples, the classWeight should be [1, 4]
     *                    (assuming label 0 is of negative, label 1 is of
     *                    positive).
     * @param subsample sampling rate. Draw samples with replacement if it is 1.0.
     */
    public Bagging(int k, int[] y, int[] classWeight, double subsample) {
        int n = y.length;
        int[] sampling = new int[n];

        // Stratified sampling in case class is unbalanced.
        // That is, we sample each class separately.
        if (subsample == 1.0) {
            // Training samples draw with replacement.
            for (int l = 0; l < k; l++) {
                int nj = 0;
                ArrayList<Integer> cj = new ArrayList<>();
                for (int i = 0; i < n; i++) {
                    if (y[i] == l) {
                        cj.add(i);
                        nj++;
                    }
                }

                // We used to do up sampling.
                // But we switch to down sampling, which seems has better performance.
                int size = nj / classWeight[l];
                for (int i = 0; i < size; i++) {
                    int xi = smile.math.Math.randomInt(nj);
                    sampling[cj.get(xi)] += 1;
                }
            }
        } else {
            // Training samples draw without replacement.
            int[] perm = new int[n];
            for (int i = 0; i < n; i++) {
                perm[i] = i;
            }

            Math.permutate(perm);

            int[] nc = new int[k];
            for (int i = 0; i < n; i++) {
                nc[y[i]]++;
            }

            for (int l = 0; l < k; l++) {
                int subj = (int) Math.round(nc[l] * subsample / classWeight[l]);
                int count = 0;
                for (int i = 0; i < n && count < subj; i++) {
                    int xi = perm[i];
                    if (y[xi] == l) {
                        sampling[xi] += 1;
                        count++;
                    }
                }
            }
        }

        int m = 0;
        for (int s : sampling) {
            if (s != 0) {
                m++;
                size += s;
            }
        }

        this.samples = new int[m][2];
        for (int i = 0, l = 0; i < n; i++) {
            if (sampling[i] > 0) {
                samples[l][0] = i;
                samples[l][1] = sampling[i];
                l++;
            }
        }
    }
}