/*
* This file is part of ALOE.
*
* ALOE is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
* ALOE is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
* You should have received a copy of the GNU General Public License
* along with ALOE. If not, see <http://www.gnu.org/licenses/>.
*
* Copyright (c) 2012 SCCL, University of Washington (http://depts.washington.edu/sccl)
*/
package etc.aloe.processes;
import etc.aloe.data.LabelableItem;
import java.util.ArrayList;
import java.util.List;
/**
* Procedure for separating data items into training and test sets for cross
* validation.
*
* @author Michael Brooks <mjbrooks@uw.edu>
*/
public class CrossValidationSplit<T extends LabelableItem> {
/**
* Creates the training set for one fold of a cross-validation on the
* dataset.
*
* @param instances the data to split
* @param foldIndex 0 for the first fold, 1 for the second, ...
* @param numFolds the number of folds in the cross-validation. Must be
* greater than 1.
* @return the training set
*/
public List<T> getTrainingForFold(List<T> instances, int foldIndex, int numFolds) {
int numInstForFold, first, offset;
if (numFolds < 2) {
throw new IllegalArgumentException("Number of folds must be at least 2!");
}
if (numFolds > instances.size()) {
throw new IllegalArgumentException("Can't have more folds than instances!");
}
numInstForFold = instances.size() / numFolds;
if (foldIndex < instances.size() % numFolds) {
numInstForFold++;
offset = foldIndex;
} else {
offset = instances.size() % numFolds;
}
List<T> train = new ArrayList<T>(instances.size() - numInstForFold);
first = foldIndex * (instances.size() / numFolds) + offset;
for (int i = 0; i < first; i++) {
train.add(instances.get(i));
}
for (int i = first + numInstForFold; i < instances.size(); i++) {
train.add(instances.get(i));
}
return train;
}
/**
* Creates the test set for one fold of a cross-validation on the dataset.
*
* @param instances the data to split
* @param foldIndex 0 for the first fold, 1 for the second, ...
* @param numFolds the number of folds in the cross-validation. Must be
* greater than 1.
* @return the test set as a set
*/
public List<T> getTestingForFold(List<T> instances, int foldIndex, int numFolds) {
int numInstForFold, first, offset;
if (numFolds < 2) {
throw new IllegalArgumentException("Number of folds must be at least 2!");
}
if (numFolds > instances.size()) {
throw new IllegalArgumentException("Can't have more folds than instances!");
}
numInstForFold = instances.size() / numFolds;
if (foldIndex < instances.size() % numFolds) {
numInstForFold++;
offset = foldIndex;
} else {
offset = instances.size() % numFolds;
}
List<T> test = new ArrayList<T>(numInstForFold);
first = foldIndex * (instances.size() / numFolds) + offset;
for (int i = first; i < first + numInstForFold; i++) {
test.add(instances.get(i));
}
return test;
}
}