package com.skp.experiment.cf.als.hadoop;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;
import org.apache.mahout.common.Pair;
import org.apache.mahout.common.RandomUtils;
public class KFoldCrossValidationUtils {
private static Random random = RandomUtils.getRandom();
// O(N) random uniform distribution suffle. list should be ArrayList to be O(N)
public static void randomSuffleInPlace(ArrayList<String> values) {
int lastIndex = values.size();
for (int i = 0; i < values.size(); i++) {
int current = random.nextInt(lastIndex);
// swap current, lastIndex-1
String tmp = values.get(lastIndex-1);
values.set(lastIndex-1, values.get(current));
values.set(current, tmp);
lastIndex--;
}
}
/*
* iterate values and return current nth window as probset and rest as trainingSet. O(N).
*/
public static Pair<List<String>, List<String>> splitNth(List<String> values, int k, int nth) {
int window = (int) Math.ceil(values.size() / (double) k);
int startIdx = nth * window;
int endIdx = Math.min((nth + 1) * window, values.size());
List<String> trainingSet = new ArrayList<String>();
List<String> probeSet = new ArrayList<String>();
for (int i = 0; i < values.size(); i++) {
if (i >= startIdx && i < endIdx) {
// goes to probe set
probeSet.add(values.get(i));
} else {
// rest goes to training set
trainingSet.add(values.get(i));
}
}
return new Pair<List<String>, List<String>>(trainingSet, probeSet);
}
}