package hex.createframe.postprocess;
import hex.createframe.CreateFramePostprocessStep;
import water.DKV;
import water.fvec.Frame;
import water.fvec.Vec;
import water.util.ArrayUtils;
import java.util.HashMap;
import java.util.Random;
/**
* Action to shuffle the columns of the frame.
*/
public class ShuffleColumnsCfps extends CreateFramePostprocessStep {
private boolean reassignNames;
private boolean responseFirst;
public ShuffleColumnsCfps() {}
/**
* @param reassignNames If true, the columns will be renamed within each group starting with a common alpha-prefix.
* I.e. if the original frame had columns [A1, A3, B2, B5] then after shuffling it may look like
* this: [B1, A1, A2, B2]. In this new frame column "A1" may have been either "A1" or "A3" in
* the original frame.
* If false, each vec will keep its name.
* @param responseFirst If true, the "response" column will be moved to the beginning of the frame. Otherwise it
* will be shuffled together with the rest of the columns.
*/
public ShuffleColumnsCfps(boolean reassignNames, boolean responseFirst) {
this.reassignNames = reassignNames;
this.responseFirst = responseFirst;
}
@Override
public void exec(Frame fr, Random rng) {
// Initial shuffle
int numCols = fr.numCols();
if (numCols == 0) return;
int[] idx = ArrayUtils.seq(0, numCols);
ArrayUtils.shuffleArray(idx, rng);
// Move the response column to the beginning of the frame
if (responseFirst) {
int responseIndex = ArrayUtils.find(fr.names(), "response");
if (responseIndex == -1) responseIndex = ArrayUtils.find(fr.names(), "Response");
if (responseIndex >= 0) {
int shuffledIndex = ArrayUtils.find(idx, responseIndex);
idx[shuffledIndex] = idx[0];
idx[0] = responseIndex;
}
}
// Construct shuffled arrays of names and vecs
Vec[] newVecs = new Vec[numCols];
String[] newNames = new String[numCols];
for (int i = 0; i < numCols; ++i) {
newVecs[i] = fr.vec(idx[i]);
newNames[i] = fr.name(idx[i]);
}
// Rename columns in order to hide the fact that they were shuffled
if (reassignNames) {
HashMap<String, Integer> prefixCounts = new HashMap<>();
for (int i = 0; i < numCols; i++){
String prefix = removeNumericSuffix(newNames[i]);
int count = prefixCounts.containsKey(prefix)? prefixCounts.get(prefix) + 1 : 1;
prefixCounts.put(prefix, count);
if (!newNames[i].equals("response"))
newNames[i] = prefix + count;
}
}
// Reshape the original dataframe
fr.restructure(newNames, newVecs);
DKV.put(fr);
}
/**
* Helper function which strips the provided name from any numeric suffix in the end.
* Equivalent to <code>name.rstrip("0123456789")</code> in Python.
*/
public static String removeNumericSuffix(String name) {
int i = name.length();
while (--i >= 0) {
char ch = name.charAt(i);
if (ch < '0' || ch > '9') break;
}
return name.substring(0, i + 1);
}
}