package org.talend.dataquality.datamasking.shuffling; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Random; /** * The class ShuffleColumn defines the basic common methods used in the "shuffling" functions. As with shuffling, this * technique is effective only on a large data set.<br> * DOC qzhao class global comment. */ public class ShuffleColumn { private static final int[] PRIME_NUMBERS = { 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113, 127, 131, 137, 139, 149, 151, 157, 163, 167, 173, 179, 181, 191, 193, 197, 199, 211, 223, 227, 229, 233, 239, 241, 251, 257, 263, 269, 271, 277, 281, 283, 293, 307, 311, 313, 317, 331, 337, 347, 349, 353, 359, 367, 373, 379, 383, 389, 397, 401, 409, 419, 421, 431, 433, 439, 443, 449, 457, 461, 463, 467, 479, 487, 491, 499, 503, 509, 521, 523, 541, 547, 557, 563, 569, 571, 577, 587, 593, 599, 601, 607, 613, 617, 619, 631, 641, 643, 647, 653, 659, 661, 673, 677, 683, 691, 701, 709, 719, 727, 733, 739, 743, 751, 757, 761, 769, 773, 787, 797, 809, 811, 821, 823, 827, 829, 839, 853, 857, 859, 863, 877, 881, 883, 887, 907, 911, 919, 929, 937, 941, 947, 953, 967, 971, 977, 983, 991, 997, 1009, 1013, 1019, 1021, 1031, 1033, 1039, 1049, 1051, 1061, 1063, 1069 }; private List<List<Integer>> numColumns = new ArrayList<List<Integer>>(); private List<Integer> partitionColumns = new ArrayList<Integer>(); private List<String> allInputColumns = new ArrayList<String>(); private Random random = new Random(); /** * Constructor without the partition choice * * @param shuffledColumns the 2D list of shuffled columns * @param allInputColumns the list of all input columns name */ public ShuffleColumn(List<List<String>> shuffledColumns, List<String> allInputColumns) { this.allInputColumns = allInputColumns; this.numColumns = getNumColumn(shuffledColumns); } /** * ShuffleColumn constructor comment. * * @param shuffledColumns the 2D list of shuffled columns * @param allInputColumns the list of all input columns name * @param partitionColumns the partitioned columns names */ public ShuffleColumn(List<List<String>> shuffledColumns, List<String> allInputColumns, List<String> partitionColumns) { this.allInputColumns = allInputColumns; this.numColumns = getNumColumn(shuffledColumns); this.partitionColumns = partitionColumns == null ? null : getPartitionIndex(partitionColumns); } /** * Comment method "shuffle". * * @param rows a list of partitioned rows */ public void shuffle(List<List<Object>> rows) { if (partitionColumns == null || partitionColumns.isEmpty()) { shuffleTable(rows); } else { shuffleColumnWithPartition(rows); } } private List<List<Integer>> getNumColumn(List<List<String>> shuffledColumns) { if (shuffledColumns.isEmpty() || shuffledColumns == null) throw new IllegalArgumentException("At least one column name should be given"); List<Integer> noDuplicated = new ArrayList<Integer>(); for (List<String> subList : shuffledColumns) { List<Integer> indexes = new ArrayList<Integer>(); for (int i = 0; i < subList.size(); i++) { int index = allInputColumns.indexOf(subList.get(i)); if (index != -1 && !noDuplicated.contains(index)) { indexes.add(index); noDuplicated.add(index); } else if (index != -1) { throw new IllegalArgumentException( "At least one column name in the shuffled columns does not match the input column names"); } else if (noDuplicated.contains(index)) { throw new IllegalArgumentException("One column can be only set in one shuffling group"); } } numColumns.add(indexes); } return numColumns; } private List<Integer> getPartitionIndex(List<String> partitionColumns) { List<Integer> list = new ArrayList<Integer>(); for (int i = 0; i < partitionColumns.size(); i++) { if (allInputColumns.contains(partitionColumns.get(i))) { int index = allInputColumns.indexOf(partitionColumns.get(i)); if (list.contains(index)) throw new IllegalArgumentException("Partitioning column should be set once"); else list.add(allInputColumns.indexOf(partitionColumns.get(i))); } else { throw new IllegalArgumentException( "At least one column name in the partition columns does not match the input column names"); } } return list; } /** * This methods shuffles the input 2D list by the give columns number.<br> * * The row indexes shift back by a random number between 1 and the input 2D list size one column by one column. Then * we find a prime number bigger than the row number. * * @param rowList a list of partitioned objects */ protected void shuffleTable(List<List<Object>> rowList) { List<Row> rows = generateRows(rowList, null); processShuffleTable(rowList, rows); rows.clear(); } private void processShuffleTable(List<List<Object>> rowList, List<Row> rows) { int size = rows.size(); List<Integer> replacements = calculateReplacementInteger(size, getPrimeNumber(size)); List<Integer> shifts = new ArrayList<Integer>(); if (numColumns.size() == 1) { adjustReplacements(replacements); for (int row = 0; row < size; row++) { for (int column : numColumns.get(0)) { rowList.get(rows.get(row).rIndex).set(column, rows.get(replacements.get(row)).rItems.get(column)); } } } else { for (int group = 0; group < numColumns.size(); group++) { int shift = getShift(shifts, size); shifts.add(shift); for (int row = 0; row < size; row++) { int replacement = replacements.get((row + shift) % size); List<Object> aux = rowList.get(rows.get(row).rIndex); List<Object> auxRow = rows.get(replacement).rItems; for (int column : numColumns.get(group)) { aux.set(column, auxRow.get(column)); } } } } } private void adjustReplacements(List<Integer> replacements) { for (int i = 0; i < replacements.size(); i++) { if (i == replacements.get(i)) { if (i != replacements.size() - 1) { replacements.set(i, replacements.get(i + 1)); replacements.set(i + 1, i); } else { replacements.set(i, replacements.get(i - 1)); replacements.set(i - 1, i); } } } } /** * Gets the shift of row index. Generally, the values in the shifts list should be unique and inferior than integer. * But when the integer is smaller than shifts' size, the method cannot guarantee the unique value in the shifts * list, which means that the shift list has the at least one value exits more than one times. * * @param shifts * @param integer * @return */ private int getShift(List<Integer> shifts, int integer) { int shift = 0; if (shifts.size() >= integer) { return random.nextInt(integer); } do { shift = random.nextInt(integer); } while (shifts.contains(shift)); return shift; } /** * * Shuffles the columns by a given group<br> * * @param rowList input table value */ protected void shuffleColumnWithPartition(List<List<Object>> rowList) { List<Row> rows = generateRows(rowList, partitionColumns); Collections.sort(rows); List<List<Row>> subRows = seperateRowsByPartition(rows); for (List<Row> subRow : subRows) { if (subRow.size() != 1) { processShuffleTable(rowList, subRow); } } } /** * Separates the list of Row object by the same group. Tow pointers are needed. The first pointer points to the * first line of the partition, then the second pointer slips down until the first line who has the different value * with the first pointer. * * @param rows the list of rows to be separated * @return a list of separated list */ private List<List<Row>> seperateRowsByPartition(List<Row> rows) { List<List<Row>> subRows = new ArrayList<List<Row>>(); int i = 0; int j = 1; do { List<Object> compared = rows.get(i).rPartition; do { List<Object> comparing = rows.get(j).rPartition; for (int k = 0; k < compared.size(); k++) { if (!compared.get(k).equals(comparing.get(k))) { subRows.add(rows.subList(i, j)); i = j; break; } } j++; if (j == rows.size()) { subRows.add(rows.subList(i, j)); i = j - 1; } } while (i != (j - 1)); } while (i != (rows.size() - 1)); return subRows; } /** * * Sets the random seed. * * @param seed a long number */ public void setRandomSeed(long seed) { this.random.setSeed(seed); } /** * Gets a prime number, prime with size. * * @param size * @return a prime number with size */ protected int getPrimeNumber(int size) { int res; do { res = PRIME_NUMBERS[random.nextInt(PRIME_NUMBERS.length)]; } while (size % res == 0); return res; } /** * This methods calculates the replaced index.<br> * The replaced index is calculated by the equation (original_index * prime_number) modulo (input_size)<br> * * @param size the input size * @param prime the prime number * @return a list of replacements */ protected List<Integer> calculateReplacementInteger(int size, int prime) { List<Integer> list = new ArrayList<Integer>(); for (int i = 0; i < size; i++) { long aux = ((i + 1L) % size) * (prime % size); list.add((int) (aux % size)); } return list; } /** * Generates a list of Row with input row values and saves the information of group. * * @param input the 2D arrays to be cloned * @param columnsPartition a list of grouped columns' indexes * @return a list of Row object */ protected List<Row> generateRows(List<List<Object>> input, List<Integer> columnsPartition) { List<Row> rows = new ArrayList<Row>(); int rIndex = 0; for (List<Object> subInput : input) { List<Object> partition = new ArrayList<Object>(); if (columnsPartition != null) { for (int cIndex : columnsPartition) { partition.add(subInput.get(cIndex)); } } rows.add(new Row(rIndex, subInput, partition)); rIndex++; } return rows; } /** * This class abstracts a Row with its index and the group items. It implements {@link Comparable} interface to * compare the value by the group items.<br> * DOC qzhao ShuffleColumnWithPartition class global comment. Detailled comment */ class Row implements Comparable<Row> { int rIndex; List<Object> rPartition = new ArrayList<Object>(); List<Object> rItems = new ArrayList<Object>(); public Row(int rIndex, List<Object> rItems, List<Object> rGroup) { super(); this.rIndex = rIndex; for (Object o : rItems) { this.rItems.add(o); } if (rGroup == null) { this.rPartition = null; } else { for (Object o : rGroup) { this.rPartition.add(o); } } } @Override public String toString() { return "( " + rIndex + " " + " rItems " + rItems + " rGroup " + rPartition + " )"; } @Override public int hashCode() { final int prime = 31; int result = 1; result = prime * result + getOuterType().hashCode(); result = prime * result + ((rPartition == null) ? 0 : rPartition.hashCode()); result = prime * result + rIndex; result = prime * result + ((rItems == null) ? 0 : rItems.hashCode()); return result; } @Override public boolean equals(Object o) { if (!(o instanceof Row)) { return false; } Row r = (Row) o; if (r.rIndex != rIndex || r.rPartition.size() != rPartition.size()) { return false; } for (int i = 0; i < rPartition.size(); i++) { if (!rPartition.get(i).equals(r.rPartition.get(i))) { return false; } } return true; } @Override public int compareTo(Row r) { int limit = Math.min(rPartition.size(), r.rPartition.size()); int cmp = Integer.MIN_VALUE; for (int i = 0; i < limit; i++) { cmp = ((String) rPartition.get(i)).compareTo((String) r.rPartition.get(i)); if (cmp != 0) { return cmp; } } return cmp; } Object getItem(int index) { return rItems.get(index); } private ShuffleColumn getOuterType() { return ShuffleColumn.this; } } }