package org.talend.dataquality.datamasking.shuffling;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.fail;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Queue;
import java.util.concurrent.ConcurrentLinkedQueue;
import org.junit.Assert;
import org.junit.BeforeClass;
import org.junit.Ignore;
import org.junit.Test;
public class ShuffleColumnTest {
private String file5000 = "Shuffling_test_data_5000.csv";
private String file50000 = "Shuffling_test_data_50000.csv";
private String file1000Compared = "Shuffling_test_data_1000 _result.csv";
private static List<Integer> data = new ArrayList<Integer>();
private static GenerateData generation = new GenerateData();
private static List<List<String>> columns = new ArrayList<List<String>>();
private static List<String> allColumns = Arrays
.asList(new String[] { "id", "first_name", "last_name", "email", "gender", "birth", "city", "zip_code", "country" });
@BeforeClass
public static void generateData() {
for (int i = 0; i < 14; i++) {
data.add(i);
}
List<String> column1 = Arrays.asList(new String[] { "id", "first_name" });
List<String> column2 = Arrays.asList(new String[] { "email" });
List<String> column3 = Arrays.asList(new String[] { "city", "zip_code" });
columns.add(column1);
columns.add(column2);
columns.add(column3);
}
@Test
public void testBufferDemo() {
String file = "demo_test.csv";
String fileCompared = "demo_test _result.csv";
List<List<String>> columns = new ArrayList<List<String>>();
List<String> column1 = Arrays.asList(new String[] { "id" });
List<String> column2 = Arrays.asList(new String[] { "fn", "ln" });
columns.add(column1);
columns.add(column2);
List<String> allColumns = Arrays.asList(new String[] { "id", "fn", "ln", "City", "Addr", "Country" });
Queue<List<List<Object>>> result = new ConcurrentLinkedQueue<List<List<Object>>>();
ShufflingService service = new ShufflingService(columns, allColumns);
ShufflingHandler handler = new ShufflingHandler(service, result);
service.setShufflingHandler(handler);
service.setSeperationSize(10);
service.setRandomSeed(77);
List<List<Object>> fileData = generation.getTableValue(file);
long time1 = System.currentTimeMillis();
service.setRows(fileData);
long time2 = System.currentTimeMillis();
service.setHasFinished(true);
List<List<Object>> fileDataCompared = generation.getTableValue(fileCompared);
for (int i = 0; i < 2; i++) {
List<List<Object>> rows = result.peek();
for (int j = 0; j < rows.size(); j++) {
List<Object> shuffled = rows.get(j);
List<Object> compared = fileDataCompared.get(i * 10 + j);
for (int k = 0; k < 3; k++) {
assertEquals(shuffled.get(k).toString(), compared.get(k).toString().trim());
}
}
}
}
@Test
public void testReplacementBigInteger() {
int size = 23000000;
int prime = 198491329;
// System.out.println((long) Integer.MAX_VALUE * Integer.MAX_VALUE);
for (long i = 0; i < size; i++) {
int result = (int) (((i + 1) * prime) % size);
if (result == i || (result < 0)) {
System.out.println(i + " => " + result);
fail("result is identical");
}
}
}
@Test
public void testOneColumnBigInteger() {
int partition = 10000;
int size = 1000000;
List<List<String>> id = new ArrayList<List<String>>();
List<String> idc = new ArrayList<String>();
idc.add("id");
id.add(idc);
Queue<List<List<Object>>> result = new ConcurrentLinkedQueue<List<List<Object>>>();
ShufflingService service = new ShufflingService(id, idc);
ShufflingHandler handler = new ShufflingHandler(service, result);
service.setShufflingHandler(handler);
service.setSeperationSize(partition);
for (int i = 0; i < size; i++) {
List<Object> row = Arrays.asList((Object) (i + ""));
service.addOneRow(row);
}
service.setHasFinished(true);
Assert.assertEquals(size / partition, result.size());
for (int i = 0; i < result.size(); i++) {
List<List<Object>> rows = result.poll();
for (int position = 0; position < rows.size(); position++) {
int item = Integer.parseInt(rows.get(position).get(0).toString());
// the partition is good
Assert.assertTrue(item < partition * (i + 1));
Assert.assertTrue(item >= partition * i);
// the position changes
Assert.assertTrue(item != position);
}
}
}
@Test
@Ignore
public void testOneColumnBigIntegerHasModulo() {
int partition = 100000;
int size = 10000999;
List<List<String>> id = new ArrayList<List<String>>();
List<String> idc = new ArrayList<String>();
idc.add("id");
id.add(idc);
Queue<List<List<Object>>> result = new ConcurrentLinkedQueue<List<List<Object>>>();
ShufflingService service = new ShufflingService(id, idc);
ShufflingHandler handler = new ShufflingHandler(service, result);
service.setShufflingHandler(handler);
service.setSeperationSize(partition);
service.setSeperationSize(partition);
for (int i = 0; i < size; i++) {
List<Object> row = Arrays.asList((Object) (i + ""));
service.addOneRow(row);
}
service.setHasFinished(true);
Assert.assertEquals(size / partition, result.size() - 1);
for (int i = 0; i < size / partition; i++) {
List<List<Object>> rows = result.poll();
for (int position = 0; position < rows.size(); position++) {
int item = Integer.parseInt(rows.get(position).get(0).toString());
// the partition is good
Assert.assertTrue(item < partition * (i + 1));
Assert.assertTrue(item >= partition * i);
// the position changes
Assert.assertTrue(item != position);
}
}
// test last rows
List<List<Object>> rows = result.poll();
for (int position = 0; position < rows.size(); position++) {
int item = Integer.parseInt(rows.get(position).get(0).toString());
// the partition is good
Assert.assertTrue(item < size);
Assert.assertTrue(item >= partition * (size / partition));
// the position changes
Assert.assertTrue(item != position);
}
}
@Test
public void testshuffleColumnsData1000() throws InterruptedException {
Queue<List<List<Object>>> result = new ConcurrentLinkedQueue<List<List<Object>>>();
ShufflingService service = new ShufflingService(columns, allColumns);
service.setRandomSeed(77);
ShufflingHandler handler = new ShufflingHandler(service, result);
service.setShufflingHandler(handler);
service.setSeperationSize(100000);
List<List<Object>> fileData = generation.getTableValue(GenerateData.SHUFFLING_DATA_PATH);
List<List<Object>> fileDataCompared = generation.getTableValue(file1000Compared);
long time1 = System.currentTimeMillis();
service.setRows(fileData);
long time2 = System.currentTimeMillis();
service.setHasFinished(true);
System.out.println("1000 line generation time " + (time2 - time1));
Assert.assertEquals(1, result.size());
List<Object> idColumnSL = new ArrayList<Object>();
List<Object> firstNameColumnSL = new ArrayList<Object>();
List<Object> emailSL = new ArrayList<Object>();
List<Object> citySL = new ArrayList<Object>();
List<Object> zipSL = new ArrayList<Object>();
List<Object> idColumnL = new ArrayList<Object>();
List<Object> firstNameColumnL = new ArrayList<Object>();
List<Object> emailL = new ArrayList<Object>();
List<Object> cityL = new ArrayList<Object>();
List<Object> zipL = new ArrayList<Object>();
// Initialize the shuffled data
for (int group = 0; group < result.size(); group++) {
List<List<Object>> rows = result.poll();
// Compare the shuffling results' positions
for (int i = 0; i < rows.size(); i++) {
List<Object> shuffled = rows.get(i);
List<Object> compared = fileDataCompared.get(i);
for (int j = 0; j < 4; j++) {
assertEquals(shuffled.get(j).toString(), compared.get(j).toString().trim());
}
}
for (List<Object> row : rows) {
Object idS = row.get(0);
Object firstNameS = row.get(1);
Object emailS = row.get(3);
Object cityS = row.get(6);
Object zipS = row.get(7);
idColumnSL.add(idS);
firstNameColumnSL.add(firstNameS);
emailSL.add(emailS);
citySL.add(cityS);
zipSL.add(zipS);
}
}
// Initialize the original data set
for (int i = 0; i < fileData.size(); i++) {
Object id = fileData.get(i).get(0);
Object firstName = fileData.get(i).get(1);
Object email = fileData.get(i).get(3);
Object city = fileData.get(i).get(6);
Object zip = fileData.get(i).get(7);
idColumnL.add(id);
firstNameColumnL.add(firstName);
emailL.add(email);
cityL.add(city);
zipL.add(zip);
}
for (int i = 0; i < fileData.size(); i++) {
// test whether all email address retain
Assert.assertTrue(emailSL.contains(emailL.get(i)));
// test whether all name retain
Assert.assertTrue(firstNameColumnSL.contains(firstNameColumnSL.get(i)));
Object oid = idColumnL.get(i);
Object nid = idColumnSL.get(i);
Object oemail = emailL.get(i);
Object nemail = emailSL.get(i);
Object oName = firstNameColumnL.get(i);
// test whether email and id information have all changed
Assert.assertTrue(!oid.equals(nid) || !oemail.equals(nemail));
// test whether the id and first name's relation retains
int sIdIndex = idColumnSL.indexOf(oid);
Object sFirstName = firstNameColumnSL.get(sIdIndex);
Assert.assertTrue(oName.equals(sFirstName));
}
}
@Test
public void testshuffleColumnsData5000() throws InterruptedException {
Queue<List<List<Object>>> result = new ConcurrentLinkedQueue<List<List<Object>>>();
ShufflingService service = new ShufflingService(columns, allColumns);
ShufflingHandler handler = new ShufflingHandler(service, result);
service.setShufflingHandler(handler);
service.setSeperationSize(100000);
List<List<Object>> fileData = generation.getTableValue(file5000);
long time1 = System.currentTimeMillis();
service.setRows(fileData);
long time2 = System.currentTimeMillis();
service.setHasFinished(true);
Thread.sleep(100);
System.out.println("5000 line generation time " + (time2 - time1));
Assert.assertEquals(1, result.size());
List<Object> idColumnSL = new ArrayList<Object>();
List<Object> firstNameColumnSL = new ArrayList<Object>();
List<Object> emailSL = new ArrayList<Object>();
List<Object> citySL = new ArrayList<Object>();
List<Object> zipSL = new ArrayList<Object>();
List<Object> idColumnL = new ArrayList<Object>();
List<Object> firstNameColumnL = new ArrayList<Object>();
List<Object> emailL = new ArrayList<Object>();
List<Object> cityL = new ArrayList<Object>();
List<Object> zipL = new ArrayList<Object>();
// Initialize the shuffled data
for (int group = 0; group < result.size(); group++) {
List<List<Object>> rows = result.poll();
for (List<Object> row : rows) {
Object idS = row.get(0);
Object firstNameS = row.get(1);
Object emailS = row.get(3);
Object cityS = row.get(6);
Object zipS = row.get(7);
idColumnSL.add(idS);
firstNameColumnSL.add(firstNameS);
emailSL.add(emailS);
citySL.add(cityS);
zipSL.add(zipS);
}
}
// Initialize the original data set
for (int i = 0; i < fileData.size(); i++) {
Object id = fileData.get(i).get(0);
Object firstName = fileData.get(i).get(1);
Object email = fileData.get(i).get(3);
Object city = fileData.get(i).get(6);
Object zip = fileData.get(i).get(7);
idColumnL.add(id);
firstNameColumnL.add(firstName);
emailL.add(email);
cityL.add(city);
zipL.add(zip);
}
for (int i = 0; i < fileData.size(); i++) {
// test whether all email address retain
Assert.assertTrue(emailSL.contains(emailL.get(i)));
// test whether all name retain
Assert.assertTrue(firstNameColumnSL.contains(firstNameColumnSL.get(i)));
Object oid = idColumnL.get(i);
Object nid = idColumnSL.get(i);
Object oemail = emailL.get(i);
Object nemail = emailSL.get(i);
Object oName = firstNameColumnL.get(i);
// test whether email and id information have all changed
Assert.assertTrue(!oid.equals(nid) || !oemail.equals(nemail));
// test whether the id and first name's relation retains
int sIdIndex = idColumnSL.indexOf(oid);
Object sFirstName = firstNameColumnSL.get(sIdIndex);
Assert.assertTrue(oName.equals(sFirstName));
}
}
@Test
@Ignore
public void testshuffleColumnsData50000() {
Queue<List<List<Object>>> result = new ConcurrentLinkedQueue<List<List<Object>>>();
ShufflingService service = new ShufflingService(columns, allColumns);
ShufflingHandler handler = new ShufflingHandler(service, result);
service.setShufflingHandler(handler);
service.setSeperationSize(100000);
List<List<Object>> fileData = generation.getTableValue(file50000);
long time1 = System.currentTimeMillis();
service.setRows(fileData);
long time2 = System.currentTimeMillis();
service.setHasFinished(true);
System.out.println("50000 line generation time " + (time2 - time1));
Assert.assertEquals(1, result.size());
System.out.println("result size " + result.size());
long time3 = System.currentTimeMillis();
List<Object> idColumnSL = new ArrayList<Object>();
List<Object> firstNameColumnSL = new ArrayList<Object>();
List<Object> emailSL = new ArrayList<Object>();
List<Object> citySL = new ArrayList<Object>();
List<Object> zipSL = new ArrayList<Object>();
List<Object> idColumnL = new ArrayList<Object>();
List<Object> firstNameColumnL = new ArrayList<Object>();
List<Object> emailL = new ArrayList<Object>();
List<Object> cityL = new ArrayList<Object>();
List<Object> zipL = new ArrayList<Object>();
// Initialize the shuffled data
for (int group = 0; group < result.size(); group++) {
List<List<Object>> rows = result.poll();
for (List<Object> row : rows) {
Object idS = row.get(0);
Object firstNameS = row.get(1);
Object emailS = row.get(3);
Object cityS = row.get(6);
Object zipS = row.get(7);
idColumnSL.add(idS);
firstNameColumnSL.add(firstNameS);
emailSL.add(emailS);
citySL.add(cityS);
zipSL.add(zipS);
}
}
// Initialize the original data set
for (int i = 0; i < fileData.size(); i++) {
Object id = fileData.get(i).get(0);
Object firstName = fileData.get(i).get(1);
Object email = fileData.get(i).get(3);
Object city = fileData.get(i).get(6);
Object zip = fileData.get(i).get(7);
idColumnL.add(id);
firstNameColumnL.add(firstName);
emailL.add(email);
cityL.add(city);
zipL.add(zip);
}
for (int i = 0; i < fileData.size(); i++) {
// test whether all email address retain
Assert.assertTrue(emailSL.contains(emailL.get(i)));
// test whether all name retain
Assert.assertTrue(firstNameColumnSL.contains(firstNameColumnSL.get(i)));
Object oid = idColumnL.get(i);
Object nid = idColumnSL.get(i);
Object oemail = emailL.get(i);
Object nemail = emailSL.get(i);
Object oName = firstNameColumnL.get(i);
// test whether email and id information have all changed
Assert.assertTrue(!oid.equals(nid) || !oemail.equals(nemail));
// test whether the id and first name's relation retains
int sIdIndex = idColumnSL.indexOf(oid);
Object sFirstName = firstNameColumnSL.get(sIdIndex);
Assert.assertTrue(oName.equals(sFirstName));
}
long time4 = System.currentTimeMillis();
System.out.println("50000 line generation time " + (time4 - time3));
}
}