package org.talend.dataquality.datamasking.shuffling;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Queue;
import java.util.concurrent.ConcurrentLinkedQueue;
import org.junit.Assert;
import org.junit.BeforeClass;
import org.junit.Test;
public class ShuffleColumnWithPartitionTest {
private String file1000 = "Shuffling_test_data_1000.csv";
private static List<String> group = new ArrayList<String>();
private static List<List<String>> numColumn = new ArrayList<List<String>>();
private static List<String> allColumns = Arrays
.asList(new String[] { "id", "first_name", "last_name", "email", "gender", "birth", "city", "zip_code", "country" });
private static GenerateData generator = new GenerateData();
@BeforeClass
public static void prepareData() {
group.add("city");
group.add("zip_code");
group.add("country");
List<String> column1 = Arrays.asList(new String[] { "id", "first_name" });
List<String> column2 = Arrays.asList(new String[] { "email" });
numColumn.add(column1);
numColumn.add(column2);
}
/**
* Tests by the partitions.<br>
* <ul>
*
* <li>Partition runs well :
* <ul>
* <li>id is in the rage of partition</li>
* <li>email's original index is in the range of partition</li>
* <li>the city and state do not move</li>
* </ul>
* </li>
*
* <li>Integration of data :
* <ul>
* <li>id and the first name remain its original correspondence</li>
* <li>email exists in the list</li>
* </ul>
* </li>
*
* <li>Shuffling quality :
* <ul>
* <li>the id group (id and the first) and the email, at least one has changed its original position</li>
* </ul>
* </li>
*
* </ul>
* (otyiuo *
*
* @throws InterruptedException
*/
@Test
public void testPartition1000() throws InterruptedException {
List<List<Object>> fileData = generator.getTableValue(file1000);
int partition = 1000;
Queue<List<List<Object>>> result = new ConcurrentLinkedQueue<List<List<Object>>>();
ShufflingService service = new ShufflingService(numColumn, allColumns, group);
ShufflingHandler handler = new ShufflingHandler(service, result);
service.setShufflingHandler(handler);
service.setSeperationSize(partition);
long time1 = System.currentTimeMillis();
service.setRows(fileData);
long time2 = System.currentTimeMillis();
service.setHasFinished(true);
System.out.println("1000 line generation time " + (time2 - time1));
Assert.assertEquals(1, result.size());
for (int i = 0; i < fileData.size() / partition; i++) {
List<String> emailsO = new ArrayList<String>();
List<String> fnsO = new ArrayList<String>();
List<String> citisO = new ArrayList<String>();
List<String> statesO = new ArrayList<String>();
List<Integer> idsO = new ArrayList<Integer>();
List<String> emailsS = new ArrayList<String>();
List<String> fnsS = new ArrayList<String>();
List<Integer> idsS = new ArrayList<Integer>();
List<String> citisS = new ArrayList<String>();
List<String> statesS = new ArrayList<String>();
List<List<Object>> subRows = result.poll();
Assert.assertEquals(partition, subRows.size());
for (int row = 0; row < subRows.size(); row++) {
int idS = Integer.parseInt(subRows.get(row).get(0).toString());
// Partition runs well: id is in the range of partition
Assert.assertTrue(idS >= (partition * i + 1));
Assert.assertTrue(idS < (partition * (i + 1) + 1));
emailsO.add(fileData.get(row + partition * i).get(3).toString());
fnsO.add(fileData.get(row + partition * i).get(1).toString());
citisO.add(fileData.get(row + partition * i).get(6).toString());
statesO.add(fileData.get(row + partition * i).get(7).toString());
idsO.add(Integer.parseInt(fileData.get(row + partition * i).get(0).toString()));
idsS.add(idS);
fnsS.add(subRows.get(row).get(1).toString());
emailsS.add(subRows.get(row).get(3).toString());
citisS.add(subRows.get(row).get(6).toString());
statesS.add(subRows.get(row).get(7).toString());
}
for (int row = 0; row < subRows.size(); row++) {
// Partition runs well: email's original index is in the range of partition && Integration of data :
// email exists in the list
Assert.assertTrue(emailsO.contains(emailsS.get(row)));
int ids = idsS.get(row);
int idO = ids - i * partition - 1;
// Partition runs well: the city and state do not move
String cityS = citisS.get(row);
String cityO = citisO.get(idO);
Assert.assertEquals(cityO, cityS);
String stateS = statesS.get(row);
String stateO = statesO.get(idO);
Assert.assertEquals(stateO, stateS);
// Integration of data : id and the first name remain its original correspondence
;
String fnO = fnsO.get(idO);
Assert.assertEquals(fnO, fnsS.get(row));
// Shuffling quality : the id group (id and the first) and the email, at least one has changed its
// original position
String emailS = emailsS.get(row);
String emailO = emailsO.get(row);
Assert.assertTrue(ids != idO || !emailS.equals(emailO));
}
}
}
}