// ============================================================================
//
// Copyright (C) 2006-2016 Talend Inc. - www.talend.com
//
// This source code is available under agreement available at
// %InstallDIR%\features\org.talend.rcp.branding.%PRODUCTNAME%\%PRODUCTNAME%license.txt
//
// You should have received a copy of the agreement
// along with this program; if not, write to Talend SA
// 9 rue Pages 92150 Suresnes, France
//
// ============================================================================
package org.talend.dataquality.duplicating;
import static org.junit.Assert.assertEquals;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;
import org.junit.Test;
public class AbstractDuplicatorTest {
private static final String CITY_NAME = "CITY_NAME"; //$NON-NLS-1$
private static final double grp_size_expectation = 5;
private static final double duplicate_percentage = 0.7;
private static final String[] distroNames = { "BERNOULLI", "POISSON", "GEOMETRIC" }; //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$
private static final int original_count = 1000;
private static final int expectedDupCount = 334;
private static final int[] expectedDupCountSum = { 1693, 1680, 1706 };
private static final int[] expectedCountSum = { 2359, 2346, 2372 };
/**
* Test the record count after duplication for the three types of distributions with a fixed random seed.
* <p/>
* The row1Struct and row2Struct classes simulate the usage in components.
*/
@Test
public void testDistro() {
for (int i = 0; i < distroNames.length; i++) {
AbstractDuplicator<row1Struct, row2Struct> duplicator = new AbstractDuplicator<row1Struct, row2Struct>(
grp_size_expectation, duplicate_percentage, distroNames[i], AllDataqualitySamplingTests.RANDOM_SEED) {
@Override
protected row2Struct generateOutput(row1Struct v, boolean isOriginal) {
row2Struct tmpStruct = new row2Struct();
tmpStruct.id = v.id;
tmpStruct.city = v.city;
if (isOriginal) {
tmpStruct.ORIGINAL_MARK = false;
} else {
tmpStruct.ORIGINAL_MARK = true;
}
return tmpStruct;
}
};
int[] result = getCountResult(duplicator);
assertEquals(expectedDupCount, result[0]);
assertEquals(expectedDupCountSum[i], result[1]);
assertEquals(expectedCountSum[i], result[2]);
}
}
private int[] getCountResult(AbstractDuplicator<row1Struct, row2Struct> duplicator) {
row1Struct[] testers = new row1Struct[original_count];
for (int j = 0; j < original_count; j++) {
row1Struct struct = new row1Struct();
struct.id = j + 1;
struct.city = CITY_NAME;
testers[j] = struct;
}
int countSum = 0;
int dupCount = 0;
int dupCountSum = 0;
List<row2Struct> duplicateResult = new ArrayList<row2Struct>();
for (row1Struct tester : testers) {
List<row2Struct> res = duplicator.process(tester);
duplicateResult.addAll(res);
if (res.size() == 1) {
countSum++;
} else {
dupCount++;
dupCountSum += res.size();
countSum += res.size();
}
}
int[] result = { dupCount, dupCountSum, countSum };
return result;
}
/**
* Test the record count after duplication for the three types of distributions with a fixed random seed.
* <p/>
* The row1Struct and row2Struct classes simulate the usage in components.
*/
@Test
public void testDistroWithoutSeed() {
for (int i = 0; i < distroNames.length; i++) {
AbstractDuplicator<row1Struct, row2Struct> duplicator = new AbstractDuplicator<row1Struct, row2Struct>(
grp_size_expectation, duplicate_percentage, distroNames[i]) {
@Override
protected row2Struct generateOutput(row1Struct v, boolean isOriginal) {
row2Struct tmpStruct = new row2Struct();
tmpStruct.id = v.id;
tmpStruct.city = v.city;
if (isOriginal) {
tmpStruct.ORIGINAL_MARK = false;
} else {
tmpStruct.ORIGINAL_MARK = true;
}
return tmpStruct;
}
};
int[] result = getCountResult(duplicator);
Random random = duplicator.getRandom();
long seed = ((RandomWrapper) random).getSeed();
AbstractDuplicator<row1Struct, row2Struct> duplicator2 = new AbstractDuplicator<row1Struct, row2Struct>(
grp_size_expectation, duplicate_percentage, distroNames[i], seed) {
@Override
protected row2Struct generateOutput(row1Struct v, boolean isOriginal) {
row2Struct tmpStruct = new row2Struct();
tmpStruct.id = v.id;
tmpStruct.city = v.city;
if (isOriginal) {
tmpStruct.ORIGINAL_MARK = false;
} else {
tmpStruct.ORIGINAL_MARK = true;
}
return tmpStruct;
}
};
int[] result2 = getCountResult(duplicator2);
assertEquals(result[0], result2[0]);
assertEquals(result[1], result2[1]);
assertEquals(result[2], result2[2]);
}
}
}
class row1Struct {
public Integer id;
public Integer getId() {
return this.id;
}
public String city;
public String getCity() {
return this.city;
}
@Override
public String toString() {
return id + " -> " + city; //$NON-NLS-1$
}
}
class row2Struct {
public Integer id;
public Integer getId() {
return this.id;
}
public String city;
public String getCity() {
return this.city;
}
public Boolean ORIGINAL_MARK;
public Boolean getORIGINAL_MARK() {
return this.ORIGINAL_MARK;
}
@Override
public String toString() {
return id + " -> " + city + " -> " + ORIGINAL_MARK; //$NON-NLS-1$ //$NON-NLS-2$
}
}