package org.talend.dataquality.semantic.statistics;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import java.io.*;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;
import java.util.Set;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVPrinter;
import org.apache.commons.csv.CSVRecord;
import org.junit.BeforeClass;
import org.junit.Ignore;
import org.junit.Test;
import org.talend.dataquality.common.inference.Analyzer;
import org.talend.dataquality.common.inference.Analyzers;
import org.talend.dataquality.common.inference.ValueQualityStatistics;
import org.talend.dataquality.common.inference.Analyzers.Result;
import org.talend.dataquality.semantic.index.utils.DictionaryGenerationSpec;
import org.talend.dataquality.semantic.index.utils.SemanticDictionaryGenerator;
import org.talend.dataquality.semantic.recognizer.CategoryRecognizerBuilder;
public class SemanticQualityAnalyzerPerformanceTest {
private static CategoryRecognizerBuilder builder;
private static int RECORD_LINES_NUMBER = 500000;
private static final String BIG_FILE_PATH = "src/test/resources/org/talend/dataquality/semantic/statistics/validation_big_file.csv";
private static final List<String[]> RECORDS_CRM_CUST = getRecords("validation_big_file.csv");
private static final DictionaryGenerationSpec[] EXPECTED_CATEGORIES_DICT = new DictionaryGenerationSpec[] { //
DictionaryGenerationSpec.AIRPORT_CODE, //
DictionaryGenerationSpec.CIVILITY, //
DictionaryGenerationSpec.CONTINENT, //
DictionaryGenerationSpec.COUNTRY, //
DictionaryGenerationSpec.COUNTRY_CODE_ISO3, //
DictionaryGenerationSpec.MONTH, //
DictionaryGenerationSpec.US_COUNTY, //
DictionaryGenerationSpec.FR_COMMUNE, //
DictionaryGenerationSpec.FR_DEPARTEMENT, //
DictionaryGenerationSpec.LANGUAGE //
};
private static final long[][] EXPECTED_VALIDITY_COUNT_DICT = new long[][] { //
new long[] { 9944, 0, 0 }, //
new long[] { 9944, 0, 0 }, //
new long[] { 9944, 0, 0 }, //
new long[] { 9944, 0, 0 }, //
new long[] { 9944, 0, 0 }, //
new long[] { 9944, 0, 0 }, //
new long[] { 9943, 1, 0 }, //
new long[] { 9943, 0, 0 }, //
new long[] { 9943, 0, 0 }, //
new long[] { 9943, 0, 0 }, //
};
@BeforeClass
public static void setupBuilder() throws URISyntaxException {
final URI ddPath = SemanticQualityAnalyzerPerformanceTest.class.getResource(CategoryRecognizerBuilder.DEFAULT_DD_PATH)
.toURI();
final URI kwPath = SemanticQualityAnalyzerPerformanceTest.class.getResource(CategoryRecognizerBuilder.DEFAULT_KW_PATH)
.toURI();
builder = CategoryRecognizerBuilder.newBuilder() //
.ddPath(ddPath) //
.kwPath(kwPath) //
.lucene();
}
@Test
@Ignore
public void testSemanticQualityAnalyzerWithDictionaryCategory() {
String[] a = new String[EXPECTED_CATEGORIES_DICT.length];
for (int i = 0; i < EXPECTED_CATEGORIES_DICT.length; i++) {
a[i] = EXPECTED_CATEGORIES_DICT[i].getCategoryName();
}
testAnalysis(RECORDS_CRM_CUST, a, EXPECTED_VALIDITY_COUNT_DICT);
}
public void testAnalysis(List<String[]> records, String[] expectedCategories, long[][] expectedValidityCount) {
Analyzer<Result> analyzers = Analyzers.with(//
new SemanticQualityAnalyzer(builder, expectedCategories)//
);
long time = System.currentTimeMillis();
for (String[] record : records) {
analyzers.analyze(record);
}
final List<Result> result = analyzers.getResult();
System.out.println("Result = " + (System.currentTimeMillis() - time) + " ms");
assertEquals(expectedCategories.length, result.size());
// Composite result assertions (there should be a DataType and a SemanticType)
for (Result columnResult : result) {
assertNotNull(columnResult.get(ValueQualityStatistics.class));
}
// Semantic validation assertions
for (int i = 0; i < expectedCategories.length; i++) {
final ValueQualityStatistics stats = result.get(i).get(ValueQualityStatistics.class);
// System.out.println("new long[] {" + stats.getValidCount() + ", " + stats.getInvalidCount() + ", "
// + stats.getEmptyCount() + "}, //");
assertEquals("Unexpected valid count on column " + i, expectedValidityCount[i][0], stats.getValidCount());
assertEquals("Unexpected invalid count on column " + i, expectedValidityCount[i][1], stats.getInvalidCount());
assertEquals("Unexpected empty count on column " + i, expectedValidityCount[i][2], stats.getEmptyCount());
assertEquals("Unexpected unknown count on column " + i, 0, stats.getUnknownCount());
}
}
private static List<String[]> getRecords(String path) {
List<String[]> records = new ArrayList<String[]>();
try {
Reader reader = new FileReader(SemanticQualityAnalyzerPerformanceTest.class.getResource(path).getPath());
CSVFormat csvFormat = CSVFormat.DEFAULT.withDelimiter(';');
Iterable<CSVRecord> csvRecords = csvFormat.parse(reader);
for (CSVRecord csvRecord : csvRecords) {
String[] values = new String[csvRecord.size()];
for (int i = 0; i < csvRecord.size(); i++) {
values[i] = csvRecord.get(i);
}
records.add(values);
}
} catch (IOException e) {
e.printStackTrace();
}
return records;
}
// To generate a bigger validation_big_file.csv if necessary
public static void main(String[] args) {
try {
final String resourcePath = SemanticDictionaryGenerator.class.getResource(".").getFile();
final String projectRoot = new File(resourcePath).getParentFile().getParentFile().getParentFile().getParentFile()
.getParentFile().getParentFile().getParentFile().getParentFile().getPath() + File.separator;
File f = new File(projectRoot + BIG_FILE_PATH);
CSVPrinter writer = new CSVPrinter(new FileWriter(f), CSVFormat.DEFAULT.withDelimiter(';'));
List<String[]> records = new ArrayList<>();
Random randomGenerator = new Random();
for (int i = 0; i < RECORD_LINES_NUMBER; i++) {
records.add(new String[EXPECTED_CATEGORIES_DICT.length]);
}
for (int j = 0; j < EXPECTED_CATEGORIES_DICT.length; j++) {
List<String> file = getFile(EXPECTED_CATEGORIES_DICT[j]);
for (int i = 0; i < RECORD_LINES_NUMBER; i++) {
records.get(i)[j] = file.get(randomGenerator.nextInt(file.size()));
}
}
for (String[] record : records)
writer.printRecord(record);
} catch (IOException e) {
e.printStackTrace();
}
}
public static List<String> getFile(DictionaryGenerationSpec spec) throws IOException {
Reader reader = new FileReader(
SemanticQualityAnalyzerPerformanceTest.class.getResource("../index/utils/" + spec.getSourceFile()).getPath());
CSVFormat csvFormat = CSVFormat.DEFAULT.withDelimiter(spec.getCsvConfig().getDelimiter());
if (spec.getCsvConfig().isWithHeader()) {
csvFormat = csvFormat.withFirstRecordAsHeader();
}
// collect values
Iterable<CSVRecord> records = csvFormat.parse(reader);
List<Set<String>> valueSetList = SemanticDictionaryGenerator.getDictionaryForCategory(records, spec);
List<String> result = new ArrayList<>();
for (Set<String> valueSet : valueSetList) {
if (valueSet.iterator().hasNext())
result.add(valueSet.iterator().next());
}
return result;
}
}