package org.talend.dataquality.statistics.semantic;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVRecord;
import org.junit.BeforeClass;
import org.junit.Ignore;
import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.talend.dataquality.common.inference.Analyzer;
import org.talend.dataquality.common.inference.Analyzers;
import org.talend.dataquality.common.inference.Analyzers.Result;
import org.talend.dataquality.semantic.recognizer.CategoryRecognizerBuilder;
import org.talend.dataquality.semantic.statistics.SemanticAnalyzer;
import org.talend.dataquality.semantic.statistics.SemanticType;
import org.talend.dataquality.statistics.type.DataTypeAnalyzer;
import org.talend.dataquality.statistics.type.DataTypeEnum;
import org.talend.dataquality.statistics.type.DataTypeOccurences;
public class BigFileAnalyzerPerformanceTest {
private static Logger log = LoggerFactory.getLogger(BigFileAnalyzerPerformanceTest.class);
private static CategoryRecognizerBuilder builder;
private static final List<String[]> RECORDS_BIG_FILE = getRecords("big_file.csv");
private static final DataTypeEnum[] EXPECTED_DATA_TYPE = new DataTypeEnum[] { //
DataTypeEnum.INTEGER, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING,
DataTypeEnum.INTEGER, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING,
DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.DOUBLE,
DataTypeEnum.DOUBLE, DataTypeEnum.INTEGER, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING,
DataTypeEnum.STRING, DataTypeEnum.INTEGER, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.INTEGER,
DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.INTEGER, DataTypeEnum.INTEGER,
DataTypeEnum.INTEGER, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.INTEGER, DataTypeEnum.INTEGER,
DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING,
DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING,
DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING,
DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING,
DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING,
DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING,
DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING,
DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING,
DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.DATE, DataTypeEnum.DATE,
DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING,
DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING,
DataTypeEnum.STRING, DataTypeEnum.INTEGER, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING,
DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING,
DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING,
DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING,
DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING,
DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING,
DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING,
DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING,
DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING,
DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING,
DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING,
DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING,
DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING,
DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING,
DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING,
DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING,
DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING,
DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING,
DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING,
DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING,
DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING,
DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING,
DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING,
DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING,
DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING,
DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING,
DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING,
DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING,
DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING,
DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING,
DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING,
DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING,
DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING,
DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING,
DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING,
DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING,
DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING,
DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING,
DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.INTEGER, DataTypeEnum.STRING,
DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, };
// Semantic types assertions
private static final String[] EXPECTED_SEMANTIC_DOMAIN = new String[] { //
"FR_POSTAL_CODE", "", "", "", "", "FR_POSTAL_CODE", "", "", "", "COUNTRY", // 10
"", "", "", "", "", "", "", "", "", "", // 20
"", "", "", "", "", "", "", "", "", "", // 30
"", "", "", "", "", "", "", "", "", "", // 40
"", "", "", "", "", "", "", "", "", "", // 50
"", "", "", "", "", "", "", "", "", "", // 60
"", "", "", "", "", "", "", "", "", "", // 70
"", "", "", "", "", "", "", "", "", "", // 80
"", "", "", "", "", "", "", "", "", "", // 90
"", "", "", "", "", "", "", "", "", "", // 100
"", "", "", "", "", "", "", "", "", "", // 110
"", "", "", "", "", "", "", "", "", "", // 120
"", "", "", "", "", "", "", "", "", "", // 130
"", "", "", "", "", "", "", "", "", "", // 140
"", "", "", "", "", "", "", "", "", "", // 150
"", "", "", "", "", "", "", "", "", "", // 160
"", "", "", "", "", "", "", "", "", "", // 170
"", "", "", "", "", "", "", "", "", "", // 180
"", "", "", "", "", "", "", "", "", "", // 190
"", "", "", "", "", "", "", "", "", "", // 200
"", "", "", "", "", "", "", "", "", "", // 210
"", "", "", "", "", "", "", "", "", "", // 220
"", "", "", "", "", "", "", "", "", "", // 230
"", "", "", "", "", "", "", "", "", "", // 240
"", "", "", "", "", "", "", "", "", "", // 250
"", "", "", "", "", "", "", "", "", "", // 260
"", "", "", "", "", "", "", "", "", "", // 270
"", "", "", "", "", "", "", "", "", "", // 280
"", "", "", "", "", "", "", "", "", };
@BeforeClass
public static void setupBuilder() throws URISyntaxException {
final URI ddPath = BigFileAnalyzerPerformanceTest.class.getResource(CategoryRecognizerBuilder.DEFAULT_DD_PATH).toURI();
final URI kwPath = BigFileAnalyzerPerformanceTest.class.getResource(CategoryRecognizerBuilder.DEFAULT_KW_PATH).toURI();
builder = CategoryRecognizerBuilder.newBuilder() //
.ddPath(ddPath) //
.kwPath(kwPath) //
.lucene();
}
private Analyzer<Result> setupBaselineAnalyzers(DataTypeEnum[] types) {
// Analysis.QUALITY, Analysis.CARDINALITY, Analysis.TYPE, Analysis.FREQUENCY, Analysis.PATTERNS,
// Analysis.SEMANTIC
return Analyzers.with(//
// new DataTypeQualityAnalyzer(types), //
// new CardinalityAnalyzer(), //
new DataTypeAnalyzer(), //
// new DataTypeFrequencyAnalyzer(), //
// new CompositePatternFrequencyAnalyzer(types), //
new SemanticAnalyzer(builder) //
);
}
@Test
@Ignore
public void testBaselineAnalysis() {
Analyzer<Result> analyzers = setupBaselineAnalyzers(EXPECTED_DATA_TYPE);
String[] firstRecord = RECORDS_BIG_FILE.get(0);
analyzers.analyze(firstRecord);
final long begin = System.currentTimeMillis();
for (int i = 0; i < RECORDS_BIG_FILE.size(); i++) {
if ((i + 1) % 1000 == 0) {
System.out.println(i + 1);
}
final String[] record = RECORDS_BIG_FILE.get(i);
analyzers.analyze(record);
}
final List<Analyzers.Result> result = analyzers.getResult();
final long end = System.currentTimeMillis();
log.info("The analyses took " + (end - begin) + " ms.");
// Composite result assertions (there should be a DataType and a SemanticType)
for (Analyzers.Result columnResult : result) {
assertNotNull(columnResult.get(DataTypeOccurences.class));
assertNotNull(columnResult.get(SemanticType.class));
}
assertEquals(EXPECTED_DATA_TYPE.length, result.size());
// Data type assertions
for (int i = 0; i < result.size(); i++) {
// System.out.println("DataTypeEnum." + result.get(i).get(DataTypeOccurences.class).getSuggestedType() + ", ");
assertEquals("Unexpected DataType on column " + i, EXPECTED_DATA_TYPE[i],
result.get(i).get(DataTypeOccurences.class).getSuggestedType());
}
assertEquals(EXPECTED_SEMANTIC_DOMAIN.length, result.size());
for (int i = 0; i < result.size(); i++) {
System.out.print("\"" + result.get(i).get(SemanticType.class).getSuggestedCategory() + "\", ");
if ((i + 1) % 10 == 0) {
System.out.println("// " + (i + 1));
}
assertEquals("Unexpected SemanticType on column " + i, EXPECTED_SEMANTIC_DOMAIN[i],
result.get(i).get(SemanticType.class).getSuggestedCategory());
}
}
private static List<String[]> getRecords(String path) {
List<String[]> records = new ArrayList<String[]>();
try {
Reader reader = new FileReader(BigFileAnalyzerPerformanceTest.class.getResource(path).getPath());
CSVFormat csvFormat = CSVFormat.DEFAULT.withDelimiter(';').withFirstRecordAsHeader();
Iterable<CSVRecord> csvRecords = csvFormat.parse(reader);
for (CSVRecord csvRecord : csvRecords) {
String[] values = new String[csvRecord.size()];
for (int i = 0; i < csvRecord.size(); i++) {
values[i] = csvRecord.get(i);
}
records.add(values);
}
} catch (IOException e) {
e.printStackTrace();
}
return records;
}
}