package org.talend.dataquality.statistics.semantic;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.lang.management.ManagementFactory;
import java.lang.management.ThreadMXBean;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVRecord;
import org.junit.BeforeClass;
import org.junit.Ignore;
import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.talend.dataquality.common.inference.Analyzer;
import org.talend.dataquality.common.inference.Analyzers;
import org.talend.dataquality.common.inference.Analyzers.Result;
import org.talend.dataquality.semantic.classifier.SemanticCategoryEnum;
import org.talend.dataquality.semantic.recognizer.CategoryRecognizerBuilder;
import org.talend.dataquality.semantic.statistics.SemanticAnalyzer;
import org.talend.dataquality.semantic.statistics.SemanticType;
import org.talend.dataquality.statistics.cardinality.CardinalityAnalyzer;
import org.talend.dataquality.statistics.frequency.DataTypeFrequencyAnalyzer;
import org.talend.dataquality.statistics.frequency.pattern.CompositePatternFrequencyAnalyzer;
import org.talend.dataquality.statistics.numeric.quantile.QuantileAnalyzer;
import org.talend.dataquality.statistics.numeric.summary.SummaryAnalyzer;
import org.talend.dataquality.statistics.quality.DataTypeQualityAnalyzer;
import org.talend.dataquality.statistics.text.TextLengthAnalyzer;
import org.talend.dataquality.statistics.type.DataTypeAnalyzer;
import org.talend.dataquality.statistics.type.DataTypeEnum;
import org.talend.dataquality.statistics.type.DataTypeOccurences;
public class AnalyzerPerformanceTest {
private static Logger log = LoggerFactory.getLogger(AnalyzerPerformanceTest.class);
private static CategoryRecognizerBuilder builder;
private static final List<String[]> records_card_exceptions = getRecords("Card_Exceptions_Preparation.csv");
private final DataTypeEnum[] types_card_exceptions = new DataTypeEnum[] { //
DataTypeEnum.INTEGER, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, //
DataTypeEnum.INTEGER, DataTypeEnum.STRING, DataTypeEnum.INTEGER, DataTypeEnum.INTEGER, DataTypeEnum.TIME, //
DataTypeEnum.INTEGER, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING, //
DataTypeEnum.INTEGER, DataTypeEnum.STRING, DataTypeEnum.DATE, DataTypeEnum.DATE, DataTypeEnum.STRING,//
};
@BeforeClass
public static void setupBuilder() throws URISyntaxException {
final URI ddPath = AnalyzerPerformanceTest.class.getResource(CategoryRecognizerBuilder.DEFAULT_DD_PATH).toURI();
final URI kwPath = AnalyzerPerformanceTest.class.getResource(CategoryRecognizerBuilder.DEFAULT_KW_PATH).toURI();
builder = CategoryRecognizerBuilder.newBuilder() //
.ddPath(ddPath) //
.kwPath(kwPath) //
.lucene();
}
private Analyzer<Result> setupBaselineAnalyzers(DataTypeEnum[] types) {
// Analysis.QUALITY, Analysis.CARDINALITY, Analysis.TYPE, Analysis.FREQUENCY, Analysis.PATTERNS,
// Analysis.SEMANTIC
return Analyzers.with(//
new DataTypeQualityAnalyzer(types), //
new CardinalityAnalyzer(), //
new DataTypeAnalyzer(), //
new DataTypeFrequencyAnalyzer(), //
new CompositePatternFrequencyAnalyzer(types), //
new SemanticAnalyzer(builder) //
);
}
private Analyzer<Result> setupAdvancedAnalyzers() {
// Analysis.LENGTH, Analysis.QUANTILES, Analysis.SUMMARY, Analysis.HISTOGRAM
return Analyzers.with(//
new TextLengthAnalyzer(), //
new QuantileAnalyzer(types_card_exceptions), //
new SummaryAnalyzer(types_card_exceptions) //
);
}
@Test
public void testBaselineAnalysis() {
Analyzer<Result> analyzers = setupBaselineAnalyzers(types_card_exceptions);
String[] firstRecord = records_card_exceptions.get(0);
analyzers.analyze(firstRecord);
final ThreadMXBean mxBean = ManagementFactory.getThreadMXBean();
final long cpuBefore = mxBean.getCurrentThreadCpuTime();
for (String[] record : records_card_exceptions) {
analyzers.analyze(record);
}
final List<Analyzers.Result> result = analyzers.getResult();
final long cpuAfter = mxBean.getCurrentThreadCpuTime();
log.info("baseline analysis took " + (cpuAfter - cpuBefore) + " CPU time.");
assertTrue("baseline analysis took " + (cpuAfter - cpuBefore) + " CPU time, which is slower than expected.",
(cpuAfter - cpuBefore) < 1.5e10);
assertEquals(types_card_exceptions.length, result.size());
// Composite result assertions (there should be a DataType and a SemanticType)
for (Analyzers.Result columnResult : result) {
assertNotNull(columnResult.get(DataTypeOccurences.class));
assertNotNull(columnResult.get(SemanticType.class));
}
// Data type assertions
for (int i = 0; i < types_card_exceptions.length; i++) {
assertEquals("Unexpected DataType on column " + i, types_card_exceptions[i],
result.get(i).get(DataTypeOccurences.class).getSuggestedType());
}
// Semantic types assertions
String[] expectedCategories = new String[] { "", //
SemanticCategoryEnum.US_STATE_CODE.getId(), //
"", //
"", //
"", //
"", //
"", //
"", //
"", //
"", //
"", //
"", //
"", //
"", //
"", //
"", //
"", //
"", //
"", //
"" //
};
for (int i = 0; i < expectedCategories.length; i++) {
assertEquals("Unexpected SemanticType on column " + i, expectedCategories[i],
result.get(i).get(SemanticType.class).getSuggestedCategory());
}
}
@Test
@Ignore
public void testAdvancedAnalysis() {
Analyzer<Result> analyzers = setupAdvancedAnalyzers();
final ThreadMXBean mxBean = ManagementFactory.getThreadMXBean();
final long cpuBefore = mxBean.getCurrentThreadCpuTime();
for (String[] record : records_card_exceptions) {
analyzers.analyze(record);
}
final List<Analyzers.Result> result = analyzers.getResult();
final long cpuAfter = mxBean.getCurrentThreadCpuTime();
log.info("advanced analysis took " + (cpuAfter - cpuBefore) + " CPU time.");
assertTrue("advanced analysis took " + (cpuAfter - cpuBefore) + " CPU time, which is slower than expected.",
(cpuAfter - cpuBefore) < 7e8);
}
private static List<String[]> getRecords(String path) {
List<String[]> records = new ArrayList<String[]>();
try {
Reader reader = new FileReader(AnalyzerPerformanceTest.class.getResource(path).getPath());
CSVFormat csvFormat = CSVFormat.DEFAULT.withDelimiter(';').withFirstRecordAsHeader();
Iterable<CSVRecord> csvRecords = csvFormat.parse(reader);
for (CSVRecord csvRecord : csvRecords) {
String[] values = new String[csvRecord.size()];
for (int i = 0; i < csvRecord.size(); i++) {
values[i] = csvRecord.get(i);
}
records.add(values);
}
} catch (IOException e) {
e.printStackTrace();
}
return records;
}
}