// ============================================================================
//
// Copyright (C) 2006-2016 Talend Inc. - www.talend.com
//
// This source code is available under agreement available at
// %InstallDIR%\features\org.talend.rcp.branding.%PRODUCTNAME%\%PRODUCTNAME%license.txt
//
// You should have received a copy of the agreement
// along with this program; if not, write to Talend SA
// 9 rue Pages 92150 Suresnes, France
//
// ============================================================================
package org.talend.dataquality.statistics.quality;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.fail;
import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.talend.dataquality.common.inference.ValueQualityStatistics;
import org.talend.dataquality.semantic.api.CategoryRegistryManager;
import org.talend.dataquality.semantic.classifier.SemanticCategoryEnum;
import org.talend.dataquality.semantic.recognizer.CategoryRecognizerBuilder;
import org.talend.dataquality.semantic.statistics.SemanticQualityAnalyzer;
import org.talend.dataquality.statistics.type.DataTypeEnum;
/**
* created by talend on 2015-07-28 Detailled comment.
*
*/
public class ValueQualityAnalyzerTest {
private static final Logger LOGGER = LoggerFactory.getLogger(ValueQualityAnalyzerTest.class);
private static final String LOCAL_CATEGORY_REGISTRY_PATH = "target/test_registry";
@BeforeClass
public static void setCategoryRegistryPath() {
CategoryRegistryManager.setLocalRegistryPath(LOCAL_CATEGORY_REGISTRY_PATH);
}
@AfterClass
public static void tearDown() throws IOException {
CategoryRegistryManager.getInstance().reset();
}
public static List<String[]> getRecords(InputStream inputStream, String separator) {
if (inputStream == null) {
throw new IllegalArgumentException("Input stream cannot be null.");
}
try {
List<String[]> records = new ArrayList<String[]>();
final List<String> lines = IOUtils.readLines(inputStream);
for (String line : lines) {
String[] record = StringUtils.splitByWholeSeparatorPreserveAllTokens(line, separator);
records.add(record);
}
return records;
} catch (IOException e) {
throw new RuntimeException(e);
} finally {
try {
inputStream.close();
} catch (IOException e) {
// Silent ignore
e.printStackTrace();
}
}
}
public static List<String[]> getRecords(InputStream inputStream) {
return getRecords(inputStream, ";");
}
private CategoryRecognizerBuilder createCategoryRecognizerBuilder() throws URISyntaxException {
final URI ddPath = this.getClass().getResource(CategoryRecognizerBuilder.DEFAULT_DD_PATH).toURI();
final URI kwPath = this.getClass().getResource(CategoryRecognizerBuilder.DEFAULT_KW_PATH).toURI();
final CategoryRecognizerBuilder builder = CategoryRecognizerBuilder.newBuilder() //
.ddPath(ddPath) //
.kwPath(kwPath) //
.lucene();
return builder;
}
@Test
public void testValueQualityAnalyzerWithoutSemanticQuality() throws URISyntaxException {
DataTypeQualityAnalyzer dataTypeQualityAnalyzer = new DataTypeQualityAnalyzer(
new DataTypeEnum[] { DataTypeEnum.INTEGER, DataTypeEnum.STRING, DataTypeEnum.STRING, DataTypeEnum.STRING,
DataTypeEnum.DATE, DataTypeEnum.STRING, DataTypeEnum.DATE, DataTypeEnum.INTEGER, DataTypeEnum.DOUBLE });
String[] semanticTypes = new String[] { SemanticCategoryEnum.UNKNOWN.name(), SemanticCategoryEnum.UNKNOWN.name(),
SemanticCategoryEnum.UNKNOWN.name(), SemanticCategoryEnum.UNKNOWN.name(), SemanticCategoryEnum.UNKNOWN.name(),
SemanticCategoryEnum.UNKNOWN.name(), SemanticCategoryEnum.UNKNOWN.name(), SemanticCategoryEnum.UNKNOWN.name(),
SemanticCategoryEnum.UNKNOWN.name() };
SemanticQualityAnalyzer semanticQualityAnalyzer = new SemanticQualityAnalyzer(createCategoryRecognizerBuilder(),
semanticTypes);
ValueQualityAnalyzer valueQualityAnalyzer = new ValueQualityAnalyzer(dataTypeQualityAnalyzer, semanticQualityAnalyzer);
valueQualityAnalyzer.init();
final List<String[]> records = getRecords(this.getClass().getResourceAsStream("../data/customers_100.csv"));
for (String[] record : records) {
valueQualityAnalyzer.analyze(record);
}
for (int i = 0; i < semanticTypes.length; i++) {
ValueQualityStatistics dataTypeQualityResult = dataTypeQualityAnalyzer.getResult().get(i);
ValueQualityStatistics aggregatedResult = valueQualityAnalyzer.getResult().get(i);
assertEquals("unexpected ValidCount on Column " + i, dataTypeQualityResult.getValidCount(),
aggregatedResult.getValidCount());
assertEquals("unexpected InvalidCount on Column " + i, dataTypeQualityResult.getInvalidCount(),
aggregatedResult.getInvalidCount());
assertEquals("unexpected EmptyCount on Column " + i, dataTypeQualityResult.getEmptyCount(),
aggregatedResult.getEmptyCount());
}
try {
valueQualityAnalyzer.close();
} catch (Exception e) {
LOGGER.error(e.getMessage(), e);
}
}
@Test
public void testValueQualityAnalyzerWithSemanticQuality() throws URISyntaxException {
final List<String[]> records = new ArrayList<String[]>() {
private static final long serialVersionUID = 1L;
{
add(new String[] { "1", "UT", "Bonn" });
add(new String[] { "2", "MN", "Suresnes" });
add(new String[] { "3", "MO", "Beijing" });
add(new String[] { "4", "", "Washington" });
add(new String[] { "5", "IL", "Tokyo" });
add(new String[] { "6", "ORZ", "" });
add(new String[] { "7", " ", "CityA" });
add(new String[] { "8", "LOL", "CityB" });
}
};
final int[] EXPECTED_VALID_COUNT = { 8, 4, 7 };
final int[] EXPECTED_EMPTY_COUNT = { 0, 2, 1 };
final int[] EXPECTED_INVALID_COUNT = { 0, 2, 0 };
final int[] EXPECTED_UNKNOWN_COUNT = { 0, 0, 0 };
final List<Set<String>> EXPECTED_INVALID_VALUES = new ArrayList<Set<String>>() {
private static final long serialVersionUID = 1L;
{
add(new HashSet<String>());
add(new HashSet<String>() {
private static final long serialVersionUID = 1L;
{
add("LOL");
add("ORZ");
}
});
add(new HashSet<String>());
}
};
final List<Set<String>> EXPECTED_UNKNOWN_VALUES = new ArrayList<Set<String>>() {
private static final long serialVersionUID = 1L;
{
add(new HashSet<String>());
add(new HashSet<String>());
add(new HashSet<String>());
}
};
final DataTypeQualityAnalyzer dataTypeQualityAnalyzer = new DataTypeQualityAnalyzer(
new DataTypeEnum[] { DataTypeEnum.INTEGER, DataTypeEnum.STRING, DataTypeEnum.STRING });
final String[] semanticTypes = new String[] { SemanticCategoryEnum.UNKNOWN.name(),
SemanticCategoryEnum.US_STATE_CODE.name(), SemanticCategoryEnum.CITY.name() };
final SemanticQualityAnalyzer semanticQualityAnalyzer = new SemanticQualityAnalyzer(createCategoryRecognizerBuilder(),
semanticTypes);
final ValueQualityAnalyzer valueQualityAnalyzer = new ValueQualityAnalyzer(dataTypeQualityAnalyzer,
semanticQualityAnalyzer);
valueQualityAnalyzer.init();
for (String[] record : records) {
valueQualityAnalyzer.analyze(record);
}
for (int i = 0; i < EXPECTED_INVALID_VALUES.size(); i++) {
ValueQualityStatistics aggregatedResult = valueQualityAnalyzer.getResult().get(i);
assertEquals("unexpected ValidCount on Column " + i, EXPECTED_VALID_COUNT[i], aggregatedResult.getValidCount());
assertEquals("unexpected EmptyCount on Column " + i, EXPECTED_EMPTY_COUNT[i], aggregatedResult.getEmptyCount());
assertEquals("unexpected InvalidCount on Column " + i, EXPECTED_INVALID_COUNT[i], aggregatedResult.getInvalidCount());
assertEquals("unexpected InvalidValues on Column " + i, EXPECTED_INVALID_VALUES.get(i),
aggregatedResult.getInvalidValues());
assertEquals("unexpected UnknownCount on Column " + i, EXPECTED_UNKNOWN_COUNT[i], aggregatedResult.getUnknownCount());
assertEquals("unexpected UnknownValues on Column " + i, EXPECTED_UNKNOWN_VALUES.get(i),
aggregatedResult.getUnknownValues());
}
try {
valueQualityAnalyzer.close();
} catch (Exception e) {
fail(e.getMessage());
}
}
}