package org.talend.dataquality.semantic.statistics; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; import java.io.FileReader; import java.io.IOException; import java.io.Reader; import java.net.URI; import java.net.URISyntaxException; import java.util.ArrayList; import java.util.List; import org.apache.commons.csv.CSVFormat; import org.apache.commons.csv.CSVRecord; import org.junit.BeforeClass; import org.junit.Test; import org.talend.dataquality.common.inference.Analyzer; import org.talend.dataquality.common.inference.Analyzers; import org.talend.dataquality.common.inference.Analyzers.Result; import org.talend.dataquality.common.inference.ValueQualityStatistics; import org.talend.dataquality.semantic.classifier.SemanticCategoryEnum; import org.talend.dataquality.semantic.recognizer.CategoryFrequency; import org.talend.dataquality.semantic.recognizer.CategoryRecognizerBuilder; public class SemanticQualityAnalyzerTest { private static CategoryRecognizerBuilder builder; private static final List<String[]> RECORDS_CRM_CUST = getRecords("crm_cust.csv"); private static final List<String[]> RECORDS_CREDIT_CARDS = getRecords("credit_card_number_samples.csv"); private static final List<String[]> RECORDS_PHONES = getRecords("phone_number.csv"); private final String[] EXPECTED_CATEGORIES_DICT = new String[] { // "", // "CIVILITY", // "FIRST_NAME", // "LAST_NAME", // "COUNTRY_CODE_ISO3", // "ADDRESS_LINE", // "FR_POSTAL_CODE", // "CITY", // "", // "EMAIL", // "", // "", // }; private static final long[][] EXPECTED_VALIDITY_COUNT_DICT = new long[][] { // new long[] { 1000, 0, 0 }, // new long[] { 1000, 0, 0 }, // new long[] { 1000, 0, 0 }, // new long[] { 1000, 0, 0 }, // new long[] { 990, 10, 0 }, // new long[] { 1000, 0, 0 }, // new long[] { 996, 4, 0 }, // new long[] { 1000, 0, 0 }, // new long[] { 518, 0, 482 }, // new long[] { 996, 4, 0 }, // new long[] { 1000, 0, 0 }, // new long[] { 1000, 0, 0 }, // }; private final String[] EXPECTED_CATEGORIES_REGEX = new String[] { // "", // "VISA_CARD", // "DATA_URL", // }; private static final long[][] EXPECTED_VALIDITY_COUNT_REGEX_FOR_DISCOVERY = new long[][] { // new long[] { 30, 0, 0 }, // new long[] { 20, 10, 0 }, // new long[] { 30, 0, 0 }, // }; private static final long[][] EXPECTED_VALIDITY_COUNT_REGEX_FOR_VALIDATION = new long[][] { // new long[] { 30, 0, 0 }, // new long[] { 20, 10, 0 }, // new long[] { 19, 11, 0 }, // }; private static final long[][] EXPECTED_VALIDITY_COUNT_PHONE = new long[][] { // new long[] { 11, 0, 0 } }; @BeforeClass public static void setupBuilder() throws URISyntaxException { final URI ddPath = SemanticQualityAnalyzerTest.class.getResource(CategoryRecognizerBuilder.DEFAULT_DD_PATH).toURI(); final URI kwPath = SemanticQualityAnalyzerTest.class.getResource(CategoryRecognizerBuilder.DEFAULT_KW_PATH).toURI(); builder = CategoryRecognizerBuilder.newBuilder() // .ddPath(ddPath) // .kwPath(kwPath) // .lucene(); } @Test public void testSemanticQualityAnalyzerWithDictionaryCategory() { testAnalysis(RECORDS_CRM_CUST, EXPECTED_CATEGORIES_DICT, EXPECTED_VALIDITY_COUNT_DICT, EXPECTED_VALIDITY_COUNT_DICT); } @Test public void testSemanticQualityAnalyzerWithRegexCategory() { testAnalysis(RECORDS_CREDIT_CARDS, EXPECTED_CATEGORIES_REGEX, EXPECTED_VALIDITY_COUNT_REGEX_FOR_DISCOVERY, EXPECTED_VALIDITY_COUNT_REGEX_FOR_VALIDATION); } @Test public void testSemanticQualityAnalyzerWithPhoneCategory() { testAnalysis(RECORDS_PHONES, new String[] { "PHONE" }, EXPECTED_VALIDITY_COUNT_PHONE, EXPECTED_VALIDITY_COUNT_PHONE); } public void testAnalysis(List<String[]> records, String[] expectedCategories, long[][] expectedValidityCountForDiscovery, long[][] expectedValidityCountForValidation) { Analyzer<Result> analyzers = Analyzers.with(// new SemanticAnalyzer(builder), // new SemanticQualityAnalyzer(builder, expectedCategories)// ); for (String[] record : records) { analyzers.analyze(record); } final List<Analyzers.Result> result = analyzers.getResult(); assertEquals(expectedCategories.length, result.size()); // Composite result assertions (there should be a DataType and a SemanticType) for (Analyzers.Result columnResult : result) { assertNotNull(columnResult.get(SemanticType.class)); assertNotNull(columnResult.get(ValueQualityStatistics.class)); } // Semantic types assertions for (int i = 0; i < expectedCategories.length; i++) { final SemanticType stats = result.get(i).get(SemanticType.class); // System.out.println("\"" + stats.getSuggestedCategory() + "\", //"); assertEquals("Unexpected SemanticType on column " + i, expectedCategories[i], result.get(i).get(SemanticType.class).getSuggestedCategory()); for (CategoryFrequency cf : stats.getCategoryToCount().keySet()) { if (expectedCategories[i].equals(cf.getCategoryId())) { SemanticCategoryEnum cat = SemanticCategoryEnum.getCategoryById(cf.getCategoryId()); if (cat != null && cat.getCompleteness()) { assertEquals("Unexpected SemanticType occurence on column " + i, expectedValidityCountForDiscovery[i][0], cf.getCount()); } } } } // Semantic validation assertions for (int i = 0; i < expectedCategories.length; i++) { final ValueQualityStatistics stats = result.get(i).get(ValueQualityStatistics.class); // System.out.println("new long[] {" + stats.getValidCount() + ", " + stats.getInvalidCount() + ", " // + stats.getEmptyCount() + "}, //"); assertEquals("Unexpected valid count on column " + i, expectedValidityCountForValidation[i][0], stats.getValidCount()); assertEquals("Unexpected invalid count on column " + i, expectedValidityCountForValidation[i][1], stats.getInvalidCount()); assertEquals("Unexpected empty count on column " + i, expectedValidityCountForValidation[i][2], stats.getEmptyCount()); assertEquals("Unexpected unknown count on column " + i, 0, stats.getUnknownCount()); } } private static List<String[]> getRecords(String path) { List<String[]> records = new ArrayList<String[]>(); try { Reader reader = new FileReader(SemanticQualityAnalyzerTest.class.getResource(path).getPath()); CSVFormat csvFormat = CSVFormat.DEFAULT.withDelimiter(';').withFirstRecordAsHeader(); Iterable<CSVRecord> csvRecords = csvFormat.parse(reader); for (CSVRecord csvRecord : csvRecords) { String[] values = new String[csvRecord.size()]; for (int i = 0; i < csvRecord.size(); i++) { values[i] = csvRecord.get(i); } records.add(values); } } catch (IOException e) { e.printStackTrace(); } return records; } }