// ============================================================================
//
// Copyright (C) 2006-2016 Talend Inc. - www.talend.com
//
// This source code is available under agreement available at
// %InstallDIR%\features\org.talend.rcp.branding.%PRODUCTNAME%\%PRODUCTNAME%license.txt
//
// You should have received a copy of the agreement
// along with this program; if not, write to Talend SA
// 9 rue Pages 92150 Suresnes, France
//
// ============================================================================
package org.talend.dataquality.statistics.frequency.pattern;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
import java.util.*;
import java.util.Map.Entry;
import org.junit.After;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
import org.talend.dataquality.statistics.frequency.AbstractFrequencyAnalyzer;
import org.talend.dataquality.statistics.frequency.recognition.DateTimePatternRecognizer;
import org.talend.dataquality.statistics.quality.DataTypeQualityAnalyzer;
import org.talend.dataquality.statistics.type.DataTypeEnum;
public class CompositePatternFrequencyAnalyzerTest {
AbstractFrequencyAnalyzer<PatternFrequencyStatistics> patternFreqAnalyzer = null;
@Before
public void setUp() throws Exception {
patternFreqAnalyzer = new CompositePatternFrequencyAnalyzer();
}
@After
public void tearDown() throws Exception {
}
@Test
public void testAsciiAndAsiaChars() {
CompositePatternFrequencyAnalyzer analyzer = new CompositePatternFrequencyAnalyzer();
Set<String> patternString1 = analyzer.getValuePatternSet("abcd1234ィゥェ中国");
Assert.assertEquals(Collections.singleton("aaaa9999ィゥェ中国"), patternString1);
Set<String> patternString4 = analyzer.getValuePatternSet("2008-01-01");
Assert.assertEquals(new HashSet<String>(Arrays.asList(new String[] { "yyyy-MM-dd" })), patternString4);
Set<String> patternString5 = analyzer.getValuePatternSet("2008-1月-01");
Assert.assertEquals(Collections.singleton("9999-9月-99"), patternString5);
}
@Test
public void testAnalyze() {
String[] data = new String[] { "John", "", "123Code", "111", "Zhao", "2015-08-20", "2012-02-12", "12/2/99", "Hois",
"2001年" };
for (String value : data) {
patternFreqAnalyzer.analyze(value);
}
Map<String, Long> freqTable = patternFreqAnalyzer.getResult().get(0).getTopK(10);
Iterator<Entry<String, Long>> entrySet = freqTable.entrySet().iterator();
int idx = 0;
boolean isAtLeastOneAsssert = false;
while (entrySet.hasNext()) {
Entry<String, Long> e = entrySet.next();
if (idx == 0) {
Assert.assertEquals("Aaaa", e.getKey());
Assert.assertEquals(3, e.getValue(), 0);
isAtLeastOneAsssert = true;
} else if (idx == 1) {
Assert.assertEquals("yyyy-MM-dd", e.getKey());
Assert.assertEquals(2, e.getValue(), 0);
isAtLeastOneAsssert = true;
}
if (e.getKey().equals("999Aaaa")) {
Assert.assertEquals(1, e.getValue(), 0);
isAtLeastOneAsssert = true;
}
idx++;
}
Assert.assertTrue(isAtLeastOneAsssert);
}
@Test
public void testAnalyzerTwoColumns() {
String[][] data = new String[][] { { "John", "filx" }, { "", "a" }, { "123Code", "3649273" }, { "111", "100" },
{ "Zhao", "silL" }, { "2015-08-20", "2015-08-21" }, { "2012-02-12", "2022-9-12" }, { "12/2/99", "12/2/99" },
{ "Hois", "*^2lii" }, { "2001年", "4445-" } };
for (String[] value : data) {
patternFreqAnalyzer.analyze(value);
}
Map<String, Long> freqTable = patternFreqAnalyzer.getResult().get(0).getTopK(10);
Map<String, Long> freqTable2 = patternFreqAnalyzer.getResult().get(1).getTopK(10);
Iterator<Entry<String, Long>> entrySet = freqTable.entrySet().iterator();
Iterator<Entry<String, Long>> entrySet2 = freqTable2.entrySet().iterator();
int idx = 0;
boolean isAtLeastOneAsssert = false;
while (entrySet.hasNext()) {
Entry<String, Long> e = entrySet.next();
if (idx == 0) {
Assert.assertEquals("Aaaa", e.getKey());
Assert.assertEquals(3, e.getValue(), 0);
isAtLeastOneAsssert = true;
} else if (idx == 1) {
Assert.assertEquals("yyyy-MM-dd", e.getKey());
Assert.assertEquals(2, e.getValue(), 0);
isAtLeastOneAsssert = true;
}
if (e.getKey().equals("999Aaaa")) {
Assert.assertEquals(1, e.getValue(), 0);
isAtLeastOneAsssert = true;
}
idx++;
}
Assert.assertTrue(isAtLeastOneAsssert);
isAtLeastOneAsssert = false;
while (entrySet2.hasNext()) {
Entry<String, Long> e = entrySet2.next();
if (idx == 0) {
Assert.assertEquals("yyyy-M-d", e.getKey());
Assert.assertEquals(2, e.getValue(), 0);
isAtLeastOneAsssert = true;
}
if (e.getKey().equals("9999999")) {
Assert.assertEquals(1, e.getValue(), 0);
isAtLeastOneAsssert = true;
}
if (e.getKey().equals("a")) {
Assert.assertEquals(1, e.getValue(), 0);
isAtLeastOneAsssert = true;
}
if (e.getKey().equals("d/M/yy")) {
Assert.assertEquals(1, e.getValue(), 0);
isAtLeastOneAsssert = true;
}
idx++;
}
Assert.assertTrue(isAtLeastOneAsssert);
}
@Test
public void testCustomDatePatternAnalyzer() {
DateTimePatternRecognizer datetimePatternAnalyzer = new DateTimePatternRecognizer();
final String[] data = new String[] { "11/19/07 2:54", "7/6/09 16:46", "2015-08-20", "2012-02-12", "2/8/15 15:57",
"4/15/11 4:24", "2001年", "12:00.000000 1?1?7" };
// Set customized pattern and analyze again
datetimePatternAnalyzer.addCustomDateTimePattern("M/d/yy H:m");
CompositePatternFrequencyAnalyzer patternAnalyzer = new CompositePatternFrequencyAnalyzer(
Collections.singletonList(datetimePatternAnalyzer));
patternAnalyzer.init();
for (String value : data) {
patternAnalyzer.analyze(value);
}
patternAnalyzer.end();
Map<String, Long> freqTable = patternAnalyzer.getResult().get(0).getTopK(10);
Iterator<Entry<String, Long>> entrySet = freqTable.entrySet().iterator();
if (entrySet.hasNext()) {
Entry<String, Long> e = entrySet.next();
Assert.assertEquals("M/d/yy H:m", e.getKey());
Assert.assertEquals(4, e.getValue(), 0);
} else {
fail("no entry");
}
}
@Test
public void testCustomDateTypeQualityAnalyzer() {
// Add value quality analyzer to have list of valid date. some date matches patterns from the file, some matches
// them in memory user set.
DataTypeQualityAnalyzer qualityAnalyzer = new DataTypeQualityAnalyzer(DataTypeEnum.DATE);
qualityAnalyzer.addCustomDateTimePattern("M/d/yy a H:m");
qualityAnalyzer.init();
// 2-8-15 15:57 is not at date with pattern available,"2012-02-12" is a date match pattern from file, the others
// match pattern set ad-hoc
final String[] data = new String[] { "11/19/07 AM 2:54", "7/6/09 PM 16:46", "2/8/15 PM 15:57", "2*8*15 15:57",
"2012-02-12" };
for (String value : data) {
qualityAnalyzer.analyze(value);
}
qualityAnalyzer.end();
assertTrue(qualityAnalyzer.getResult().size() > 0);
Assert.assertEquals(5, qualityAnalyzer.getResult().get(0).getCount(), 0); // Count
Assert.assertEquals(4, qualityAnalyzer.getResult().get(0).getValidCount()); // Valid Count
// Invalid values
Assert.assertTrue(qualityAnalyzer.getResult().get(0).getInvalidValues().size() == 1);
Assert.assertEquals("2*8*15 15:57", qualityAnalyzer.getResult().get(0).getInvalidValues().toArray()[0]);
// Add new customized pattern , create new quality analyzer , check again dates should be valid given customized
// pattern and the pattern in file.
// patterns provided.
DataTypeQualityAnalyzer qualityAnalyzer2 = new DataTypeQualityAnalyzer(DataTypeEnum.DATE);
qualityAnalyzer2.addCustomDateTimePattern("M*d*yy H:m");
qualityAnalyzer2.init();
for (String value : data) {
qualityAnalyzer2.analyze(value);
}
qualityAnalyzer2.end();
Assert.assertEquals(5, qualityAnalyzer2.getResult().get(0).getCount()); // Count
// Valid Count , only "2012-02-12" and "2-8-15 15:57" match.
Assert.assertEquals(2, qualityAnalyzer2.getResult().get(0).getValidCount());
Assert.assertTrue(qualityAnalyzer2.getResult().get(0).getInvalidValues().size() == 3);
Set<String> resultSet = qualityAnalyzer2.getResult().get(0).getInvalidValues();
Assert.assertTrue(resultSet.contains("11/19/07 AM 2:54"));
Assert.assertTrue(resultSet.contains("7/6/09 PM 16:46"));
Assert.assertTrue(resultSet.contains("2/8/15 PM 15:57"));
}
}