// ============================================================================
//
// Copyright (C) 2006-2016 Talend Inc. - www.talend.com
//
// This source code is available under agreement available at
// %InstallDIR%\features\org.talend.rcp.branding.%PRODUCTNAME%\%PRODUCTNAME%license.txt
//
// You should have received a copy of the agreement
// along with this program; if not, write to Talend SA
// 9 rue Pages 92150 Suresnes, France
//
// ============================================================================
package org.talend.dataquality.statistics.semantic;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import java.net.URI;
import java.util.List;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
import org.talend.dataquality.common.inference.Analyzer;
import org.talend.dataquality.common.inference.Analyzers;
import org.talend.dataquality.semantic.classifier.SemanticCategoryEnum;
import org.talend.dataquality.semantic.recognizer.CategoryRecognizerBuilder;
import org.talend.dataquality.semantic.statistics.SemanticAnalyzer;
import org.talend.dataquality.semantic.statistics.SemanticType;
import org.talend.dataquality.statistics.type.DataTypeAnalyzer;
import org.talend.dataquality.statistics.type.DataTypeEnum;
import org.talend.dataquality.statistics.type.DataTypeOccurences;
public class CompositeAnalyzerTest extends SemanticStatisticsTestBase {
Analyzer<Analyzers.Result> analyzer = null;
@Before
public void setUp() throws Exception {
final URI ddPath = this.getClass().getResource(CategoryRecognizerBuilder.DEFAULT_DD_PATH).toURI();
final URI kwPath = this.getClass().getResource(CategoryRecognizerBuilder.DEFAULT_KW_PATH).toURI();
final CategoryRecognizerBuilder builder = CategoryRecognizerBuilder.newBuilder() //
.ddPath(ddPath) //
.kwPath(kwPath) //
.lucene();
analyzer = Analyzers.with(new DataTypeAnalyzer(), new SemanticAnalyzer(builder));
}
@After
public void tearDown() throws Exception {
analyzer.end();
}
@Test
public void testDataTypeAndSemantic() {
final List<String[]> records = getRecords(SemanticStatisticsTestBase.class.getResourceAsStream("employee_100.csv"));
for (String[] record : records) {
analyzer.analyze(record);
}
final List<Analyzers.Result> result = analyzer.getResult();
assertEquals(18, result.size());
// Composite result assertions (there should be a DataType and a SemanticType)
for (Analyzers.Result columnResult : result) {
assertNotNull(columnResult.get(DataTypeOccurences.class));
assertNotNull(columnResult.get(SemanticType.class));
}
// Data type assertions
assertEquals(DataTypeEnum.INTEGER, result.get(0).get(DataTypeOccurences.class).getSuggestedType());
assertEquals(DataTypeEnum.STRING, result.get(1).get(DataTypeOccurences.class).getSuggestedType());
assertEquals(DataTypeEnum.STRING, result.get(2).get(DataTypeOccurences.class).getSuggestedType());
assertEquals(DataTypeEnum.STRING, result.get(3).get(DataTypeOccurences.class).getSuggestedType());
assertEquals(DataTypeEnum.STRING, result.get(4).get(DataTypeOccurences.class).getSuggestedType());
assertEquals(DataTypeEnum.INTEGER, result.get(5).get(DataTypeOccurences.class).getSuggestedType());
assertEquals(DataTypeEnum.STRING, result.get(6).get(DataTypeOccurences.class).getSuggestedType());
assertEquals(DataTypeEnum.INTEGER, result.get(7).get(DataTypeOccurences.class).getSuggestedType());
assertEquals(DataTypeEnum.INTEGER, result.get(8).get(DataTypeOccurences.class).getSuggestedType());
assertEquals(DataTypeEnum.DATE, result.get(9).get(DataTypeOccurences.class).getSuggestedType());
assertEquals(DataTypeEnum.DATE, result.get(10).get(DataTypeOccurences.class).getSuggestedType());
assertEquals(DataTypeEnum.STRING, result.get(11).get(DataTypeOccurences.class).getSuggestedType());
assertEquals(DataTypeEnum.DOUBLE, result.get(12).get(DataTypeOccurences.class).getSuggestedType());
assertEquals(DataTypeEnum.INTEGER, result.get(13).get(DataTypeOccurences.class).getSuggestedType());
assertEquals(DataTypeEnum.STRING, result.get(14).get(DataTypeOccurences.class).getSuggestedType());
assertEquals(DataTypeEnum.STRING, result.get(15).get(DataTypeOccurences.class).getSuggestedType());
assertEquals(DataTypeEnum.STRING, result.get(16).get(DataTypeOccurences.class).getSuggestedType());
assertEquals(DataTypeEnum.STRING, result.get(17).get(DataTypeOccurences.class).getSuggestedType());
// Semantic types assertions
String[] expectedCategories = new String[] { "", //
"", //
SemanticCategoryEnum.FIRST_NAME.getId(), //
SemanticCategoryEnum.FIRST_NAME.getId(), //
"", //
"", //
"", //
"", //
"", //
"", //
"", //
"", //
"", //
"", //
"", //
"", //
SemanticCategoryEnum.GENDER.getId(), //
"" //
};
for (int i = 0; i < expectedCategories.length; i++) {
assertEquals(expectedCategories[i], result.get(i).get(SemanticType.class).getSuggestedCategory());
}
}
}