// ============================================================================
//
// Copyright (C) 2006-2016 Talend Inc. - www.talend.com
//
// This source code is available under agreement available at
// %InstallDIR%\features\org.talend.rcp.branding.%PRODUCTNAME%\%PRODUCTNAME%license.txt
//
// You should have received a copy of the agreement
// along with this program; if not, write to Talend SA
// 9 rue Pages 92150 Suresnes, France
//
// ============================================================================
package org.talend.dataquality.semantic.statistics;
import static org.junit.Assert.assertEquals;
import java.net.URI;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.junit.Before;
import org.junit.Test;
import org.talend.dataquality.common.inference.Analyzer;
import org.talend.dataquality.common.inference.Analyzers;
import org.talend.dataquality.common.inference.Analyzers.Result;
import org.talend.dataquality.semantic.classifier.SemanticCategoryEnum;
import org.talend.dataquality.semantic.recognizer.CategoryRecognizerBuilder;
public class SemanticAnalyzerTest {
private CategoryRecognizerBuilder builder;
final List<String[]> TEST_RECORDS = new ArrayList<String[]>() {
private static final long serialVersionUID = 1L;
{
add(new String[] { "CHAT" });
add(new String[] { "United States" });
add(new String[] { "France" });
}
};
final List<String[]> TEST_RECORDS_TAGADA = new ArrayList<String[]>() {
private static final long serialVersionUID = 1L;
{
add(new String[] { "1", "Lennon", "John", "40", "10/09/1940", "false" });
add(new String[] { "2", "Bowie", "David", "67", "01/08/1947", "true" });
}
};
final List<String> EXPECTED_CATEGORY_TAGADA = Arrays
.asList(new String[] { "", SemanticCategoryEnum.LAST_NAME.name(), SemanticCategoryEnum.FIRST_NAME.name(), "", "" });
@Before
public void setUp() throws Exception {
final URI ddPath = this.getClass().getResource(CategoryRecognizerBuilder.DEFAULT_DD_PATH).toURI();
final URI kwPath = this.getClass().getResource(CategoryRecognizerBuilder.DEFAULT_KW_PATH).toURI();
builder = CategoryRecognizerBuilder.newBuilder() //
.ddPath(ddPath) //
.kwPath(kwPath) //
.lucene();
}
@Test
public void testTagada() {
SemanticAnalyzer semanticAnalyzer = new SemanticAnalyzer(builder);
Analyzer<Result> analyzer = Analyzers.with(semanticAnalyzer);
analyzer.init();
for (String[] record : TEST_RECORDS_TAGADA) {
analyzer.analyze(record);
}
analyzer.end();
for (int i = 0; i < EXPECTED_CATEGORY_TAGADA.size(); i++) {
Result result = analyzer.getResult().get(i);
if (result.exist(SemanticType.class)) {
final SemanticType semanticType = result.get(SemanticType.class);
final String suggestedCategory = semanticType.getSuggestedCategory();
assertEquals("Unexpected Category.", EXPECTED_CATEGORY_TAGADA.get(i), suggestedCategory);
}
}
}
@Test
public void testSetLimit() {
SemanticAnalyzer semanticAnalyzer = new SemanticAnalyzer(builder);
semanticAnalyzer.setLimit(0);
assertEquals("Unexpected Category.", SemanticCategoryEnum.COUNTRY.getId(), getSuggestedCategorys(semanticAnalyzer));
semanticAnalyzer.setLimit(1);
assertEquals("Unexpected Category.", SemanticCategoryEnum.ANIMAL.getId(), getSuggestedCategorys(semanticAnalyzer));
semanticAnalyzer.setLimit(3);
assertEquals("Unexpected Category.", SemanticCategoryEnum.COUNTRY.getId(), getSuggestedCategorys(semanticAnalyzer));
}
private String getSuggestedCategorys(SemanticAnalyzer semanticAnalyzer) {
Analyzer<Result> analyzer = Analyzers.with(semanticAnalyzer);
analyzer.init();
for (String[] record : TEST_RECORDS) {
analyzer.analyze(record);
}
analyzer.end();
Result result = analyzer.getResult().get(0);
if (result.exist(SemanticType.class)) {
final SemanticType semanticType = result.get(SemanticType.class);
final String suggestedCategory = semanticType.getSuggestedCategory();
return suggestedCategory;
}
return null;
}
}