// ============================================================================
//
// Copyright (C) 2006-2016 Talend Inc. - www.talend.com
//
// This source code is available under agreement available at
// %InstallDIR%\features\org.talend.rcp.branding.%PRODUCTNAME%\%PRODUCTNAME%license.txt
//
// You should have received a copy of the agreement
// along with this program; if not, write to Talend SA
// 9 rue Pages 92150 Suresnes, France
//
// ============================================================================
package org.talend.dataquality.semantic.statistics;
import java.io.IOException;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.commons.lang3.NotImplementedException;
import org.talend.dataquality.common.inference.Analyzer;
import org.talend.dataquality.common.inference.ResizableList;
import org.talend.dataquality.semantic.recognizer.CategoryFrequency;
import org.talend.dataquality.semantic.recognizer.CategoryRecognizer;
import org.talend.dataquality.semantic.recognizer.CategoryRecognizerBuilder;
/**
* Semantic type infer executor. <br>
*
* @see Analyzer
*
*/
public class SemanticAnalyzer implements Analyzer<SemanticType> {
private static final long serialVersionUID = 6808620909722453108L;
private final ResizableList<SemanticType> results = new ResizableList<>(SemanticType.class);
private final Map<Integer, CategoryRecognizer> columnIdxToCategoryRecognizer = new HashMap<>();
private final CategoryRecognizerBuilder builder;
// Threshold of handle to be run. since the semantic inferring will require
// more time than expected, we may only want to run the handle method on a
// sample with small size. Default value is 10000.
private int limit = 10000;
private int currentCount = 0;
public SemanticAnalyzer(CategoryRecognizerBuilder builder) {
this(builder, 10000);
}
public SemanticAnalyzer(CategoryRecognizerBuilder builder, int limit) {
this.builder = builder;
this.limit = limit;
builder.initIndex();
}
/**
* Set the maximum of records this semantic analyzer is expected to process. Any value <= 0 is considered as
* "no limit". A value of 1 will only analyze first call to {@link #analyze(String...)}.
*
* @param limit A integer that indicate the maximum number of record this analyzer should process.
*/
public void setLimit(int limit) {
this.limit = limit;
}
@Override
public void init() {
currentCount = 0;
columnIdxToCategoryRecognizer.clear();
results.clear();
builder.initIndex();
}
/**
* Analyze the record by guessing the data semantic type.
*/
@Override
public boolean analyze(String... record) {
results.resize(record.length);
resizeCategoryRecognizer(record);
if (currentCount < limit || limit <= 0) {
for (int i = 0; i < record.length; i++) {
CategoryRecognizer categoryRecognizer = columnIdxToCategoryRecognizer.get(i);
if (categoryRecognizer == null) {
throw new RuntimeException("CategoryRecognizer is null for record and i=" + i + " " + Arrays.asList(record));
} else {
categoryRecognizer.process(record[i]);
}
}
currentCount++;
}
return true;
}
private void resizeCategoryRecognizer(String[] record) {
if (columnIdxToCategoryRecognizer.size() > 0) {
// already resized
return;
}
for (int idx = 0; idx < record.length; idx++) {
try {
CategoryRecognizer recognizer = builder.build();
columnIdxToCategoryRecognizer.put(idx, recognizer);
} catch (IOException e) {
throw new IllegalArgumentException("Unable to configure category recognizer with builder.", e);
}
}
}
@Override
public void end() {
// do nothing
}
/**
* Get a list of guessed semantic type with type {{@link SemanticType}
*/
@Override
public List<SemanticType> getResult() {
for (Integer colIdx : columnIdxToCategoryRecognizer.keySet()) {
Collection<CategoryFrequency> result = columnIdxToCategoryRecognizer.get(colIdx).getResult();
for (CategoryFrequency semCategory : result) {
results.get(colIdx).increment(semCategory, semCategory.getCount());
}
}
return results;
}
@Override
public Analyzer<SemanticType> merge(Analyzer<SemanticType> another) {
throw new NotImplementedException("Merge function is not implemented.");
}
@Override
public void close() throws Exception {
for (CategoryRecognizer catRecognizer : columnIdxToCategoryRecognizer.values()) {
catRecognizer.end();
}
}
}