// ============================================================================
//
// Copyright (C) 2006-2016 Talend Inc. - www.talend.com
//
// This source code is available under agreement available at
// %InstallDIR%\features\org.talend.rcp.branding.%PRODUCTNAME%\%PRODUCTNAME%license.txt
//
// You should have received a copy of the agreement
// along with this program; if not, write to Talend SA
// 9 rue Pages 92150 Suresnes, France
//
// ============================================================================
package org.talend.dataquality.semantic.statistics;
import java.io.IOException;
import java.util.*;
import org.apache.commons.collections.CollectionUtils;
import org.apache.log4j.Logger;
import org.talend.dataquality.common.inference.Analyzer;
import org.talend.dataquality.common.inference.QualityAnalyzer;
import org.talend.dataquality.common.inference.ResizableList;
import org.talend.dataquality.common.inference.ValueQualityStatistics;
import org.talend.dataquality.semantic.api.CategoryRegistryManager;
import org.talend.dataquality.semantic.classifier.ISubCategoryClassifier;
import org.talend.dataquality.semantic.classifier.impl.DataDictFieldClassifier;
import org.talend.dataquality.semantic.model.CategoryType;
import org.talend.dataquality.semantic.model.DQCategory;
import org.talend.dataquality.semantic.recognizer.CategoryRecognizer;
import org.talend.dataquality.semantic.recognizer.CategoryRecognizerBuilder;
import org.talend.dataquality.semantic.recognizer.LFUCache;
/**
* created by talend on 2015-07-28 Detailled comment.
*
*/
public class SemanticQualityAnalyzer extends QualityAnalyzer<ValueQualityStatistics, String[]> {
private static final long serialVersionUID = -5951511723860660263L;
private static final Logger LOG = Logger.getLogger(SemanticQualityAnalyzer.class);
private final ResizableList<ValueQualityStatistics> results = new ResizableList<>(ValueQualityStatistics.class);
private final Map<String, LFUCache<String, Boolean>> knownValidationCategoryCache = new HashMap<>();
private ISubCategoryClassifier regexClassifier;
private ISubCategoryClassifier dataDictClassifier;
private CategoryRegistryManager crm;
private final CategoryRecognizerBuilder builder;
public SemanticQualityAnalyzer(CategoryRecognizerBuilder builder, String[] types, boolean isStoreInvalidValues) {
this.isStoreInvalidValues = isStoreInvalidValues;
this.builder = builder;
setTypes(types);
init();
}
public SemanticQualityAnalyzer(CategoryRecognizerBuilder builder, String... types) {
this(builder, types, false);
}
@Override
public void init() {
try {
final CategoryRecognizer categoryRecognizer = builder.build();
regexClassifier = categoryRecognizer.getUserDefineClassifier();
dataDictClassifier = categoryRecognizer.getDataDictFieldClassifier();
crm = categoryRecognizer.getCrm();
} catch (IOException e) {
LOG.error(e, e);
}
results.clear();
}
@Override
public void setStoreInvalidValues(boolean isStoreInvalidValues) {
this.isStoreInvalidValues = isStoreInvalidValues;
}
/**
* @deprecated use {@link #analyze(String...)}
* <p>
* TODO remove this method later
*
* Analyze record of Array of string type, this method is used in scala library which not support parameterized
* array type.
*
* @param record
* @return
*/
@Deprecated
public boolean analyzeArray(String[] record) {
return analyze(record);
}
/**
* TODO use String[] as parameter for this method.
*/
@Override
public boolean analyze(String... record) {
if (record == null) {
results.resize(0);
return true;
}
results.resize(record.length);
for (int i = 0; i < record.length; i++) {
String semanticType = getTypes()[i];
final String value = record[i];
final ValueQualityStatistics valueQuality = results.get(i);
if (value == null || value.trim().length() == 0) {
valueQuality.incrementEmpty();
} else {
analyzeValue(semanticType, value, valueQuality);
}
}
return true;
}
private void analyzeValue(String catName, String value, ValueQualityStatistics valueQuality) {
DQCategory category = null;
for (String id : CategoryRegistryManager.getInstance().getCategoryIds()) {
DQCategory tmp = CategoryRegistryManager.getInstance().getCategoryMetadataById(id);
if (catName.equals(tmp.getName())) {
category = tmp;
break;
}
}
if (category == null) {
valueQuality.incrementValid();
return;
}
if (category.getCompleteness() != null && category.getCompleteness().booleanValue()) {
if (isValid(category, category.getType(), value)) {
valueQuality.incrementValid();
} else {
valueQuality.incrementInvalid();
processInvalidValue(valueQuality, value);
}
} else {
valueQuality.incrementValid();
}
}
private boolean isValid(DQCategory category, CategoryType catType, String value) {
LFUCache<String, Boolean> categoryCache = knownValidationCategoryCache.get(category.getId());
if (categoryCache == null) {
categoryCache = new LFUCache<String, Boolean>(10, 1000, 0.01f);
knownValidationCategoryCache.put(category.getId(), categoryCache);
} else {
final Boolean isValid = categoryCache.get(value);
if (isValid != null) {
return isValid;
}
}
boolean validCat = false;
switch (catType) {
case REGEX:
validCat = regexClassifier.validCategories(value, category, null);
break;
case DICT:
validCat = dataDictClassifier.validCategories(value, category, null);
break;
case COMPOUND:
Map<CategoryType, Set<DQCategory>> children = getChildrenCategories(category.getId());
validCat = regexClassifier.validCategories(value, category, children.get(CategoryType.REGEX));
if (!validCat)
validCat = dataDictClassifier.validCategories(value, category, children.get(CategoryType.DICT));
break;
default:
break;
}
categoryCache.put(value, validCat);
return validCat;
}
private void processInvalidValue(ValueQualityStatistics valueQuality, String invalidValue) {
if (isStoreInvalidValues) {
valueQuality.appendInvalidValue(invalidValue);
}
}
/**
* For the validation of a COMPOUND category, we only have to valid the leaves children categories.
* This methods find the DICT children categories and the REGEX children categories.
*
* @param id, the category from we search the children
* @return the DICT children categories and the REGEX children categories with a map.
*/
private Map<CategoryType, Set<DQCategory>> getChildrenCategories(String id) {
Deque<String> catToSee = new ArrayDeque<>();
Set<String> catAlreadySeen = new HashSet<>();
Map<CategoryType, Set<DQCategory>> children = new HashMap<>();
children.put(CategoryType.REGEX, new HashSet<DQCategory>());
children.put(CategoryType.DICT, new HashSet<DQCategory>());
catToSee.add(id);
String currentCategory;
while (!catToSee.isEmpty()) {
currentCategory = catToSee.pop();
DQCategory dqCategory = crm.getCategoryMetadataById(currentCategory);
if (dqCategory != null)
if (!CollectionUtils.isEmpty(dqCategory.getChildren())) {
for (DQCategory child : dqCategory.getChildren()) {
if (!catAlreadySeen.contains(child.getId())) {
catAlreadySeen.add(child.getId());
catToSee.add(child.getId());
}
}
} else if (!currentCategory.equals(id)) {
children.get(dqCategory.getType()).add(dqCategory);
}
}
return children;
}
@Override
public void end() {
// do some finalized thing at here.
}
@Override
public List<ValueQualityStatistics> getResult() {
return results;
}
@Override
public Analyzer<ValueQualityStatistics> merge(Analyzer<ValueQualityStatistics> analyzer) {
int idx = 0;
SemanticQualityAnalyzer mergedValueQualityAnalyze = new SemanticQualityAnalyzer(this.builder, getTypes());
((ResizableList<ValueQualityStatistics>) mergedValueQualityAnalyze.getResult()).resize(results.size());
for (ValueQualityStatistics qs : results) {
ValueQualityStatistics mergedStats = mergedValueQualityAnalyze.getResult().get(idx);
ValueQualityStatistics anotherStats = analyzer.getResult().get(idx);
mergedStats.setValidCount(qs.getValidCount() + anotherStats.getValidCount());
mergedStats.setInvalidCount(qs.getInvalidCount() + anotherStats.getInvalidCount());
mergedStats.setEmptyCount(qs.getEmptyCount() + anotherStats.getEmptyCount());
if (!qs.getInvalidValues().isEmpty()) {
mergedStats.getInvalidValues().addAll(qs.getInvalidValues());
}
if (!anotherStats.getInvalidValues().isEmpty()) {
mergedStats.getInvalidValues().addAll(anotherStats.getInvalidValues());
}
idx++;
}
return mergedValueQualityAnalyze;
}
@Override
public void close() throws Exception {
((DataDictFieldClassifier) dataDictClassifier).closeIndex();
}
}