// ============================================================================
//
// Copyright (C) 2006-2016 Talend Inc. - www.talend.com
//
// This source code is available under agreement available at
// %InstallDIR%\features\org.talend.rcp.branding.%PRODUCTNAME%\%PRODUCTNAME%license.txt
//
// You should have received a copy of the agreement
// along with this program; if not, write to Talend SA
// 9 rue Pages 92150 Suresnes, France
//
// ============================================================================
package org.talend.dataquality.statistics.frequency.pattern;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.talend.dataquality.common.inference.ResizableList;
import org.talend.dataquality.statistics.frequency.AbstractFrequencyAnalyzer;
import org.talend.dataquality.statistics.frequency.AbstractFrequencyStatistics;
import org.talend.dataquality.statistics.frequency.recognition.AbstractPatternRecognizer;
import org.talend.dataquality.statistics.frequency.recognition.DateTimePatternRecognizer;
import org.talend.dataquality.statistics.frequency.recognition.EmptyPatternRecognizer;
import org.talend.dataquality.statistics.frequency.recognition.LatinExtendedCharPatternRecognizer;
import org.talend.dataquality.statistics.frequency.recognition.RecognitionResult;
import org.talend.dataquality.statistics.type.DataTypeEnum;
/**
* Compute the pattern frequency tables.<br>
* This class is a composite analyzer that it will automatically attribute a character to the correct pattern group.
*
* @since 1.3.3
* @author mzhao
*
*/
public class CompositePatternFrequencyAnalyzer extends AbstractFrequencyAnalyzer<PatternFrequencyStatistics> {
private static final long serialVersionUID = -4658709249927616622L;
private List<AbstractPatternRecognizer> patternFreqRecognizers = new ArrayList<AbstractPatternRecognizer>();
private DataTypeEnum[] types; // types of columns
public CompositePatternFrequencyAnalyzer() {
this(new DataTypeEnum[] {});
}
public CompositePatternFrequencyAnalyzer(DataTypeEnum[] types) {
patternFreqRecognizers.add(new EmptyPatternRecognizer());
patternFreqRecognizers.add(new DateTimePatternRecognizer());
patternFreqRecognizers.add(new LatinExtendedCharPatternRecognizer());
this.types = types;
}
public CompositePatternFrequencyAnalyzer(List<AbstractPatternRecognizer> analyzerList) {
this(analyzerList, new DataTypeEnum[] {});
}
public CompositePatternFrequencyAnalyzer(List<AbstractPatternRecognizer> analyzerList, DataTypeEnum[] types) {
patternFreqRecognizers.addAll(analyzerList);
this.types = types;
}
@Override
public boolean analyze(String... record) {
if (record == null) {
return true;
}
if (freqTableStatistics == null || freqTableStatistics.isEmpty()) {
initFreqTableList(record.length);
}
for (int i = 0; i < record.length; i++) {
AbstractFrequencyStatistics freqStats = freqTableStatistics.get(i);
if (types.length > 0) {
analyzeField(record[i], freqStats, types[i]);
} else {
analyzeField(record[i], freqStats, null);
}
}
return true;
}
protected void analyzeField(String field, AbstractFrequencyStatistics freqStats, DataTypeEnum type) {
for (String pattern : getValuePatternSet(field, type)) {
freqStats.add(pattern);
}
}
@Override
protected void analyzeField(String field, AbstractFrequencyStatistics freqStats) {
for (String pattern : getValuePatternSet(field)) {
freqStats.add(pattern);
}
}
/**
* Recognize the string and return the pattern of the string with a boolean indicating the pattern replacement is
* complete if true ,false otherwise.
*
* @param originalValue the string to be replaced by its pattern string
* @return the recognition result bean.
*/
Set<String> getValuePatternSet(String originalValue) {
return getValuePatternSet(originalValue, null);
}
/**
* Recognize the string and return the pattern of the string with a boolean indicating the pattern replacement is
* complete if true ,false otherwise.
*
* @param originalValue the string to be replaced by its pattern string
* @param type the data type
* @return the recognition result bean.
*/
Set<String> getValuePatternSet(String originalValue, DataTypeEnum type) {
Set<String> resultSet = new HashSet<String>();
String patternString = originalValue;
for (AbstractPatternRecognizer recognizer : patternFreqRecognizers) {
RecognitionResult result = recognizer.recognize(patternString, type);
resultSet = result.getPatternStringSet();
if (result.isComplete()) {
break;
} else {
if (!resultSet.isEmpty()) {
patternString = resultSet.iterator().next();
}
}
}
// value is not recognized completely.
return resultSet;
}
@Override
protected void initFreqTableList(int size) {
List<PatternFrequencyStatistics> freqTableList = new ArrayList<>();
for (int i = 0; i < size; i++) {
PatternFrequencyStatistics freqTable = new PatternFrequencyStatistics();
freqTable.setAlgorithm(algorithm);
freqTableList.add(freqTable);
}
freqTableStatistics = new ResizableList<>(freqTableList);
}
}