// ============================================================================
//
// Copyright (C) 2006-2016 Talend Inc. - www.talend.com
//
// This source code is available under agreement available at
// %InstallDIR%\features\org.talend.rcp.branding.%PRODUCTNAME%\%PRODUCTNAME%license.txt
//
// You should have received a copy of the agreement
// along with this program; if not, write to Talend SA
// 9 rue Pages 92150 Suresnes, France
//
// ============================================================================
package org.talend.dataquality.statistics.frequency.recognition;
import java.util.Collections;
import java.util.Set;
import java.util.regex.Pattern;
import org.apache.commons.lang.StringUtils;
import org.talend.dataquality.statistics.type.DataTypeEnum;
/**
* * Recognize ascii characters given predefined list of Ascii characters and its pattern mappings.
*
* @since 1.3.0
* @author mzhao
*/
public class LatinExtendedCharPatternRecognizer extends AbstractPatternRecognizer {
public static final String CHARS_TO_REPLACE = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞß0123456789";
public static final String REPLACEMENT_CHARS = "aaaaaaaaaaaaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAAaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA9999999999";
private Pattern charsPattern = Pattern.compile("[a-z|A-Z|à-ÿ|À-ß]");
@Override
public RecognitionResult recognize(String stringToRecognize, DataTypeEnum type) {
RecognitionResult result = new RecognitionResult();
if (StringUtils.isEmpty(stringToRecognize)) {
result.setResult(Collections.singleton(stringToRecognize), false);
return result;
}
boolean isComplete = true;
StringBuilder sb = new StringBuilder();
int n = stringToRecognize.length();
for (int i = 0; i < n; i++) {
char c = stringToRecognize.charAt(i);
int pos = CHARS_TO_REPLACE.indexOf(c);
if (pos > -1) {
sb.append(REPLACEMENT_CHARS.charAt(pos));
} else {
sb.append(c);
isComplete = false;
}
}
result.setResult(Collections.singleton(sb.toString()), isComplete);
return result;
}
/**
* Whether the patternString contains the predefined alpha character.
*
* @param patternString
* @return
*/
public boolean containsAlphabetic(String patternString) {
return charsPattern.matcher(patternString).find();
}
@Override
public Set<String> getValuePattern(String originalValue) {
RecognitionResult result = recognize(originalValue);
return result.getPatternStringSet();
}
}