/**
* <copyright> </copyright>
*
* $Id$
*/
package org.talend.dataquality.indicators.impl;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.commons.codec.language.Soundex;
import org.apache.log4j.Logger;
import org.eclipse.emf.common.notify.Notification;
import org.eclipse.emf.ecore.EClass;
import org.eclipse.emf.ecore.impl.ENotificationImpl;
import org.talend.commons.utils.WorkspaceUtils;
import org.talend.dataquality.PluginConstant;
import org.talend.dataquality.helpers.IndicatorHelper;
import org.talend.dataquality.indicators.IndicatorsPackage;
import org.talend.dataquality.indicators.SoundexFreqIndicator;
import org.talend.dataquality.indicators.mapdb.StandardDBName;
import org.talend.utils.collections.MapValueSorter;
/**
* <!-- begin-user-doc --> An implementation of the model object '<em><b>Soundex Freq Indicator</b></em>'. <!--
* end-user-doc -->
* <p>
* The following features are implemented:
* <ul>
* <li>{@link org.talend.dataquality.indicators.impl.SoundexFreqIndicatorImpl#getValueToDistinctFreq <em>Value To Distinct Freq</em>}</li>
* </ul>
* </p>
*
* @generated
*/
public class SoundexFreqIndicatorImpl extends FrequencyIndicatorImpl implements SoundexFreqIndicator {
/**
* The default value of the '{@link #getValueToDistinctFreq() <em>Value To Distinct Freq</em>}' attribute. <!--
* begin-user-doc --> <!-- end-user-doc -->
*
* @see #getValueToDistinctFreq()
* @generated
* @ordered
*/
protected static final HashMap<Object, Long> VALUE_TO_DISTINCT_FREQ_EDEFAULT = null;
/**
* The cached value of the '{@link #getValueToDistinctFreq() <em>Value To Distinct Freq</em>}' attribute. <!--
* begin-user-doc --> <!-- end-user-doc -->
*
* @see #getValueToDistinctFreq()
* @generated
* @ordered
*/
protected HashMap<Object, Long> valueToDistinctFreq = VALUE_TO_DISTINCT_FREQ_EDEFAULT;
protected Map<Object, List<Object>> soundexFreqMap = null;
private Soundex soundex = new Soundex();
private static Logger log = Logger.getLogger(SoundexFreqIndicatorImpl.class);
/**
* <!-- begin-user-doc --> <!-- end-user-doc -->
* @generated
*/
protected SoundexFreqIndicatorImpl() {
super();
}
/**
* <!-- begin-user-doc --> <!-- end-user-doc -->
* @generated
*/
@Override
protected EClass eStaticClass() {
return IndicatorsPackage.Literals.SOUNDEX_FREQ_INDICATOR;
}
/**
* <!-- begin-user-doc --> <!-- end-user-doc -->
* @generated
*/
@Override
public HashMap<Object, Long> getValueToDistinctFreq() {
return valueToDistinctFreq;
}
/**
* <!-- begin-user-doc --> <!-- end-user-doc -->
* @generated
*/
@Override
public void setValueToDistinctFreq(HashMap<Object, Long> newValueToDistinctFreq) {
HashMap<Object, Long> oldValueToDistinctFreq = valueToDistinctFreq;
valueToDistinctFreq = newValueToDistinctFreq;
if (eNotificationRequired())
eNotify(new ENotificationImpl(this, Notification.SET, IndicatorsPackage.SOUNDEX_FREQ_INDICATOR__VALUE_TO_DISTINCT_FREQ, oldValueToDistinctFreq, valueToDistinctFreq));
}
/**
* <!-- begin-user-doc --> <!-- end-user-doc -->
*
* @generated NOT getDistinctCount(Object dataValue)
*/
@Override
public Long getDistinctCount(Object dataValue) {
// MOD xqliu 2009-07-01 bug 7068
Long freq = this.valueToDistinctFreq == null || this.valueToDistinctFreq.size() == 0 ? null : this.valueToDistinctFreq
.get(dataValue);
// ~
return (freq == null) ? 0L : freq;
}
/**
* <!-- begin-user-doc --> <!-- end-user-doc -->
* @generated
*/
@Override
public Object eGet(int featureID, boolean resolve, boolean coreType) {
switch (featureID) {
case IndicatorsPackage.SOUNDEX_FREQ_INDICATOR__VALUE_TO_DISTINCT_FREQ:
return getValueToDistinctFreq();
}
return super.eGet(featureID, resolve, coreType);
}
/**
* <!-- begin-user-doc --> <!-- end-user-doc -->
* @generated
*/
@SuppressWarnings("unchecked")
@Override
public void eSet(int featureID, Object newValue) {
switch (featureID) {
case IndicatorsPackage.SOUNDEX_FREQ_INDICATOR__VALUE_TO_DISTINCT_FREQ:
setValueToDistinctFreq((HashMap<Object, Long>)newValue);
return;
}
super.eSet(featureID, newValue);
}
/**
* <!-- begin-user-doc --> <!-- end-user-doc -->
* @generated
*/
@Override
public void eUnset(int featureID) {
switch (featureID) {
case IndicatorsPackage.SOUNDEX_FREQ_INDICATOR__VALUE_TO_DISTINCT_FREQ:
setValueToDistinctFreq(VALUE_TO_DISTINCT_FREQ_EDEFAULT);
return;
}
super.eUnset(featureID);
}
/**
* <!-- begin-user-doc --> <!-- end-user-doc -->
* @generated
*/
@Override
public boolean eIsSet(int featureID) {
switch (featureID) {
case IndicatorsPackage.SOUNDEX_FREQ_INDICATOR__VALUE_TO_DISTINCT_FREQ:
return VALUE_TO_DISTINCT_FREQ_EDEFAULT == null ? valueToDistinctFreq != null : !VALUE_TO_DISTINCT_FREQ_EDEFAULT.equals(valueToDistinctFreq);
}
return super.eIsSet(featureID);
}
/**
* <!-- begin-user-doc --> <!-- end-user-doc -->
* @generated
*/
@Override
public String toString() {
if (eIsProxy()) return super.toString();
StringBuffer result = new StringBuffer(super.toString());
result.append(" (valueToDistinctFreq: ");
result.append(valueToDistinctFreq);
result.append(')');
return result.toString();
}
/*
* (non-Javadoc)
*
* @see org.talend.dataquality.indicators.impl.FrequencyIndicatorImpl#storeSqlResults(java.util.List)
*
* MOD scorreia 2009-02-09 storeSqlResults(List<Object[]> objects)
*/
@Override
public boolean storeSqlResults(List<Object[]> objects) {
// handle case when no row is returned because there is no value.
if (objects.isEmpty()) {
if (log.isInfoEnabled()) {
log.info("Query for soundex frequency table did not return any result. " + "Check the options of this indicator.");
}
this.setValueToFreq(new HashMap<Object, Long>());
return true;
} // else we got some values
final int nbColumns = 4;
// columns of result set are: value, soundex, count(*) , count(distinct)
if (!checkResults(objects, nbColumns)) {
return false;
}
// --- count the distinct records
HashMap<Object, Long> mapVal2DistinctFreq = new HashMap<Object, Long>();
HashMap<Object, Long> mapVal2Freq = new HashMap<Object, Long>();
boolean debug = log.isDebugEnabled();
StringBuffer matrix = debug ? new StringBuffer() : null;
for (Object[] value2freq : objects) {
if (value2freq.length != nbColumns) {
log.error("Problem with result for Frequency indicator");
return false;
}
Object value = getValueFields(value2freq);
// MOD gdbu 2011-4-14 bug : 18975
// Long freq = Long.valueOf(String.valueOf(value2freq[nbColumns - 2]));
Long freq = IndicatorHelper.getLongFromObject(value2freq[nbColumns - 2]);
// ~18975
mapVal2Freq.put(value, freq);
// MOD gdbu 2011-4-14 bug : 18975
// Long distinctFreq = Long.valueOf(String.valueOf(value2freq[nbColumns - 1]));
Long distinctFreq = IndicatorHelper.getLongFromObject(value2freq[nbColumns - 1]);
// ~18975
mapVal2DistinctFreq.put(value, distinctFreq);
if (debug) {
matrix.append("\n").append("\"").append(value).append("\"").append(",").append(freq);
}
}
if (debug) {
log.debug(matrix);
}
this.setValueToFreq(mapVal2Freq);
this.setValueToDistinctFreq(mapVal2DistinctFreq);
return true;
}
/*
* (non-Javadoc)
*
* @see org.talend.dataquality.indicators.impl.FrequencyIndicatorImpl#getValueFields(java.lang.Object[])
*/
@Override
protected Object getValueFields(Object[] value2freq) {
assert (value2freq.length == 4);
return value2freq[0];
}
@Override
public boolean handle(Object data) {
if (isUsedMapDBMode()) {
if (data == null) {
List<Object> valueList = soundexFreqMap.get(data);
if (valueList == null) {
valueList = new ArrayList<Object>();
valueList.add(data);// input value
valueList.add(null);// soundex value
valueList.add(0);// distinct count
valueList.add(1);// duplicate count
} else {
// if already contain null value then duplicate count +1
Long duplicateCount = Long.valueOf(valueList.get(3).toString()) + 1;
valueList.set(3, duplicateCount);
}
soundexFreqMap.put(data, valueList);
} else {
String soundexValue = soundex.soundex(data.toString());
List<Object> valueList = soundexFreqMap.get(soundexValue);
if (valueList == null) {
valueList = new ArrayList<Object>();
valueList.add(data);// input value
valueList.add(soundexValue);// soundex value
valueList.add(1);// distinct count
valueList.add(0);// duplicate count
} else if (!getMapForFreq().containsKey(data)) {
// if already contain then soundex value but input valie is different then distinct count +1
Long distinctCount = Long.valueOf(valueList.get(2).toString()) + 1;
valueList.set(2, distinctCount);
// distinctKey should be max one
if (valueList.get(0).toString().compareTo(data.toString()) < 0) {
valueList.set(0, data);
}
} else {
// if already contain the soundex value but input valie is same then duplicate count +1
Long duplicateCount = Long.valueOf(valueList.get(3).toString()) + 1;
valueList.set(3, duplicateCount);
}
soundexFreqMap.put(soundexValue, valueList);
}
}
boolean returnValue = super.handle(data);
this.mustStoreRow = false;
return returnValue;
}
/*
* Run as JavaEngine.Set ValueToFreq and ValueToDistinctFreq Add by qiongli 2010-6-22,bug 13654
*/
protected void soundexForJavaEngine() {
Iterator<Object> iterator = this.getValueToFreq().keySet().iterator();
Soundex sd = new Soundex();
HashMap<Object, Long> disctinctVfMap = new HashMap<Object, Long>();
List<String[]> valueToFreqLs = new ArrayList<String[]>();
while (iterator.hasNext()) {
String array[] = new String[3];
Object obj = iterator.next();
if (obj == null) {
array[0] = null;
array[2] = String.valueOf(0);// distinct count
} else {
array[0] = obj.toString();
array[2] = String.valueOf(1);
}
try {
array[1] = sd.soundex(array[0]);// soundex value
} catch (IllegalArgumentException ex) {
log.warn("Soundex algorithm do not support the charactors: " + array[0]);
continue;
}
valueToFreqLs.add(array);
}
String foreArray[] = null;
String afterArray[] = null;
HashMap<Object, Long> vfMap = new HashMap<Object, Long>();
// remove duplicate key of valueToFreqLs, and get total count every key for valueToFreq.
for (int i = 0; i < valueToFreqLs.size(); i++) {
foreArray = valueToFreqLs.get(i);
// MOD qiongli 7-21
if (foreArray[0] == null) {
disctinctVfMap.put(WorkspaceUtils.NULL_FIELD, Long.valueOf(foreArray[2]));
vfMap.put(WorkspaceUtils.NULL_FIELD, getMapForFreq().get(foreArray[0]));
continue;
}
vfMap.put(foreArray[0], getMapForFreq().get(foreArray[0]));
for (int j = i + 1; j < valueToFreqLs.size(); j++) {
afterArray = valueToFreqLs.get(j);
if (afterArray[0] == null) {
continue;
}
if (afterArray[1].equals(foreArray[1])) {
foreArray[2] = Long.valueOf(foreArray[2]).intValue() + 1 + "";
valueToFreqLs.remove(afterArray);
j--;
Long newLong = Long.valueOf(vfMap.get(foreArray[0]).longValue()
+ getMapForFreq().get(afterArray[0]).longValue());
vfMap.remove(foreArray[0]);
// same as function max in sql engine
if (foreArray[0].compareTo(afterArray[0]) < 0) {
foreArray[0] = afterArray[0];
}
vfMap.put(foreArray[0], newLong);
}
}
disctinctVfMap.put(foreArray[0], Long.valueOf(foreArray[2]));
}
setValueToFreq(vfMap);
setValueToDistinctFreq(disctinctVfMap);
}
/**
*
* compute soundex result
*
* @param asc
*/
protected void computeSoundexFreqByMapDB(boolean asc) {
final int topN = (parameters != null) ? parameters.getTopN() : PluginConstant.DEFAULT_TOP_N;
Iterator<List<Object>> iterator = soundexFreqMap.values().iterator();
HashMap<Object, Long> ValueToFreq = new HashMap<Object, Long>();
HashMap<Object, Long> ValueToDistinctFreq = new HashMap<Object, Long>();
Object[] distinctKey = new Object[topN + 1];// input value
Long[] distinctValue = new Long[topN + 1];
Long[] countValue = new Long[topN + 1];// the count of itmes which contain by this soundex group
while (iterator.hasNext()) {
List<Object> nextValueList = iterator.next();
Long currentDistinctValue = Long.valueOf(nextValueList.get(2).toString());// current input value
Object currentDistinctKey = nextValueList.get(0);
// current the count of itmes which contain by this soundex group
Long currentCountValue = Long.valueOf(nextValueList.get(3).toString()) + currentDistinctValue;
// find out TopN elements
for (int i = 0; i < topN; i++) {
if (distinctValue[i] == null) {
distinctValue[i] = currentDistinctValue;
distinctKey[i] = currentDistinctKey;
countValue[i] = currentCountValue;
ValueToFreq.put(distinctKey[i], countValue[i]);
ValueToDistinctFreq.put(distinctKey[i], distinctValue[i]);
break;
} else if (asc && currentDistinctValue > distinctValue[i] || !asc && currentDistinctValue < distinctValue[i]) {
distinctValue[topN] = distinctValue[i];
countValue[topN] = countValue[i];
distinctKey[topN] = distinctKey[i];
distinctValue[i] = currentDistinctValue;
countValue[i] = currentCountValue;
ValueToFreq.remove(distinctKey[i]);
ValueToDistinctFreq.remove(distinctKey[i]);
distinctKey[i] = currentDistinctKey;
ValueToFreq.put(distinctKey[i], countValue[i]);
ValueToDistinctFreq.put(distinctKey[i], distinctValue[i]);
currentDistinctKey = distinctKey[topN];
currentDistinctValue = distinctValue[topN];
currentCountValue = countValue[topN];
}
}
}
setValueToFreq(ValueToFreq);
setValueToDistinctFreq(ValueToDistinctFreq);
}
/*
* (non-Javadoc)
*
* @see org.talend.dataquality.indicators.impl.FrequencyIndicatorImpl#finalizeComputation()
*/
@Override
public boolean finalizeComputation() {
final int topN = (parameters != null) ? parameters.getTopN() : PluginConstant.DEFAULT_TOP_N;
if (isUsedMapDBMode()) {
computeSoundexFreqByMapDB(true);
} else {
soundexForJavaEngine();
MapValueSorter mvs = new MapValueSorter();
List<Object> ls = mvs.sortMap(this.valueToDistinctFreq, false);
List<Object> mostDistinctFrequent = getOrderElements(ls, topN, false);
HashMap<Object, Long> map = new HashMap<Object, Long>();
for (Object object : mostDistinctFrequent) {
map.put(object, valueToFreq.get(object));
}
this.setValueToFreq(map);
}
return true;
}
/*
* ADD qiongli 2010-6-22,bug 13654 For java engine:Order by valueToDistinctFreq ,then order by valueToFreq
*/
protected List<Object> getOrderElements(final List<Object> sortedMap, int bottomN, boolean asc) {
List<Object> frequentLs = new ArrayList<Object>();
int i = 0;
Object obj1 = null;
Object obj2 = null;
Object temp = null;
for (int m = 0; m < sortedMap.size(); m++) {
obj1 = sortedMap.get(m);
for (int n = m + 1; n < sortedMap.size(); n++) {
obj2 = sortedMap.get(n);
if (this.valueToDistinctFreq.get(obj1).equals(this.valueToDistinctFreq.get(obj2))) {
if (!asc && valueToFreq.get(obj1) < valueToFreq.get(obj2) || asc
&& valueToFreq.get(obj1) > valueToFreq.get(obj2)) {
temp = obj1;
obj1 = obj2;
obj2 = temp;
sortedMap.set(n, temp);
sortedMap.set(m, obj1);
}
}
}
frequentLs.add(obj1);
i++;
if (i == bottomN) {
break;
}
}
return frequentLs;
}
/*
* (non-Javadoc)
*
* @see org.talend.dataquality.indicators.impl.FrequencyIndicatorImpl#reset()
*/
@Override
public boolean reset() {
if (isUsedMapDBMode()) {
if (soundexFreqMap != null) {
soundexFreqMap.clear();
}
soundexFreqMap = initValueForDBMap(StandardDBName.computeProcess.name());
}
return super.reset();
}
} // SoundexFreqIndicatorImpl