// ============================================================================
//
// Copyright (C) 2006-2016 Talend Inc. - www.talend.com
//
// This source code is available under agreement available at
// %InstallDIR%\features\org.talend.rcp.branding.%PRODUCTNAME%\%PRODUCTNAME%license.txt
//
// You should have received a copy of the agreement
// along with this program; if not, write to Talend SA
// 9 rue Pages 92150 Suresnes, France
//
// ============================================================================
package org.talend.survivorship.services;
import java.util.HashMap;
import java.util.HashSet;
import org.talend.survivorship.model.Attribute;
import org.talend.survivorship.model.DataSet;
/**
* This is the frequency service to use in rules.
*/
public class FrequencyService extends AbstractService {
protected HashMap<String, HashMap<Object, Integer>> frequencyMaps = new HashMap<>();
protected HashMap<String, Integer> maxOccurence = new HashMap<>();
protected HashMap<String, Integer> secondMaxOccurence = new HashMap<>();
/**
* FrequencyService constructor.
*
* @param dataset
*/
public FrequencyService(DataSet dataset) {
super(dataset);
}
/**
* Put attribute values into the frequencyMap of a given column.
*
* @param column
* @param ignoreBlanks
* @return
*/
public HashMap<Object, Integer> putAttributeValues(String column, boolean ignoreBlanks) {
HashMap<Object, Integer> valueToFreq = frequencyMaps.get(column);
valueToFreq = new HashMap<>();
frequencyMaps.put(column, valueToFreq);
for (Attribute attr : dataset.getAttributesByColumn(column)) {
if (attr.isAlive()) {
Object value = attr.getValue();
if (value == null || (ignoreBlanks == true && "".equals(value.toString().trim()))) { //$NON-NLS-1$
continue;
}
if (valueToFreq.get(value) == null) {
// add value to map
valueToFreq.put(value, 1);
} else {
// already exists: increment number of occurrences
valueToFreq.put(value, valueToFreq.get(value) + 1);
}
}
}
int max = 0;
int second = 0;
for (Object value : valueToFreq.keySet()) {
int freq = valueToFreq.get(value);
if (freq > max) {
second = max;
max = freq;
} else if (freq < max && freq > second) {
second = freq;
}
}
maxOccurence.put(column, max);
secondMaxOccurence.put(column, second);
return valueToFreq;
}
/**
* Retrieve the most common value of a given column.
*
* @param column
* @param ignoreBlanks
* @return
*/
public HashSet<Object> getMostCommonValue(String column, boolean ignoreBlanks) {
HashMap<Object, Integer> valueToFreq = frequencyMaps.get(column);
if (valueToFreq == null) {
valueToFreq = putAttributeValues(column, ignoreBlanks);
}
int max = maxOccurence.get(column);
HashSet<Object> mostFrequentValues = new HashSet<Object>();
for (Object obj : valueToFreq.keySet()) {
int count = valueToFreq.get(obj);
if (count == max) {
mostFrequentValues.add(obj);
}
}
return mostFrequentValues;
}
/**
* Retrieve the second most common value of a given column.
*
* @param column
* @param ignoreBlanks
* @return
*/
public HashSet<Object> getSecondMostCommonValue(String column, boolean ignoreBlanks) {
HashMap<Object, Integer> valueToFreq = frequencyMaps.get(column);
if (valueToFreq == null) {
valueToFreq = putAttributeValues(column, ignoreBlanks);
}
int second = secondMaxOccurence.get(column);
HashSet<Object> secondMostFrequentValues = new HashSet<Object>();
for (Object obj : valueToFreq.keySet()) {
int count = valueToFreq.get(obj);
if (count == second) {
secondMostFrequentValues.add(obj);
}
}
return secondMostFrequentValues;
}
/*
* (non-Javadoc)
*
* @see org.talend.survivorship.services.AbstractService#init()
*/
@Override
public void init() {
frequencyMaps.clear();
maxOccurence.clear();
secondMaxOccurence.clear();
}
}