// ============================================================================
//
// Copyright (C) 2006-2016 Talend Inc. - www.talend.com
//
// This source code is available under agreement available at
// %InstallDIR%\features\org.talend.rcp.branding.%PRODUCTNAME%\%PRODUCTNAME%license.txt
//
// You should have received a copy of the agreement
// along with this program; if not, write to Talend SA
// 9 rue Pages 92150 Suresnes, France
//
// ============================================================================
package org.talend.survivorship.model;
import java.lang.ref.SoftReference;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.talend.survivorship.action.ISurvivoredAction;
import org.talend.survivorship.action.handler.CRCRHandler;
import org.talend.survivorship.action.handler.FunctionParameter;
import org.talend.survivorship.action.handler.HandlerParameter;
import org.talend.survivorship.services.CompletenessService;
import org.talend.survivorship.services.FrequencyService;
import org.talend.survivorship.services.NumberService;
import org.talend.survivorship.services.StringService;
import org.talend.survivorship.services.TimeService;
import org.talend.survivorship.utils.ChainNodeMap;
/**
* Collection of a group of data. This class will be instantiated and insert into the rule engine as global variable.
*/
public class DataSet {
private List<Record> recordList;
private List<Column> columnList;
private ChainNodeMap chainMap;
private HashMap<String, Object> survivorMap;
protected HashMap<String, Integer> survivorIndexMap;
private List<HashSet<String>> conflictList;
private List<Column> columnOrder;
private HashSet<String> conflictsOfSurvivor;
private FrequencyService fs;
private StringService ss;
private TimeService ts;
private CompletenessService cs;
private NumberService ns;
private SoftReference<HashMap<String, List<Integer>>> conflictDataMap = new SoftReference<>(
new HashMap<String, List<Integer>>());
/**
* DataSet constructor.
*
* @param columns
* @param input
*/
public DataSet(List<Column> columns) {
columnList = columns;
recordList = new ArrayList<>();
survivorMap = new HashMap<>();
conflictList = new ArrayList<>();
conflictsOfSurvivor = new HashSet<>();
chainMap = new ChainNodeMap();
survivorIndexMap = new HashMap<>();
initServices();
}
/**
* DataSet constructor.
*
* @param columns
* @param input
*/
protected DataSet(List<Column> columns, List<Record> recordList) {
this(columns);
this.recordList = recordList;
}
public void initData(Object[][] input) {
for (int j = 0; j < columnList.size(); j++) {
columnList.get(j).init();
}
for (int i = 0; i < input.length; i++) {
conflictList.add(new HashSet<String>());
Record rec = new Record();
rec.setId(i);
Attribute attribute;
for (int j = 0; j < columnList.size(); j++) {
Column col = columnList.get(j);
attribute = new Attribute(rec, col, input[i][j]);
rec.putAttribute(col.getName(), attribute);
col.putAttribute(rec, attribute);
}
recordList.add(rec);
}
}
private void initServices() {
// only data is keep use same one
fs = new FrequencyService(this);
ss = new StringService(this);
ts = new TimeService(this);
cs = new CompletenessService(this);
ns = new NumberService(this);
}
public void reset() {
recordList.clear();
survivorMap.clear();
survivorIndexMap.clear();
conflictList.clear();
conflictsOfSurvivor.clear();
fs.init();
ss.init();
ts.init();
cs.init();
ns.init();
for (Column col : columnList) {
col.init();
}
}
/**
* Retrieve all attributes of a column.
*
* @param columnName
* @return
*/
public Collection<Attribute> getAttributesByColumn(String columnName) {
for (Column col : columnList) {
if (col.getName().equals(columnName)) {
return col.getAttributes();
}
}
return null;
}
/**
* Gets the columnList.
*
* @return
*/
public List<Column> getColumnList() {
return columnList;
}
/**
* Survive an attribute by record number and column name.
*
* @param recNum
* @param colName
* @deprecated this method is kept for backward compatibility of existing rules
*/
@Deprecated
public void survive(int recNum, String colName) {
Record record = recordList.get(recNum);
Attribute attribute = record.getAttribute(colName);
if (attribute.isAlive()) {
attribute.setSurvived(true);
}
}
/**
* Survive an attribute by record number and column name.
*
* @param recNum
* @param colName
* @param ruleName
*/
public void survive(int recNum, String colName, String ruleName) {
Record record = recordList.get(recNum);
Attribute attribute = record.getAttribute(colName);
if (attribute.isAlive()) {
Column column = attribute.getColumn();
// TDQ-12742 when there are 2 or more rules on a column, one rule can work only if the previous one does
// not producer any survivor
if (column.getSurvivingRuleName() == null || ruleName.equals(column.getSurvivingRuleName())) {
String columnName = column.getName();
// conflict generated
CRCRHandler crcrHandler = (CRCRHandler) this.chainMap.get(columnName);
// If we don't do that maybe we can store conflict data number form here
if (crcrHandler == null) {
for (ConflictRuleDefinition ruleDef : column.getConflictResolveList()) {
ISurvivoredAction action = ruleDef.getFunction().getAction();
Column refColumn = record.getAttribute(ruleDef.getReferenceColumn()).getColumn();
Column tarColumn = column;
String expression = ruleDef.getOperation();
String cRuleName = ruleDef.getRuleName();
boolean isIgnoreBlank = ruleDef.isIgnoreBlanks();
String fillColumn = ruleDef.getFillColumn();
boolean isDealDup = ruleDef.isDuplicateSurCheck();
FunctionParameter functionParameter = new FunctionParameter(action, expression, isIgnoreBlank, isDealDup);
CRCRHandler newCrcrHandler = new CRCRHandler(new HandlerParameter(this, refColumn, tarColumn, cRuleName,
this.getColumnIndexMap(), fillColumn, functionParameter));
if (crcrHandler == null) {
this.chainMap.put(columnName, newCrcrHandler);
}
crcrHandler = crcrHandler == null ? newCrcrHandler
: (CRCRHandler) crcrHandler.linkSuccessor(newCrcrHandler);
}
}
// store conflict data
if (crcrHandler != null) {
this.addConfDataIndex(columnName, recNum);
}
// modify this attribute after conflict resolve
attribute.setSurvived(true);
column.setSurvivingRuleName(ruleName);
}
}
}
/**
* Create by zshen Get a mapping between column name and column index
*
* @return a mapping map between column name and column index
*/
private Map<String, Integer> getColumnIndexMap() {
Map<String, Integer> columnIndexMap = new HashMap<>();
int index = 0;
for (Column col : this.columnList) {
columnIndexMap.put(col.getName(), index++);
}
return columnIndexMap;
}
/**
* Survive an attribute if another attribute is still alive.
*
* @param recNum
* @param col
* @param aliveField
*/
public void surviveByAliveField(int recNum, String col, String aliveField) {
Record record = recordList.get(recNum);
Attribute attribute = record.getAttribute(col);
Attribute another = record.getAttribute(aliveField);
if (attribute.isAlive() && another.isAlive()) {
attribute.setSurvived(true);
}
}
/**
* Eliminate an attribute.
*
* @param recNum
* @param col
*/
public void eliminate(int recNum, String col) {
Record record = recordList.get(recNum);
Attribute attribute = record.getAttribute(col);
if (attribute.isAlive()) {
attribute.setAlive(false);
}
}
/**
* Compute all the attributes to see if they are alive.
*/
public void finalizeComputation() {
for (Record record : recordList) {
for (Column col : getColumnOrder() == null ? this.getColumnList() : getColumnOrder()) {
Attribute a = record.getAttribute(col.getName());
if (a.isSurvived()) {
// defause case first one
conflictRecord(col, a);
}
}
}
if (getColumnOrder() == null) {
return;
}
for (Column nextCol : getColumnOrder()) {
String nextColName = nextCol.getName();
if (conflictsOfSurvivor.contains(nextColName)) {
String conflictCol = nextColName;
CRCRHandler crcrHandler = (CRCRHandler) this.chainMap.get(conflictCol);
List<Integer> conflictDataIndexList = this.getConflictDataIndexList(conflictCol);
if (crcrHandler != null && conflictDataIndexList != null) {
crcrHandler.handleRequest();
}
if (crcrHandler != null) {
SurvivedResult survivoredRowNum = crcrHandler.getSurvivoredRowNum();
if (survivoredRowNum != null) {
Attribute attribute = recordList.get(survivoredRowNum.getRowNum())
.getAttribute(survivoredRowNum.getColumnName());
Object survivedVlaue = attribute.getValue();
if (crcrHandler.getHandlerParameter().isDealDup() && checkDupSurValue(survivedVlaue)) {
survivedVlaue = crcrHandler.getNonDupResult(survivedVlaue);
}
survivorMap.put(conflictCol, survivedVlaue);
survivorIndexMap.put(conflictCol, survivoredRowNum.getRowNum());
}
}
}
}
}
/**
* Create by zshen check whether new value has been exist in the survivorMap.
*
* @param value The new value
* @return true when exist duplicate else false
*/
private boolean checkDupSurValue(Object value) {
Iterator<Object> iterator = survivorMap.values().iterator();
while (iterator.hasNext()) {
if (value.equals(iterator.next())) {
return true;
}
}
return false;
}
/**
* Create by zshen record conflict resolved result
*
* @param col The column current result is come from
* @param a the value which should be record by survived value
*/
private void conflictRecord(Column col, Attribute a) {
// default case get first one
if (survivorMap.get(col.getName()) == null) {
survivorMap.put(col.getName(), a.getValue());
survivorIndexMap.put(col.getName(), a.getRecordID());
} else {
survivorIndexMap.remove(col.getName());
Object survivor = survivorMap.get(col.getName());
if (a.getValue() != null && !a.getValue().equals(survivor)) {
HashSet<String> desc = conflictList.get(a.getRecordID());
desc.add(col.getName());
conflictsOfSurvivor.add(col.getName());
}
}
}
/**
* determine if a value is the most common one of a given column. Used only in rules.
*
* @param var the value which need to be check
* @param column the column which var belong
* @return true when var is the most common else false
*/
public boolean isMostCommon(Object var, String column, boolean ignoreBlanks) {
if (var == null) {
return false;
}
if (fs.getMostCommonValue(column, ignoreBlanks).contains(var)) {
return true;
}
return false;
}
/**
* determine if a record is the most complete. Used only in rules.
*
* @param var The input data
* @param column The column which input data belong
* @return true if it is esle false
*/
public boolean isMostComplete(int recNum) {
if (cs.getMostCompleteRecNumList().contains(recNum)) {
return true;
}
return false;
}
/**
* determine if a value is the longest one of a given column. Used only in rules.
*
* @param var The value which need to be check
* @param column The column which input data belong
* @return True if it is esle false
*/
public boolean isLongest(Object var, String column, boolean ignoreBlanks) {
if (var == null) {
return false;
}
return ss.isLongestValue(var, column, ignoreBlanks);
}
/**
* determine if a value is the shortest one of a given column. Used only in rules.
*
* @param var The value which need to be check
* @param column The column which input data belong
* @return True if it is esle false
*/
public boolean isShortest(Object var, String column, boolean ignoreBlanks) {
if (var == null) {
return false;
}
return ss.isShortestValue(var, column, ignoreBlanks);
}
/**
* determine if a value is the latest one of a given column. Used only in rules.
*
* @param var The value which need to be check
* @param column The column which input data belong
* @return true If it is esle false
*/
public boolean isLatest(Object var, String column) {
if (var == null) {
return false;
}
return ts.isLatestValue(var, column);
}
/**
* determine if a value is the earliest one of a given column. Used only in rules.
*
* @param var The value which need to be check
* @param column The column which input data belong
* @return true If it is esle false
*/
public boolean isEarliest(Object var, String column) {
if (var == null) {
return false;
}
return ts.isEarliestValue(var, column);
}
/**
* determine if a value is the largest one of a given column. Used only in rules.
*
* @param var The value which need to be check
* @param column The column which input data belong
* @return true If it is esle false
*/
public boolean isLargest(Object var, String column) {
if (var == null) {
return false;
}
return ns.isLargestValue(var, column);
}
/**
* determine if a value is the smallest one of a given column. Used only in rules.
*
* @param var The value which need to be check
* @param column The column which input data belong
* @return true If it is esle false
*/
public boolean isSmallest(Object var, String column) {
if (var == null) {
return false;
}
return ns.isSmallestValue(var, column);
}
/**
* Getter for survivorMap.
*
* @return the survivorMap
*/
public HashMap<String, Object> getSurvivorMap() {
return survivorMap;
}
/**
* Getter for conflictList.
*
* @return the conflictList
*/
public List<HashSet<String>> getConflictList() {
return conflictList;
}
/**
* Getter for conflictsOfSurvivor.
*
* @return the conflictsOfSurvivor
*/
public HashSet<String> getConflictsOfSurvivor() {
return conflictsOfSurvivor;
}
/**
* Getter for recordList
*
* @return
*/
public List<Record> getRecordList() {
return recordList;
}
/**
* Getter for conflictDataMap.
*
* @return the conflictDataMap
*/
public SoftReference<HashMap<String, List<Integer>>> getConflictDataMap() {
return this.conflictDataMap;
}
/**
*
* Get all of conflict data
*
* @param colName the name of column which conflict data come from
* @return The list of conflict data
*/
public List<Integer> getConflictDataIndexList(String colName) {
Map<String, List<Integer>> tempConflictDataMap = this.getConflictDataMap().get();
if (tempConflictDataMap != null) {
return tempConflictDataMap.get(colName);
} else {
return null;
}
}
/**
*
* Create by zshen store conflict row num of every column.
*
* @param colName The name of column which generate conflict
* @param index The row number of conflic generate
*/
public void addConfDataIndex(String colName, Integer index) {
Map<String, List<Integer>> tempConflictDataMap = this.getConflictDataMap().get();
if (tempConflictDataMap != null) {
List<Integer> indexList = tempConflictDataMap.get(colName);
if (indexList == null) {
indexList = new ArrayList<>();
tempConflictDataMap.put(colName, indexList);
}
indexList.add(index);
} else {
return;
}
}
/**
*
* Create by zshen create subset of current dataset by speciala column name.
*
* @param colName The column name of new subset
* @return a sebset of current dataset not new data be created in the process
*/
public SubDataSet createSubDataSet(String colName) {
List<Integer> conflictDataIndexList = this.getConflictDataIndexList(colName);
if (conflictDataIndexList == null) {
return null;
}
return new SubDataSet(this, conflictDataIndexList);
}
/**
* Getter for survivorIndexMap.
*
* @return the survivorIndexMap
*/
public HashMap<String, Integer> getSurvivorIndexMap() {
return this.survivorIndexMap;
}
/**
* Getter for columnOrder.
*
* @return the columnOrder
*/
public List<Column> getColumnOrder() {
return this.columnOrder;
}
/**
* Sets the columnOrder.
*
* @param columnOrder the columnOrder to set
*/
public void setColumnOrder(List<Column> columnOrder) {
this.columnOrder = columnOrder;
}
}