// ============================================================================
//
// Copyright (C) 2006-2016 Talend Inc. - www.talend.com
//
// This source code is available under agreement available at
// %InstallDIR%\features\org.talend.rcp.branding.%PRODUCTNAME%\%PRODUCTNAME%license.txt
//
// You should have received a copy of the agreement
// along with this program; if not, write to Talend SA
// 9 rue Pages 92150 Suresnes, France
//
// ============================================================================
package org.talend.dq.analysis;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;
import org.apache.commons.lang.StringUtils;
import org.talend.commons.exception.BusinessException;
import org.talend.core.model.metadata.builder.connection.ConnectionPackage;
import org.talend.core.model.metadata.builder.connection.MetadataColumn;
import org.talend.core.model.metadata.types.JavaTypesManager;
import org.talend.cwm.management.i18n.Messages;
import org.talend.dataquality.PluginConstant;
import org.talend.dataquality.indicators.columnset.RecordMatchingIndicator;
import org.talend.dataquality.record.linkage.constant.AttributeMatcherType;
import org.talend.dataquality.record.linkage.constant.RecordMatcherType;
import org.talend.dataquality.record.linkage.grouping.AnalysisMatchRecordGrouping;
import org.talend.dataquality.record.linkage.grouping.IRecordGrouping;
import org.talend.dataquality.record.linkage.grouping.swoosh.SurvivorShipAlgorithmParams;
import org.talend.dataquality.record.linkage.grouping.swoosh.SurvivorShipAlgorithmParams.SurvivorshipFunction;
import org.talend.dataquality.record.linkage.grouping.swoosh.SurvivorshipUtils;
import org.talend.dataquality.record.linkage.record.CombinedRecordMatcher;
import org.talend.dataquality.record.linkage.record.IRecordMatcher;
import org.talend.dataquality.record.linkage.utils.MatchAnalysisConstant;
import org.talend.dataquality.record.linkage.utils.SurvivorShipAlgorithmEnum;
import org.talend.dataquality.rules.AppliedBlockKey;
import org.talend.dataquality.rules.BlockKeyDefinition;
import org.talend.dataquality.rules.DefaultSurvivorshipDefinition;
import org.talend.dataquality.rules.KeyDefinition;
import org.talend.dataquality.rules.MatchKeyDefinition;
import org.talend.dataquality.rules.MatchRule;
import org.talend.dataquality.rules.RulesPackage;
import org.talend.dataquality.rules.SurvivorshipKeyDefinition;
import org.talend.dq.helper.CustomAttributeMatcherHelper;
/**
* used for some utility functions
*/
public class AnalysisRecordGroupingUtils {
public static final String ESCAPE_CHARACTER = "\\"; //$NON-NLS-1$
/**
* get Complete Column Schema.
*
* @return
*/
public static MetadataColumn[] getCompleteColumnSchema(Map<MetadataColumn, String> columnMap) {
List<MetadataColumn> columnNameList = new ArrayList<MetadataColumn>();
if (columnMap == null) {
return new MetadataColumn[0];
}
for (MetadataColumn columnName : columnMap.keySet()) {
columnNameList.add(columnName);
}
MetadataColumn dummyBlockKeyColumn = ConnectionPackage.eINSTANCE.getConnectionFactory().createMetadataColumn();
dummyBlockKeyColumn.setName(MatchAnalysisConstant.BLOCK_KEY);
columnNameList.add(dummyBlockKeyColumn);
MetadataColumn dummyGIDColumn = ConnectionPackage.eINSTANCE.getConnectionFactory().createMetadataColumn();
dummyGIDColumn.setName(MatchAnalysisConstant.GID);
columnNameList.add(dummyGIDColumn);
MetadataColumn dummyGSizeColumn = ConnectionPackage.eINSTANCE.getConnectionFactory().createMetadataColumn();
dummyGSizeColumn.setName(MatchAnalysisConstant.GRP_SIZE);
columnNameList.add(dummyGSizeColumn);
MetadataColumn dummyMasterColumn = ConnectionPackage.eINSTANCE.getConnectionFactory().createMetadataColumn();
dummyMasterColumn.setName(MatchAnalysisConstant.MASTER);
columnNameList.add(dummyMasterColumn);
MetadataColumn dummyScoreColumn = ConnectionPackage.eINSTANCE.getConnectionFactory().createMetadataColumn();
dummyScoreColumn.setName(MatchAnalysisConstant.SCORE);
columnNameList.add(dummyScoreColumn);
MetadataColumn dummyGRPQualityColumn = ConnectionPackage.eINSTANCE.getConnectionFactory().createMetadataColumn();
dummyGRPQualityColumn.setName(MatchAnalysisConstant.GRP_QUALITY);
columnNameList.add(dummyGRPQualityColumn);
MetadataColumn dummyAtrrScoresColumn = ConnectionPackage.eINSTANCE.getConnectionFactory().createMetadataColumn();
dummyAtrrScoresColumn.setName(MatchAnalysisConstant.ATTRIBUTE_SCORES);
columnNameList.add(dummyAtrrScoresColumn);
return columnNameList.toArray(new MetadataColumn[columnNameList.size()]);
}
/**
* getCompleteColumnSchema: when refresh match chart by click refresh button
*
* @return
*/
public static MetadataColumn[] getCompleteColumnSchemaWithoutBlockKey(Map<MetadataColumn, String> columnMap) {
List<MetadataColumn> columnNameList = new ArrayList<MetadataColumn>();
if (columnMap == null) {
return new MetadataColumn[0];
}
for (MetadataColumn columnName : columnMap.keySet()) {
columnNameList.add(columnName);
}
MetadataColumn dummyGIDColumn = ConnectionPackage.eINSTANCE.getConnectionFactory().createMetadataColumn();
dummyGIDColumn.setName(MatchAnalysisConstant.GID);
columnNameList.add(dummyGIDColumn);
MetadataColumn dummyGSizeColumn = ConnectionPackage.eINSTANCE.getConnectionFactory().createMetadataColumn();
dummyGSizeColumn.setName(MatchAnalysisConstant.GRP_SIZE);
columnNameList.add(dummyGSizeColumn);
MetadataColumn dummyMasterColumn = ConnectionPackage.eINSTANCE.getConnectionFactory().createMetadataColumn();
dummyMasterColumn.setName(MatchAnalysisConstant.MASTER);
columnNameList.add(dummyMasterColumn);
MetadataColumn dummyScoreColumn = ConnectionPackage.eINSTANCE.getConnectionFactory().createMetadataColumn();
dummyScoreColumn.setName(MatchAnalysisConstant.SCORE);
columnNameList.add(dummyScoreColumn);
MetadataColumn dummyGRPQualityColumn = ConnectionPackage.eINSTANCE.getConnectionFactory().createMetadataColumn();
dummyGRPQualityColumn.setName(MatchAnalysisConstant.GRP_QUALITY);
columnNameList.add(dummyGRPQualityColumn);
MetadataColumn dummyAtrrScoresColumn = ConnectionPackage.eINSTANCE.getConnectionFactory().createMetadataColumn();
dummyAtrrScoresColumn.setName(MatchAnalysisConstant.ATTRIBUTE_SCORES);
columnNameList.add(dummyAtrrScoresColumn);
return columnNameList.toArray(new MetadataColumn[columnNameList.size()]);
}
/**
* get the key map of the match table's columns(<column, index>)
*
* @param column
* @param algoType
* @param algoParameter
* @param confidentWeight
* @param columnIndexMap
* @param matchInterval
* @param attributeName
* @return
*/
public static Map<String, String> getMatchKeyMap(String column, String algoType, String algoParameter, int confidentWeight,
double attrThreshold, Map<MetadataColumn, String> columnIndexMap, double matchInterval, String attributeName,
String handleNull) {
return getMatchKeyMap(column, algoType, algoParameter, confidentWeight, attrThreshold, columnIndexMap, matchInterval,
attributeName, handleNull, null);// The jar path is null when the matcher's algorithm is not a type of
// "custom"
}
/**
* get the key map of the match table's columns(<column, index>)
*
* @param column
* @param algoType
* @param algoParameter
* @param confidentWeight
* @param columnIndexMap
* @param matchInterval
* @param attributeName
* @param handleNull
* @param jarPath
* @return
*/
public static Map<String, String> getMatchKeyMap(String column, String algoType, String algoParameter, int confidentWeight,
double attrThreshold, Map<MetadataColumn, String> columnIndexMap, double matchInterval, String attributeName,
String handleNull, String jarPath) {
Map<String, String> matchKeyMap = getMatchKeyMap(column, algoType, algoParameter, confidentWeight, attrThreshold,
columnIndexMap, matchInterval, attributeName, null, handleNull, jarPath, null);// the last one parameter need to
// be check by junit
return matchKeyMap;
}
/**
*
* Create match key map plus an additional "match key name" .
*
* @param column
* @param algoType
* @param algoParameter
* @param confidentWeight
* @param columnIndexMap
* @param matchInterval
* @param attributeName
* @param matchKeyName
* @param handleNull
* @param jarPath
* @return
*/
public static Map<String, String> getMatchKeyMap(String column, String algoType, String algoParameter, int confidentWeight,
double attrThreshold, Map<MetadataColumn, String> columnIndexMap, double matchInterval, String attributeName,
String matchKeyName, String handleNull, String jarPath, String tokenizationType) {
Map<String, String> matchKeyMap = new HashMap<String, String>();
for (MetadataColumn metaCol : columnIndexMap.keySet()) {
if (metaCol.getName().equals(column)) {
matchKeyMap.put(IRecordGrouping.COLUMN_IDX, columnIndexMap.get(metaCol));
break;
}
}
matchKeyMap.put(IRecordGrouping.MATCHING_TYPE, AttributeMatcherType.valueOf(algoType).name());
matchKeyMap.put(IRecordGrouping.CUSTOMER_MATCH_CLASS, algoParameter);
matchKeyMap.put(IRecordGrouping.CONFIDENCE_WEIGHT, String.valueOf(confidentWeight));
matchKeyMap.put(IRecordGrouping.ATTRIBUTE_THRESHOLD, String.valueOf(attrThreshold));
matchKeyMap.put(IRecordGrouping.RECORD_MATCH_THRESHOLD, String.valueOf(matchInterval));
matchKeyMap.put(IRecordGrouping.ATTRIBUTE_NAME, attributeName);
matchKeyMap.put(IRecordGrouping.MATCH_KEY_NAME, matchKeyName);
matchKeyMap.put(IRecordGrouping.HANDLE_NULL, handleNull);
matchKeyMap.put(IRecordGrouping.TOKENIZATION_TYPE, tokenizationType);
matchKeyMap.put(IRecordGrouping.JAR_PATH, jarPath);
return matchKeyMap;
}
/**
* Get blocking key map
*
* @return
*/
public static Map<String, String> getBlockingKeyMap(String column, String preAlgo, String preAlgValue, String algorithm,
String algorithmValue, String postAlgo, String postAlgoValue) {
Map<String, String> blockKeyDefMap = new HashMap<String, String>();
blockKeyDefMap.put(MatchAnalysisConstant.PRECOLUMN, column);
blockKeyDefMap.put(MatchAnalysisConstant.PRE_ALGO, preAlgo);
blockKeyDefMap.put(MatchAnalysisConstant.PRE_VALUE, preAlgValue);
blockKeyDefMap.put(MatchAnalysisConstant.KEY_ALGO, algorithm);
blockKeyDefMap.put(MatchAnalysisConstant.KEY_VALUE, algorithmValue);
blockKeyDefMap.put(MatchAnalysisConstant.POST_ALGO, postAlgo);
blockKeyDefMap.put(MatchAnalysisConstant.POST_VALUE, postAlgoValue);
return blockKeyDefMap;
}
/**
* join the string array to a single string, use escapeCharacter to escape the separator. MUST call
* {@link #split(String, String, String)} to split the joined string. (if the string end with escapeCharacter, there
* will join to the next column!!!)
*
* @param array
* @param separator recommend to use |
* @param escapeCharacter recommend to use \
* @return
*/
public static String join(String[] array, String separator, String escapeCharacter) {
String doubleEscapeCharacter = escapeCharacter + escapeCharacter;
String escapeCharacterSeparator = escapeCharacter + separator;
StringBuilder sr = new StringBuilder();
for (String str : array) {
String temp = StringUtils.replace(str, escapeCharacter, doubleEscapeCharacter);
temp = StringUtils.replace(temp, separator, escapeCharacterSeparator);
sr.append(temp + separator);
}
return StringUtils.removeEnd(sr.toString(), separator);
}
/**
* split the string into a string array, use escapeCharacter to escape the separator. the string MUST be generated
* by {@link #join(String[], String, String)}.(if the string end with escapeCharacter, there will join to the next
* column!!!)
*
* @param string
* @param separator recommend to use |
* @param escapeCharacter recommend to use \
* @return
*/
public static String[] split(String string, String separator, String escapeCharacter) {
String doubleEscapeCharacter = escapeCharacter + escapeCharacter;
String escapeCharacterSeparator = escapeCharacter + separator;
String regex = "(?<!" + Pattern.quote(escapeCharacter) + ")" + Pattern.quote(separator); //$NON-NLS-1$ //$NON-NLS-2$
ArrayList<String> strs = new ArrayList<String>();
for (String s : string.split(regex)) {
String temp = StringUtils.replace(s, escapeCharacterSeparator, separator);
temp = StringUtils.replace(temp, doubleEscapeCharacter, escapeCharacter);
strs.add(temp);
}
return strs.toArray(new String[strs.size()]);
}
/**
* By default for analysis, the applied blocking key will be the key from key generation definition. This will be
* refined when there is a need to define the applied blocking key manually by user later.
*
* @param recordMatchingIndicator
*/
public static void createAppliedBlockKeyByGenKey(RecordMatchingIndicator recordMatchingIndicator) {
List<AppliedBlockKey> appliedBlockKeys = recordMatchingIndicator.getBuiltInMatchRuleDefinition().getAppliedBlockKeys();
appliedBlockKeys.clear();
List<BlockKeyDefinition> blockKeyDefs = recordMatchingIndicator.getBuiltInMatchRuleDefinition().getBlockKeys();
if (blockKeyDefs != null && blockKeyDefs.size() > 0) {
AppliedBlockKey appliedBlockKey = RulesPackage.eINSTANCE.getRulesFactory().createAppliedBlockKey();
appliedBlockKey.setColumn(PluginConstant.BLOCK_KEY);
appliedBlockKey.setName(PluginConstant.BLOCK_KEY);
appliedBlockKeys.add(appliedBlockKey);
}
}
/**
* mzhao Get block key schema given the record matching indicator.
*
* @param recordMatchingIndicator
* @return
*/
public static List<Map<String, String>> getBlockKeySchema(RecordMatchingIndicator recordMatchingIndicator) {
List<AppliedBlockKey> appliedBlockKeys = recordMatchingIndicator.getBuiltInMatchRuleDefinition().getAppliedBlockKeys();
List<Map<String, String>> blockKeySchema = new ArrayList<Map<String, String>>();
for (KeyDefinition keyDef : appliedBlockKeys) {
AppliedBlockKey appliedKeyDefinition = (AppliedBlockKey) keyDef;
String column = appliedKeyDefinition.getColumn();
if (StringUtils.equals(PluginConstant.BLOCK_KEY, column)) {
// If there exist customized block key defined, get the key
// parameters.
List<BlockKeyDefinition> blockKeyDefs = recordMatchingIndicator.getBuiltInMatchRuleDefinition().getBlockKeys();
for (BlockKeyDefinition blockKeyDef : blockKeyDefs) {
Map<String, String> blockKeyDefMap = new HashMap<String, String>();
blockKeyDefMap.putAll(getCustomizedBlockKeyParameter(blockKeyDef, blockKeyDef.getColumn()));
blockKeySchema.add(blockKeyDefMap);
}
} else {
Map<String, String> blockKeyDefMap = new HashMap<String, String>();
blockKeyDefMap.put(MatchAnalysisConstant.PRECOLUMN, column);
blockKeySchema.add(blockKeyDefMap);
}
}
return blockKeySchema;
}
/**
* DOC zhao Comment method "getCustomizedBlockKeyParameter".
*
* @param appliedKeyDefinition
* @param column
* @return
*/
private static Map<String, String> getCustomizedBlockKeyParameter(BlockKeyDefinition blockKeydef, String column) {
String preAlgo = blockKeydef.getPreAlgorithm().getAlgorithmType();
String preAlgoValue = blockKeydef.getPreAlgorithm().getAlgorithmParameters();
String algorithm = blockKeydef.getAlgorithm().getAlgorithmType();
String algorithmValue = blockKeydef.getAlgorithm().getAlgorithmParameters();
String postAlgo = blockKeydef.getPostAlgorithm().getAlgorithmType();
String postAlgValue = blockKeydef.getPostAlgorithm().getAlgorithmParameters();
Map<String, String> blockKeyDefMap = AnalysisRecordGroupingUtils.getBlockingKeyMap(column, preAlgo, preAlgoValue,
algorithm, algorithmValue, postAlgo, postAlgValue);
return blockKeyDefMap;
}
/**
* DOC zhao Comment method "setRuleMatcher".
*
* @param columnMap
* @param recordMatchingIndicator
* @param analysisMatchRecordGrouping
* @throws BusinessException
*/
public static void setRuleMatcher(Map<MetadataColumn, String> columnMap, RecordMatchingIndicator recordMatchingIndicator,
AnalysisMatchRecordGrouping analysisMatchRecordGrouping) throws BusinessException {
List<MatchRule> matchRules = recordMatchingIndicator.getBuiltInMatchRuleDefinition().getMatchRules();
// Column index list store all the indices.
List<String> allColumnIndice = new ArrayList<String>();
for (MatchRule matcher : matchRules) {
if (matcher == null) {
continue;
}
List<Map<String, String>> currentRuleMatcher = new ArrayList<Map<String, String>>();
List<String> currentColumnIndice = new ArrayList<String>();
for (MatchKeyDefinition matchDef : matcher.getMatchKeys()) {
// check if the current match key does not contain any
// column,throw exception, do not continue
if (matchDef.getColumn() == null || StringUtils.EMPTY.equals(matchDef.getColumn())) {
BusinessException businessException = new BusinessException();
businessException.setAdditonalMessage(Messages.getString("MatchAnalysisExecutor.NoColumnInMatchKey", //$NON-NLS-1$
matchDef.getName()));
throw businessException;
}
String algorithmType = matchDef.getAlgorithm().getAlgorithmType();
Map<String, String> matchKeyMap = getMatchKeyMap(columnMap, matcher, matchDef, algorithmType);
addMatchKeyOrderbyColumnIdx(currentRuleMatcher, matchKeyMap);
currentColumnIndice.add(matchKeyMap.get(IRecordGrouping.COLUMN_IDX));
}
if (allColumnIndice.isEmpty()) {
for (Map<String, String> matchKey : currentRuleMatcher) {
String colIdx = matchKey.get(IRecordGrouping.COLUMN_IDX);
allColumnIndice.add(colIdx);
}
} else {
refineMatcherWithDummy(analysisMatchRecordGrouping, allColumnIndice, currentColumnIndice, currentRuleMatcher);
}
analysisMatchRecordGrouping.addRuleMatcher(currentRuleMatcher);
}
}
/**
* DOC yyin Comment method "initialMatchGrouping".
*
* @param columnMap
* @param recordMatchingIndicator
* @param analysisMatchRecordGrouping
* @throws InstantiationException
* @throws IllegalAccessException
* @throws ClassNotFoundException
*/
public static void initialMatchGrouping(Map<MetadataColumn, String> columnMap,
RecordMatchingIndicator recordMatchingIndicator, AnalysisMatchRecordGrouping analysisMatchRecordGrouping)
throws InstantiationException, IllegalAccessException, ClassNotFoundException {
if (recordMatchingIndicator.getBuiltInMatchRuleDefinition().getRecordLinkageAlgorithm()
.equals(RecordMatcherType.simpleVSRMatcher.name())) {
analysisMatchRecordGrouping.setRecordLinkAlgorithm(RecordMatcherType.simpleVSRMatcher);
analysisMatchRecordGrouping.initialize();
} else {
analysisMatchRecordGrouping.setRecordLinkAlgorithm(RecordMatcherType.T_SwooshAlgorithm);
analysisMatchRecordGrouping.setOrginalInputColumnSize(columnMap.size()+1);
analysisMatchRecordGrouping.initialize();
SurvivorShipAlgorithmParams survivorShipAlgorithmParams = createSurvivorShipAlgorithmParams(
analysisMatchRecordGrouping, recordMatchingIndicator, columnMap);
analysisMatchRecordGrouping.setSurvivorShipAlgorithmParams(survivorShipAlgorithmParams);
}
}
public static SurvivorShipAlgorithmParams createSurvivorShipAlgorithmParams(
AnalysisMatchRecordGrouping analysisMatchRecordGrouping, RecordMatchingIndicator recordMatchingIndicator,
Map<MetadataColumn, String> columnMap) {
SurvivorShipAlgorithmParams survivorShipAlgorithmParams = new SurvivorShipAlgorithmParams();
// Survivorship functions.
List<SurvivorshipKeyDefinition> survivorshipKeyDefs = recordMatchingIndicator.getBuiltInMatchRuleDefinition()
.getSurvivorshipKeys();
List<SurvivorshipFunction> survFunctions = new ArrayList<SurvivorshipFunction>();
for (SurvivorshipKeyDefinition survDef : survivorshipKeyDefs) {
SurvivorshipFunction func = survivorShipAlgorithmParams.new SurvivorshipFunction();
func.setSurvivorShipKey(survDef.getName());
func.setParameter(survDef.getFunction().getAlgorithmParameters());
func.setSurvivorShipAlgoEnum(SurvivorShipAlgorithmEnum.getTypeBySavedValue(survDef.getFunction().getAlgorithmType()));
survFunctions.add(func);
}
survivorShipAlgorithmParams
.setSurviorShipAlgos(survFunctions.toArray(new SurvivorshipFunction[survivorshipKeyDefs.size()]));
// Set default survivorship functions.
List<DefaultSurvivorshipDefinition> defSurvDefs = recordMatchingIndicator.getBuiltInMatchRuleDefinition()
.getDefaultSurvivorshipDefinitions();
Map<Integer, SurvivorshipFunction> defaultSurvRules = new HashMap<Integer, SurvivorshipFunction>();
for (MetadataColumn metaColumn : columnMap.keySet()) {
String dataTypeName = metaColumn.getTalendType();
for (DefaultSurvivorshipDefinition defSurvDef : defSurvDefs) {
// the column's data type start with id_, so need to add id_ ahead of the default survivorship's data
// type before judging if they are equal
if (StringUtils.equals(dataTypeName, "id_" + defSurvDef.getDataType())) { //$NON-NLS-1$
putNewSurvFunc(columnMap, survivorShipAlgorithmParams, defaultSurvRules, metaColumn, defSurvDef);
break;
} else if (StringUtils.equals(defSurvDef.getDataType(), "Number") && JavaTypesManager.isNumber(dataTypeName)) { //$NON-NLS-1$
putNewSurvFunc(columnMap, survivorShipAlgorithmParams, defaultSurvRules, metaColumn, defSurvDef);
break;
}
}// End for: if no func defined, then the value will be taken from one of the records in a group (1st
// one ).
}
survivorShipAlgorithmParams.setDefaultSurviorshipRules(defaultSurvRules);
// Set the record matcher
CombinedRecordMatcher combinedRecordMatcher = analysisMatchRecordGrouping.getCombinedRecordMatcher();
survivorShipAlgorithmParams.setRecordMatcher(combinedRecordMatcher);
Map<IRecordMatcher, SurvivorshipFunction[]> survAlgos = new HashMap<IRecordMatcher, SurvivorshipFunction[]>();
SurvivorshipFunction[] survFuncs = survivorShipAlgorithmParams.getSurviorShipAlgos();
Map<Integer, SurvivorshipFunction> colIdx2DefaultSurvFunc = survivorShipAlgorithmParams.getDefaultSurviorshipRules();
int matchRuleIdx = -1;
List<List<Map<String, String>>> multiRules = analysisMatchRecordGrouping.getMultiMatchRules();
for (List<Map<String, String>> matchrule : multiRules) {
matchRuleIdx++;
if (matchrule == null) {
continue;
}
SurvivorshipFunction[] surFuncsInMatcher = new SurvivorshipFunction[matchrule.size()];
int idx = 0;
for (Map<String, String> mkDef : matchrule) {
String matcherType = mkDef.get(IRecordGrouping.MATCHING_TYPE);
if (AttributeMatcherType.DUMMY.name().equals(matcherType)) {
// Find the func from default survivorship rule.
surFuncsInMatcher[idx] = colIdx2DefaultSurvFunc.get(Integer.valueOf(mkDef.get(IRecordGrouping.COLUMN_IDX)));
if (surFuncsInMatcher[idx] == null) {
// Use CONCATENATE by default if not specified .
surFuncsInMatcher[idx] = survivorShipAlgorithmParams.new SurvivorshipFunction();
surFuncsInMatcher[idx].setSurvivorShipAlgoEnum(SurvivorShipAlgorithmEnum.CONCATENATE);
// MOD TDQ-11774 set a default parameter
surFuncsInMatcher[idx].setParameter(SurvivorshipUtils.DEFAULT_CONCATENATE_PARAMETER);
}
} else {
// Find the func from existing survivorship rule list.
for (SurvivorshipFunction survFunc : survFuncs) {
String keyName = mkDef.get(IRecordGrouping.MATCH_KEY_NAME);
if (keyName.equals(survFunc.getSurvivorShipKey())) {
surFuncsInMatcher[idx] = survFunc;
break;
}
}
}
idx++;
}
// Add the funcs to a specific record matcher. NOTE that the index of matcher must be coincidence to the
// index of match rule.
survAlgos.put(combinedRecordMatcher.getMatchers().get(matchRuleIdx), surFuncsInMatcher);
}
survivorShipAlgorithmParams.setSurvivorshipAlgosMap(survAlgos);
return survivorShipAlgorithmParams;
}
/**
* Create a new surv function and put it into map given column index as the key.
*
* @param columnMap
* @param survivorShipAlgorithmParams
* @param defaultSurvRules
* @param metaColumn
* @param defSurvDef
*/
private static void putNewSurvFunc(Map<MetadataColumn, String> columnMap,
SurvivorShipAlgorithmParams survivorShipAlgorithmParams, Map<Integer, SurvivorshipFunction> defaultSurvRules,
MetadataColumn metaColumn, DefaultSurvivorshipDefinition defSurvDef) {
SurvivorshipFunction survFunc = survivorShipAlgorithmParams.new SurvivorshipFunction();
survFunc.setParameter(defSurvDef.getFunction().getAlgorithmParameters());
survFunc.setSurvivorShipAlgoEnum(SurvivorShipAlgorithmEnum.getTypeBySavedValue(defSurvDef.getFunction()
.getAlgorithmType()));
defaultSurvRules.put(Integer.valueOf(columnMap.get(metaColumn)), survFunc);
}
private static void refineMatcherWithDummy(AnalysisMatchRecordGrouping analysisMatchRecordGrouping,
List<String> allColumnIndice, List<String> currentColumnIndice, List<Map<String, String>> currentRuleMatcher) {
List<List<Map<String, String>>> multiMatchRules = analysisMatchRecordGrouping.getMultiMatchRules();
// Refine the other matchers with dummy matcher
for (Map<String, String> matchKey : currentRuleMatcher) {
String colIdx = matchKey.get(IRecordGrouping.COLUMN_IDX);
if (!allColumnIndice.contains(colIdx)) {
allColumnIndice.add(colIdx);
// Create dummy matcher
Map<String, String> dummyMatcherMap = new HashMap<String, String>();
dummyMatcherMap.put(IRecordGrouping.COLUMN_IDX, colIdx);
dummyMatcherMap.put(IRecordGrouping.MATCHING_TYPE, AttributeMatcherType.DUMMY.name());
// Refine the multi match rules with dummy matcher.
for (List<Map<String, String>> matchRule : multiMatchRules) {
addMatchKeyOrderbyColumnIdx(matchRule, dummyMatcherMap);
}
}
}
// Refine current matcher with dummy matcher by the knowledge of the
// previous matcher.
if (multiMatchRules != null && multiMatchRules.size() > 0) {
// Here the 0 index is safe because all matchers before are
// same record size with same match key.
List<Map<String, String>> preMatcher = multiMatchRules.get(0);
for (Map<String, String> preMatchKey : preMatcher) {
String colIdx = preMatchKey.get(IRecordGrouping.COLUMN_IDX);
if (!currentColumnIndice.contains(colIdx)) {
// Create dummy matcher
Map<String, String> dummyMatcherMap = new HashMap<String, String>();
dummyMatcherMap.put(IRecordGrouping.COLUMN_IDX, colIdx);
dummyMatcherMap.put(IRecordGrouping.MATCHING_TYPE, AttributeMatcherType.DUMMY.name());
addMatchKeyOrderbyColumnIdx(currentRuleMatcher, dummyMatcherMap);
}
}
}
}
private static Map<String, String> getMatchKeyMap(Map<MetadataColumn, String> columnMap, MatchRule matcher,
MatchKeyDefinition matchDef, String algorithmType) {
Map<String, String> matchKeyMap = null;
if (AttributeMatcherType.get(algorithmType) == AttributeMatcherType.CUSTOM) {
matchKeyMap = AnalysisRecordGroupingUtils.getMatchKeyMap(matchDef.getColumn(), algorithmType, matchDef.getAlgorithm()
.getAlgorithmParameters(), matchDef.getConfidenceWeight(), matchDef.getThreshold(), columnMap, matcher
.getMatchInterval(), matchDef.getColumn(), matchDef.getName(), matchDef.getHandleNull(),
CustomAttributeMatcherHelper.getFullJarPath(matchDef.getAlgorithm().getAlgorithmParameters()), null);
} else {
matchKeyMap = AnalysisRecordGroupingUtils.getMatchKeyMap(matchDef.getColumn(), algorithmType, matchDef.getAlgorithm()
.getAlgorithmParameters(), matchDef.getConfidenceWeight(), matchDef.getThreshold(), columnMap, matcher
.getMatchInterval(), matchDef.getColumn(), matchDef.getName(), matchDef.getHandleNull(), null, matchDef
.getTokenizationType());
}
return matchKeyMap;
}
private static void addMatchKeyOrderbyColumnIdx(List<Map<String, String>> currentRuleMatcher, Map<String, String> matchKeyMap) {
int index = 0;
for (Map<String, String> currentMatchKey : currentRuleMatcher) {
int currColIdx = Integer.valueOf(currentMatchKey.get(IRecordGrouping.COLUMN_IDX));
int toBeInsertColIdx = Integer.valueOf(matchKeyMap.get(IRecordGrouping.COLUMN_IDX));
if (currColIdx > toBeInsertColIdx) {
currentRuleMatcher.add(index, matchKeyMap);
return;
}
index++;
}
currentRuleMatcher.add(matchKeyMap);
}
}