// ============================================================================ // // Copyright (C) 2006-2016 Talend Inc. - www.talend.com // // This source code is available under agreement available at // %InstallDIR%\features\org.talend.rcp.branding.%PRODUCTNAME%\%PRODUCTNAME%license.txt // // You should have received a copy of the agreement // along with this program; if not, write to Talend SA // 9 rue Pages 92150 Suresnes, France // // ============================================================================ package org.talend.dq.analysis.match; import java.io.IOException; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.TreeMap; import org.talend.commons.exception.BusinessException; import org.talend.core.model.metadata.builder.connection.MetadataColumn; import org.talend.dataquality.indicators.columnset.BlockKeyIndicator; import org.talend.dataquality.indicators.columnset.RecordMatchingIndicator; import org.talend.dataquality.matchmerge.Attribute; import org.talend.dataquality.matchmerge.Record; import org.talend.dataquality.record.linkage.constant.RecordMatcherType; import org.talend.dataquality.record.linkage.genkey.AbstractGenerateKey; import org.talend.dataquality.record.linkage.grouping.AnalysisMatchRecordGrouping; import org.talend.dataquality.record.linkage.grouping.MatchGroupResultConsumer; import org.talend.dataquality.record.linkage.grouping.swoosh.AnalysisSwooshMatchRecordGrouping; import org.talend.dataquality.record.linkage.grouping.swoosh.RichRecord; import org.talend.dataquality.record.linkage.utils.MatchAnalysisConstant; import org.talend.dq.analysis.AnalysisRecordGroupingUtils; /** * For each record: find its related block, and use this block's matching to do the match. * Used for running the match analysis * */ public class BlockAndMatchManager { private Map<String, OneBlockMatching> blockMatchMap = new HashMap<String, OneBlockMatching>(); // Used to read each record from the data source private Iterator<Record> resultIterator; private MatchGroupResultConsumer matchResultConsumer; private RecordMatchingIndicator recordMatchingIndicator; private BlockKeyIndicator blockKeyIndicator; private Map<MetadataColumn, String> columnMap; private AbstractGenerateKey blockKeyGenerator = new AbstractGenerateKey(); private Map<String, String> colName2IndexMap = new HashMap<String, String>(); private List<Map<String, String>> blockKeyDefinition; public BlockAndMatchManager(Iterator<Record> resultIterator, MatchGroupResultConsumer matchResultConsumer, Map<MetadataColumn, String> columnMap, RecordMatchingIndicator recordMatchingIndicator, BlockKeyIndicator blockKeyIndicator) { this.resultIterator = resultIterator; this.matchResultConsumer = matchResultConsumer; this.recordMatchingIndicator = recordMatchingIndicator; this.blockKeyIndicator = blockKeyIndicator; this.columnMap = columnMap; for (MetadataColumn metaCol : columnMap.keySet()) { colName2IndexMap.put(metaCol.getName(), columnMap.get(metaCol)); } initBlockKeyDefinitions(); } public void initBlockKeyDefinitions() { // By default for analysis, the applied blocking key will be the key from key generation definition. This // will be refined when there is a need to define the applied blocking key manually by user later. AnalysisRecordGroupingUtils.createAppliedBlockKeyByGenKey(recordMatchingIndicator); blockKeyDefinition = AnalysisRecordGroupingUtils.getBlockKeySchema(recordMatchingIndicator); } public void run() throws BusinessException { while (this.resultIterator.hasNext()) { RichRecord record = (RichRecord) this.resultIterator.next(); if (record == null) { continue; } // get the block key of this record(which block this record belongs) String blockKey = getBlockKey(record); OneBlockMatching matchInOneBlock = this.blockMatchMap.get(blockKey); if (matchInOneBlock == null) {// create a new matching for this new block to do the matching matchInOneBlock = createBlockKeyManager(blockKey); } // do group in the same block matchInOneBlock.run(record); } // end all endAll(); } /** * end all blocks matching, and set each block size into the block indicator * * @throws BusinessException */ private void endAll() throws BusinessException { TreeMap<Object, Long> blockSize2Freq = new TreeMap<Object, Long>(); for (String key : this.blockMatchMap.keySet()) { OneBlockMatching oneBlockMatching = blockMatchMap.get(key); oneBlockMatching.end(); long blockSize = oneBlockMatching.getBlockSize(); Long freq = blockSize2Freq.get(blockSize); if (freq == null) { freq = 0l; } blockSize2Freq.put(blockSize, freq + 1); } blockKeyIndicator.setBlockSize2frequency(blockSize2Freq); } /** * create the processing manager of one block * * @param blockKey * @return * @throws BusinessException */ private OneBlockMatching createBlockKeyManager(String blockKey) throws BusinessException { OneBlockMatching matchInOneBlock = new OneBlockMatching(); this.blockMatchMap.put(blockKey, matchInOneBlock); return matchInOneBlock; } /** * get the block key of one record * * @param record * @return */ private String getBlockKey(RichRecord record) { Map<String, String> columnValueMap = new HashMap<String, String>(); for (String columnName : colName2IndexMap.keySet()) { int index = Integer.parseInt(colName2IndexMap.get(columnName)); columnValueMap.put(columnName, record.getAttributes().get(index).getValue()); } String blockkey = blockKeyGenerator.getGenKey(blockKeyDefinition, columnValueMap); // add the block key into the record Attribute attribute = new Attribute(MatchAnalysisConstant.BLOCK_KEY,record.getAttributes().size()); attribute.setValue(blockkey); record.getAttributes().add(attribute); return blockkey; } /** * the match grouping within one block. */ class OneBlockMatching { private AnalysisMatchRecordGrouping matchRecordGrouping; private long blockSize = 0; public OneBlockMatching() throws BusinessException { if (recordMatchingIndicator.getBuiltInMatchRuleDefinition().getRecordLinkageAlgorithm() .equals(RecordMatcherType.T_SwooshAlgorithm.name())) { matchRecordGrouping = new AnalysisSwooshMatchRecordGrouping(matchResultConsumer); } else { matchRecordGrouping = new AnalysisMatchRecordGrouping(matchResultConsumer); } initGrouping(matchRecordGrouping); } /** * Init the algorithm * * @throws BusinessException */ private void initGrouping(AnalysisMatchRecordGrouping recordGrouping) throws BusinessException { AnalysisRecordGroupingUtils.setRuleMatcher(columnMap, recordMatchingIndicator, recordGrouping); try { AnalysisRecordGroupingUtils.initialMatchGrouping(columnMap, recordMatchingIndicator, recordGrouping); } catch (InstantiationException e1) { throw new BusinessException(); } catch (IllegalAccessException e1) { throw new BusinessException(); } catch (ClassNotFoundException e1) { throw new BusinessException(); } } /* * Group on one record * * @see java.lang.Thread#run() */ public void run(RichRecord currentRecord) throws BusinessException { try { matchRecordGrouping.doGroup(currentRecord); blockSize++; } catch (IOException e) { throw new BusinessException(); } catch (InterruptedException e) { throw new BusinessException(); } } public void end() throws BusinessException { try { matchRecordGrouping.end(); } catch (IOException e) { throw new BusinessException(); } catch (InterruptedException e) { throw new BusinessException(); } } public long getBlockSize() { return blockSize; } } }