// ============================================================================ // // Copyright (C) 2006-2016 Talend Inc. - www.talend.com // // This source code is available under agreement available at // %InstallDIR%\features\org.talend.rcp.branding.%PRODUCTNAME%\%PRODUCTNAME%license.txt // // You should have received a copy of the agreement // along with this program; if not, write to Talend SA // 9 rue Pages 92150 Suresnes, France // // ============================================================================ package org.talend.dq.analysis; import static org.junit.Assert.*; import java.net.URL; import java.util.Map; import org.eclipse.core.runtime.NullProgressMonitor; import org.junit.Before; import org.junit.Test; import org.talend.core.model.metadata.builder.connection.ConnectionPackage; import org.talend.core.model.metadata.builder.connection.DelimitedFileConnection; import org.talend.core.model.metadata.builder.connection.MetadataColumn; import org.talend.core.model.metadata.builder.connection.MetadataTable; import org.talend.cwm.helper.TaggedValueHelper; import org.talend.dataquality.analysis.Analysis; import org.talend.dataquality.analysis.AnalysisContext; import org.talend.dataquality.analysis.AnalysisPackage; import org.talend.dataquality.analysis.AnalysisParameters; import org.talend.dataquality.analysis.AnalysisResult; import org.talend.dataquality.indicators.columnset.BlockKeyIndicator; import org.talend.dataquality.indicators.columnset.ColumnsetPackage; import org.talend.dataquality.indicators.columnset.RecordMatchingIndicator; import org.talend.dataquality.record.linkage.constant.AttributeMatcherType; import org.talend.dataquality.record.linkage.utils.BlockingKeyPreAlgorithmEnum; import org.talend.dataquality.rules.AlgorithmDefinition; import org.talend.dataquality.rules.BlockKeyDefinition; import org.talend.dataquality.rules.MatchKeyDefinition; import org.talend.dataquality.rules.MatchRule; import org.talend.dataquality.rules.MatchRuleDefinition; import org.talend.dataquality.rules.RulesPackage; import org.talend.dq.helper.UnitTestBuildHelper; /** * created by zhao on Aug 28, 2013 Detailled comment * */ public class MatchAnalysisExecutorTest { private DelimitedFileConnection delimitedFileconnection = null; private MetadataTable metadataTable = null; private MetadataColumn name = null; @Before public void setUp() throws Exception { delimitedFileconnection = ConnectionPackage.eINSTANCE.getConnectionFactory().createDelimitedFileConnection(); } /** * Test method for * {@link org.talend.dq.analysis.MatchAnalysisExecutor#execute(org.talend.dataquality.analysis.Analysis)}. */ @SuppressWarnings("nls") @Test public void testExecute() { MatchAnalysisExecutor matchAnalysisExecutor = new MatchAnalysisExecutor(); Analysis analysis = AnalysisPackage.eINSTANCE.getAnalysisFactory().createAnalysis(); AnalysisContext context = AnalysisPackage.eINSTANCE.getAnalysisFactory().createAnalysisContext(); analysis.setContext(context); AnalysisParameters params = AnalysisPackage.eINSTANCE.getAnalysisFactory().createAnalysisParameters(); analysis.setParameters(params); TaggedValueHelper.setTaggedValue(analysis, TaggedValueHelper.PREVIEW_ROW_NUMBER, String.valueOf(100)); // analysisResult.setAnalysis(analysis); context.setConnection(delimitedFileconnection); URL fileUrl = this.getClass().getResource("match_test_data"); //$NON-NLS-1$ metadataTable = UnitTestBuildHelper.getDefault().initFileConnection(fileUrl, delimitedFileconnection); this.name = UnitTestBuildHelper.getDefault().initColumns(context, this.metadataTable); // Scenario 1 // - Match key: name, no block key, levenshtein attribute algorithm. groupQualityThreshold = 0.9d, matchInterval // = 0.95d . double groupQualityThreshold = 0.9d; double matchInterval = 0.95d; assertScenario1(matchAnalysisExecutor, analysis, name, "name", groupQualityThreshold, matchInterval); // Scenario 2 // - Same to scenario 1, EXCEPT matchInterval = 0.8d . matchInterval = 0.8d; assertScenario2(matchAnalysisExecutor, analysis, name, "name", groupQualityThreshold, matchInterval); // Scenario 3 // - Same to scenario 2, EXCEPT groupQualityThreshold = 0.95d. groupQualityThreshold = 0.95d; assertScenario3(matchAnalysisExecutor, analysis, name, "name", groupQualityThreshold, matchInterval); // Scenario 4 // - Same to scenario 3, EXCEPT a new blocking key = country. assertScenario4(matchAnalysisExecutor, analysis, name, "name", groupQualityThreshold, matchInterval); } /** * DOC zhao Comment method "assertScenario1". * * @param matchAnalysisExecutor * @param analysis * @param name * @param nameVar */ private void assertScenario1(MatchAnalysisExecutor matchAnalysisExecutor, Analysis analysis, MetadataColumn name, String nameVar, double groupQualityThreshold, double matchInterval) { // Set indicators into analysis result. RecordMatchingIndicator matchIndicator = ColumnsetPackage.eINSTANCE.getColumnsetFactory().createRecordMatchingIndicator(); // Match key: name, no block key, levenshtein attribute algorithm. matchIndicator.setAnalyzedElement(name); createMatchIndicatorWithOneMathRule(nameVar, matchIndicator, groupQualityThreshold, matchInterval); executeAnalysis(matchAnalysisExecutor, analysis, matchIndicator); // Assert group size and frequency. Map<Object, Long> size2Frequency = matchIndicator.getGroupSize2groupFrequency(); assertTrue(size2Frequency.get(String.valueOf(4)) == 1l);// For 4 -> "seb" assertTrue(size2Frequency.get(String.valueOf(1)) == 4l);// For 1 -> "Sebastião","babass","nico","nicola" assertTrue(size2Frequency.get(String.valueOf(2)) == 3l);// For 2 -> "sebas","nicolas","nigula" // Assert row count, unique records, matched records and suspect records. assertTrue(matchIndicator.getCount() == 14); assertTrue(matchIndicator.getMatchedRecordCount() == 10); assertTrue(matchIndicator.getSuspectRecordCount() == 0); } /** * DOC zhao Comment method "assertScenario2". * * @param matchAnalysisExecutor * @param analysis * @param name * @param nameVar */ private void assertScenario2(MatchAnalysisExecutor matchAnalysisExecutor, Analysis analysis, MetadataColumn name, String nameVar, double groupQualityThreshold, double matchInterval) { // Set indicators into analysis result. RecordMatchingIndicator matchIndicator = ColumnsetPackage.eINSTANCE.getColumnsetFactory().createRecordMatchingIndicator(); // Match key: name, no block key, levenshtein attribute algorithm. matchIndicator.setAnalyzedElement(name); createMatchIndicatorWithOneMathRule(nameVar, matchIndicator, groupQualityThreshold, matchInterval); executeAnalysis(matchAnalysisExecutor, analysis, matchIndicator); // Assert group size and frequency. Map<Object, Long> size2Frequency = matchIndicator.getGroupSize2groupFrequency(); assertTrue(size2Frequency.get(String.valueOf(4)) == 1l);// For 4 -> "seb" assertTrue(size2Frequency.get(String.valueOf(1)) == 3l);// For 1 -> "Sebastião","babass","nico" assertTrue(size2Frequency.get(String.valueOf(3)) == 1l);// For 3 -> "nicolas"("nicola") assertTrue(size2Frequency.get(String.valueOf(2)) == 2l);// For 2 -> "sebas","nigula" // Assert row count, unique records, matched records and suspect records. assertTrue(matchIndicator.getCount() == 14); assertTrue(matchIndicator.getMatchedRecordCount() == 11); assertTrue(matchIndicator.getSuspectRecordCount() == 0); } /** * DOC zhao Comment method "assertScenario3". * * @param matchAnalysisExecutor * @param analysis * @param name * @param nameVar */ private void assertScenario3(MatchAnalysisExecutor matchAnalysisExecutor, Analysis analysis, MetadataColumn name, String nameVar, double groupQualityThreshold, double matchInterval) { // Set indicators into analysis result. RecordMatchingIndicator matchIndicator = ColumnsetPackage.eINSTANCE.getColumnsetFactory().createRecordMatchingIndicator(); // Match key: name, no block key, levenshtein attribute algorithm. matchIndicator.setAnalyzedElement(name); createMatchIndicatorWithOneMathRule(nameVar, matchIndicator, groupQualityThreshold, matchInterval); executeAnalysis(matchAnalysisExecutor, analysis, matchIndicator); // Assert group size and frequency. Map<Object, Long> size2Frequency = matchIndicator.getGroupSize2groupFrequency(); assertTrue(size2Frequency.get(String.valueOf(4)) == 1l);// For 4 -> "seb" assertTrue(size2Frequency.get(String.valueOf(1)) == 3l);// For 1 -> "Sebastião","babass","nico" assertTrue(size2Frequency.get(String.valueOf(3)) == 1l);// For 3 -> "nicolas"("nicola") assertTrue(size2Frequency.get(String.valueOf(2)) == 2l);// For 2 -> "sebas","nigula" // Assert row count, unique records, matched records and suspect records. assertTrue(matchIndicator.getCount() == 14); assertTrue(matchIndicator.getMatchedRecordCount() == 8); assertTrue(matchIndicator.getSuspectRecordCount() == 3); // For 3 -> "nicolas"("nicola"), group score: 0.9 < // 0.95 } /** * DOC zhao Comment method "assertScenario3". * * @param matchAnalysisExecutor * @param analysis * @param name * @param nameVar */ @SuppressWarnings("nls") private void assertScenario4(MatchAnalysisExecutor matchAnalysisExecutor, Analysis analysis, MetadataColumn name, String nameVar, double groupQualityThreshold, double matchInterval) { // Set indicators into analysis result. RecordMatchingIndicator matchIndicator = ColumnsetPackage.eINSTANCE.getColumnsetFactory().createRecordMatchingIndicator(); // Match key: name, no block key, levenshtein attribute algorithm. matchIndicator.setAnalyzedElement(name); createMatchIndicatorWithOneMathRule(nameVar, matchIndicator, groupQualityThreshold, matchInterval); // Add a blocking key: country BlockKeyDefinition blockKeyDef = RulesPackage.eINSTANCE.getRulesFactory().createBlockKeyDefinition(); AlgorithmDefinition algoDef = RulesPackage.eINSTANCE.getRulesFactory().createAlgorithmDefinition(); algoDef.setAlgorithmType(AttributeMatcherType.EXACT.name()); blockKeyDef.setAlgorithm(algoDef); blockKeyDef.setColumn("country"); blockKeyDef.setName("country"); AlgorithmDefinition dummyAlgoPre = RulesPackage.eINSTANCE.getRulesFactory().createAlgorithmDefinition(); dummyAlgoPre.setAlgorithmType(BlockingKeyPreAlgorithmEnum.NON_ALGO.getComponentValueName()); blockKeyDef.setPreAlgorithm(dummyAlgoPre); AlgorithmDefinition dummyAlgoPost = RulesPackage.eINSTANCE.getRulesFactory().createAlgorithmDefinition(); dummyAlgoPost.setAlgorithmType(BlockingKeyPreAlgorithmEnum.NON_ALGO.getComponentValueName()); blockKeyDef.setPostAlgorithm(dummyAlgoPost); matchIndicator.getBuiltInMatchRuleDefinition().getBlockKeys().add(blockKeyDef); executeAnalysis(matchAnalysisExecutor, analysis, matchIndicator); // Assert group size and frequency. Map<Object, Long> size2Frequency = matchIndicator.getGroupSize2groupFrequency(); assertTrue(size2Frequency.get(String.valueOf(1)) == 6l);// For 1 -> FR(4)"babass","Sebastião","nicolas","nigula" // CN(2)"nigula","nico" assertTrue(size2Frequency.get(String.valueOf(2)) == 2l);// For 2 -> FR(1)"sebas", CN(1)"nicolas"("nicola") assertTrue(size2Frequency.get(String.valueOf(4)) == 1l);// For 4 -> FR(4)"seb" // Assert row count, unique records, matched records and suspect records. assertTrue(matchIndicator.getCount() == 14); assertTrue(matchIndicator.getMatchedRecordCount() == 6); // For 6 -> FR 4*"seb", FR 2 *"sebas" assertTrue(matchIndicator.getSuspectRecordCount() == 2); // For 2 -> CN "nicolas"("nicola"), group score: 0.9 < // 0.95 } /** * DOC zhao Comment method "executeAnalysis". * * @param matchAnalysisExecutor * @param analysis * @param matchIndicator */ private void executeAnalysis(MatchAnalysisExecutor matchAnalysisExecutor, Analysis analysis, RecordMatchingIndicator matchIndicator) { BlockKeyIndicator blockKeyIndicator = ColumnsetPackage.eINSTANCE.getColumnsetFactory().createBlockKeyIndicator(); AnalysisResult anaResult = AnalysisPackage.eINSTANCE.getAnalysisFactory().createAnalysisResult(); anaResult.setResultMetadata(AnalysisPackage.eINSTANCE.getAnalysisFactory().createExecutionInformations()); analysis.setResults(anaResult); analysis.getResults().getIndicators().add(matchIndicator); analysis.getResults().getIndicators().add(blockKeyIndicator); matchAnalysisExecutor.setMonitor(new NullProgressMonitor()); matchAnalysisExecutor.execute(analysis); } /** * DOC zhao Comment method "createMatchIndicatorWithOneMathRule". * * @param nameVar * @param matchIndicator * @param groupQualityThreshold * @param matchInterval */ private void createMatchIndicatorWithOneMathRule(String nameVar, RecordMatchingIndicator matchIndicator, double groupQualityThreshold, double matchInterval) { MatchRuleDefinition matchRuleDefinition = RulesPackage.eINSTANCE.getRulesFactory().createMatchRuleDefinition(); matchRuleDefinition.setMatchGroupQualityThreshold(groupQualityThreshold); MatchRule matchRule = RulesPackage.eINSTANCE.getRulesFactory().createMatchRule(); matchRule.setMatchInterval(matchInterval); matchRule.setName("match rule 1"); MatchKeyDefinition matchkeyDef = RulesPackage.eINSTANCE.getRulesFactory().createMatchKeyDefinition(); matchkeyDef.setName(nameVar); matchkeyDef.setColumn(nameVar); AlgorithmDefinition algoDef = RulesPackage.eINSTANCE.getRulesFactory().createAlgorithmDefinition(); algoDef.setAlgorithmType(AttributeMatcherType.LEVENSHTEIN.name()); matchkeyDef.setAlgorithm(algoDef); matchkeyDef.setConfidenceWeight(1); matchRule.getMatchKeys().add(matchkeyDef); matchRuleDefinition.getMatchRules().add(matchRule); matchIndicator.setBuiltInMatchRuleDefinition(matchRuleDefinition); } }