package com.thinkbiganalytics.ingest;
/*-
* #%L
* thinkbig-nifi-core-processors
* %%
* Copyright (C) 2017 ThinkBig Analytics
* %%
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* #L%
*/
import com.klarna.hiverunner.HiveShell;
import com.klarna.hiverunner.StandaloneHiveRunner;
import com.klarna.hiverunner.annotations.HiveProperties;
import com.klarna.hiverunner.annotations.HiveRunnerSetup;
import com.klarna.hiverunner.annotations.HiveSQL;
import com.klarna.hiverunner.config.HiveRunnerConfig;
import com.thinkbiganalytics.hive.util.HiveUtils;
import com.thinkbiganalytics.util.ColumnSpec;
import com.thinkbiganalytics.util.PartitionBatch;
import com.thinkbiganalytics.util.PartitionSpec;
import org.apache.commons.collections4.MapUtils;
import org.junit.Before;
import org.junit.Test;
import org.junit.runner.RunWith;
import java.sql.SQLException;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Vector;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
@RunWith(StandaloneHiveRunner.class)
public class TableMergeSyncSupportTest {
/**
* Explicit test class configuration of the HiveRunner runtime. See {@link HiveRunnerConfig} for further details.
*/
@HiveRunnerSetup
public final HiveRunnerConfig CONFIG = new HiveRunnerConfig() {{
setHiveExecutionEngine("mr");
}};
private final String sourceSchema = "emp_sr";
private final String sourceTable = "employee_valid";
private final String targetSchema = "emp_sr";
private final String targetTable = "employee";
private final String targetTableNP = "employee_np";
private final String processingPartition = "20160119074340";
private final PartitionSpec spec = new PartitionSpec("country|string|country\nyear|int|year(hired)");
private final PartitionSpec specNP = new PartitionSpec("");
/**
* Cater for all the parameters in the script that we want to test. Note that the "hadoop.tmp.dir" is one of the dirs defined by the test harness
*/
@HiveProperties
public Map<String, String> hiveProperties = MapUtils.putAll(new HashMap<String, String>(), new Object[]{
"MY.HDFS.DIR", "${hadoop.tmp.dir}",
"my.schema", "bar",
});
private TableMergeSyncSupport mergeSyncSupport;
/**
* Define the script files under test. The files will be loaded in the given order. <p/> The HiveRunner instantiate and inject the HiveShell
*/
@HiveSQL(files = {
"hive-test-support/create_table.sql"
}, encoding = "UTF-8")
private HiveShell hiveShell;
@Before
public void setupSupport() throws SQLException {
this.mergeSyncSupport = new HiveShellTableMergeSyncSupport(hiveShell);
mergeSyncSupport.enableDynamicPartitions();
}
@Test
public void testPartitionBatches() {
List<PartitionBatch> batches = fetchPartitionBatches();
assertTrue(batches.size() == 4);
}
private List<PartitionBatch> fetchPartitionBatches() {
List<PartitionBatch> vBatch = new Vector<>();
String sql = spec.toDistinctSelectSQL(sourceSchema, sourceTable, processingPartition);
List<Object[]> results = hiveShell.executeStatement(sql);
for (Object[] vals : results) {
vBatch.add(new PartitionBatch((Long) vals[2], spec, new String[]{vals[0].toString(), vals[1].toString()}));
}
return vBatch;
}
private List<String> fetchEmployees(String targetSchema, String targetTable) {
return hiveShell.executeQuery("select * from " + HiveUtils.quoteIdentifier(targetSchema, targetTable));
}
@Test
/**
* Tests the sync function
*/
public void testSyncWithPartitions() throws Exception {
doTestSync(targetSchema, targetTable, spec);
}
@Test
/**
* Tests the sync function
*/
public void testSyncNonPartitioned() throws Exception {
doTestSync(targetSchema, targetTableNP, specNP);
}
private void doTestSync(String targetSchema, String targetTable, PartitionSpec spec) throws SQLException {
mergeSyncSupport.doSync(sourceSchema, sourceTable, targetSchema, targetTable, spec, processingPartition);
List<String> results = fetchEmployees(targetSchema, targetTable);
assertEquals(4, results.size());
hiveShell.execute("insert into emp_sr.employee_valid partition(processing_dttm='20160119074340') ( `id`, `timestamp`, `name`,`company`,`zip`,`phone`,`email`, `hired`,`country`) values "
+ "(100,'1',"
+ "'Bruce',"
+ "'ABC',"
+ "'94550','555-1212','bruce@acme.org','2016-01-01','Canada');");
mergeSyncSupport.doSync(sourceSchema, sourceTable, targetSchema, targetTable, spec, processingPartition);
results = fetchEmployees(targetSchema, targetTable);
assertEquals(5, results.size());
}
@Test
/**
* Tests the merge with empty target table
*/
public void testMergePKWithEmptyTargetTable() throws Exception {
List<String> results = fetchEmployees(targetSchema, targetTableNP);
assertEquals(0, results.size());
ColumnSpec columnSpec1 = new ColumnSpec("id", "String", "", true, false, false);
ColumnSpec columnSpec2 = new ColumnSpec("name", "String", "", false, false, false);
ColumnSpec[] columnSpecs = Arrays.asList(columnSpec1, columnSpec2).toArray(new ColumnSpec[0]);
// Call merge
mergeSyncSupport.doPKMerge(sourceSchema, sourceTable, targetSchema, targetTableNP, new PartitionSpec(), processingPartition, columnSpecs);
// We should have 4 records
results = fetchEmployees(targetSchema, targetTableNP);
assertEquals(4, results.size());
// Merge with same source should leave us with 4 records
mergeSyncSupport.doPKMerge(sourceSchema, sourceTable, targetSchema, targetTableNP, new PartitionSpec(), processingPartition, columnSpecs);
// We should have 4 records
results = fetchEmployees(targetSchema, targetTableNP);
assertEquals(4, results.size());
// Should update 1 and add 1
hiveShell.execute("insert into emp_sr.employee_valid partition(processing_dttm='20160119074350') ( `id`, `timestamp`, `name`,`company`,`zip`,`phone`,`email`, `hired`,`country`) values "
+ "(1,'1',"
+ "'NEW VALUE',"
+ "'ABC',"
+ "'94550','555-1212','bruce@acme.org','2016-01-01','Canada');");
hiveShell.execute("insert into emp_sr.employee_valid partition(processing_dttm='20160119074350') ( `id`, `timestamp`, `name`,`company`,`zip`,`phone`,`email`, `hired`,`country`) values "
+ "(10010,'1',"
+ "'Bruce',"
+ "'ABC',"
+ "'94550','555-1212','bruce@acme.org','2016-01-01','Canada');");
// Call merge
mergeSyncSupport.doPKMerge(sourceSchema, sourceTable, targetSchema, targetTableNP, new PartitionSpec(), "20160119074350", columnSpecs);
// We should have 4 records
results = fetchEmployees(targetSchema, targetTableNP);
assertEquals(5, results.size());
}
@Test
/**
* Tests the merge partition with empty target table
*/
public void testMergePartitionPKWithEmptyTargetTable() throws Exception {
List<String> results = fetchEmployees(targetSchema, targetTable);
assertEquals(0, results.size());
ColumnSpec columnSpec1 = new ColumnSpec("id", "String", "", true, false, false);
ColumnSpec columnSpec2 = new ColumnSpec("name", "String", "", false, false, false);
ColumnSpec[] columnSpecs = Arrays.asList(columnSpec1, columnSpec2).toArray(new ColumnSpec[0]);
// Call merge
mergeSyncSupport.doPKMerge(sourceSchema, sourceTable, targetSchema, targetTable, spec, processingPartition, columnSpecs);
// We should have 4 records
results = fetchEmployees(targetSchema, targetTable);
assertEquals(4, results.size());
}
@Test
/**
* Tests the merge partition without dedupe and the merge partition with dedupe
*/
public void testMergePartitionPK() throws Exception {
// Insert one record to start
hiveShell.execute(
"insert into emp_sr.employee partition(country='USA',year=2015) ( `id`, `timestamp`,`name`,`company`,`zip`,`phone`,`email`, `hired`) values (1,'1','Sally','OLD VALUE','94550',"
+ "'555-1212',"
+ "'sally@acme.org','2015-01-01');");
doTestMergePK(targetSchema, targetTable, spec);
}
@Test
/**
* Tests the merge partition without dedupe and the merge partition with dedupe
*/
public void testMergePartitionMovingPartitionPK() throws Exception {
doTestMergePKWithDifferentPartitions(targetSchema, targetTable, spec);
}
@Test
/**
* Tests the merge partition without dedupe and the merge partition with dedupe
*/
public void testMergePartition() throws Exception {
// Insert one record to start
hiveShell.execute(
"insert into emp_sr.employee partition(country='USA',year=2015) (`id`, `timestamp`, `name`,`company`,`zip`,`phone`,`email`, `hired`, `processing_dttm`) values (60,'1','Billy',"
+ "'ABC','94550',"
+ "'555-1212',"
+ "'billy@acme.org','2015-01-01','20150119974340');");
// Validate one record initial test condition
List<String> results = fetchEmployees(targetSchema, targetTable);
assertEquals(1, results.size());
// Call merge
mergeSyncSupport.doMerge(sourceSchema, sourceTable, targetSchema, targetTable, spec, processingPartition, false);
// We should have 5 records 4 from the sourceTable and 1 existing
results = fetchEmployees(targetSchema, targetTable);
assertEquals(5, results.size());
// Now create a duplicate record and ensure we don't see it twice the final table
hiveShell.execute("insert into emp_sr.employee partition(country='Canada',year=2016) (`id`, `timestamp`, `name`,`company`,`zip`,`phone`,`email`, `hired`,`processing_dttm`) "
+ "values (100, '1', 'Bruce','ABC','94550','555-1212','bruce@acme.org','2016-01-01','20150119974340');");
mergeSyncSupport.doMerge(sourceSchema, sourceTable, targetSchema, targetTable, spec, "20160119974350", true);
results = fetchEmployees(targetSchema, targetTable);
assertEquals(7, results.size());
verifyUnique(results);
}
@Test
/**
* Tests the merge partition without dedupe and the merge partition with dedupe
*/
public void testMergePartitionNoProcessingDttm() throws Exception {
String targetTable = "employeepd";
// Insert one record to start
hiveShell.execute(
"insert into emp_sr.employeepd partition(country='USA',year=2015) (`id`, `timestamp`, `name`,`company`,`zip`,`phone`,`email`, `hired`) values (60,'1','Billy',"
+ "'ABC','94550',"
+ "'555-1212',"
+ "'billy@acme.org','2015-01-01');");
// Validate one record initial test condition
List<String> results = fetchEmployees(targetSchema, targetTable);
assertEquals(1, results.size());
// Call merge
mergeSyncSupport.doMerge(sourceSchema, sourceTable, targetSchema, targetTable, spec, processingPartition, false);
// We should have 5 records 4 from the sourceTable and 1 existing
results = fetchEmployees(targetSchema, targetTable);
assertEquals(5, results.size());
// Now create a duplicate record and ensure we don't see it twice the final table
hiveShell.execute("insert into emp_sr.employeepd partition(country='Canada',year=2016) (`id`, `timestamp`, `name`,`company`,`zip`,`phone`,`email`, `hired`) "
+ "values (100, '1', 'Bruce','ABC','94550','555-1212','bruce@acme.org','2016-01-01');");
mergeSyncSupport.doMerge(sourceSchema, sourceTable, targetSchema, targetTable, spec, "20160119974350", true);
results = fetchEmployees(targetSchema, targetTable);
assertEquals(7, results.size());
verifyUnique(results);
}
// Verify no duplicates exist in the table
private void verifyUnique(List<String> results) {
HashSet<String> existing = new HashSet<>();
for (String r : results) {
assertFalse(existing.contains(r));
existing.add(r);
}
}
@Test
/**
* Tests the merge partition without dedupe and the merge partition with dedupe
*/
public void testMergeNonPartitioned() throws Exception {
// Insert one record to start
hiveShell.execute(
"insert into emp_sr.employee_np (`id`, `timestamp`, `name`,`company`,`zip`,`phone`,`email`, `hired`, `country`) values (60, '1', 'Billy',"
+ "'ABC',"
+ "'94550',"
+ "'555-1212',"
+ "'billy@acme.org','2015-01-01', 'USA');");
List<String> results = fetchEmployees(targetSchema, targetTableNP);
assertEquals(1, results.size());
// Call merge without dedupe
mergeSyncSupport.doMerge(sourceSchema, sourceTable, targetSchema, targetTableNP, specNP, processingPartition, false);
// We should have 5 records 4 from the sourceTable and 1 existing
results = fetchEmployees(targetSchema, targetTableNP);
assertEquals(5, results.size());
// Now create a duplicate record and ensure we don't see it in the final table
hiveShell.execute("insert into emp_sr.employee_np (`id`, `timestamp`, `name`,`company`,`zip`,`phone`,`email`, `hired`, `country`) "
+ "values (100, '1', 'Bruce','ABC','94550','555-1212','bruce@acme.org','2016-01-01', 'Canada');");
mergeSyncSupport.doMerge(sourceSchema, sourceTable, targetSchema, targetTableNP, specNP, "20160119974350", true);
results = fetchEmployees(targetSchema, targetTableNP);
assertEquals(7, results.size());
verifyUnique(results);
}
@Test
/**
* Tests the merge partition without dedupe and the merge partition with dedupe
*/
public void testMergeNonPartitionedWithProcessingDttm() throws Exception {
String targetTableNP = "employeepd_np";
// Insert one record to start
hiveShell.execute(
"insert into emp_sr.employeepd_np (`id`, `timestamp`, `name`,`company`,`zip`,`phone`,`email`, `hired`, `country`, `processing_dttm`) values (60, '1', 'Billy',"
+ "'ABC',"
+ "'94550',"
+ "'555-1212',"
+ "'billy@acme.org','2015-01-01', 'USA', '20150119974350');");
List<String> results = fetchEmployees(targetSchema, targetTableNP);
assertEquals(1, results.size());
// Call merge without dedupe
mergeSyncSupport.doMerge(sourceSchema, sourceTable, targetSchema, targetTableNP, specNP, processingPartition, false);
// We should have 5 records 4 from the sourceTable and 1 existing
results = fetchEmployees(targetSchema, targetTableNP);
assertEquals(5, results.size());
// Now create a duplicate record and ensure we don't see it in the final table
hiveShell.execute("insert into emp_sr.employeepd_np (`id`, `timestamp`, `name`,`company`,`zip`,`phone`,`email`, `hired`, `country`, `processing_dttm`) "
+ "values (100, '1', 'Bruce','ABC','94550','555-1212','bruce@acme.org','2016-01-01', 'Canada', '20150119974350');");
mergeSyncSupport.doMerge(sourceSchema, sourceTable, targetSchema, targetTableNP, specNP, "20160119974350", true);
results = fetchEmployees(targetSchema, targetTableNP);
assertEquals(7, results.size());
verifyUnique(results);
}
@Test
/**
* Test Rolling Sync.
*/
public void testRollingSync() throws Exception {
List<String> results = fetchEmployees(targetSchema, targetTable);
assertEquals(0, results.size());
doTestRollingSyncMerge(processingPartition);
//Target table is empty. All 4 records should be inserted.
results = fetchEmployees(targetSchema, targetTable);
assertEquals(4, results.size());
//update existing partition
String job1 = "20110119074340";
hiveShell.execute("insert into emp_sr.employee_valid partition(processing_dttm='" + job1 + "') ( `id`, `name`,`company`,`zip`,`phone`,`email`, `hired`,`country`) values (101,'Harry',"
+ "'ABC',"
+ "'94550','555-1212','harry@acme.org','2016-01-01','Canada');");
doTestRollingSyncMerge(job1);
//Target table should still have 4 records. Partition Canada/2016 had one record before merge. It should now have 1 updated record.
results = fetchEmployees(targetSchema, targetTable);
assertEquals(4, results.size());
//Record Jen is gone and replaced by Hary
assertFalse(results.stream().anyMatch(x -> x.contains("Jen")));
assertTrue(results.stream().anyMatch(x -> x.contains("Harry")));
//add new existing partition
String job2 = "20120119074340";
hiveShell.execute("insert into emp_sr.employee_valid partition(processing_dttm='" + job2 + "') ( `id`, `name`,`company`,`zip`,`phone`,`email`, `hired`,`country`) values (101,'Flora',"
+ "'ABC',"
+ "'94550','555-1212','harry@acme.org','2017-01-01','France');");
doTestRollingSyncMerge(job2);
//Target table should now have 5 records. Partition France/2017 has new data. No other partitions are disturbed.
results = fetchEmployees(targetSchema, targetTable);
assertEquals(5, results.size());
}
private void doTestRollingSyncMerge(String processingPartition) throws SQLException {
mergeSyncSupport.doRollingSync(sourceSchema, sourceTable, targetSchema, targetTable, spec, processingPartition);
}
private void doTestMergeNoProcessingDttm(String targetTable, PartitionSpec spec) {
List<String> results = fetchEmployees(targetSchema, targetTable);
assertEquals(1, results.size());
// Call merge
mergeSyncSupport.doMerge(sourceSchema, sourceTable, targetSchema, targetTable, spec, processingPartition, false);
// We should have 5 records 4 from the sourceTable and 1 existing
results = fetchEmployees(targetSchema, targetTable);
assertEquals(5, results.size());
// Run merge with dedupe and should get the following two additional results. The result should not include any duplicates in the target table.
hiveShell.execute("insert into emp_sr.employee_valid partition(processing_dttm='20160119974340') ( `id`, `timestamp`, `name`,`company`,`zip`,`phone`,`email`, `hired`,`country`) "
+ "values (100, '1', 'Bruce','ABC','94550','555-1212','bruce@acme.org','2016-01-01','Canada');");
hiveShell.execute("insert into emp_sr.employee_valid partition(processing_dttm='20160119974340') ( `id`, `timestamp`, `name`,`company`,`zip`,`phone`,`email`, `hired`,`country`) "
+ "values (101, '1','Harry','ABC','94550','555-1212','harry@acme.org','2016-01-01','Canada');");
hiveShell.execute("insert into emp_sr.employee partition(country='Canada',year=2016) (`id`, `timestamp`, `name`,`company`,`zip`,`phone`,`email`, `hired`,`processing_dttm`) "
+ "values (100, '1', 'Bruce','ABC','94550','555-1212','bruce@acme.org','2016-01-01','20150119974340');");
mergeSyncSupport.doMerge(sourceSchema, sourceTable, targetSchema, targetTable, spec, "20160119974340", true);
results = fetchEmployees(targetSchema, targetTable);
assertEquals(7, results.size());
// Verify no duplicates exist in the table
HashSet<String> existing = new HashSet<>();
for (String r : results) {
assertFalse(existing.contains(r));
existing.add(r);
}
}
private void doTestMergePK(String targetSchema, String targetTable, PartitionSpec spec) {
List<String> results = fetchEmployees(targetSchema, targetTable);
assertEquals(1, results.size());
ColumnSpec columnSpec1 = new ColumnSpec("id", "String", "", true, false, false);
ColumnSpec columnSpec2 = new ColumnSpec("name", "String", "", false, false, false);
ColumnSpec[] columnSpecs = Arrays.asList(columnSpec1, columnSpec2).toArray(new ColumnSpec[0]);
// Call merge
mergeSyncSupport.doPKMerge(sourceSchema, sourceTable, targetSchema, targetTable, spec, processingPartition, columnSpecs);
// We should have 4 records
results = fetchEmployees(targetSchema, targetTable);
assertEquals(4, results.size());
assertFalse("Should not have old valur", results.stream().anyMatch(s -> s.contains("OLD")));
// Run merge with dedupe and should get the following two additional results. The result should not include any duplicates in the target table.
hiveShell.execute("insert into emp_sr.employee_valid partition(processing_dttm='20160119074340') ( `id`, `name`,`company`,`zip`,`phone`,`email`, `hired`,`country`) values (100,'Bruce',"
+ "'OLD',"
+ "'94550','555-1212','bruce@acme.org','2016-01-01','Canada');");
hiveShell.execute("insert into emp_sr.employee_valid partition(processing_dttm='20160119074340') ( `id`, `name`,`company`,`zip`,`phone`,`email`, `hired`,`country`) values (101,'Harry',"
+ "'OLD',"
+ "'94550','555-1212','harry@acme.org','2016-01-01','Canada');");
mergeSyncSupport.doPKMerge(sourceSchema, sourceTable, targetSchema, targetTable, spec, processingPartition, columnSpecs);
results = fetchEmployees(targetSchema, targetTable);
assertEquals(6, results.size());
// Verify no duplicates exist in the table
HashSet<String> existing = new HashSet<>();
for (String r : results) {
assertFalse(existing.contains(r));
existing.add(r);
}
hiveShell.execute("insert into emp_sr.employee_valid partition(processing_dttm='20160119074540') ( `id`, `name`,`company`,`zip`,`phone`,`email`, `hired`,`country`) values (100,'Bruce',"
+ "'ABC',"
+ "'94550','555-1212','bruce@acme.org','2016-01-01','Canada');");
hiveShell.execute("insert into emp_sr.employee_valid partition(processing_dttm='20160119074540') ( `id`, `name`,`company`,`zip`,`phone`,`email`, `hired`,`country`) values (101,'Harry',"
+ "'ABC',"
+ "'94550','555-1212','harry@acme.org','2016-01-01','Canada');");
hiveShell.execute("insert into emp_sr.employee_valid partition(processing_dttm='20160119074540') ( `id`, `name`,`company`,`zip`,`phone`,`email`, `hired`,`country`) values (102,'Buddy',"
+ "'ABC',"
+ "'94550','555-1212','buddy@acme.org','2016-01-01','Canada');");
mergeSyncSupport.doPKMerge(sourceSchema, sourceTable, targetSchema, targetTable, spec, "20160119074540", columnSpecs);
results = fetchEmployees(targetSchema, targetTable);
assertEquals(7, results.size());
existing = new HashSet<>();
for (String r : results) {
assertFalse(existing.contains(r));
existing.add(r);
}
assertFalse("Should not have old valur", results.stream().anyMatch(s -> s.contains("OLD")));
}
/*
Test ability to strip records that match the ID but are in a different partition than the newer record
*/
private void doTestMergePKWithDifferentPartitions(String targetSchema, String targetTable, PartitionSpec spec) {
// Insert one record to start
hiveShell.execute(
"insert into emp_sr.employee partition(country='USA',year=2012) ( `id`, `timestamp`,`name`,`company`,`zip`,`phone`,`email`, `hired`) values (1,'1','Sally','OLD VALUE','94550',"
+ "'555-1212',"
+ "'sally@acme.org','2012-01-01');");
hiveShell.execute(
"insert into emp_sr.employee partition(country='USA',year=2012) ( `id`, `timestamp`,`name`,`company`,`zip`,`phone`,`email`, `hired`) values (1002,'1','Jimbo','VALUE','94550',"
+ "'555-1212',"
+ "'sally@acme.org','2012-01-01');");
hiveShell.execute(
"insert into emp_sr.employee partition(country='USA',year=2015) ( `id`, `timestamp`,`name`,`company`,`zip`,`phone`,`email`, `hired`) values (1000,'1','Jill','ORIG','94550',"
+ "'555-1212',"
+ "'sally@acme.org','2015-01-01');");
hiveShell.execute(
"insert into emp_sr.employee partition(country='USA',year=2013) ( `id`, `timestamp`,`name`,`company`,`zip`,`phone`,`email`, `hired`) values (2,'1','Bill','OLD VALUE','94550',"
+ "'555-1212',"
+ "'sally@acme.org','2013-01-01');");
hiveShell.execute(
"insert into emp_sr.employee partition(country='USA',year=2013) ( `id`, `timestamp`,`name`,`company`,`zip`,`phone`,`email`, `hired`) values (3,'1','Ray','OLD VALUE','94550',"
+ "'555-1212',"
+ "'sally@acme.org','2013-01-01');");
hiveShell.execute(
"insert into emp_sr.employee partition(country='USA',year=2013) ( `id`, `timestamp`,`name`,`company`,`zip`,`phone`,`email`, `hired`) values (1001,'1','Fred','VALUE','94550',"
+ "'555-1212',"
+ "'sally@acme.org','2013-01-01');");
List<String> results = fetchEmployees(targetSchema, targetTable);
assertEquals(6, results.size());
ColumnSpec columnSpec1 = new ColumnSpec("id", "String", "", true, false, false);
ColumnSpec columnSpec2 = new ColumnSpec("name", "String", "", false, false, false);
ColumnSpec[] columnSpecs = Arrays.asList(columnSpec1, columnSpec2).toArray(new ColumnSpec[0]);
// Call merge
mergeSyncSupport.doPKMerge(sourceSchema, sourceTable, targetSchema, targetTable, spec, processingPartition, columnSpecs);
// We should have 6 records
results = fetchEmployees(targetSchema, targetTable);
assertEquals(6, results.size());
assertFalse("Should not have old value", results.stream().anyMatch(s -> s.contains("OLD")));
// Run merge with dedupe and should get the following two additional results. The result should not include any duplicates in the target table.
hiveShell.execute("insert into emp_sr.employee_valid partition(processing_dttm='20160119074340') ( `id`, `name`,`company`,`zip`,`phone`,`email`, `hired`,`country`) values (100,'Bruce',"
+ "'OLD',"
+ "'94550','555-1212','bruce@acme.org','2016-01-01','Canada');");
hiveShell.execute("insert into emp_sr.employee_valid partition(processing_dttm='20160119074340') ( `id`, `name`,`company`,`zip`,`phone`,`email`, `hired`,`country`) values (101,'Harry',"
+ "'OLD',"
+ "'94550','555-1212','harry@acme.org','2016-01-01','Canada');");
mergeSyncSupport.doPKMerge(sourceSchema, sourceTable, targetSchema, targetTable, spec, processingPartition, columnSpecs);
results = fetchEmployees(targetSchema, targetTable);
assertEquals(8, results.size());
// Verify no duplicates exist in the table
HashSet<String> existing = new HashSet<>();
for (String r : results) {
assertFalse(existing.contains(r));
existing.add(r);
}
hiveShell.execute("insert into emp_sr.employee_valid partition(processing_dttm='20160119074540') ( `id`, `name`,`company`,`zip`,`phone`,`email`, `hired`,`country`) values (100,'Bruce',"
+ "'ABC',"
+ "'94550','555-1212','bruce@acme.org','2016-01-01','Canada');");
hiveShell.execute("insert into emp_sr.employee_valid partition(processing_dttm='20160119074540') ( `id`, `name`,`company`,`zip`,`phone`,`email`, `hired`,`country`) values (101,'Harry',"
+ "'ABC',"
+ "'94550','555-1212','harry@acme.org','2016-01-01','Canada');");
hiveShell.execute("insert into emp_sr.employee_valid partition(processing_dttm='20160119074540') ( `id`, `name`,`company`,`zip`,`phone`,`email`, `hired`,`country`) values (102,'Buddy',"
+ "'ABC',"
+ "'94550','555-1212','buddy@acme.org','2016-01-01','Canada');");
mergeSyncSupport.doPKMerge(sourceSchema, sourceTable, targetSchema, targetTable, spec, "20160119074540", columnSpecs);
results = fetchEmployees(targetSchema, targetTable);
assertEquals(9, results.size());
existing = new HashSet<>();
for (String r : results) {
assertFalse(existing.contains(r));
existing.add(r);
}
assertFalse("Should not have old value", results.stream().anyMatch(s -> s.contains("OLD")));
}
}