package gobblin.source.extractor.extract; import gobblin.configuration.ConfigurationKeys; import gobblin.configuration.State; import gobblin.configuration.WorkUnitState; import gobblin.source.extractor.DataRecordException; import gobblin.source.extractor.exception.HighWatermarkException; import gobblin.source.extractor.exception.RecordCountException; import gobblin.source.extractor.exception.SchemaException; import gobblin.source.extractor.partition.Partition; import gobblin.source.extractor.partition.Partitioner; import gobblin.source.extractor.watermark.Predicate; import gobblin.source.extractor.watermark.WatermarkPredicate; import gobblin.source.extractor.watermark.WatermarkType; import gobblin.source.workunit.WorkUnit; import java.io.IOException; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.Map; import org.apache.hadoop.yarn.webapp.hamlet.HamletSpec; import org.testng.Assert; import org.testng.annotations.Test; /** * Unit tests for {@link QueryBasedExtractor} */ public class QueryBasedExtractorTest { @Test public void testDataPullUpperBoundsRemovedInLastWorkUnit() { int totalCount = 5; ArrayList<DataRecord> records = this.generateRecords(totalCount); WorkUnit workUnit = WorkUnit.createEmpty(); workUnit.setProp(Partition.IS_LAST_PARTIITON, true); workUnit.setProp(ConfigurationKeys.SOURCE_QUERYBASED_EXTRACT_TYPE, "SNAPSHOT"); WorkUnitState workUnitState = new WorkUnitState(workUnit, new State()); workUnitState.setId("testDataPullUpperBoundsRemovedInLastWorkUnit"); TestQueryBasedExtractor testExtractor = new TestQueryBasedExtractor(workUnitState, records); testExtractor.setRangePredicates(1, 3); this.verify(testExtractor, totalCount); } @Test public void testDataPullUpperBoundsNotRemovedInLastWorkUnit() { int totalCount = 5; ArrayList<DataRecord> records = this.generateRecords(totalCount); WorkUnit workUnit = WorkUnit.createEmpty(); WorkUnitState workUnitState = new WorkUnitState(workUnit, new State()); workUnitState.setId("testDataPullUpperBoundsNotRemovedInLastWorkUnit"); // It's not a last work unit TestQueryBasedExtractor testExtractor = new TestQueryBasedExtractor(workUnitState, records); testExtractor.setRangePredicates(1, 3); this.verify(testExtractor, 3); // It's a last work unit but user specifies high watermark workUnit.setProp(Partition.IS_LAST_PARTIITON, true); workUnit.setProp(Partition.HAS_USER_SPECIFIED_HIGH_WATERMARK, true); testExtractor.reset(); testExtractor.setRangePredicates(1, 3); this.verify(testExtractor, 3); // It's a last work unit but it has WORK_UNIT_STATE_ACTUAL_HIGH_WATER_MARK_KEY on record workUnit.removeProp(Partition.HAS_USER_SPECIFIED_HIGH_WATERMARK); workUnit.setProp(ConfigurationKeys.WORK_UNIT_STATE_ACTUAL_HIGH_WATER_MARK_KEY, "3"); testExtractor.reset(); testExtractor.setRangePredicates(1, 3); this.verify(testExtractor, 3); } private ArrayList<DataRecord> generateRecords(int count) { ArrayList<DataRecord> records = new ArrayList<>(); while (count > 0) { records.add(new DataRecord(count, count)); count--; } return records; } private void verify(TestQueryBasedExtractor testExtractor, int expectedCount) { int actualCount = 0; try { while (testExtractor.readRecord(null) != null) { actualCount++; } } catch (Exception e) { Assert.fail("There should not incur any exception"); } Assert.assertEquals(actualCount, expectedCount, "Expect " + expectedCount + " records!"); } private class TestQueryBasedExtractor extends QueryBasedExtractor<ArrayList, DataRecord> { private final ArrayList<DataRecord> records; private long previousActualHwmValue; TestQueryBasedExtractor(WorkUnitState workUnitState, ArrayList<DataRecord> records) { super(workUnitState); this.records = records; previousActualHwmValue = -1; } void setRangePredicates(long lwmValue, long hwmValue) { WatermarkPredicate watermark = new WatermarkPredicate("timeStamp", WatermarkType.SIMPLE); predicateList.add(watermark.getPredicate(this, lwmValue, ">=", Predicate.PredicateType.LWM)); predicateList.add(watermark.getPredicate(this, hwmValue, "<=", Predicate.PredicateType.HWM)); } void reset() { previousActualHwmValue = -1; predicateList.clear(); setFetchStatus(true); } @Override public void extractMetadata(String schema, String entity, WorkUnit workUnit) throws SchemaException, IOException { } @Override public long getMaxWatermark(String schema, String entity, String watermarkColumn, List<Predicate> snapshotPredicateList, String watermarkSourceFormat) throws HighWatermarkException { return 0; } @Override public long getSourceCount(String schema, String entity, WorkUnit workUnit, List<Predicate> predicateList) throws RecordCountException { return records.size(); } @Override public Iterator<DataRecord> getRecordSet(String schema, String entity, WorkUnit workUnit, List<Predicate> predicateList) throws DataRecordException, IOException { if (records == null || predicateList == null) { // No new data to pull return null; } long lwmValue = -1; long hwmValue = Long.MAX_VALUE; long actualHwmValue = -1; // Adjust watermarks from predicate list for (Predicate predicate: predicateList) { if (predicate.getType() == Predicate.PredicateType.HWM) { hwmValue = predicate.value; } if (predicate.getType() == Predicate.PredicateType.LWM) { lwmValue = predicate.value; } } ArrayList<DataRecord> filteredRecords = new ArrayList<>(); for (DataRecord record : records) { if (record.timeStamp <= previousActualHwmValue) { // The record has been pulled previously continue; } if (record.timeStamp >= lwmValue && record.timeStamp <= hwmValue) { // Make a copy filteredRecords.add(new DataRecord(record.value, record.timeStamp)); // Mark actual high watermark if (record.timeStamp > actualHwmValue) { actualHwmValue = record.timeStamp; } } } if (filteredRecords.isEmpty()) { return null; } previousActualHwmValue = actualHwmValue; return filteredRecords.iterator(); } @Override public String getWatermarkSourceFormat(WatermarkType watermarkType) { return null; } @Override public String getHourPredicateCondition(String column, long value, String valueFormat, String operator) { return null; } @Override public String getDatePredicateCondition(String column, long value, String valueFormat, String operator) { return null; } @Override public String getTimestampPredicateCondition(String column, long value, String valueFormat, String operator) { return null; } @Override public void setTimeOut(int timeOut) { } @Override public Map<String, String> getDataTypeMap() { return null; } @Override public void closeConnection() throws Exception { } @Override public Iterator<DataRecord> getRecordSetFromSourceApi(String schema, String entity, WorkUnit workUnit, List<Predicate> predicateList) throws IOException { try { return getRecordSet(schema, entity, workUnit, predicateList); } catch (DataRecordException e) { e.printStackTrace(); return null; } } } private class DataRecord { int value; long timeStamp; DataRecord(int value, long timeStamp) { this.value = value; this.timeStamp = timeStamp; } } }