/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package gobblin.source.extractor.extract;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.Iterator;
import java.util.List;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.MDC;
import com.google.common.annotations.VisibleForTesting;
import com.google.gson.Gson;
import com.google.gson.JsonObject;
import lombok.extern.slf4j.Slf4j;
import gobblin.configuration.ConfigurationKeys;
import gobblin.configuration.WorkUnitState;
import gobblin.source.extractor.DataRecordException;
import gobblin.source.extractor.Extractor;
import gobblin.source.extractor.exception.ExtractPrepareException;
import gobblin.source.extractor.exception.HighWatermarkException;
import gobblin.source.extractor.exception.RecordCountException;
import gobblin.source.extractor.exception.SchemaException;
import gobblin.source.extractor.partition.Partition;
import gobblin.source.extractor.schema.ArrayDataType;
import gobblin.source.extractor.schema.DataType;
import gobblin.source.extractor.schema.EnumDataType;
import gobblin.source.extractor.schema.MapDataType;
import gobblin.source.extractor.utils.Utils;
import gobblin.source.extractor.watermark.Predicate;
import gobblin.source.extractor.watermark.WatermarkPredicate;
import gobblin.source.extractor.watermark.WatermarkType;
import gobblin.source.workunit.WorkUnit;
/**
* An implementation of common extractor for query based sources.
*
* @param <D> type of data record
* @param <S> type of schema
*/
@Slf4j
public abstract class QueryBasedExtractor<S, D> implements Extractor<S, D>, ProtocolSpecificLayer<S, D> {
private static final Gson GSON = new Gson();
protected final WorkUnitState workUnitState;
protected final WorkUnit workUnit;
private final String entity;
private final String schema;
private final Partition partition;
private boolean fetchStatus = true;
private S outputSchema;
private long sourceRecordCount = 0;
private long highWatermark;
private Iterator<D> iterator;
protected final List<String> columnList = new ArrayList<>();
@VisibleForTesting
protected final List<Predicate> predicateList = new ArrayList<>();
private S getOutputSchema() {
return this.outputSchema;
}
protected void setOutputSchema(S outputSchema) {
this.outputSchema = outputSchema;
}
private long getSourceRecordCount() {
return this.sourceRecordCount;
}
public boolean getFetchStatus() {
return this.fetchStatus;
}
public void setFetchStatus(boolean fetchStatus) {
this.fetchStatus = fetchStatus;
}
public void setHighWatermark(long highWatermark) {
this.highWatermark = highWatermark;
}
private boolean isPullRequired() {
return getFetchStatus();
}
protected boolean isInitialPull() {
return this.iterator == null;
}
public QueryBasedExtractor(WorkUnitState workUnitState) {
this.workUnitState = workUnitState;
this.workUnit = this.workUnitState.getWorkunit();
this.schema = this.workUnitState.getProp(ConfigurationKeys.SOURCE_QUERYBASED_SCHEMA);
this.entity = this.workUnitState.getProp(ConfigurationKeys.SOURCE_ENTITY);
partition = Partition.deserialize(workUnit);
MDC.put("tableName", getWorkUnitName());
}
private String getWorkUnitName() {
StringBuilder sb = new StringBuilder();
sb.append("[");
sb.append(StringUtils.stripToEmpty(this.workUnitState.getProp(ConfigurationKeys.SOURCE_QUERYBASED_SCHEMA)));
sb.append("_");
sb.append(StringUtils.stripToEmpty(this.workUnitState.getProp(ConfigurationKeys.SOURCE_ENTITY)));
sb.append("_");
String id = this.workUnitState.getId();
int seqIndex = id.lastIndexOf("_", id.length());
if (seqIndex > 0) {
String timeSeqStr = id.substring(0, seqIndex);
int timeIndex = timeSeqStr.lastIndexOf("_", timeSeqStr.length());
if (timeIndex > 0) {
sb.append(id.substring(timeIndex + 1));
}
}
sb.append("]");
return sb.toString();
}
@Override
public D readRecord(@Deprecated D reuse) throws DataRecordException, IOException {
if (!this.isPullRequired()) {
log.info("No more records to read");
return null;
}
D nextElement = null;
try {
if (isInitialPull()) {
log.info("Initial pull");
if (shouldRemoveDataPullUpperBounds()) {
this.removeDataPullUpperBounds();
}
this.iterator = this.getIterator();
}
if (this.iterator.hasNext()) {
nextElement = this.iterator.next();
if (!this.iterator.hasNext()) {
log.debug("Getting next pull");
this.iterator = this.getIterator();
if (this.iterator == null) {
this.setFetchStatus(false);
}
}
}
} catch (Exception e) {
throw new DataRecordException("Failed to get records using rest api; error - " + e.getMessage(), e);
}
return nextElement;
}
/**
* Check if it's appropriate to remove data pull upper bounds in the last work unit, fetching as much data as possible
* from the source. As between the time when data query was created and that was executed, there might be some
* new data generated in the source. Removing the upper bounds will help us grab the new data.
*
* Note: It's expected that there might be some duplicate data between runs because of removing the upper bounds
*
* @return should remove or not
*/
private boolean shouldRemoveDataPullUpperBounds() {
if (!this.workUnitState.getPropAsBoolean(ConfigurationKeys.SOURCE_QUERYBASED_ALLOW_REMOVE_UPPER_BOUNDS, true)) {
return false;
}
// Only consider the last work unit
if (!partition.isLastPartition()) {
return false;
}
// Don't remove if user specifies one or is recorded in previous run
if (partition.getHasUserSpecifiedHighWatermark() ||
this.workUnitState.getProp(ConfigurationKeys.WORK_UNIT_STATE_ACTUAL_HIGH_WATER_MARK_KEY) != null) {
return false;
}
return true;
}
/**
* Remove all upper bounds in the predicateList used for pulling data
*/
private void removeDataPullUpperBounds() {
log.info("Removing data pull upper bound for last work unit");
Iterator<Predicate> it = predicateList.iterator();
while (it.hasNext()) {
Predicate predicate = it.next();
if (predicate.getType() == Predicate.PredicateType.HWM) {
log.info("Remove predicate: " + predicate.condition);
it.remove();
}
}
}
/**
* Get iterator from protocol specific api if is.specific.api.active is false
* Get iterator from source specific api if is.specific.api.active is true
* @return iterator
*/
private Iterator<D> getIterator() throws DataRecordException, IOException {
if (Boolean.valueOf(this.workUnitState.getProp(ConfigurationKeys.SOURCE_QUERYBASED_IS_SPECIFIC_API_ACTIVE))) {
return this.getRecordSetFromSourceApi(this.schema, this.entity, this.workUnit, this.predicateList);
}
return this.getRecordSet(this.schema, this.entity, this.workUnit, this.predicateList);
}
/**
* get source record count from source
* @return record count
*/
@Override
public long getExpectedRecordCount() {
return this.getSourceRecordCount();
}
/**
* get schema(Metadata) corresponding to the data records
* @return schema
*/
@Override
public S getSchema() {
return this.getOutputSchema();
}
/**
* get high watermark of the current pull
* @return high watermark
*/
@Override
public long getHighWatermark() {
return this.highWatermark;
}
/**
* close extractor read stream
* update high watermark
*/
@Override
public void close() {
log.info("Updating the current state high water mark with " + this.highWatermark);
this.workUnitState.setActualHighWatermark(new LongWatermark(this.highWatermark));
try {
this.closeConnection();
} catch (Exception e) {
log.error("Failed to close the extractor", e);
}
}
/**
* @return full dump or not
*/
public boolean isFullDump() {
return Boolean.valueOf(this.workUnitState.getProp(ConfigurationKeys.EXTRACT_IS_FULL_KEY));
}
/**
* build schema, record count and high water mark
*/
public Extractor<S, D> build() throws ExtractPrepareException {
String watermarkColumn = this.workUnitState.getProp(ConfigurationKeys.EXTRACT_DELTA_FIELDS_KEY);
long lwm = partition.getLowWatermark();
long hwm = partition.getHighWatermark();
log.info("Low water mark: " + lwm + "; and High water mark: " + hwm);
WatermarkType watermarkType;
if (StringUtils.isBlank(this.workUnitState.getProp(ConfigurationKeys.SOURCE_QUERYBASED_WATERMARK_TYPE))) {
watermarkType = null;
} else {
watermarkType = WatermarkType
.valueOf(this.workUnitState.getProp(ConfigurationKeys.SOURCE_QUERYBASED_WATERMARK_TYPE).toUpperCase());
}
log.info("Source Entity is " + this.entity);
try {
this.setTimeOut(
this.workUnitState.getPropAsInt(ConfigurationKeys.SOURCE_CONN_TIMEOUT, ConfigurationKeys.DEFAULT_CONN_TIMEOUT));
this.extractMetadata(this.schema, this.entity, this.workUnit);
if (StringUtils.isNotBlank(watermarkColumn)) {
if (partition.isLastPartition()) {
// Get a more accurate high watermark from the source
long adjustedHighWatermark = this.getLatestWatermark(watermarkColumn, watermarkType, lwm, hwm);
log.info("High water mark from source: " + adjustedHighWatermark);
// If the source reports a finer high watermark, then consider the same as runtime high watermark.
// Else, consider the low watermark as high water mark(with no delta).i.e, don't move the pointer
if (adjustedHighWatermark == ConfigurationKeys.DEFAULT_WATERMARK_VALUE) {
adjustedHighWatermark = getLowWatermarkWithNoDelta(lwm);
}
this.highWatermark = adjustedHighWatermark;
} else {
this.highWatermark = hwm;
}
log.info("High water mark for the current run: " + highWatermark);
this.setRangePredicates(watermarkColumn, watermarkType, lwm, highWatermark);
}
// if it is set to true, skip count calculation and set source count to -1
if (!Boolean.valueOf(this.workUnitState.getProp(ConfigurationKeys.SOURCE_QUERYBASED_SKIP_COUNT_CALC))) {
this.sourceRecordCount = this.getSourceCount(this.schema, this.entity, this.workUnit, this.predicateList);
} else {
log.info("Skip count calculation");
this.sourceRecordCount = -1;
}
if (this.sourceRecordCount == 0) {
log.info("Record count is 0; Setting fetch status to false to skip readRecord()");
this.setFetchStatus(false);
}
} catch (SchemaException e) {
throw new ExtractPrepareException("Failed to get schema for this object; error - " + e.getMessage(), e);
} catch (HighWatermarkException e) {
throw new ExtractPrepareException("Failed to get high watermark; error - " + e.getMessage(), e);
} catch (RecordCountException e) {
throw new ExtractPrepareException("Failed to get record count; error - " + e.getMessage(), e);
} catch (Exception e) {
throw new ExtractPrepareException("Failed to prepare the extract build; error - " + e.getMessage(), e);
}
return this;
}
private long getLowWatermarkWithNoDelta(long lwm) {
if (lwm == ConfigurationKeys.DEFAULT_WATERMARK_VALUE) {
return ConfigurationKeys.DEFAULT_WATERMARK_VALUE;
}
String watermarkType = this.workUnitState.getProp(ConfigurationKeys.SOURCE_QUERYBASED_WATERMARK_TYPE, "TIMESTAMP");
WatermarkType wmType = WatermarkType.valueOf(watermarkType.toUpperCase());
int deltaNum = new WatermarkPredicate(wmType).getDeltaNumForNextWatermark();
switch (wmType) {
case SIMPLE:
return lwm - deltaNum;
default:
Date lowWaterMarkDate = Utils.toDate(lwm, "yyyyMMddHHmmss");
return Long
.parseLong(Utils.dateToString(Utils.addSecondsToDate(lowWaterMarkDate, deltaNum * -1), "yyyyMMddHHmmss"));
}
}
/**
* if snapshot extract, get latest watermark else return work unit high watermark
*
* @param watermark column
* @param low watermark value
* @param high watermark value
* @param column format
* @return letst watermark
* @throws IOException
*/
private long getLatestWatermark(String watermarkColumn, WatermarkType watermarkType, long lwmValue, long hwmValue)
throws HighWatermarkException, IOException {
if (!Boolean.valueOf(this.workUnitState.getProp(ConfigurationKeys.SOURCE_QUERYBASED_SKIP_HIGH_WATERMARK_CALC))) {
log.info("Getting high watermark");
List<Predicate> list = new ArrayList<>();
WatermarkPredicate watermark = new WatermarkPredicate(watermarkColumn, watermarkType);
String lwmOperator = partition.isLowWatermarkInclusive() ? ">=" : ">";
String hwmOperator = (partition.isLastPartition() || partition.isHighWatermarkInclusive()) ? "<=" : "<";
Predicate lwmPredicate = watermark.getPredicate(this, lwmValue, lwmOperator, Predicate.PredicateType.LWM);
Predicate hwmPredicate = watermark.getPredicate(this, hwmValue, hwmOperator, Predicate.PredicateType.HWM);
if (lwmPredicate != null) {
list.add(lwmPredicate);
}
if (hwmPredicate != null) {
list.add(hwmPredicate);
}
return this.getMaxWatermark(this.schema, this.entity, watermarkColumn, list,
watermark.getWatermarkSourceFormat(this));
}
return hwmValue;
}
/**
* range predicates for watermark column and transaction columns.
*
* @param watermarkColumn name of the column used as watermark
* @param watermarkType watermark type
* @param lwmValue estimated low watermark value
* @param hwmValue estimated high watermark value
*/
private void setRangePredicates(String watermarkColumn, WatermarkType watermarkType, long lwmValue, long hwmValue) {
log.debug("Getting range predicates");
String lwmOperator = partition.isLowWatermarkInclusive() ? ">=" : ">";
String hwmOperator = (partition.isLastPartition() || partition.isHighWatermarkInclusive()) ? "<=" : "<";
WatermarkPredicate watermark = new WatermarkPredicate(watermarkColumn, watermarkType);
this.addPredicates(watermark.getPredicate(this, lwmValue, lwmOperator, Predicate.PredicateType.LWM));
this.addPredicates(watermark.getPredicate(this, hwmValue, hwmOperator, Predicate.PredicateType.HWM));
if (Boolean.valueOf(this.workUnitState.getProp(ConfigurationKeys.SOURCE_QUERYBASED_IS_HOURLY_EXTRACT))) {
String hourColumn = this.workUnitState.getProp(ConfigurationKeys.SOURCE_QUERYBASED_HOUR_COLUMN);
if (StringUtils.isNotBlank(hourColumn)) {
WatermarkPredicate hourlyWatermark = new WatermarkPredicate(hourColumn, WatermarkType.HOUR);
this.addPredicates(hourlyWatermark.getPredicate(this, lwmValue, lwmOperator, Predicate.PredicateType.LWM));
this.addPredicates(hourlyWatermark.getPredicate(this, hwmValue, hwmOperator, Predicate.PredicateType.HWM));
}
}
}
/**
* add predicate to the predicate list
* @param Predicate(watermark column,type,format and condition)
* @return watermark list
*/
private void addPredicates(Predicate predicate) {
if (predicate != null) {
this.predicateList.add(predicate);
}
}
/**
* @param given list of watermark columns
* @param column name to search for
* @return true, if column name is part of water mark columns. otherwise, return false
*/
protected boolean isWatermarkColumn(String watermarkColumn, String columnName) {
if (columnName != null) {
columnName = columnName.toLowerCase();
}
if (StringUtils.isNotBlank(watermarkColumn)) {
List<String> waterMarkColumnList = Arrays.asList(watermarkColumn.toLowerCase().split(","));
if (waterMarkColumnList.contains(columnName)) {
return true;
}
}
return false;
}
/**
* @param given list of watermark columns
* @return true, if there are multiple water mark columns. otherwise, return false
*/
protected boolean hasMultipleWatermarkColumns(String watermarkColumn) {
if (StringUtils.isBlank(watermarkColumn)) {
return false;
}
return Arrays.asList(watermarkColumn.toLowerCase().split(",")).size() > 1;
}
/**
* @param given list of primary key columns
* @param column name to search for
* @return index of the column if it exist in given list of primary key columns. otherwise, return 0
*/
protected int getPrimarykeyIndex(String primarykeyColumn, String columnName) {
if (columnName != null) {
columnName = columnName.toLowerCase();
}
if (StringUtils.isNotBlank(primarykeyColumn)) {
List<String> primarykeyColumnList = Arrays.asList(primarykeyColumn.toLowerCase().split(","));
return primarykeyColumnList.indexOf(columnName) + 1;
}
return 0;
}
/**
* @param column name to search for
* @param list of metadata columns
* @return true if column is part of metadata columns. otherwise, return false.
*/
protected boolean isMetadataColumn(String columnName, List<String> columnList) {
boolean isColumnCheckEnabled =
Boolean.valueOf(this.workUnitState.getProp(ConfigurationKeys.SOURCE_QUERYBASED_IS_METADATA_COLUMN_CHECK_ENABLED,
ConfigurationKeys.DEFAULT_SOURCE_QUERYBASED_IS_METADATA_COLUMN_CHECK_ENABLED));
if (!isColumnCheckEnabled) {
return true;
}
columnName = columnName.trim().toLowerCase();
if (columnList.contains(columnName)) {
return true;
}
return false;
}
/**
* @param column name
* @param data type
* @param data type of elements
* @param elements
* @return converted data type
*/
protected JsonObject convertDataType(String columnName, String type, String elementType, List<String> enumSymbols) {
String dataType = this.getDataTypeMap().get(type);
if (dataType == null) {
dataType = "string";
}
DataType convertedDataType;
if (dataType.equals("map")) {
convertedDataType = new MapDataType(dataType, elementType);
} else if (dataType.equals("array")) {
convertedDataType = new ArrayDataType(dataType, elementType);
} else if (dataType.equals("enum")) {
convertedDataType = new EnumDataType(dataType, columnName, enumSymbols);
} else {
convertedDataType = new DataType(dataType);
}
return GSON.fromJson(GSON.toJson(convertedDataType), JsonObject.class).getAsJsonObject();
}
/**
* @param predicate list
* @return true, if there are any predicates. otherwise, return false.
*/
protected boolean isPredicateExists(List<Predicate> predicateList) {
if (predicateList == null || predicateList.isEmpty()) {
return false;
}
return true;
}
}