package com.thinkbiganalytics.ingest; /*- * #%L * thinkbig-nifi-core-processors * %% * Copyright (C) 2017 ThinkBig Analytics * %% * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * #L% */ import com.thinkbiganalytics.hive.util.HiveUtils; import com.thinkbiganalytics.util.ColumnSpec; import com.thinkbiganalytics.util.PartitionBatch; import com.thinkbiganalytics.util.PartitionSpec; import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.Validate; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.Serializable; import java.sql.Connection; import java.sql.ResultSet; import java.sql.SQLException; import java.sql.Statement; import java.util.Arrays; import java.util.List; import java.util.Vector; import java.util.stream.Collectors; import javax.annotation.Nonnull; import javax.annotation.Nullable; /** * Merge or Sync from a table into a target table. Dedupes and uses partition strategy of the target table. Sync will completely replace the target table with the contents from the source. Merge will * append the data into the target table adhering to partitions if defined. If Dedupe is specified then duplicates will be stripped. */ public class TableMergeSyncSupport implements Serializable { public static Logger logger = LoggerFactory.getLogger(TableMergeSyncSupport.class); protected Connection conn; public TableMergeSyncSupport(Connection conn) { Validate.notNull(conn); this.conn = conn; } /** * Sets several hive parameters to enable dynamic partitions */ public void enableDynamicPartitions() { doExecuteSQL("set hive.exec.dynamic.partition=true"); doExecuteSQL("set hive.exec.dynamic.partition.mode=nonstrict"); // Required for ORC and Parquet doExecuteSQL("set hive.optimize.index.filter=false"); } /** * Sets the list of configurations given in name=value string pairs */ public void setHiveConf(String[] configurations) { for (String conf : configurations) { doExecuteSQL("set " + conf); } } /** * Performs a sync replacing all data in the target table. A temporary table is created with the new data, old table dropped and the temporary table renamed to become the new table. This causes a * very brief lapse for consumers between when the table is dropped and the rename. * * @param sourceSchema the schema or database name of the source table * @param sourceTable the source table name * @param targetSchema the schema or database name of the target table * @param targetTable the target table name * @param partitionSpec the partition specification * @param feedPartitionValue the source processing partition value */ public void doSync(@Nonnull final String sourceSchema, @Nonnull final String sourceTable, @Nonnull final String targetSchema, @Nonnull final String targetTable, @Nonnull final PartitionSpec partitionSpec, @Nonnull final String feedPartitionValue) throws SQLException { // Validate input parameters Validate.notEmpty(sourceSchema); Validate.notEmpty(sourceTable); Validate.notEmpty(targetSchema); Validate.notEmpty(targetTable); Validate.notNull(partitionSpec); Validate.notNull(feedPartitionValue); // Extract the existing HDFS location of data String refTableLocation = extractTableLocation(targetSchema, targetTable); // 1. Create a temporary "sync" table for storing our latest snapshot String syncTableLocation = deriveSyncTableLocation(targetTable, refTableLocation); String syncTable = createSyncTable(targetSchema, targetTable, syncTableLocation); // 2. Populate the temporary "sync" table final String[] selectFields = getSelectFields(sourceSchema, sourceTable, targetSchema, syncTable, partitionSpec); final String syncSQL = partitionSpec.isNonPartitioned() ? generateSyncNonPartitionQuery(selectFields, sourceSchema, sourceTable, targetSchema, syncTable, feedPartitionValue) : generateSyncDynamicPartitionQuery(selectFields, partitionSpec, sourceSchema, sourceTable, targetSchema, syncTable, feedPartitionValue); doExecuteSQL(syncSQL); // 3. Drop the sync table. Since it is a managed table it will drop the old data dropTable(targetSchema, targetTable); // 4. Rename the sync table renameTable(targetSchema, syncTable, targetTable); } /** * @param sourceSchema the schema or database name of the source table * @param sourceTable the source table name * @param targetSchema the schema or database name of the target table * @param targetTable the target table name * @param partitionSpec the partition specification * @param feedPartitionValue the source processing partition value */ public void doRollingSync(@Nonnull final String sourceSchema, @Nonnull final String sourceTable, @Nonnull final String targetSchema, @Nonnull final String targetTable, @Nonnull final PartitionSpec partitionSpec, @Nonnull final String feedPartitionValue) throws SQLException { // Validate input parameters Validate.notEmpty(sourceSchema); Validate.notEmpty(sourceTable); Validate.notEmpty(targetSchema); Validate.notEmpty(targetTable); Validate.notNull(partitionSpec); Validate.notNull(feedPartitionValue); List<PartitionBatch> batches = createPartitionBatches(partitionSpec, sourceSchema, sourceTable, feedPartitionValue); final String[] selectFields = getSelectFields(sourceSchema, sourceTable, targetSchema, targetTable, partitionSpec); final String syncSQL = generateRollingSyncQuery(selectFields, partitionSpec, sourceSchema, sourceTable, targetSchema, targetTable, batches, feedPartitionValue); doExecuteSQL(syncSQL); } private String generateRollingSyncQuery(String[] selectFields, PartitionSpec partitionSpec, String sourceSchema, String sourceTable, String targetSchema, String targetTable, List<PartitionBatch> batches, String feedPartitionValue) { final String selectSQL = StringUtils.join(selectFields, ","); String partitionWhere = targetPartitionsWhereClause(batches, true); //something went horribly wrong if there are no partitions Validate.notEmpty(partitionWhere); return "insert overwrite table " + HiveUtils.quoteIdentifier(targetSchema, targetTable) + " " + partitionSpec.toDynamicPartitionSpec() + " select " + selectSQL + "," + partitionSpec.toDynamicSelectSQLSpec() + " from " + HiveUtils.quoteIdentifier(sourceSchema, sourceTable) + " where processing_dttm = " + HiveUtils.quoteString(feedPartitionValue) + " and (" + partitionWhere + ")"; } /** * Performs the doMerge and insert into the target table from the source table. * * @param sourceSchema the schema or database name of the source table * @param sourceTable the source table name * @param targetSchema the schema or database name of the target table * @param targetTable the target table name * @param partitionSpec the partition specification * @param feedPartitionValue the source processing partition value * @param shouldDedupe whether to perform dedupe during merge */ public List<PartitionBatch> doMerge(@Nonnull final String sourceSchema, @Nonnull final String sourceTable, @Nonnull final String targetSchema, @Nonnull final String targetTable, @Nonnull final PartitionSpec partitionSpec, @Nonnull final String feedPartitionValue, final boolean shouldDedupe) { // Validate input parameters Validate.notEmpty(sourceSchema); Validate.notEmpty(sourceTable); Validate.notEmpty(targetSchema); Validate.notEmpty(targetTable); Validate.notNull(partitionSpec); Validate.notNull(feedPartitionValue); List<PartitionBatch> batches = null; final String[] selectFields = getSelectFields(sourceSchema, sourceTable, targetSchema, targetTable, partitionSpec); final String sql; if (partitionSpec.isNonPartitioned()) { if (shouldDedupe) { if (hasProcessingDttm(selectFields)) { sql = generateMergeNonPartitionQueryWithDedupe(selectFields, sourceSchema, sourceTable, targetSchema, targetTable, feedPartitionValue); } else { sql = generateMergeNonPartitionQueryWithDedupeNoProcessingDttm(selectFields, sourceSchema, sourceTable, targetSchema, targetTable, feedPartitionValue); } } else { sql = generateMergeNonPartitionQuery(selectFields, sourceSchema, sourceTable, targetSchema, targetTable, feedPartitionValue); } } else { if (shouldDedupe) { batches = createPartitionBatches(partitionSpec, sourceSchema, sourceTable, feedPartitionValue); // Newer tables with processing_dttm in target will always be unique so requires additional handling if (hasProcessingDttm(selectFields)) { sql = generateMergeWithDedupePartitionQuery(selectFields, partitionSpec, batches, sourceSchema, sourceTable, targetSchema, targetTable, feedPartitionValue); } else { sql = generateMergeWithDedupePartitionQueryNoProcessingDttm(selectFields, partitionSpec, batches, sourceSchema, sourceTable, targetSchema, targetTable, feedPartitionValue); } } else { sql = generateMergeWithPartitionQuery(selectFields, partitionSpec, sourceSchema, sourceTable, targetSchema, targetTable, feedPartitionValue); } } doExecuteSQL(sql); return batches; } private boolean hasProcessingDttm(String[] selectFields) { return Arrays.asList(selectFields).stream().anyMatch(v -> ("`processing_dttm`".equals(v))); } /** * Updates any rows matching the same primary key, otherwise inserts the value into the appropriate partition. * * @param sourceSchema the schema or database name of the source table * @param sourceTable the source table name * @param targetSchema the schema or database name of the target table * @param targetTable the target table name * @param partitionSpec the partition specification * @param feedPartitionValue the source processing partition value * @param columnSpecs the columns to join on */ public void doPKMerge(@Nonnull final String sourceSchema, @Nonnull final String sourceTable, @Nonnull final String targetSchema, @Nonnull final String targetTable, @Nonnull final PartitionSpec partitionSpec, @Nonnull final String feedPartitionValue, @Nonnull final ColumnSpec[] columnSpecs) { // Validate input parameters Validate.notEmpty(sourceSchema); Validate.notEmpty(sourceTable); Validate.notEmpty(targetSchema); Validate.notEmpty(targetTable); Validate.notNull(partitionSpec); Validate.notNull(feedPartitionValue); Validate.notEmpty(columnSpecs); final String[] selectFields = getSelectFields(sourceSchema, sourceTable, targetSchema, targetTable, partitionSpec); final String sql = partitionSpec.isNonPartitioned() ? generatePKMergeNonPartitionQuery(selectFields, sourceSchema, sourceTable, targetSchema, targetTable, feedPartitionValue, columnSpecs) : generatePKMergePartitionQuery(selectFields, partitionSpec, sourceSchema, sourceTable, targetSchema, targetTable, feedPartitionValue, columnSpecs); doExecuteSQL(sql); } /** * Create a new table like the old table with the new location. * * @param schema the schema or database name for the reference table * @param table the name of the reference table * @param syncTableLocation the HDFS location for the reference table * @return the new table name */ private String createSyncTable(@Nonnull final String schema, @Nonnull final String table, @Nonnull final String syncTableLocation) throws SQLException { final String syncTable = table + "_" + System.currentTimeMillis(); final String createSQL = "create table " + HiveUtils.quoteIdentifier(schema, syncTable) + " like " + HiveUtils.quoteIdentifier(schema, table) + " location " + HiveUtils.quoteString(syncTableLocation); doExecuteSQL(createSQL); return syncTable; } /** * Drop table removing the data. * * @param schema the schema or database name containing the table * @param table the name of the table */ public void dropTable(@Nonnull final String schema, @Nonnull final String table) { // Make managed to remove the old data String makeManagedSQL = "alter table " + HiveUtils.quoteIdentifier(schema, table) + " SET TBLPROPERTIES ('EXTERNAL'='FALSE')"; doExecuteSQL(makeManagedSQL); String sql = "DROP TABLE " + HiveUtils.quoteIdentifier(schema, table); doExecuteSQL(sql); } /** * Renames the specified table. * * @param schema the schema or database name containing the tables * @param oldName the name of the table to be renamed * @param newName the new name for the table */ public void renameTable(@Nonnull final String schema, @Nonnull final String oldName, @Nonnull final String newName) { final String sql = "alter table " + HiveUtils.quoteIdentifier(schema, oldName) + " RENAME TO " + HiveUtils.quoteIdentifier(schema, newName); doExecuteSQL(sql); } /** * Create a new HDFS location for the target data * * @param table the name of the table * @param oldLocation the old location * @return the new HDFS location */ private String deriveSyncTableLocation(String table, String oldLocation) { String[] parts = oldLocation.split("/"); parts[parts.length - 1] = table + "_" + System.currentTimeMillis(); return StringUtils.join(parts, "/"); } /** * Extract the HDFS location of the table data. * * @param schema the schema or database name * @param table the table name * @return the HDFS location of the table data */ private String extractTableLocation(@Nonnull final String schema, @Nonnull final String table) throws SQLException { doExecuteSQL("use " + HiveUtils.quoteIdentifier(schema)); try (final Statement st = conn.createStatement()) { ResultSet rs = doSelectSQL(st, "show table extended like " + HiveUtils.quoteIdentifier(table)); while (rs.next()) { String value = rs.getString(1); if (value.startsWith("location:")) { return value.substring(9); } } } throw new RuntimeException("Unable to identify HDFS location property of table [" + table + "]"); } /** * Generates a sync query for inserting from a source table into the target table with no partitions. * * @param selectFields the list of fields in the select clause of the source table * @param sourceSchema the schema or database name of the source table * @param sourceTable the source table name * @param targetSchema the schema or database name of the target table * @param targetTable the target table name * @param feedPartitionValue the source processing partition value * @return the sql string */ protected String generateSyncNonPartitionQuery(@Nonnull final String[] selectFields, @Nonnull final String sourceSchema, @Nonnull final String sourceTable, @Nonnull final String targetSchema, @Nonnull final String targetTable, @Nonnull final String feedPartitionValue) { final String selectSQL = StringUtils.join(selectFields, ","); return "insert overwrite table " + HiveUtils.quoteIdentifier(targetSchema, targetTable) + " select " + selectSQL + " from " + HiveUtils.quoteIdentifier(sourceSchema, sourceTable) + " where processing_dttm = " + HiveUtils.quoteString(feedPartitionValue); } /** * Generates a merge query for inserting overwriting from a source table into the target table appending to any partitions. * * @param selectFields the list of fields in the select clause of the source table * @param spec the partition specification * @param sourceSchema the schema or database name of the source table * @param sourceTable the source table name * @param targetSchema the schema or database name of the target table * @param targetTable the target table name * @param feedPartitionValue the source processing partition value * @return the sql string */ protected String generateMergeWithPartitionQuery(@Nonnull final String[] selectFields, @Nonnull final PartitionSpec spec, @Nonnull final String sourceSchema, @Nonnull final String sourceTable, @Nonnull final String targetSchema, @Nonnull final String targetTable, @Nonnull final String feedPartitionValue) { final String selectSQL = StringUtils.join(selectFields, ","); return "insert into table " + HiveUtils.quoteIdentifier(targetSchema, targetTable) + " " + spec.toDynamicPartitionSpec() + " select " + selectSQL + "," + spec.toDynamicSelectSQLSpec() + " from " + HiveUtils.quoteIdentifier(sourceSchema, sourceTable) + " " + " where processing_dttm = " + HiveUtils.quoteString(feedPartitionValue); } /** * Produces a where clause that limits to the impacted partitions of the target table * * @param batches a list of partition batches * @param useSourceColumns a boolean value to decide whether to use the source columns or target columns * @return a where clause sql string */ private String targetPartitionsWhereClause(List<PartitionBatch> batches, boolean useSourceColumns) { List<String> targetPartitionsItems = new Vector<>(); for (PartitionBatch batch : batches) { String column = useSourceColumns ? batch.getPartitionSpec().toSourceSQLWhere(batch.getPartitionValues()) : batch.getPartitionSpec().toTargetSQLWhere(batch.getPartitionValues()); targetPartitionsItems.add("(" + column + ")"); } return (targetPartitionsItems.size() == 0 ? null : StringUtils.join(targetPartitionsItems.toArray(new String[0]), " or ")); } /** * Generates a merge query for inserting overwriting from a source table into the target table appending to any partitions * uses the batch identifier processing_dttm to determine whether a new record should be inserted so only new, distinct records will be * inserted. * * @param selectFields the list of fields in the select clause of the source table * @param spec the partition specification * @param batches the partitions of the source table * @param sourceSchema the schema or database name of the source table * @param sourceTable the source table name * @param targetSchema the schema or database name of the target table * @param targetTable the target table name * @param feedPartitionValue the source processing partition value * @return the sql string */ protected String generateMergeWithDedupePartitionQuery(@Nonnull final String[] selectFields, @Nonnull final PartitionSpec spec, @Nonnull final List<PartitionBatch> batches, @Nonnull final String sourceSchema, @Nonnull final String sourceTable, @Nonnull final String targetSchema, @Nonnull final String targetTable, @Nonnull final String feedPartitionValue) { // Strip processing_dttm for the distinct since it will always be different String[] distinctSelectFields = Arrays.asList(selectFields).stream().filter(v -> !("`processing_dttm`".equals(v))).collect(Collectors.toList()).toArray(new String[0]); final String selectAggregateSQL = StringUtils.join(distinctSelectFields, ",") + ", min(processing_dttm) processing_dttm, " + spec.toPartitionSelectSQL(); final String groupBySQL = StringUtils.join(distinctSelectFields, ",") + "," + spec.toPartitionSelectSQL(); final String selectSQL = StringUtils.join(selectFields, ","); final String targetPartitionWhereClause = targetPartitionsWhereClause(batches, false); final StringBuilder sb = new StringBuilder(); sb.append("insert into table ").append(HiveUtils.quoteIdentifier(targetSchema, targetTable)).append(" ") .append(spec.toDynamicPartitionSpec()) .append("select ").append(selectAggregateSQL).append(" from (") .append(" select ").append(selectSQL).append(",").append(spec.toDynamicSelectSQLSpec()) .append(" from ").append(HiveUtils.quoteIdentifier(sourceSchema, sourceTable)).append(" ") .append(" where ") .append(" processing_dttm = ").append(HiveUtils.quoteString(feedPartitionValue)) .append(" union all ") .append(" select ").append(selectSQL).append(",").append(spec.toPartitionSelectSQL()) .append(" from ").append(HiveUtils.quoteIdentifier(targetSchema, targetTable)).append(" "); if (targetPartitionWhereClause != null) { sb.append(" where (").append(targetPartitionWhereClause).append(")"); } sb.append(") t group by " + groupBySQL).append(" having min(processing_dttm) = ").append(HiveUtils.quoteString(feedPartitionValue)); return sb.toString(); } /** * Generates a merge query for inserting overwriting from a source table into the target table appending to any partitions * * @param selectFields the list of fields in the select clause of the source table * @param spec the partition specification * @param batches the partitions of the source table * @param sourceSchema the schema or database name of the source table * @param sourceTable the source table name * @param targetSchema the schema or database name of the target table * @param targetTable the target table name * @param feedPartitionValue the source processing partition value * @return the sql string */ protected String generateMergeWithDedupePartitionQueryNoProcessingDttm(@Nonnull final String[] selectFields, @Nonnull final PartitionSpec spec, @Nonnull final List<PartitionBatch> batches, @Nonnull final String sourceSchema, @Nonnull final String sourceTable, @Nonnull final String targetSchema, @Nonnull final String targetTable, @Nonnull final String feedPartitionValue) { final String selectSQL = StringUtils.join(selectFields, ","); final String targetPartitionWhereClause = targetPartitionsWhereClause(batches, false); final StringBuilder sb = new StringBuilder(); sb.append("insert overwrite table ").append(HiveUtils.quoteIdentifier(targetSchema, targetTable)).append(" ") .append(spec.toDynamicPartitionSpec()) .append("select DISTINCT ").append(selectSQL).append(",").append(spec.toPartitionSelectSQL()).append(" from (") .append(" select ").append(selectSQL).append(",").append(spec.toDynamicSelectSQLSpec()) .append(" from ").append(HiveUtils.quoteIdentifier(sourceSchema, sourceTable)).append(" ") .append(" where ") .append(" processing_dttm = ").append(HiveUtils.quoteString(feedPartitionValue)) .append(" union all ") .append(" select ").append(selectSQL).append(",").append(spec.toPartitionSelectSQL()) .append(" from ").append(HiveUtils.quoteIdentifier(targetSchema, targetTable)).append(" "); if (targetPartitionWhereClause != null) { sb.append(" where (").append(targetPartitionWhereClause).append(")"); } sb.append(") t"); return sb.toString(); } /** * Generates a dynamic partition sync query for inserting overwriting from a source table into the target table adhering to partitions. * * @param selectFields the list of fields in the select clause of the source table * @param spec the partition specification or null if none * @param sourceSchema the schema or database name of the source table * @param sourceTable the source table name * @param targetSchema the schema or database name of the target table * @param targetTable the target table name * @param feedPartitionValue the source processing partition value * @return the sql string */ protected String generateSyncDynamicPartitionQuery(@Nonnull final String[] selectFields, @Nonnull final PartitionSpec spec, @Nonnull final String sourceSchema, @Nonnull final String sourceTable, @Nonnull final String targetSchema, @Nonnull final String targetTable, @Nonnull final String feedPartitionValue) { final String selectSQL = StringUtils.join(selectFields, ","); return "insert overwrite table " + HiveUtils.quoteIdentifier(targetSchema, targetTable) + " " + spec.toDynamicPartitionSpec() + " select " + selectSQL + "," + spec.toDynamicSelectSQLSpec() + " from " + HiveUtils.quoteIdentifier(sourceSchema, sourceTable) + " where processing_dttm = " + HiveUtils.quoteString(feedPartitionValue); } /** * Generates a query for merging from a source table into the target table with no partitions. * * @param selectFields the list of fields in the select clause of the source table * @param sourceSchema the schema or database name of the source table * @param sourceTable the source table name * @param targetSchema the schema or database name of the target table * @param targetTable the target table name * @param feedPartitionValue the source processing partition value * @return the sql string */ protected String generateMergeNonPartitionQueryWithDedupe(@Nonnull final String[] selectFields, @Nonnull final String sourceSchema, @Nonnull final String sourceTable, @Nonnull final String targetSchema, @Nonnull final String targetTable, @Nonnull final String feedPartitionValue) { String[] distinctSelectFields = Arrays.asList(selectFields).stream().filter(v -> !("`processing_dttm`".equals(v))).collect(Collectors.toList()).toArray(new String[0]); final String selectAggregateSQL = StringUtils.join(distinctSelectFields, ",") + ", min(processing_dttm) processing_dttm"; final String selectSQL = StringUtils.join(selectFields, ","); final String groupBySQL = StringUtils.join(distinctSelectFields, ","); return "insert into table " + HiveUtils.quoteIdentifier(targetSchema, targetTable) + " " + "select " + selectAggregateSQL + " from (" + " select " + selectSQL + " from " + HiveUtils.quoteIdentifier(sourceSchema, sourceTable) + " where processing_dttm = " + HiveUtils.quoteString(feedPartitionValue) + " " + " union all " + " select " + selectSQL + " from " + HiveUtils.quoteIdentifier(targetSchema, targetTable) + ") x group by " + groupBySQL + " having min(processing_dttm) = " + HiveUtils.quoteString(feedPartitionValue); } /** * Generates a query for merging from a source table into the target table with no partitions. * * @param selectFields the list of fields in the select clause of the source table * @param sourceSchema the schema or database name of the source table * @param sourceTable the source table name * @param targetSchema the schema or database name of the target table * @param targetTable the target table name * @param feedPartitionValue the source processing partition value * @return the sql string */ protected String generateMergeNonPartitionQueryWithDedupeNoProcessingDttm(@Nonnull final String[] selectFields, @Nonnull final String sourceSchema, @Nonnull final String sourceTable, @Nonnull final String targetSchema, @Nonnull final String targetTable, @Nonnull final String feedPartitionValue) { final String selectSQL = StringUtils.join(selectFields, ","); return "insert overwrite table " + HiveUtils.quoteIdentifier(targetSchema, targetTable) + " " + "select " + selectSQL + " from (" + " select " + selectSQL + " from " + HiveUtils.quoteIdentifier(sourceSchema, sourceTable) + " where processing_dttm = " + HiveUtils.quoteString(feedPartitionValue) + " " + " union all " + " select " + selectSQL + " from " + HiveUtils.quoteIdentifier(targetSchema, targetTable) + ") x group by " + selectSQL; } /** * Generates a query for merging from a source table into the target table with no partitions. * * @param selectFields the list of fields in the select clause of the source table * @param sourceSchema the schema or database name of the source table * @param sourceTable the source table name * @param targetSchema the schema or database name of the target table * @param targetTable the target table name * @param feedPartitionValue the source processing partition value * @return the sql string */ protected String generateMergeNonPartitionQuery(@Nonnull final String[] selectFields, @Nonnull final String sourceSchema, @Nonnull final String sourceTable, @Nonnull final String targetSchema, @Nonnull final String targetTable, @Nonnull final String feedPartitionValue) { final String selectSQL = StringUtils.join(selectFields, ","); return "insert into " + HiveUtils.quoteIdentifier(targetSchema, targetTable) + " " + " select " + selectSQL + " from " + HiveUtils.quoteIdentifier(sourceSchema, sourceTable) + " where processing_dttm = " + HiveUtils.quoteString(feedPartitionValue); } /** * Generates a query two merge two tables without partitions on a primary key. * * @param selectFields the list of fields in the select clause of the source table * @param sourceSchema the name of the source table schema or database * @param sourceTable the source table * @param targetSchema the name of the target table schema or database * @param targetTable the target table * @param feedPartitionValue the partition of the source table to use * @param columnSpecs the column specifications * @return the sql */ protected String generatePKMergeNonPartitionQuery(@Nonnull final String[] selectFields, @Nonnull final String sourceSchema, @Nonnull final String sourceTable, @Nonnull final String targetSchema, @Nonnull final String targetTable, @Nonnull final String feedPartitionValue, @Nonnull final ColumnSpec[] columnSpecs) { // Include alias String selectSQL = StringUtils.join(selectFields, ","); String[] selectFieldsWithAlias = selectFieldsForAlias(selectFields, "a"); String selectSQLWithAlias = StringUtils.join(selectFieldsWithAlias, ","); String joinOnClause = ColumnSpec.toPrimaryKeyJoinSQL(columnSpecs, "a", "b"); String[] primaryKeys = ColumnSpec.toPrimaryKeys(columnSpecs); String anyPK = primaryKeys[0]; String sbSourceQuery = "select " + selectSQL + " from " + HiveUtils.quoteIdentifier(sourceSchema, sourceTable) + " where processing_dttm= " + HiveUtils.quoteString(feedPartitionValue); // First finds all records in valid // Second finds all records in target that should be preserved for impacted partitions return "insert overwrite table " + HiveUtils.quoteIdentifier(targetSchema, targetTable) + " " + "select " + selectSQL + " from (" + " select " + selectSQL + " from " + HiveUtils.quoteIdentifier(sourceSchema, sourceTable) + " a" + " where " + " a.processing_dttm = " + HiveUtils.quoteString(feedPartitionValue) + " union " + " select " + selectSQLWithAlias + " from " + HiveUtils.quoteIdentifier(targetSchema, targetTable) + " a left outer join (" + sbSourceQuery + ") b " + " on (" + joinOnClause + ")" + " where " + " (b." + anyPK + " is null)) t"; } /** * Generates a query two merge two tables containing partitions on a primary key. * * @param selectFields the list of fields in the select clause of the source table * @param partitionSpec partition specification * @param sourceSchema the name of the source table schema or database * @param sourceTable the source table * @param targetSchema the name of the target table schema or database * @param targetTable the target table * @param feedPartitionValue the partition of the source table to use * @param columnSpecs the column specifications * @return the sql */ protected String generatePKMergePartitionQuery(@Nonnull final String[] selectFields, @Nonnull final PartitionSpec partitionSpec, @Nonnull final String sourceSchema, @Nonnull final String sourceTable, @Nonnull final String targetSchema, @Nonnull final String targetTable, @Nonnull final String feedPartitionValue, @Nonnull final ColumnSpec[] columnSpecs) { // Include alias String selectSQL = StringUtils.join(selectFields, ","); String[] selectFieldsWithAlias = selectFieldsForAlias(selectFields, "a"); String selectSQLWithAlias = StringUtils.join(selectFieldsWithAlias, ","); String joinOnClause = ColumnSpec.toPrimaryKeyJoinSQL(columnSpecs, "a", "b"); String[] primaryKeys = ColumnSpec.toPrimaryKeys(columnSpecs); PartitionSpec partitionSpecWithAlias = partitionSpec.newForAlias("a"); String anyPK = primaryKeys[0]; List<PartitionBatch> batches = createPartitionBatchesforPKMerge(partitionSpec, sourceSchema, sourceTable, targetSchema, targetTable, feedPartitionValue, joinOnClause); String targetPartitionWhereClause = targetPartitionsWhereClause(PartitionBatch.toPartitionBatchesForAlias(batches, "a"), false); // TODO: If the records matching the primary key between the source and target are in a different partition // AND the matching records are the only remaining records of the partition, then the following sql will fail to overwrite the // remaining record. We need to detect this and then delete partition? This is a complex scenario.. String sbSourceQuery = "select " + selectSQL + "," + partitionSpec.toDynamicSelectSQLSpec() + " from " + HiveUtils.quoteIdentifier(sourceSchema, sourceTable) + " where processing_dttm = " + HiveUtils.quoteString(feedPartitionValue); // First finds all records in valid // Second finds all records in target that should be preserved for impacted partitions StringBuilder sb = new StringBuilder(); sb.append("insert overwrite table ").append(HiveUtils.quoteIdentifier(targetSchema, targetTable)).append(" ") .append(partitionSpec.toDynamicPartitionSpec()) .append("select ").append(selectSQL).append(",").append(partitionSpec.toPartitionSelectSQL()).append(" from (") .append(" select ").append(selectSQLWithAlias).append(",").append(partitionSpecWithAlias.toDynamicSelectSQLSpec()) .append(" from ").append(HiveUtils.quoteIdentifier(sourceSchema, sourceTable)).append(" a") .append(" where ") .append(" a.processing_dttm = ").append(HiveUtils.quoteString(feedPartitionValue)) .append(" union all ") .append(" select ").append(selectSQLWithAlias).append(",").append(partitionSpecWithAlias.toDynamicSelectSQLSpec()) .append(" from ").append(HiveUtils.quoteIdentifier(targetSchema, targetTable)).append(" a left outer join (").append(sbSourceQuery).append(") b ") .append(" on (").append(joinOnClause).append(")") .append(" where ") .append(" (b.").append(anyPK).append(" is null)"); if (targetPartitionWhereClause != null) { sb.append(" and (").append(targetPartitionWhereClause).append(")"); } sb.append(") t"); return sb.toString(); } /** * Finds all partitions that contain matching keys. * * @param spec the partition spec * @param sourceSchema the name of the source table schema or database * @param sourceTable the source table * @param targetSchema the name of the target table schema or database * @param targetTable the target table * @param feedPartitionValue the partition of the source table to use * @param joinOnClause the JOIN clause for the source and target tables * @return the matching partitions */ protected List<PartitionBatch> createPartitionBatchesforPKMerge(@Nonnull final PartitionSpec spec, @Nonnull final String sourceSchema, @Nonnull final String sourceTable, @Nonnull final String targetSchema, @Nonnull final String targetTable, @Nonnull final String feedPartitionValue, @Nonnull final String joinOnClause) { List<PartitionBatch> v; PartitionSpec aliasSpecA = spec.newForAlias("a"); // Find all partitions that contain matching keys String sql = "select " + aliasSpecA.toPartitionSelectSQL() + ", count(0)" + " from " + HiveUtils.quoteIdentifier(targetSchema, targetTable) + " a join " + HiveUtils.quoteIdentifier(sourceSchema, sourceTable) + " b" + " on " + joinOnClause + " where b.processing_dttm = '" + feedPartitionValue + "'" + " group by " + aliasSpecA.toPartitionSelectSQL(); try (final Statement st = conn.createStatement()) { logger.info("Selecting target partitions query [" + sql + "]"); ResultSet rs = doSelectSQL(st, sql); v = toPartitionBatches(spec, rs); } catch (SQLException e) { logger.error("Failed to select partition batches SQL {} with error {}", sql, e); throw new RuntimeException("Failed to select partition batches", e); } return v; } protected void doExecuteSQL(String sql) { try (final Statement st = conn.createStatement()) { logger.info("Executing doMerge batch sql {}", sql); st.execute(sql); } catch (SQLException e) { logger.error("Failed to execute {} with error {}", sql, e); throw new RuntimeException("Failed to execute query", e); } } protected ResultSet doSelectSQL(Statement st, String sql) throws SQLException { logger.info("Executing sql select {}", sql); return st.executeQuery(sql); } /* Generates batches of partitions in the source table */ protected List<PartitionBatch> toPartitionBatches(PartitionSpec spec, ResultSet rs) throws SQLException { Vector<PartitionBatch> v = new Vector<>(); int count = rs.getMetaData().getColumnCount(); while (rs.next()) { String[] values = new String[count]; for (int i = 1; i <= count; i++) { Object oVal = rs.getObject(i); String sVal = (oVal == null ? "" : oVal.toString()); values[i - 1] = StringUtils.defaultString(sVal, ""); } Long numRecords = rs.getLong(count); v.add(new PartitionBatch(numRecords, spec, values)); } logger.info("Number of partitions [" + v.size() + "]"); return v; } /** * Generates batches of partitions in the source table. * * @param spec the partition specification * @param sourceSchema the schema or database name of the source table * @param sourceTable the source table name * @param feedPartition the source processing partition value */ protected List<PartitionBatch> createPartitionBatches(@Nonnull final PartitionSpec spec, @Nonnull final String sourceSchema, @Nonnull final String sourceTable, @Nonnull final String feedPartition) { List<PartitionBatch> v; String sql = ""; try (final Statement st = conn.createStatement()) { sql = spec.toDistinctSelectSQL(sourceSchema, sourceTable, feedPartition); logger.info("Executing batch query [" + sql + "]"); ResultSet rs = doSelectSQL(st, sql); v = toPartitionBatches(spec, rs); } catch (SQLException e) { logger.error("Failed to select partition batches SQL {} with error {}", sql, e); throw new RuntimeException("Failed to select partition batches", e); } return v; } /** * Returns the list of columns that are common to both the source and target tables. * * <p>The column names are quoted and escaped for use in a SQL query.</p> * * @param sourceSchema the name of the source table schema or database * @param sourceTable the name of the source table * @param targetSchema the name of the target table schema or database * @param targetTable the name of the target table * @param partitionSpec the partition specifications, or {@code null} if none * @return the columns for a SELECT statement */ protected String[] getSelectFields(@Nonnull final String sourceSchema, @Nonnull final String sourceTable, @Nonnull final String targetSchema, @Nonnull final String targetTable, @Nullable final PartitionSpec partitionSpec) { List<String> srcFields = resolveTableSchema(sourceSchema, sourceTable); List<String> destFields = resolveTableSchema(targetSchema, targetTable); // Find common fields destFields.retainAll(srcFields); // Eliminate any partition columns if (partitionSpec != null) { destFields.removeAll(partitionSpec.getKeyNames()); } String[] fields = destFields.toArray(new String[0]); for (int i = 0; i < fields.length; i++) { fields[i] = HiveUtils.quoteIdentifier(fields[i]); } return fields; } private String[] selectFieldsForAlias(String[] selectFields, String alias) { return Arrays.stream(selectFields).map(s -> alias + "." + s).toArray(String[]::new); } /** * Retrieves the schema of the specified table. * * @param schema the database name * @param table the table name * @return the list of columns */ protected List<String> resolveTableSchema(@Nonnull final String schema, @Nonnull final String table) { List<String> columnSet = new Vector<>(); try (final Statement st = conn.createStatement()) { // Use default database to resolve ambiguity between schema.table and table.column // https://issues.apache.org/jira/browse/HIVE-12184 st.execute("use default"); String ddl = "desc " + HiveUtils.quoteIdentifier(schema, table); logger.info("Resolving table schema [{}]", ddl); ResultSet rs = doSelectSQL(st, ddl); while (rs.next()) { // First blank row is start of partition info if (StringUtils.isEmpty(rs.getString(1))) { break; } columnSet.add(rs.getString(1)); } } catch (SQLException e) { throw new RuntimeException("Failed to inspect schema", e); } return columnSet; } }