/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package gobblin.data.management.conversion.hive.extractor;
import java.io.IOException;
import java.util.List;
import lombok.extern.slf4j.Slf4j;
import org.apache.avro.Schema;
import org.apache.avro.Schema.Type;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.hive.metastore.IMetaStoreClient;
import org.apache.hadoop.hive.metastore.api.Partition;
import org.apache.hadoop.hive.metastore.api.Table;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.thrift.TException;
import com.google.common.base.Optional;
import com.google.common.collect.Lists;
import gobblin.configuration.ConfigurationKeys;
import gobblin.configuration.WorkUnitState;
import gobblin.data.management.conversion.hive.avro.AvroSchemaManager;
import gobblin.data.management.conversion.hive.dataset.ConvertibleHiveDataset;
import gobblin.data.management.conversion.hive.entities.QueryBasedHiveConversionEntity;
import gobblin.data.management.conversion.hive.entities.SchemaAwareHivePartition;
import gobblin.data.management.conversion.hive.entities.SchemaAwareHiveTable;
import gobblin.data.management.conversion.hive.watermarker.PartitionLevelWatermarker;
import gobblin.source.extractor.DataRecordException;
import gobblin.util.AutoReturnableObject;
/**
* <p>
* Extracts {@link QueryBasedHiveConversionEntity}s. A {@link QueryBasedHiveConversionEntity} can represent a
* hive table or a hive partition. Note that this extractor does not extract rows of a partition or a table. Entire
* table or partition is considered as a record.
* </p>
* <p>
* From the {@link WorkUnitState} this extractor deserializes the {@link SerializableHiveTable} and optionally a {@link SerializableHivePartition}.
* For these {@link SerializableHiveTable} and {@link SerializableHivePartition}'s the extractor makes a call to the Hive metastore
* to get the corresponding hive {@link org.apache.hadoop.hive.ql.metadata.Table} and hive {@link org.apache.hadoop.hive.ql.metadata.Partition}
* </p>
*/
@Slf4j
public class HiveConvertExtractor extends HiveBaseExtractor<Schema, QueryBasedHiveConversionEntity> {
private List<QueryBasedHiveConversionEntity> conversionEntities = Lists.newArrayList();
public HiveConvertExtractor(WorkUnitState state, FileSystem fs) throws IOException, TException, HiveException {
super(state);
if (Boolean.valueOf(state.getPropAsBoolean(PartitionLevelWatermarker.IS_WATERMARK_WORKUNIT_KEY))) {
log.info("Ignoring Watermark workunit for {}", state.getProp(ConfigurationKeys.DATASET_URN_KEY));
return;
}
if (!(this.hiveDataset instanceof ConvertibleHiveDataset)) {
throw new IllegalStateException("HiveConvertExtractor is only compatible with ConvertibleHiveDataset");
}
ConvertibleHiveDataset convertibleHiveDataset = (ConvertibleHiveDataset) this.hiveDataset;
try (AutoReturnableObject<IMetaStoreClient> client = this.pool.getClient()) {
Table table = client.get().getTable(this.dbName, this.tableName);
SchemaAwareHiveTable schemaAwareHiveTable = new SchemaAwareHiveTable(table, AvroSchemaManager.getSchemaFromUrl(this.hiveWorkUnit.getTableSchemaUrl(), fs));
SchemaAwareHivePartition schemaAwareHivePartition = null;
if (this.hiveWorkUnit.getPartitionName().isPresent() && this.hiveWorkUnit.getPartitionSchemaUrl().isPresent()) {
Partition partition = client.get().getPartition(this.dbName, this.tableName, this.hiveWorkUnit.getPartitionName().get());
schemaAwareHivePartition =
new SchemaAwareHivePartition(table, partition, AvroSchemaManager.getSchemaFromUrl(this.hiveWorkUnit.getPartitionSchemaUrl().get(), fs));
}
QueryBasedHiveConversionEntity entity =
new QueryBasedHiveConversionEntity(convertibleHiveDataset, schemaAwareHiveTable, Optional.fromNullable(schemaAwareHivePartition));
this.conversionEntities.add(entity);
}
}
@Override
public Schema getSchema() throws IOException {
if (this.conversionEntities.isEmpty()) {
return Schema.create(Type.NULL);
}
QueryBasedHiveConversionEntity conversionEntity = this.conversionEntities.get(0);
return conversionEntity.getHiveTable().getAvroSchema();
}
/**
* There is only one record ({@link QueryBasedHiveConversionEntity}) to be read. This {@link QueryBasedHiveConversionEntity} is
* removed from {@link #conversionEntities} list after it is read. So when gobblin runtime calls this method the second time, it returns a null
*/
@Override
public QueryBasedHiveConversionEntity readRecord(QueryBasedHiveConversionEntity reuse) throws DataRecordException, IOException {
if (this.conversionEntities.isEmpty()) {
return null;
}
return this.conversionEntities.remove(0);
}
}