/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package gobblin.compliance.purger; import java.io.IOException; import java.util.Map; import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Optional; import com.google.common.base.Preconditions; import gobblin.compliance.ComplianceConfigurationKeys; import gobblin.compliance.DatasetDescriptor; import gobblin.compliance.HivePartitionDataset; import gobblin.compliance.HivePartitionFinder; import gobblin.configuration.State; import gobblin.configuration.WorkUnitState; import gobblin.source.extractor.Extractor; import gobblin.util.reflection.GobblinConstructorUtils; /** * This extractor doesn't extract anything, but is used to instantiate and pass {@link PurgeableHivePartitionDataset} * to the converter. * * @author adsharma */ public class HivePurgerExtractor implements Extractor<PurgeableHivePartitionDatasetSchema, PurgeableHivePartitionDataset> { private PurgeableHivePartitionDataset record; private State state; private boolean read; public HivePurgerExtractor(WorkUnitState state) throws IOException { this.read = false; this.state = new State(state); } @Override public PurgeableHivePartitionDatasetSchema getSchema() { return new PurgeableHivePartitionDatasetSchema(); } /** * There is only one record {@link PurgeableHivePartitionDataset} to be read per partition, and must return null * after that to indicate end of reading. */ @Override public PurgeableHivePartitionDataset readRecord(PurgeableHivePartitionDataset record) throws IOException { if (this.read) { return null; } this.read = true; if (this.record == null) { this.record = createPurgeableHivePartitionDataset(this.state); } return this.record; } @Override public long getExpectedRecordCount() { return 1; } /** * Watermark is not managed by this extractor. */ @Override public long getHighWatermark() { return 0; } @Override public void close() throws IOException { } @VisibleForTesting public void setRecord(PurgeableHivePartitionDataset record) { this.record = record; } private PurgeableHivePartitionDataset createPurgeableHivePartitionDataset(State state) throws IOException { HivePartitionDataset hivePartitionDataset = HivePartitionFinder.findDataset(state.getProp(ComplianceConfigurationKeys.PARTITION_NAME), state); Preconditions.checkArgument(state.contains(ComplianceConfigurationKeys.COMPLIANCEID_KEY), "Missing property " + ComplianceConfigurationKeys.COMPLIANCEID_KEY); Preconditions.checkArgument(state.contains(ComplianceConfigurationKeys.COMPLIANCE_ID_TABLE_KEY), "Missing property " + ComplianceConfigurationKeys.COMPLIANCE_ID_TABLE_KEY); Preconditions.checkArgument(state.contains(ComplianceConfigurationKeys.TIMESTAMP), "Missing table property " + ComplianceConfigurationKeys.TIMESTAMP); Boolean simulate = state.getPropAsBoolean(ComplianceConfigurationKeys.COMPLIANCE_JOB_SIMULATE, ComplianceConfigurationKeys.DEFAULT_COMPLIANCE_JOB_SIMULATE); String complianceIdentifier = state.getProp(ComplianceConfigurationKeys.COMPLIANCEID_KEY); String complianceIdTable = state.getProp(ComplianceConfigurationKeys.COMPLIANCE_ID_TABLE_KEY); String timeStamp = state.getProp(ComplianceConfigurationKeys.TIMESTAMP); Boolean specifyPartitionFormat = state.getPropAsBoolean(ComplianceConfigurationKeys.SPECIFY_PARTITION_FORMAT, ComplianceConfigurationKeys.DEFAULT_SPECIFY_PARTITION_FORMAT); State datasetState = new State(); datasetState.addAll(state.getProperties()); PurgeableHivePartitionDataset dataset = new PurgeableHivePartitionDataset(hivePartitionDataset); dataset.setComplianceId(complianceIdentifier); dataset.setComplianceIdTable(complianceIdTable); dataset.setComplianceField(getComplianceField(state, hivePartitionDataset)); dataset.setTimeStamp(timeStamp); dataset.setState(datasetState); dataset.setSimulate(simulate); dataset.setSpecifyPartitionFormat(specifyPartitionFormat); return dataset; } private String getComplianceField(State state, HivePartitionDataset dataset) { Map<String, String> partitionParameters = dataset.getTableParams(); Preconditions.checkArgument(partitionParameters.containsKey(ComplianceConfigurationKeys.DATASET_DESCRIPTOR_KEY), "Missing table property " + ComplianceConfigurationKeys.DATASET_DESCRIPTOR_KEY); String datasetDescriptorClass = state.getProp(ComplianceConfigurationKeys.DATASET_DESCRIPTOR_CLASS, ComplianceConfigurationKeys.DEFAULT_DATASET_DESCRIPTOR_CLASS); DatasetDescriptor descriptor = GobblinConstructorUtils .invokeConstructor(DatasetDescriptor.class, datasetDescriptorClass, partitionParameters.get(ComplianceConfigurationKeys.DATASET_DESCRIPTOR_KEY), Optional.fromNullable(state.getProp(ComplianceConfigurationKeys.DATASET_DESCRIPTOR_FIELDPATH))); return descriptor.getComplianceField(); } }