/* * Copyright © 2014 Cask Data, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package co.cask.cdap.hive.datasets; import co.cask.cdap.api.data.batch.RecordWritable; import co.cask.cdap.api.data.format.StructuredRecord; import co.cask.cdap.api.data.schema.Schema; import co.cask.cdap.api.dataset.DatasetManagementException; import co.cask.cdap.api.dataset.DatasetProperties; import co.cask.cdap.api.dataset.DatasetSpecification; import co.cask.cdap.common.DatasetNotFoundException; import co.cask.cdap.format.StructuredRecordStringConverter; import co.cask.tephra.TransactionAware; import com.google.gson.Gson; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.OutputFormat; import org.apache.hadoop.mapred.RecordWriter; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.util.Progressable; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.lang.reflect.Type; /** * Map reduce output format to write to datasets that implement {@link RecordWritable}. */ public class DatasetOutputFormat implements OutputFormat<Void, Text> { private static final Logger LOG = LoggerFactory.getLogger(DatasetOutputFormat.class); @Override public RecordWriter<Void, Text> getRecordWriter(FileSystem ignored, final JobConf jobConf, String name, Progressable progress) throws IOException { DatasetAccessor datasetAccessor = new DatasetAccessor(jobConf); try { datasetAccessor.initialize(); return new DatasetRecordWriter(datasetAccessor); } catch (Exception e) { try { datasetAccessor.close(); } catch (IOException e1) { LOG.warn("Exception closing dataset accessor after failure to return a DatasetRecordWriter.", e1); } throw new IOException("Could not get dataset.", e); } } @Override public void checkOutputSpecs(FileSystem ignored, JobConf jobConf) throws IOException { // This is called prior to returning a RecordWriter. We make sure here that the // dataset we want to write to is RecordWritable. try (DatasetAccessor datasetAccessor = new DatasetAccessor(jobConf)) { try { datasetAccessor.initialize(); } catch (DatasetNotFoundException e) { throw new IOException(String.format("Dataset '%s' does not exist", datasetAccessor.getDatasetId()), e); } catch (DatasetManagementException | ClassNotFoundException e) { throw new IOException(String.format("Could not instantiate dataset '%s'", datasetAccessor.getDatasetId()), e); } if (!(datasetAccessor.getDataset() instanceof RecordWritable)) { throw new IOException(String.format("Dataset '%s' is not RecordWritable.", datasetAccessor.getDatasetId())); } } } private class DatasetRecordWriter implements RecordWriter<Void, Text> { private final DatasetAccessor datasetAccessor; private final RecordWritable recordWritable; private final Type recordType; private Schema recordSchema; public DatasetRecordWriter(DatasetAccessor datasetAccessor) { this.datasetAccessor = datasetAccessor; this.recordWritable = datasetAccessor.getDataset(); this.recordType = recordWritable.getRecordType(); if (recordType == StructuredRecord.class) { try { DatasetSpecification datasetSpec = datasetAccessor.getDatasetSpec(); String schemaStr = datasetSpec.getProperty(DatasetProperties.SCHEMA); // should never happen, as this should have been checked at table creation if (schemaStr == null) { throw new IllegalStateException( String.format("Dataset '%s' does not have the schema property.", datasetSpec.getName())); } recordSchema = Schema.parseJson(schemaStr); } catch (IOException | DatasetManagementException e) { try { recordWritable.close(); } catch (IOException e1) { LOG.warn("Exception closing dataset {} after failing to look up its schema.", datasetAccessor.getDatasetId(), e1); } throw new RuntimeException("Unable to look up schema for dataset.", e); } } } @Override public void write(Void key, Text value) throws IOException { if (value == null) { throw new IOException("Writable value is null."); } if (recordType == StructuredRecord.class) { recordWritable.write(StructuredRecordStringConverter.fromJsonString(value.toString(), recordSchema)); } else { recordWritable.write(new Gson().fromJson(value.toString(), recordType)); } } @Override public void close(Reporter reporter) throws IOException { try { if (recordWritable instanceof TransactionAware) { try { // Commit changes made to the dataset being written // NOTE: because the transaction wrapping a Hive query is a long running one, // we don't track changes and don't check conflicts - we can just commit the changes. ((TransactionAware) recordWritable).commitTx(); } catch (Exception e) { LOG.error("Could not commit changes for table {}", recordWritable); throw new IOException(e); } } } finally { recordWritable.close(); datasetAccessor.close(); } } } }