/* * Copyright 2013 Cloudera Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.cloudera.cdk.morphline.avro; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.OutputStream; import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.Map; import org.apache.avro.Schema; import org.apache.avro.file.CodecFactory; import org.apache.avro.file.DataFileWriter; import org.apache.avro.generic.GenericContainer; import org.apache.avro.generic.GenericDatumWriter; import org.apache.avro.io.Encoder; import org.apache.avro.io.EncoderFactory; import com.cloudera.cdk.morphline.api.Command; import com.cloudera.cdk.morphline.api.CommandBuilder; import com.cloudera.cdk.morphline.api.MorphlineContext; import com.cloudera.cdk.morphline.api.MorphlineRuntimeException; import com.cloudera.cdk.morphline.api.Record; import com.cloudera.cdk.morphline.base.AbstractCommand; import com.cloudera.cdk.morphline.base.Configs; import com.cloudera.cdk.morphline.base.Fields; import com.cloudera.cdk.morphline.base.Validator; import com.cloudera.cdk.morphline.stdio.AbstractParser; import com.google.common.base.Preconditions; import com.google.common.io.Closeables; import com.typesafe.config.Config; import com.typesafe.config.ConfigFactory; /** * Command that serializes the Avro records contained in the _attachment_body field into a byte * array and replaces the _attachment_body field with that byte array. * * @since 0.9.0 */ public final class WriteAvroToByteArrayBuilder implements CommandBuilder { @Override public Collection<String> getNames() { return Collections.singletonList("writeAvroToByteArray"); } @Override public Command build(Config config, Command parent, Command child, MorphlineContext context) { return new WriteAvroToByteArray(this, config, parent, child, context); } /////////////////////////////////////////////////////////////////////////////// // Nested classes: /////////////////////////////////////////////////////////////////////////////// private static final class WriteAvroToByteArray extends AbstractCommand { private final Format format; private final CodecFactory codecFactory; private final Map<String,String> metadata = new HashMap(); public WriteAvroToByteArray(CommandBuilder builder, Config config, Command parent, Command child, MorphlineContext context) { super(builder, config, parent, child, context); this.format = new Validator<Format>().validateEnum( config, getConfigs().getString(config, "format", Format.container.toString()), Format.class); String codec = getConfigs().getString(config, "codec", null); if (codec == null) { this.codecFactory = null; } else { this.codecFactory = CodecFactory.fromString(codec); } Config metadataConfig = getConfigs().getConfig(config, "metadata", ConfigFactory.empty()); for (Map.Entry<String, Object> entry : new Configs().getEntrySet(metadataConfig)) { this.metadata.put(entry.getKey(), entry.getValue().toString()); } validateArguments(); } @Override protected boolean doProcess(Record inputRecord) { Record outputRecord = inputRecord.copy(); AbstractParser.removeAttachments(outputRecord); ByteArrayOutputStream bout = new ByteArrayOutputStream(1024); if (format == Format.container) { writeContainer(inputRecord, bout); } else { writeContainerless(inputRecord, bout); } outputRecord.put(Fields.ATTACHMENT_BODY, bout.toByteArray()); // pass record to next command in chain: return super.doProcess(outputRecord); } private void writeContainer(Record src, OutputStream dst) { DataFileWriter dataFileWriter = null; try { try { Schema schema = null; for (Object attachment : src.get(Fields.ATTACHMENT_BODY)) { Preconditions.checkNotNull(attachment); GenericContainer datum = (GenericContainer) attachment; schema = getSchema(datum, schema); if (dataFileWriter == null) { // init GenericDatumWriter datumWriter = new GenericDatumWriter(schema); dataFileWriter = new DataFileWriter(datumWriter); if (codecFactory != null) { dataFileWriter.setCodec(codecFactory); } for (Map.Entry<String,String> entry : metadata.entrySet()) { dataFileWriter.setMeta(entry.getKey(), entry.getValue()); } dataFileWriter.create(schema, dst); } dataFileWriter.append(datum); } if (dataFileWriter != null) { dataFileWriter.flush(); } } catch (IOException e) { throw new MorphlineRuntimeException(e); } } finally { Closeables.closeQuietly(dataFileWriter); } } private void writeContainerless(Record src, OutputStream dst) { try { GenericDatumWriter datumWriter = new GenericDatumWriter(); Encoder encoder = null; Schema schema = null; for (Object attachment : src.get(Fields.ATTACHMENT_BODY)) { Preconditions.checkNotNull(attachment); GenericContainer datum = (GenericContainer) attachment; schema = getSchema(datum, schema); datumWriter.setSchema(schema); if (encoder == null) { // init if (format == Format.containerlessJSON) { encoder = EncoderFactory.get().jsonEncoder(schema, dst); } else { encoder = EncoderFactory.get().binaryEncoder(dst, null); } } datumWriter.write(datum, encoder); } encoder.flush(); } catch (IOException e) { throw new MorphlineRuntimeException(e); } } private Schema getSchema(GenericContainer datum, Schema lastSchema) { Schema schema = datum.getSchema(); if (lastSchema != null && lastSchema != schema) { throw new MorphlineRuntimeException("Schemas must be identical: " + schema + ", lastSchema: " + lastSchema); } return schema; } } /////////////////////////////////////////////////////////////////////////////// // Nested classes: /////////////////////////////////////////////////////////////////////////////// private static enum Format { container, containerlessJSON, containerlessBinary } }