/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. See the License for the specific language governing * permissions and limitations under the License. */ package org.apache.avro.hadoop.io; import java.io.IOException; import org.apache.avro.Schema; import org.apache.avro.mapred.AvroKey; import org.apache.avro.mapred.AvroValue; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.SequenceFile.CompressionType; import org.apache.hadoop.io.SequenceFile.Metadata; import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.util.Progressable; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * A wrapper around a Hadoop {@link org.apache.hadoop.io.SequenceFile} that * also supports reading and writing Avro data. * * <p>The vanilla Hadoop <code>SequenceFile</code> contains a <i>header</i> * followed by a sequence of <i>records</i>. A <i>record</i> consists of a * <i>key</i> and a <i>value</i>. The <i>key</i> and <i>value</i> must * either:</p> * * <ul> * <li>implement the <code>Writable</code> interface, or</li> * <li>be accepted by a <code>Serialization</code> registered with the * <code>SerializationFactory</code>.</li> * </ul> * * <p>Since Avro data are Plain Old Java Objects (e.g., <code>Integer</code> * for data with schema <i>"int"</i>), they do not implement <i>Writable</i>. * Furthermore, a {@link org.apache.hadoop.io.Serialization} implementation * cannot determine whether an object instance of type * <code>CharSequence</code> that also implements <code>Writable</code> should * be serialized using Avro or WritableSerialization.</p> * * <p>The solution implemented in <code>AvroSequenceFile</code> is to:</p> * * <ul> * <li>wrap Avro key data in an <code>AvroKey</code> object,</li> * <li>wrap Avro value data in an <code>AvroValue</code> object,</li> * <li>configure and register <code>AvroSerialization</code> with the * <code>SerializationFactory</code>, which will accept only objects that are instances * of either <code>AvroKey</code> or <code>AvroValue</code>, and</li> * <li>store the Avro key and value schemas in the SequenceFile <i>header</i>.</li> * </ul> */ public class AvroSequenceFile { private static final Logger LOG = LoggerFactory.getLogger(AvroSequenceFile.class); /** The SequencFile.Metadata field for the Avro key writer schema. */ public static final Text METADATA_FIELD_KEY_SCHEMA = new Text("avro.key.schema"); /** The SequencFile.Metadata field for the Avro value writer schema. */ public static final Text METADATA_FIELD_VALUE_SCHEMA = new Text("avro.value.schema"); /** Constructor disabled for this container class. */ private AvroSequenceFile() {} /** * Creates a writer from a set of options. * * <p>Since there are different implementations of <code>Writer</code> depending on the * compression type, this method constructs the appropriate subclass depending on the * compression type given in the <code>options</code>.</p> * * @param options The options for the writer. * @return A new writer instance. * @throws IOException If the writer cannot be created. */ public static SequenceFile.Writer createWriter(Writer.Options options) throws IOException { return SequenceFile.createWriter( options.getFileSystem(), options.getConfigurationWithAvroSerialization(), options.getOutputPath(), options.getKeyClass(), options.getValueClass(), options.getBufferSizeBytes(), options.getReplicationFactor(), options.getBlockSizeBytes(), options.getCompressionType(), options.getCompressionCodec(), options.getProgressable(), options.getMetadataWithAvroSchemas()); } /** * A writer for an uncompressed SequenceFile that supports Avro data. */ public static class Writer extends SequenceFile.Writer { /** * A helper class to encapsulate the options that can be used to construct a Writer. */ public static class Options { /** The default write buffer size in bytes. */ public static final int DEFAULT_BUFFER_SIZE_BYTES = 4096; /** * A magic value representing the default for buffer size, block size, and * replication factor. */ private static final short DEFAULT = -1; private FileSystem mFileSystem; private Configuration mConf; private Path mOutputPath; private Class<?> mKeyClass; private Schema mKeyWriterSchema; private Class<?> mValueClass; private Schema mValueWriterSchema; private int mBufferSizeBytes; private short mReplicationFactor; private long mBlockSizeBytes; private Progressable mProgressable; private CompressionType mCompressionType; private CompressionCodec mCompressionCodec; private Metadata mMetadata; /** * Creates a new <code>Options</code> instance with default values. */ public Options() { mBufferSizeBytes = DEFAULT; mReplicationFactor = DEFAULT; mBlockSizeBytes = DEFAULT; mCompressionType = CompressionType.NONE; mMetadata = new Metadata(); } /** * Sets the filesystem the SequenceFile should be written to. * * @param fileSystem The filesystem. * @return This options instance. */ public Options withFileSystem(FileSystem fileSystem) { if (null == fileSystem) { throw new IllegalArgumentException("Filesystem may not be null"); } mFileSystem = fileSystem; return this; } /** * Sets the Hadoop configuration. * * @param conf The configuration. * @return This options instance. */ public Options withConfiguration(Configuration conf) { if (null == conf) { throw new IllegalArgumentException("Configuration may not be null"); } mConf = conf; return this; } /** * Sets the output path for the SequenceFile. * * @param outputPath The output path. * @return This options instance. */ public Options withOutputPath(Path outputPath) { if (null == outputPath) { throw new IllegalArgumentException("Output path may not be null"); } mOutputPath = outputPath; return this; } /** * Sets the class of the key records to be written. * * <p>If the keys will be Avro data, use {@link * #withKeySchema(org.apache.avro.Schema)} to specify the writer schema. The key * class will be automatically set to {@link org.apache.avro.mapred.AvroKey}.</p> * * @param keyClass The key class. * @return This options instance. */ public Options withKeyClass(Class<?> keyClass) { if (null == keyClass) { throw new IllegalArgumentException("Key class may not be null"); } mKeyClass = keyClass; return this; } /** * Sets the writer schema of the key records when using Avro data. * * <p>The key class will automatically be set to {@link * org.apache.avro.mapred.AvroKey}, so there is no need to call {@link * #withKeyClass(Class)} when using this method.</p> * * @param keyWriterSchema The writer schema for the keys. * @return This options instance. */ public Options withKeySchema(Schema keyWriterSchema) { if (null == keyWriterSchema) { throw new IllegalArgumentException("Key schema may not be null"); } withKeyClass(AvroKey.class); mKeyWriterSchema = keyWriterSchema; return this; } /** * Sets the class of the value records to be written. * * <p>If the values will be Avro data, use {@link * #withValueSchema(org.apache.avro.Schema)} to specify the writer schema. The value * class will be automatically set to {@link org.apache.avro.mapred.AvroValue}.</p> * * @param valueClass The value class. * @return This options instance. */ public Options withValueClass(Class<?> valueClass) { if (null == valueClass) { throw new IllegalArgumentException("Value class may not be null"); } mValueClass = valueClass; return this; } /** * Sets the writer schema of the value records when using Avro data. * * <p>The value class will automatically be set to {@link * org.apache.avro.mapred.AvroValue}, so there is no need to call {@link * #withValueClass(Class)} when using this method.</p> * * @param valueWriterSchema The writer schema for the values. * @return This options instance. */ public Options withValueSchema(Schema valueWriterSchema) { if (null == valueWriterSchema) { throw new IllegalArgumentException("Value schema may not be null"); } withValueClass(AvroValue.class); mValueWriterSchema = valueWriterSchema; return this; } /** * Sets the write buffer size in bytes. * * @param bytes The desired buffer size. * @return This options instance. */ public Options withBufferSizeBytes(int bytes) { if (bytes < 0) { throw new IllegalArgumentException("Buffer size may not be negative"); } mBufferSizeBytes = bytes; return this; } /** * Sets the desired replication factor for the file. * * @param replicationFactor The replication factor. * @return This options instance. */ public Options withReplicationFactor(short replicationFactor) { if (replicationFactor <= 0) { throw new IllegalArgumentException("Replication factor must be positive"); } mReplicationFactor = replicationFactor; return this; } /** * Sets the desired size of the file blocks. * * @param bytes The desired block size in bytes. * @return This options instance. */ public Options withBlockSizeBytes(long bytes) { if (bytes <= 0) { throw new IllegalArgumentException("Block size must be positive"); } mBlockSizeBytes = bytes; return this; } /** * Sets an object to report progress to. * * @param progressable A progressable object to track progress. * @return This options instance. */ public Options withProgressable(Progressable progressable) { mProgressable = progressable; return this; } /** * Sets the type of compression. * * @param compressionType The type of compression for the output file. * @return This options instance. */ public Options withCompressionType(CompressionType compressionType) { mCompressionType = compressionType; return this; } /** * Sets the compression codec to use if it is enabled. * * @param compressionCodec The compression codec. * @return This options instance. */ public Options withCompressionCodec(CompressionCodec compressionCodec) { mCompressionCodec = compressionCodec; return this; } /** * Sets the metadata that should be stored in the file <i>header</i>. * * @param metadata The file metadata. * @return This options instance. */ public Options withMetadata(Metadata metadata) { if (null == metadata) { throw new IllegalArgumentException("Metadata may not be null"); } mMetadata = metadata; return this; } /** * Gets the filesystem the SequenceFile should be written to. * * @return The file system to write to. */ public FileSystem getFileSystem() { if (null == mFileSystem) { throw new RuntimeException("Must call Options.withFileSystem()"); } return mFileSystem; } /** * Gets the Hadoop configuration. * * @return The Hadoop configuration. */ public Configuration getConfiguration() { return mConf; } /** * Gets the Hadoop configuration with Avro serialization registered. * * @return The Hadoop configuration. */ public Configuration getConfigurationWithAvroSerialization() { Configuration conf = getConfiguration(); if (null == conf) { throw new RuntimeException("Must call Options.withConfiguration()"); } Configuration confWithAvro = new Configuration(conf); if (null != mKeyWriterSchema) { AvroSerialization.setKeyWriterSchema(confWithAvro, mKeyWriterSchema); } if (null != mValueWriterSchema) { AvroSerialization.setValueWriterSchema(confWithAvro, mValueWriterSchema); } AvroSerialization.addToConfiguration(confWithAvro); return confWithAvro; } /** * Gets the output path for the sequence file. * * @return The output path. */ public Path getOutputPath() { if (null == mOutputPath) { throw new RuntimeException("Must call Options.withOutputPath()"); } return mOutputPath; } /** * Gets the class of the key records. * * @return The key class. */ public Class<?> getKeyClass() { if (null == mKeyClass) { throw new RuntimeException( "Must call Options.withKeyClass() or Options.withKeySchema()"); } return mKeyClass; } /** * Gets the class of the value records. * * @return The value class. */ public Class<?> getValueClass() { if (null == mValueClass) { throw new RuntimeException( "Must call Options.withValueClass() or Options.withValueSchema()"); } return mValueClass; } /** * Gets the desired size of the buffer used when flushing records to disk. * * @return The buffer size in bytes. */ public int getBufferSizeBytes() { if (DEFAULT == mBufferSizeBytes) { return getConfiguration().getInt("io.file.buffer.size", DEFAULT_BUFFER_SIZE_BYTES); } return mBufferSizeBytes; } /** * Gets the desired number of replicas to store for each block of the file. * * @return The replciation factor for the blocks of the file. */ public short getReplicationFactor() { if (DEFAULT == mReplicationFactor) { return getFileSystem().getDefaultReplication(); } return mReplicationFactor; } /** * Gets the desired size of the file blocks. * * @return The size of a file block in bytes. */ public long getBlockSizeBytes() { if (DEFAULT == mBlockSizeBytes) { return getFileSystem().getDefaultBlockSize(); } return mBlockSizeBytes; } /** * Gets the object to report progress to. * * @return A progressable object to track progress. */ public Progressable getProgressable() { return mProgressable; } /** * Gets the type of compression. * * @return The compression type. */ public CompressionType getCompressionType() { return mCompressionType; } /** * Gets the compression codec. * * @return The compression codec. */ public CompressionCodec getCompressionCodec() { return mCompressionCodec; } /** * Gets the SequenceFile metadata to store in the <i>header</i>. * * @return The metadata header. */ public Metadata getMetadata() { return mMetadata; } /** * Gets the metadata to store in the file header, which includes * any necessary Avro writer schemas. * * @return The metadata header with Avro writer schemas if Avro data is being written. */ private Metadata getMetadataWithAvroSchemas() { // mMetadata was intialized in the constructor, and cannot be set to null. assert null != mMetadata; if (null != mKeyWriterSchema) { mMetadata.set(METADATA_FIELD_KEY_SCHEMA, new Text(mKeyWriterSchema.toString())); } if (null != mValueWriterSchema) { mMetadata.set(METADATA_FIELD_VALUE_SCHEMA, new Text(mValueWriterSchema.toString())); } return mMetadata; } } /** * Creates a new <code>Writer</code> to a SequenceFile that supports Avro data. * * @param options The writer options. * @throws IOException If the writer cannot be initialized. */ public Writer(Options options) throws IOException { super(options.getFileSystem(), options.getConfigurationWithAvroSerialization(), options.getOutputPath(), options.getKeyClass(), options.getValueClass(), options.getBufferSizeBytes(), options.getReplicationFactor(), options.getBlockSizeBytes(), options.getProgressable(), options.getMetadataWithAvroSchemas()); } } /** * A reader for SequenceFiles that may contain Avro data. */ public static class Reader extends SequenceFile.Reader { /** * A helper class to encapsulate the options that can be used to construct a Reader. */ public static class Options { private FileSystem mFileSystem; private Path mInputPath; private Configuration mConf; private Schema mKeyReaderSchema; private Schema mValueReaderSchema; /** * Sets the filesystem the SequenceFile should be read from. * * @param fileSystem The filesystem. * @return This options instance. */ public Options withFileSystem(FileSystem fileSystem) { if (null == fileSystem) { throw new IllegalArgumentException("Filesystem may not be null"); } mFileSystem = fileSystem; return this; } /** * Sets the input path for the SequenceFile. * * @param inputPath The input path. * @return This options instance. */ public Options withInputPath(Path inputPath) { if (null == inputPath) { throw new IllegalArgumentException("Input path may not be null"); } mInputPath = inputPath; return this; } /** * Sets the Hadoop configuration. * * @param conf The configuration. * @return This options instance. */ public Options withConfiguration(Configuration conf) { if (null == conf) { throw new IllegalArgumentException("Configuration may not be null"); } mConf = conf; return this; } /** * Sets the reader schema of the key records when using Avro data. * * <p>If not set, the writer schema will be used as the reader schema.</p> * * @param keyReaderSchema The reader schema for the keys. * @return This options instance. */ public Options withKeySchema(Schema keyReaderSchema) { mKeyReaderSchema = keyReaderSchema; return this; } /** * Sets the reader schema of the value records when using Avro data. * * <p>If not set, the writer schema will be used as the reader schema.</p> * * @param valueReaderSchema The reader schema for the values. * @return This options instance. */ public Options withValueSchema(Schema valueReaderSchema) { mValueReaderSchema = valueReaderSchema; return this; } /** * Gets the filesystem the SequenceFile should be read rom. * * @return The file system to read from. */ public FileSystem getFileSystem() { if (null == mFileSystem) { throw new RuntimeException("Must call Options.withFileSystem()"); } return mFileSystem; } /** * Gets the input path for the sequence file. * * @return The input path. */ public Path getInputPath() { if (null == mInputPath) { throw new RuntimeException("Must call Options.withInputPath()"); } return mInputPath; } /** * Gets the Hadoop configuration. * * @return The Hadoop configuration. */ public Configuration getConfiguration() { return mConf; } /** * Gets the Hadoop configuration with Avro serialization registered. * * @return The Hadoop configuration. * @throws IOException If there is an error configuring Avro serialization. */ public Configuration getConfigurationWithAvroSerialization() throws IOException { Configuration conf = getConfiguration(); if (null == conf) { throw new RuntimeException("Must call Options.withConfiguration()"); } // Configure schemas and add Avro serialization to the configuration. Configuration confWithAvro = new Configuration(conf); AvroSerialization.addToConfiguration(confWithAvro); // Read the metadata header from the SequenceFile to get the writer schemas. Metadata metadata = AvroSequenceFile.getMetadata( getFileSystem(), getInputPath(), confWithAvro); // Set the key schema if present in the metadata. Text keySchemaText = metadata.get(METADATA_FIELD_KEY_SCHEMA); if (null != keySchemaText) { LOG.debug("Using key writer schema from SequenceFile metadata: " + keySchemaText.toString()); AvroSerialization.setKeyWriterSchema( confWithAvro, Schema.parse(keySchemaText.toString())); if (null != mKeyReaderSchema) { AvroSerialization.setKeyReaderSchema(confWithAvro, mKeyReaderSchema); } } // Set the value schema if present in the metadata. Text valueSchemaText = metadata.get(METADATA_FIELD_VALUE_SCHEMA); if (null != valueSchemaText) { LOG.debug("Using value writer schema from SequenceFile metadata: " + valueSchemaText.toString()); AvroSerialization.setValueWriterSchema( confWithAvro, Schema.parse(valueSchemaText.toString())); if (null != mValueReaderSchema) { AvroSerialization.setValueReaderSchema(confWithAvro, mValueReaderSchema); } } return confWithAvro; } } /** * Creates a new <code>Reader</code> from a SequenceFile that supports Avro data. * * @param options The reader options. * @throws IOException If the reader cannot be initialized. */ public Reader(Options options) throws IOException { super(options.getFileSystem(), options.getInputPath(), options.getConfigurationWithAvroSerialization()); } } /** * Open and read just the metadata header from a SequenceFile. * * @param fs The FileSystem the SequenceFile is on. * @param path The path to the file. * @param conf The Hadoop configuration. * @return The metadata header. * @throws IOException If the metadata cannot be read from the file. */ private static Metadata getMetadata(FileSystem fs, Path path, Configuration conf) throws IOException { SequenceFile.Reader metadataReader = null; try { metadataReader = new SequenceFile.Reader(fs, path, conf); return metadataReader.getMetadata(); } finally { if (null != metadataReader) { metadataReader.close(); } } } }