/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.archive.hadoop.pig; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.BooleanWritable; import org.apache.hadoop.io.ByteWritable; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.DoubleWritable; import org.apache.hadoop.io.FloatWritable; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.SequenceFile.CompressionType; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.DefaultCodec; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.OutputFormat; import org.apache.hadoop.mapreduce.RecordWriter; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; import org.apache.hadoop.util.ReflectionUtils; import org.apache.pig.StoreFunc; import org.apache.pig.data.DataType; import org.apache.pig.data.Tuple; /** * Pig StoreFunc which stores Tuples in a Hadoop SequenceFile. Hadoop * SequenceFiles are made up of unordered (key,value) pairs. So, you * can add (key,value) pairs in any order, the inputs do not need to * be sorted. * * In a Hadoop SequenceFile, the key and the value are each typed * according to the Hadoop type system. The type of the key and the * value are specified when the SequenceFile is created. Thus, when * this StoreFunc is initialized, the key and value types must be * given. By default, if the types are not specified, they are * assumed to be Text (i.e. Strings). * * For example, in a Pig script: * * STORE foo INTO 'foo' USING SequenceFileStorage(); -- Default (Text,Text) * * or * * STORE foo INTO 'foo' USING SequenceFileStorage( 'org.apache.hadoop.io.LongWritable', * 'org.apache.hadoop.io.BytesWritable' ); * */ public class SequenceFileStorage extends StoreFunc { /* * It's possible that we can actually get the Class objects in the * constructor; however, I'm not sure that the whole environment is * setup when this class is instantiated, so I think it's safer to * defer getting the Class objects until they are actually needed in * the RecordWriter. */ String keyType = "org.apache.hadoop.io.Text"; String valueType = "org.apache.hadoop.io.Text"; /* * We create a null object for the key and value types. If we need * to write a null to the sequence file, then we just use these * instances. */ Writable nullKey; Writable nullValue; Text emptyTextKey = new Text(""); RecordWriter writer; public SequenceFileStorage() { } public SequenceFileStorage( String valueType ) { this.valueType = valueType; } public SequenceFileStorage( String keyType, String valueType ) { this.keyType = keyType; this.valueType = valueType; } /** * Most of this method is cut/pasted from the Hadoop * SequenceFileOutputFormat. The big difference is that we use the * key and value types given to this Pig storage class rather than * using the ones set by the job configuration. */ public OutputFormat getOutputFormat() throws IOException { return new SequenceFileOutputFormat() { public RecordWriter getRecordWriter( TaskAttemptContext context ) throws IOException, InterruptedException { Configuration conf = context.getConfiguration(); Class keyClass, valueClass; try { keyClass = conf.getClassByName( keyType ); valueClass = conf.getClassByName( valueType ); } catch ( ClassNotFoundException cnfe ) { throw new IOException( cnfe ); } // Instantiate null objects for the key and value types. // See getWritable() for their use. try { nullKey = (Writable) keyClass.newInstance(); nullValue = (Writable) valueClass.newInstance(); } catch ( Exception roe ) { throw new IOException( roe ); } CompressionCodec codec = null; CompressionType compressionType = CompressionType.NONE; if (getCompressOutput(context)) { // find the kind of compression to do compressionType = getOutputCompressionType(context); // find the right codec Class<?> codecClass = getOutputCompressorClass(context, DefaultCodec.class); codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf); } // get the path of the temporary output file Path file = getDefaultWorkFile(context, ""); FileSystem fs = file.getFileSystem(conf); final SequenceFile.Writer out = SequenceFile.createWriter(fs, conf, file, keyClass, valueClass, compressionType, codec, context); return new RecordWriter() { public void write( Object key, Object value) throws IOException { out.append(key, value); } public void close(TaskAttemptContext context) throws IOException { out.close(); } }; } }; } public void setStoreLocation( String location, Job job ) throws IOException { FileOutputFormat.setOutputPath( job, new Path(location) ); } public void prepareToWrite( RecordWriter writer) throws IOException { this.writer = writer; } /** * Tuples must have two entries, the first for the SequenceFile * 'key' and the second for the 'value'. */ public void putNext( Tuple tuple ) throws IOException { try { Writable key, value; int size = tuple.size(); if ( size == 2 ) { key = getWritable( tuple.get(0), this.nullKey ); value = getWritable( tuple.get(1), this.nullValue ); } else if ( size == 1 ) { key = this.nullKey; value = getWritable( tuple.get(0), this.nullValue ); } else { throw new IOException( "Invalid tuple size, must be 1 or 2: " + size ); } this.writer.write( key, value ); } catch ( InterruptedException ie ) { throw new IOException( ie ); } } /** * Convert the Pig tupleValue to the corresponding Hadoop object. */ public Writable getWritable( Object tupleValue, Writable nullWritable ) throws IOException { switch ( DataType.findType( tupleValue ) ) { case DataType.BOOLEAN: return new BooleanWritable( (Boolean) tupleValue ); case DataType.BYTE: return new ByteWritable( (Byte) tupleValue ); case DataType.CHARARRAY: return new Text( (String) tupleValue ); case DataType.INTEGER: return new IntWritable( (Integer) tupleValue ); case DataType.LONG: return new LongWritable( (Long) tupleValue ); case DataType.DOUBLE: return new DoubleWritable( (Double) tupleValue ); case DataType.FLOAT: return new FloatWritable( (Float) tupleValue ); case DataType.BYTEARRAY: return new BytesWritable( (byte[]) tupleValue ); // If we get a 'null' from Pig, just pass through the // already-instantiated Hadoop nullWritable. case DataType.NULL: return nullWritable; // Don't know what to do with these complex data types. case DataType.BAG: case DataType.ERROR: case DataType.MAP: case DataType.TUPLE: case DataType.UNKNOWN: default: throw new IOException( "Cannot write values of type: " + DataType.findTypeName( tupleValue ) ); } } }