/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.pig.builtin; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.util.Arrays; import java.util.Comparator; import java.util.List; import java.util.Properties; import org.apache.avro.Schema; import org.apache.avro.SchemaParseException; import org.apache.avro.Schema.Type; import org.apache.avro.file.DataFileStream; import org.apache.avro.generic.GenericDatumReader; import org.apache.avro.generic.GenericData; import org.apache.avro.mapred.AvroOutputFormat; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.GnuParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.PathFilter; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.mapreduce.InputFormat; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.OutputFormat; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.RecordWriter; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.pig.Expression; import org.apache.pig.LoadFunc; import org.apache.pig.LoadMetadata; import org.apache.pig.LoadPushDown; import org.apache.pig.PigWarning; import org.apache.pig.ResourceSchema; import org.apache.pig.ResourceStatistics; import org.apache.pig.StoreFunc; import org.apache.pig.StoreFuncInterface; import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit; import org.apache.pig.data.Tuple; import org.apache.pig.impl.logicalLayer.FrontendException; import org.apache.pig.impl.util.UDFContext; import org.apache.pig.impl.util.avro.AvroArrayReader; import org.apache.pig.impl.util.avro.AvroRecordReader; import org.apache.pig.impl.util.avro.AvroRecordWriter; import org.apache.pig.impl.util.avro.AvroStorageSchemaConversionUtilities; import org.apache.pig.impl.util.avro.AvroTupleWrapper; import com.google.common.collect.Lists; import com.google.common.collect.Maps; import com.google.common.primitives.Longs; /** * Pig UDF for reading and writing Avro data. * */ public class AvroStorage extends LoadFunc implements StoreFuncInterface, LoadMetadata, LoadPushDown { /** * Creates new instance of Pig Storage function, without specifying * the schema. Useful for just loading in data. */ public AvroStorage() { this(null, null); } /** * Creates new instance of Pig Storage function. * @param sn Specifies the input/output schema or record name. */ public AvroStorage(final String sn) { this(sn, null); } private String schemaName = "record"; private String schemaNameSpace = null; protected boolean allowRecursive = false; protected boolean doubleColonsToDoubleUnderscores = false; protected Schema schema; protected final Log log = LogFactory.getLog(getClass()); /** * Creates new instance of AvroStorage function, specifying output schema * properties. * @param sn Specifies the input/output schema or record name. * @param opts Options for AvroStorage: * <li><code>-namespace</code> Namespace for an automatically generated * output schema.</li> * <li><code>-schemafile</code> Specifies URL for avro schema file * from which to read the input schema (can be local file, hdfs, * url, etc).</li> * <li><code>-examplefile</code> Specifies URL for avro data file from * which to copy the input schema (can be local file, hdfs, url, etc).</li> * <li><code>-allowrecursive</code> Option to allow recursive schema * definitions (default is false).</li> * <li><code>-doublecolons</code> Option to translate Pig schema names * with double colons to names with double underscores (default is false).</li> * */ public AvroStorage(final String sn, final String opts) { super(); if (sn != null) { try { Schema s = (new Schema.Parser()).parse(sn); // must be a valid schema setInputAvroSchema(s); setOutputAvroSchema(s); } catch (SchemaParseException e) { // not a valid schema, use as a record name schemaName = sn; } } if (opts != null) { String[] optsArr = opts.split(" "); Options validOptions = new Options(); try { CommandLineParser parser = new GnuParser(); validOptions.addOption("n", "namespace", true, "Namespace for an automatically generated output schema"); validOptions.addOption("f", "schemafile", true, "Specifies URL for avro schema file from which to read " + "the input or output schema"); validOptions.addOption("e", "examplefile", true, "Specifies URL for avro data file from which to copy " + "the output schema"); validOptions.addOption("r", "allowrecursive", false, "Option to allow recursive schema definitions (default is false)"); validOptions.addOption("d", "doublecolons", false, "Option to translate Pig schema names with double colons " + "to names with double underscores (default is false)"); CommandLine configuredOptions = parser.parse(validOptions, optsArr); schemaNameSpace = configuredOptions.getOptionValue("namespace", null); allowRecursive = configuredOptions.hasOption('r'); doubleColonsToDoubleUnderscores = configuredOptions.hasOption('d'); if (configuredOptions.hasOption('f')) { try { Path p = new Path(configuredOptions.getOptionValue('f')); Schema s = new Schema.Parser() .parse((FileSystem.get(p.toUri(), new Configuration()).open(p))); setInputAvroSchema(s); setOutputAvroSchema(s); } catch (FileNotFoundException fnfe) { System.err.printf("file not found exception\n"); log.warn("Schema file not found when instantiating AvroStorage. (If the " + "schema was described in a local file on the front end, and this message " + "is in the back end log, you can ignore this mesasge.)", fnfe); } } else if (configuredOptions.hasOption('e')) { setOutputAvroSchema( getAvroSchema(configuredOptions.getOptionValue('e'), new Job(new Configuration()))); } } catch (ParseException e) { log.error("Exception in AvroStorage", e); log.error("AvroStorage called with arguments " + sn + ", " + opts); warn("ParseException in AvroStorage", PigWarning.UDF_WARNING_1); HelpFormatter formatter = new HelpFormatter(); formatter.printHelp("AvroStorage(',', '[options]')", validOptions); throw new RuntimeException(e); } catch (IOException e) { log.warn("Exception in AvroStorage", e); log.warn("AvroStorage called with arguments " + sn + ", " + opts); warn("IOException in AvroStorage", PigWarning.UDF_WARNING_1); throw new RuntimeException(e); } } } /** * Context signature for this UDF instance. */ protected String udfContextSignature = null; @Override public final void setUDFContextSignature(final String signature) { udfContextSignature = signature; super.setUDFContextSignature(signature); } /** * Internal function for getting the Properties object associated with * this UDF instance. * @return The Properties object associated with this UDF instance */ protected final Properties getProperties() { if (udfContextSignature == null) { return getProperties(AvroStorage.class, null); } else { return getProperties(AvroStorage.class, udfContextSignature); } } /** * Internal function for getting the Properties object associated with * this UDF instance. * @param c Class of this UDF * @param signature Signature string * @return The Properties object associated with this UDF instance */ @SuppressWarnings("rawtypes") protected final Properties getProperties(final Class c, final String signature) { UDFContext context = UDFContext.getUDFContext(); if (signature == null) { return context.getUDFProperties(c); } else { return context.getUDFProperties(c, new String[] {signature}); } } /* * @see org.apache.pig.LoadMetadata#getSchema(java.lang.String, * org.apache.hadoop.mapreduce.Job) */ @Override public final ResourceSchema getSchema(final String location, final Job job) throws IOException { if (schema == null) { Schema s = getAvroSchema(location, job); setInputAvroSchema(s); } ResourceSchema rs = AvroStorageSchemaConversionUtilities .avroSchemaToResourceSchema(schema, allowRecursive); return rs; } /** * Reads the avro schema at the specified location. * @param location Location of file * @param job Hadoop job object * @return an Avro Schema object derived from the specified file * @throws IOException * */ protected final Schema getAvroSchema(final String location, final Job job) throws IOException { return getAvroSchema(new Path(location), job); } /** * A PathFilter that filters out invisible files. */ protected static final PathFilter VISIBLE_FILES = new PathFilter() { @Override public boolean accept(final Path p) { return (!(p.getName().startsWith("_") || p.getName().startsWith("."))); } }; /** * Reads the avro schema at the specified location. * @param p Location of file * @param job Hadoop job object * @return an Avro Schema object derived from the specified file * @throws IOException * */ public Schema getAvroSchema(final Path p, final Job job) throws IOException { GenericDatumReader<Object> avroReader = new GenericDatumReader<Object>(); FileSystem fs = FileSystem.get(p.toUri(), job.getConfiguration()); FileStatus[] statusArray = fs.globStatus(p); if (statusArray == null) { throw new IOException("Path " + p.toString() + " does not exist."); } if (statusArray.length == 0) { throw new IOException("No path matches pattern " + p.toString()); } Path filePath = depthFirstSearchForFile(statusArray, fs); if (filePath == null) { throw new IOException("No path matches pattern " + p.toString()); } InputStream hdfsInputStream = fs.open(filePath); DataFileStream<Object> avroDataStream = new DataFileStream<Object>(hdfsInputStream, avroReader); Schema s = avroDataStream.getSchema(); avroDataStream.close(); return s; } /** * Finds a valid path for a file from a FileStatus object. * @param fileStatus FileStatus object corresponding to a file, * or a directory. * @param fileSystem FileSystem in with the file should be found * @return The first file found * @throws IOException */ private Path depthFirstSearchForFile(final FileStatus fileStatus, final FileSystem fileSystem) throws IOException { if (fileSystem.isFile(fileStatus.getPath())) { return fileStatus.getPath(); } else { return depthFirstSearchForFile( fileSystem.listStatus(fileStatus.getPath(), VISIBLE_FILES), fileSystem); } } /** * Finds a valid path for a file from an array of FileStatus objects. * @param statusArray Array of FileStatus objects in which to search * for the file. * @param fileSystem FileSystem in which to search for the first file. * @return The first file found. * @throws IOException */ protected Path depthFirstSearchForFile(final FileStatus[] statusArray, final FileSystem fileSystem) throws IOException { // Most recent files first Arrays.sort(statusArray, new Comparator<FileStatus>() { @Override public int compare(final FileStatus fs1, final FileStatus fs2) { return Longs.compare(fs2.getModificationTime(),fs1.getModificationTime()); } } ); for (FileStatus f : statusArray) { Path p = depthFirstSearchForFile(f, fileSystem); if (p != null) { return p; } } return null; } /* * @see org.apache.pig.LoadMetadata#getStatistics(java.lang.String, * org.apache.hadoop.mapreduce.Job) */ @Override public final ResourceStatistics getStatistics(final String location, final Job job) throws IOException { return null; } /* * @see org.apache.pig.LoadMetadata#getPartitionKeys(java.lang.String, * org.apache.hadoop.mapreduce.Job) */ @Override public final String[] getPartitionKeys(final String location, final Job job) throws IOException { return null; } /* * @see * org.apache.pig.LoadMetadata#setPartitionFilter(org.apache.pig.Expression) */ @Override public void setPartitionFilter(final Expression partitionFilter) throws IOException { } /* * @see * org.apache.pig.StoreFuncInterface#relToAbsPathForStoreLocation(java.lang * .String, org.apache.hadoop.fs.Path) */ @Override public final String relToAbsPathForStoreLocation(final String location, final Path curDir) throws IOException { return LoadFunc.getAbsolutePath(location, curDir); } /* * @see org.apache.pig.StoreFuncInterface#getOutputFormat() */ @Override public OutputFormat<NullWritable, Object> getOutputFormat() throws IOException { /** * Hadoop output format for AvroStorage. */ class AvroStorageOutputFormat extends FileOutputFormat<NullWritable, Object> { @Override public RecordWriter<NullWritable, Object> getRecordWriter( final TaskAttemptContext tc) throws IOException, InterruptedException { return new AvroRecordWriter( // avroStorageOutputFormatSchema, getDefaultWorkFile(tc, AvroOutputFormat.EXT), tc.getConfiguration()); } } return new AvroStorageOutputFormat(); } /* * @see org.apache.pig.StoreFuncInterface#setStoreLocation(java.lang.String, * org.apache.hadoop.mapreduce.Job) */ @Override public final void setStoreLocation(final String location, final Job job) throws IOException { FileOutputFormat.setOutputPath(job, new Path(location)); } /** * Pig property name for the output avro schema. */ public static final String OUTPUT_AVRO_SCHEMA = "org.apache.pig.builtin.AvroStorage.output.schema"; /* * @see * org.apache.pig.StoreFuncInterface#checkSchema(org.apache.pig.ResourceSchema * ) */ @Override public final void checkSchema(final ResourceSchema rs) throws IOException { if (rs == null) { throw new IOException("checkSchema: called with null ResourceSchema"); } Schema avroSchema = AvroStorageSchemaConversionUtilities .resourceSchemaToAvroSchema(rs, (schemaName == null || schemaName.length() == 0) ? "pig_output" : schemaName, schemaNameSpace, Maps.<String, List<Schema>> newHashMap(), doubleColonsToDoubleUnderscores); if (avroSchema == null) { throw new IOException("checkSchema: could not translate ResourceSchema to Avro Schema"); } setOutputAvroSchema(avroSchema); } /** * Sets the output avro schema to {@s}. * @param s An Avro schema */ protected final void setOutputAvroSchema(final Schema s) { schema = s; getProperties() .setProperty(OUTPUT_AVRO_SCHEMA, s.toString()); } /** * Utility function that gets the output schema from the udf * properties for this instance of the store function. * @return the output schema associated with this UDF */ protected final Schema getOutputAvroSchema() { if (schema == null) { String schemaString = getProperties() .getProperty(OUTPUT_AVRO_SCHEMA); if (schemaString != null) { schema = (new Schema.Parser()).parse(schemaString); } } return schema; } /** * RecordWriter used by this UDF instance. */ private RecordWriter<NullWritable, Object> writer; /* * @see * org.apache.pig.StoreFuncInterface#prepareToWrite(org.apache.hadoop.mapreduce * .RecordWriter) */ @SuppressWarnings({ "unchecked", "rawtypes" }) @Override public final void prepareToWrite(final RecordWriter w) throws IOException { if (this.udfContextSignature == null) throw new IOException(this.getClass().toString() + ".prepareToWrite called without setting udf context signature"); writer = (RecordWriter<NullWritable, Object>) w; ((AvroRecordWriter) writer).prepareToWrite(getOutputAvroSchema()); } /* * @see org.apache.pig.StoreFuncInterface#putNext(org.apache.pig.data.Tuple) */ @Override public final void putNext(final Tuple t) throws IOException { try { writer.write(null, t); } catch (InterruptedException e) { log.error("InterruptedException in putNext"); throw new IOException(e); } } /* * @see * org.apache.pig.StoreFuncInterface#setStoreFuncUDFContextSignature(java. * lang.String) */ @Override public final void setStoreFuncUDFContextSignature(final String signature) { udfContextSignature = signature; super.setUDFContextSignature(signature); } /* * @see org.apache.pig.StoreFuncInterface#cleanupOnFailure(java.lang.String, * org.apache.hadoop.mapreduce.Job) */ @Override public final void cleanupOnFailure(final String location, final Job job) throws IOException { StoreFunc.cleanupOnFailureImpl(location, job); } /** * Pig property name for the input avro schema. */ public static final String INPUT_AVRO_SCHEMA = "org.apache.pig.builtin.AvroStorage.input.schema"; /* * @see org.apache.pig.LoadFunc#setLocation(java.lang.String, * org.apache.hadoop.mapreduce.Job) */ @Override public void setLocation(final String location, final Job job) throws IOException { FileInputFormat.setInputPaths(job, location); if (schema == null) { schema = getInputAvroSchema(); if (schema == null) { schema = getAvroSchema(location, job); if (schema == null) { throw new IOException( "Could not determine avro schema for location " + location); } setInputAvroSchema(schema); } } } /** * Sets the input avro schema to {@s}. * @param s The specified schema */ protected final void setInputAvroSchema(final Schema s) { schema = s; getProperties().setProperty(INPUT_AVRO_SCHEMA, s.toString()); } /** * Helper function reads the input avro schema from the UDF * Properties. * @return The input avro schema */ public final Schema getInputAvroSchema() { if (schema == null) { String schemaString = getProperties().getProperty(INPUT_AVRO_SCHEMA); if (schemaString != null) { Schema s = new Schema.Parser().parse(schemaString); schema = s; } } return schema; } /* * @see org.apache.pig.LoadFunc#getInputFormat() */ @Override public InputFormat<NullWritable, GenericData.Record> getInputFormat() throws IOException { return new org.apache.pig.backend.hadoop.executionengine.mapReduceLayer .PigFileInputFormat<NullWritable, GenericData.Record>() { @Override public RecordReader<NullWritable, GenericData.Record> createRecordReader(final InputSplit is, final TaskAttemptContext tc) throws IOException, InterruptedException { Schema s = getInputAvroSchema(); RecordReader<NullWritable, GenericData.Record> rr = null; if (s.getType() == Type.ARRAY) { rr = new AvroArrayReader(s); } else { rr = new AvroRecordReader(s); } rr.initialize(is, tc); tc.setStatus(is.toString()); return rr; } }; } @SuppressWarnings("rawtypes") private RecordReader reader; PigSplit split; /* * @see * org.apache.pig.LoadFunc#prepareToRead(org.apache.hadoop.mapreduce.RecordReader * , org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit) */ @SuppressWarnings("rawtypes") @Override public final void prepareToRead(final RecordReader r, final PigSplit s) throws IOException { reader = r; split = s; } /* * @see org.apache.pig.LoadFunc#getNext() */ @Override public final Tuple getNext() throws IOException { try { if (reader.nextKeyValue()) { return new AvroTupleWrapper<GenericData.Record>( (GenericData.Record) reader.getCurrentValue()); } else { return null; } } catch (InterruptedException e) { throw new IOException("Wrapped Interrupted Exception", e); } } @Override public void cleanupOnSuccess(final String location, final Job job) throws IOException { } @Override public List<OperatorSet> getFeatures() { return Lists.newArrayList(LoadPushDown.OperatorSet.PROJECTION); } /** * List of required fields passed by pig in a push down projection. */ protected RequiredFieldList requiredFieldList; /* * @see * org.apache.pig.LoadPushDown#pushProjection(org.apache.pig.LoadPushDown. * RequiredFieldList) */ @Override public RequiredFieldResponse pushProjection(final RequiredFieldList rfl) throws FrontendException { requiredFieldList = rfl; Schema newSchema = AvroStorageSchemaConversionUtilities .newSchemaFromRequiredFieldList(schema, rfl); if (newSchema != null) { schema = newSchema; setInputAvroSchema(schema); return new RequiredFieldResponse(true); } else { log.warn("could not select fields subset " + rfl + "\n"); warn("could not select fields subset", PigWarning.UDF_WARNING_2); return new RequiredFieldResponse(false); } } }