/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.zebra.pig;
import java.io.IOException;
import java.util.Properties;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.zebra.mapreduce.BasicTableOutputFormat;
import org.apache.hadoop.zebra.mapreduce.ZebraOutputPartition;
import org.apache.hadoop.zebra.mapreduce.ZebraSchema;
import org.apache.hadoop.zebra.mapreduce.ZebraSortInfo;
import org.apache.hadoop.zebra.mapreduce.ZebraStorageHint;
import org.apache.hadoop.zebra.parser.ParseException;
import org.apache.hadoop.zebra.types.ZebraConf;
import org.apache.pig.LoadFunc;
import org.apache.pig.ResourceSchema;
import org.apache.pig.ResourceStatistics;
import org.apache.pig.StoreFunc;
import org.apache.pig.StoreMetadata;
import org.apache.pig.ResourceSchema.ResourceFieldSchema;
import org.apache.pig.data.Tuple;
import org.apache.pig.impl.util.UDFContext;
/**
* Pig LoadFunc implementation for Zebra Table
*/
public class TableStorer extends StoreFunc implements StoreMetadata {
private static final String UDFCONTEXT_OUTPUT_SCHEMA = "zebra.UDFContext.outputSchema";
private static final String UDFCONTEXT_SORT_INFO = "zebra.UDFContext.sortInfo";
private static final String UDFCONTEXT_OUTPUT_CHECKTYPE = "zebra.UDFContext.checkType";
private String storageHintString = null;
private String udfContextSignature = null;
private RecordWriter<BytesWritable, Tuple> tableRecordWriter = null;
private String partitionClassString = null;
Class<? extends ZebraOutputPartition> partitionClass = null;
private String partitionClassArgumentsString = null;
public TableStorer() {
}
public TableStorer(String storageHintString) {
this.storageHintString = storageHintString;
}
public TableStorer(String storageHintString, String partitionClassString) {
this.storageHintString = storageHintString;
this.partitionClassString = partitionClassString;
}
public TableStorer(String storageHintString, String partitionClassString, String partitionClassArgumentsString) {
this.storageHintString = storageHintString;
this.partitionClassString = partitionClassString;
this.partitionClassArgumentsString = partitionClassArgumentsString;
}
@Override
public void putNext(Tuple tuple) throws IOException {
try {
tableRecordWriter.write( null, tuple );
} catch (InterruptedException e) {
throw new IOException(e.getMessage());
}
}
@Override
public void checkSchema(ResourceSchema schema) throws IOException {
// Get schemaStr and sortColumnNames from the given schema. In the process, we
// also validate the schema and sorting info.
ResourceSchema.Order[] orders = schema.getSortKeyOrders();
boolean descending = false;
for (ResourceSchema.Order order : orders)
{
if (order == ResourceSchema.Order.DESCENDING)
{
Log LOG = LogFactory.getLog(TableStorer.class);
LOG.warn("Sorting in descending order is not supported by Zebra and the table will be unsorted.");
descending = true;
break;
}
}
StringBuilder sortColumnNames = new StringBuilder();
if (!descending) {
ResourceSchema.ResourceFieldSchema[] fields = schema.getFields();
int[] index = schema.getSortKeys();
for( int i = 0; i< index.length; i++ ) {
ResourceFieldSchema field = fields[index[i]];
String name = field.getName();
if( name == null )
throw new IOException("Zebra does not support column positional reference yet");
if( !org.apache.pig.data.DataType.isAtomic( field.getType() ) )
throw new IOException( "Field [" + name + "] is not of simple type as required for a sort column now." );
if( i > 0 )
sortColumnNames.append( "," );
sortColumnNames.append( name );
}
}
// Convert resource schema to zebra schema
org.apache.hadoop.zebra.schema.Schema zebraSchema;
try {
zebraSchema = SchemaConverter.convertFromResourceSchema( schema );
} catch (ParseException ex) {
throw new IOException("Exception thrown from SchemaConverter: " + ex.getMessage() );
}
Properties properties = UDFContext.getUDFContext().getUDFProperties(
this.getClass(), new String[]{ udfContextSignature } );
properties.setProperty( UDFCONTEXT_OUTPUT_SCHEMA, zebraSchema.toString() );
properties.setProperty( UDFCONTEXT_SORT_INFO, sortColumnNames.toString() );
// This is to turn off type check for potential corner cases - for internal use only;
if (System.getenv("zebra_output_checktype") != null && System.getenv("zebra_output_checktype").equals("no")) {
properties.setProperty( UDFCONTEXT_OUTPUT_CHECKTYPE, "no");
}
}
@SuppressWarnings("unchecked")
@Override
public org.apache.hadoop.mapreduce.OutputFormat getOutputFormat()
throws IOException {
return new BasicTableOutputFormat();
}
@SuppressWarnings("unchecked")
@Override
public void prepareToWrite(RecordWriter writer)
throws IOException {
tableRecordWriter = writer;
if( tableRecordWriter == null ) {
throw new IOException( "Invalid type of writer. Expected type: TableRecordWriter." );
}
}
@Override
public String relToAbsPathForStoreLocation(String location, Path curDir)
throws IOException {
return LoadFunc.getAbsolutePath( location, curDir );
}
@Override
public void setStoreLocation(String location, Job job) throws IOException {
Configuration conf = job.getConfiguration();
String[] outputs = location.split(",");
if (outputs.length == 1) {
BasicTableOutputFormat.setOutputPath(job, new Path(location));
} else if (outputs.length > 1) {
if (partitionClass == null) {
try {
partitionClass = (Class<? extends ZebraOutputPartition>) conf.getClassByName(partitionClassString);
} catch (ClassNotFoundException e) {
throw new IOException(e);
}
}
Path[] paths = new Path[outputs.length];
for (int i=0; i<paths.length; i++) {
paths[i] = new Path(outputs[i]);
}
BasicTableOutputFormat.setMultipleOutputs(job, partitionClass, partitionClassArgumentsString, paths);
} else {
throw new IOException( "Invalid location : " + location);
}
// Get schema string and sorting info from UDFContext and re-store them to
// job config.
Properties properties = UDFContext.getUDFContext().getUDFProperties(
this.getClass(), new String[]{ udfContextSignature } );
ZebraSchema zSchema = ZebraSchema.createZebraSchema(properties.getProperty(UDFCONTEXT_OUTPUT_SCHEMA));
ZebraSortInfo zSortInfo = ZebraSortInfo.createZebraSortInfo(properties.getProperty(UDFCONTEXT_SORT_INFO), null);
ZebraStorageHint zStorageHint = ZebraStorageHint.createZebraStorageHint(storageHintString);
try {
BasicTableOutputFormat.setStorageInfo(job, zSchema, zStorageHint, zSortInfo);
} catch (ParseException e) {
throw new IOException("Invalid storage info: " + e.getMessage());
}
// Get checktype information from UDFContext and re-store it to job config;
if (properties.getProperty(UDFCONTEXT_OUTPUT_CHECKTYPE) != null && properties.getProperty(UDFCONTEXT_OUTPUT_CHECKTYPE).equals("no")) {
ZebraConf.setCheckType(conf, false);
}
}
@Override
public void storeSchema(ResourceSchema schema, String location, Job job)
throws IOException {
//TODO: This is temporary - we will do close at cleanupJob() when OutputCommitter is ready.
BasicTableOutputFormat.close(job);
}
@Override
public void setStoreFuncUDFContextSignature(String signature) {
udfContextSignature = signature;
}
@Override
public void storeStatistics(ResourceStatistics stats, String location,
Job job) throws IOException {
// no-op
}
}