TableStorer.java example

Explorer
spork-streaming-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.zebra.pig;

import java.io.IOException;
import java.util.Properties;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.zebra.mapreduce.BasicTableOutputFormat;
import org.apache.hadoop.zebra.mapreduce.ZebraOutputPartition;
import org.apache.hadoop.zebra.mapreduce.ZebraSchema;
import org.apache.hadoop.zebra.mapreduce.ZebraSortInfo;
import org.apache.hadoop.zebra.mapreduce.ZebraStorageHint;
import org.apache.hadoop.zebra.parser.ParseException;
import org.apache.hadoop.zebra.types.ZebraConf;
import org.apache.pig.LoadFunc;
import org.apache.pig.ResourceSchema;
import org.apache.pig.ResourceStatistics;
import org.apache.pig.StoreFunc;
import org.apache.pig.StoreMetadata;
import org.apache.pig.ResourceSchema.ResourceFieldSchema;
import org.apache.pig.data.Tuple;
import org.apache.pig.impl.util.UDFContext;

/**
 * Pig LoadFunc implementation for Zebra Table
 */
public class TableStorer extends StoreFunc implements StoreMetadata {
    private static final String UDFCONTEXT_OUTPUT_SCHEMA = "zebra.UDFContext.outputSchema";
    private static final String UDFCONTEXT_SORT_INFO = "zebra.UDFContext.sortInfo";
    private static final String UDFCONTEXT_OUTPUT_CHECKTYPE = "zebra.UDFContext.checkType";

    private String storageHintString = null;
    private String udfContextSignature = null;
    private RecordWriter<BytesWritable, Tuple> tableRecordWriter = null;
    private String partitionClassString = null;
    Class<? extends ZebraOutputPartition> partitionClass = null;
    private String partitionClassArgumentsString = null;

    public TableStorer() {
    }

    public TableStorer(String storageHintString) {
      this.storageHintString = storageHintString;
    }

    public TableStorer(String storageHintString, String partitionClassString) {
      this.storageHintString = storageHintString;
      this.partitionClassString = partitionClassString;
    }

    public TableStorer(String storageHintString, String partitionClassString, String partitionClassArgumentsString) {
      this.storageHintString = storageHintString;
      this.partitionClassString = partitionClassString;
      this.partitionClassArgumentsString = partitionClassArgumentsString;
    }


    @Override
    public void putNext(Tuple tuple) throws IOException {
      try {
        tableRecordWriter.write( null, tuple );
      } catch (InterruptedException e) {
        throw new IOException(e.getMessage());
      }
    }

    @Override
    public void checkSchema(ResourceSchema schema) throws IOException {
      // Get schemaStr and sortColumnNames from the given schema. In the process, we
      // also validate the schema and sorting info.
      ResourceSchema.Order[] orders = schema.getSortKeyOrders();
      boolean descending = false;
      for (ResourceSchema.Order order : orders)
      {
        if (order == ResourceSchema.Order.DESCENDING)
        {
          Log LOG = LogFactory.getLog(TableStorer.class);
          LOG.warn("Sorting in descending order is not supported by Zebra and the table will be unsorted.");
          descending = true;
          break;
        }
      }
      StringBuilder sortColumnNames = new StringBuilder();
      if (!descending) {
        ResourceSchema.ResourceFieldSchema[] fields = schema.getFields();
        int[] index = schema.getSortKeys();
      
        for( int i = 0; i< index.length; i++ ) {
          ResourceFieldSchema field = fields[index[i]];
          String name = field.getName();
          if( name == null )
              throw new IOException("Zebra does not support column positional reference yet");
          if( !org.apache.pig.data.DataType.isAtomic( field.getType() ) )
              throw new IOException( "Field [" + name + "] is not of simple type as required for a sort column now." );
          if( i > 0 )
              sortColumnNames.append( "," );
          sortColumnNames.append( name );
        }
      }

      // Convert resource schema to zebra schema
      org.apache.hadoop.zebra.schema.Schema zebraSchema;
      try {
          zebraSchema = SchemaConverter.convertFromResourceSchema( schema );
      } catch (ParseException ex) {
          throw new IOException("Exception thrown from SchemaConverter: " + ex.getMessage() );
      }

      Properties properties = UDFContext.getUDFContext().getUDFProperties( 
              this.getClass(), new String[]{ udfContextSignature } );
      properties.setProperty( UDFCONTEXT_OUTPUT_SCHEMA, zebraSchema.toString() );
      properties.setProperty( UDFCONTEXT_SORT_INFO, sortColumnNames.toString() );
        
      // This is to turn off type check for potential corner cases - for internal use only;
      if (System.getenv("zebra_output_checktype") != null && System.getenv("zebra_output_checktype").equals("no")) {
        properties.setProperty( UDFCONTEXT_OUTPUT_CHECKTYPE, "no");
      }
    }

    @SuppressWarnings("unchecked")
    @Override
    public org.apache.hadoop.mapreduce.OutputFormat getOutputFormat()
    throws IOException {
      return new BasicTableOutputFormat();
    }

    @SuppressWarnings("unchecked")
    @Override
    public void prepareToWrite(RecordWriter writer)
    throws IOException {
      tableRecordWriter = writer;
      if( tableRecordWriter == null ) {
          throw new IOException( "Invalid type of writer. Expected type: TableRecordWriter." );
      }
    }

    @Override
    public String relToAbsPathForStoreLocation(String location, Path curDir)
    throws IOException {
      return LoadFunc.getAbsolutePath( location, curDir );
    }

    @Override
    public void setStoreLocation(String location, Job job) throws IOException {
      Configuration conf = job.getConfiguration();
      
      String[] outputs = location.split(",");
      
      if (outputs.length == 1) {
        BasicTableOutputFormat.setOutputPath(job, new Path(location));
      } else if (outputs.length > 1) {
        if (partitionClass == null) {
          try {
            partitionClass = (Class<? extends ZebraOutputPartition>) conf.getClassByName(partitionClassString);
          } catch (ClassNotFoundException e) {
            throw new IOException(e);
          } 
        }
        
        Path[] paths = new Path[outputs.length];
        for (int i=0; i<paths.length; i++) {
          paths[i] = new Path(outputs[i]);
        }

        BasicTableOutputFormat.setMultipleOutputs(job, partitionClass, partitionClassArgumentsString, paths);
      } else {
        throw new IOException( "Invalid location : " + location);
      }

      // Get schema string and sorting info from UDFContext and re-store them to
      // job config.
      Properties properties = UDFContext.getUDFContext().getUDFProperties( 
              this.getClass(), new String[]{ udfContextSignature } );
      ZebraSchema zSchema = ZebraSchema.createZebraSchema(properties.getProperty(UDFCONTEXT_OUTPUT_SCHEMA));
      ZebraSortInfo zSortInfo = ZebraSortInfo.createZebraSortInfo(properties.getProperty(UDFCONTEXT_SORT_INFO), null);
      ZebraStorageHint zStorageHint = ZebraStorageHint.createZebraStorageHint(storageHintString);
      try {
        BasicTableOutputFormat.setStorageInfo(job, zSchema, zStorageHint, zSortInfo);
      } catch (ParseException e) {
        throw new IOException("Invalid storage info: " + e.getMessage());
      }
        
      // Get checktype information from UDFContext and re-store it to job config;
      if (properties.getProperty(UDFCONTEXT_OUTPUT_CHECKTYPE) != null && properties.getProperty(UDFCONTEXT_OUTPUT_CHECKTYPE).equals("no")) {
        ZebraConf.setCheckType(conf, false);
      }
    }

    @Override
    public void storeSchema(ResourceSchema schema, String location, Job job)
    throws IOException {
      //TODO: This is temporary - we will do close at cleanupJob() when OutputCommitter is ready.
      BasicTableOutputFormat.close(job);
    }

    @Override
    public void setStoreFuncUDFContextSignature(String signature) {
      udfContextSignature = signature;
    }

    @Override
    public void storeStatistics(ResourceStatistics stats, String location,
            Job job) throws IOException {
      // no-op
    }
}