HiveColumnarLoader.java example

Explorer
spork-streaming-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements. See the NOTICE file distributed with this
 * work for additional information regarding copyright ownership. The ASF
 * licenses this file to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */
package org.apache.pig.piggybank.storage;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.regex.Pattern;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.ql.session.SessionState;
import org.apache.hadoop.hive.serde.Constants;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable;
import org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe;
import org.apache.hadoop.hive.serde2.columnar.ColumnarStruct;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.pig.Expression;
import org.apache.pig.FileInputLoadFunc;
import org.apache.pig.LoadMetadata;
import org.apache.pig.LoadPushDown;
import org.apache.pig.ResourceSchema;
import org.apache.pig.ResourceSchema.ResourceFieldSchema;
import org.apache.pig.ResourceStatistics;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit;
import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.logicalLayer.FrontendException;
import org.apache.pig.impl.logicalLayer.schema.Schema;
import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema;
import org.apache.pig.impl.util.UDFContext;
import org.apache.pig.piggybank.storage.hiverc.HiveRCInputFormat;
import org.apache.pig.piggybank.storage.hiverc.HiveRCRecordReader;
import org.apache.pig.piggybank.storage.hiverc.HiveRCSchemaUtil;
import org.apache.pig.piggybank.storage.partition.PathPartitionHelper;

/**
 * Loader for Hive RC Columnar files.<br/>
 * Supports the following types:<br/>
 * *
 * <table>
 * <tr>
 * <th>Hive Type</th>
 * <th>Pig Type from DataType</th>
 * </tr>
 * <tr>
 * <td>string</td>
 * <td>CHARARRAY</td>
 * </tr>
 * <tr>
 * <td>int</td>
 * <td>INTEGER</td>
 * </tr>
 * <tr>
 * <td>bigint or long</td>
 * <td>LONG</td>
 * </tr>
 * <tr>
 * <td>float</td>
 * <td>float</td>
 * </tr>
 * <tr>
 * <td>double</td>
 * <td>DOUBLE</td>
 * </tr>
 * <tr>
 * <td>boolean</td>
 * <td>BOOLEAN</td>
 * </tr>
 * <tr>
 * <td>byte</td>
 * <td>BYTE</td>
 * </tr>
 * <tr>
 * <td>array</td>
 * <td>TUPLE</td>
 * </tr>
 * <tr>
 * <td>map</td>
 * <td>MAP</td>
 * </tr>
 * </table>
 *
 * <p/>
 * <b>Partitions</b><br/>
 * The input paths are scanned by the loader for [partition name]=[value]
 * patterns in the subdirectories.<br/>
 * If detected these partitions are appended to the table schema.<br/>
 * For example if you have the directory structure:<br/>
 *
 * <pre>
 * /user/hive/warehouse/mytable
 * 				/year=2010/month=02/day=01
 * </pre>
 *
 * The mytable schema is (id int,name string).<br/>
 * The final schema returned in pig will be (id:int, name:chararray,
 * year:chararray, month:chararray, day:chararray).<br/>
 * <p/>
 * Usage 1:
 * <p/>
 * To load a hive table: uid bigint, ts long, arr ARRAY<string,string>, m
 * MAP<String, String> <br/>
 * <code>
 * <pre>
 * a = LOAD 'file' USING HiveColumnarLoader("uid bigint, ts long, arr array<string,string>, m map<string,string>");
 * -- to reference the fields
 * b = FOREACH GENERATE a.uid, a.ts, a.arr, a.m;
 * </pre>
 * </code>
 * <p/>
 * Usage 2:
 * <p/>
 * To load a hive table: uid bigint, ts long, arr ARRAY<string,string>, m
 * MAP<String, String> only processing dates 2009-10-01 to 2009-10-02 in a <br/>
 * date partitioned hive table.<br/>
 * <b>Old Usage</b><br/>
 * <b>Note:</b> The partitions can be filtered by using pig's FILTER operator.<br/>
 * <code>
 * <pre>
 * a = LOAD 'file' USING HiveColumnarLoader("uid bigint, ts long, arr array<string,string>, m map<string,string>", "2009-10-01:2009-10-02");
 * -- to reference the fields
 * b = FOREACH GENERATE a.uid, a.ts, a.arr, a.m;
 * </pre>
 * </code> <br/>
 * <b>New Usage</b/><br/>
 * <code>
 * <pre>
 * a = LOAD 'file' USING HiveColumnarLoader("uid bigint, ts long, arr array<string,string>, m map<string,string>");
 * f = FILTER a BY daydate>='2009-10-01' AND daydate >='2009-10-02';
 * </pre>
 * </code>
 * <p/>
 * Usage 3:
 * <p/>
 * To load a hive table: uid bigint, ts long, arr ARRAY<string,string>, m
 * MAP<String, String> only reading column uid and ts for dates 2009-10-01 to
 * 2009-10-02.<br/ <br/>
 * <b>Old Usage</b><br/>
 * <b>Note:<b/> This behaviour is now supported in pig by LoadPushDown adding
 * the columns needed to be loaded like below is ignored and pig will
 * automatically send the columns used by the script to the loader.<br/>
 * <code>
 * <pre>
 * a = LOAD 'file' USING HiveColumnarLoader("uid bigint, ts long, arr array<string,string>, m map<string,string>");
 * f = FILTER a BY daydate>='2009-10-01' AND daydate >='2009-10-02';
 * -- to reference the fields
 * b = FOREACH a GENERATE uid, ts, arr, m;
 * </pre>
 * </code>
 * <p/>
 * <b>Issues</b>
 * <p/>
 * <u>Table schema definition</u><br/>
 * The schema definition must be column name followed by a space then a comma
 * then no space and the next column name and so on.<br/>
 * This so column1 string, column2 string will not work, it must be column1
 * string,column2 string
 * <p/>
 * <u>Partitioning</u><br/>
 * Partitions must be in the format [partition name]=[partition value]<br/>
 * Only strings are supported in the partitioning.<br/>
 * Partitions must follow the same naming convention for all sub directories in
 * a table<br/>
 * For example:<br/>
 * The following is not valid:<br/>
 *
 * <pre>
 *     mytable/hour=00
 *     mytable/day=01/hour=00
 * </pre>
 *
 **/
public class HiveColumnarLoader extends FileInputLoadFunc implements
	LoadMetadata, LoadPushDown {

    public static final String PROJECTION_ID = HiveColumnarLoader.class
	    .getName() + ".projection";

    public static final String DATE_RANGE = HiveColumnarLoader.class.getName()
	    + ".date-range";

    /**
     * Regex to filter out column names
     */
    protected static final Pattern pcols = Pattern.compile("[a-zA-Z_0-9]*[ ]");
    protected static final Log LOG = LogFactory
	    .getLog(HiveColumnarLoader.class);

    protected TupleFactory tupleFactory = TupleFactory.getInstance();

    String signature = "";

    // we need to save the dateRange from the constructor if provided to add to
    // the UDFContext only when the signature is available.
    String dateRange = null;

    HiveRCRecordReader reader;

    ColumnarSerDe serde = null;
    Configuration conf = null;

    ResourceSchema pigSchema;
    boolean partitionKeysSet = false;

    BytesRefArrayWritable buff = null;

    private Properties props;
    private HiveConf hiveConf;

    transient int[] requiredColumns;

    transient Set<String> partitionColumns;

    /**
     * Implements the logic for searching partition keys and applying parition
     * filtering
     */
    transient PathPartitionHelper pathPartitionerHelper = new PathPartitionHelper();

    transient Path currentPath = null;
    transient Map<String, String> currentPathPartitionKeyMap;

    /**
     * Table schema should be a space and comma separated string describing the
     * Hive schema.<br/>
     * For example uid BIGINT, pid long, means 1 column of uid type BIGINT and
     * one column of pid type LONG.<br/>
     * The types are not case sensitive.
     *
     * @param table_schema
     *            This property cannot be null
     */
    public HiveColumnarLoader(String table_schema) {
	setup(table_schema);
    }

    /**
     * This constructor is for backward compatibility.
     *
     * Table schema should be a space and comma separated string describing the
     * Hive schema.<br/>
     * For example uid BIGINT, pid long, means 1 column of uid type BIGINT and
     * one column of pid type LONG.<br/>
     * The types are not case sensitive.
     *
     * @param table_schema
     *            This property cannot be null
     * @param dateRange
     *            String
     * @param columns
     *            String not used any more
     */
    public HiveColumnarLoader(String table_schema, String dateRange,
	    String columns) {
	setup(table_schema);

	this.dateRange = dateRange;
    }

    /**
     * This constructor is for backward compatibility.
     *
     * Table schema should be a space and comma separated string describing the
     * Hive schema.<br/>
     * For example uid BIGINT, pid long, means 1 column of uid type BIGINT and
     * one column of pid type LONG.<br/>
     * The types are not case sensitive.
     *
     * @param table_schema
     *            This property cannot be null
     * @param dateRange
     *            String
     */
    public HiveColumnarLoader(String table_schema, String dateRange) {
	setup(table_schema);

	this.dateRange = dateRange;
    }

    private Properties getUDFContext() {
	return UDFContext.getUDFContext().getUDFProperties(this.getClass(),
		new String[] { signature });
    }

    @Override
    public InputFormat<LongWritable, BytesRefArrayWritable> getInputFormat()
	    throws IOException {
	LOG.info("Signature: " + signature);
	return new HiveRCInputFormat(signature);
    }

    @Override
    public Tuple getNext() throws IOException {
	Tuple tuple = null;

	try {
	    if (reader.nextKeyValue()) {

		BytesRefArrayWritable buff = reader.getCurrentValue();
		ColumnarStruct struct = readColumnarStruct(buff);

		tuple = readColumnarTuple(struct, reader.getSplitPath());
	    }

	} catch (InterruptedException e) {
	    throw new IOException(e.toString(), e);
	}

	return tuple;
    }

    @Override
    public void prepareToRead(
	    @SuppressWarnings("rawtypes") RecordReader reader, PigSplit split)
	    throws IOException {

	this.reader = (HiveRCRecordReader) reader;

	// check that the required indexes actually exist i.e. the columns that
	// should be read.
	// assuming this is always defined simplifies the readColumnarTuple
	// logic.

	int requiredIndexes[] = getRequiredColumns();
	if (requiredIndexes == null) {

	    int fieldLen = pigSchema.getFields().length;

	    // if any the partition keys should already exist
	    String[] partitionKeys = getPartitionKeys(null, null);
	    if (partitionKeys != null) {
		fieldLen += partitionKeys.length;
	    }

	    requiredIndexes = new int[fieldLen];

	    for (int i = 0; i < fieldLen; i++) {
		requiredIndexes[i] = i;
	    }

	    this.requiredColumns = requiredIndexes;
	}

	try {
	    serde = new ColumnarSerDe();
	    serde.initialize(hiveConf, props);
	} catch (SerDeException e) {
	    LOG.error(e.toString(), e);
	    throw new IOException(e);
	}

    }

    @Override
    public void setLocation(String location, Job job) throws IOException {
	FileInputFormat.setInputPaths(job, location);
    }

    /**
     * Does the configuration setup and schema parsing and setup.
     *
     * @param table_schema
     *            String
     * @param columnsToRead
     *            String
     */
    private void setup(String table_schema) {

	if (table_schema == null)
	    throw new RuntimeException(
		    "The table schema must be defined as colname type, colname type.  All types are hive types");

	// create basic configuration for hdfs and hive
	conf = new Configuration();
	hiveConf = new HiveConf(conf, SessionState.class);

	// parse the table_schema string
	List<String> types = HiveRCSchemaUtil.parseSchemaTypes(table_schema);
	List<String> cols = HiveRCSchemaUtil.parseSchema(pcols, table_schema);

	List<FieldSchema> fieldSchemaList = new ArrayList<FieldSchema>(
		cols.size());

	for (int i = 0; i < cols.size(); i++) {
	    fieldSchemaList.add(new FieldSchema(cols.get(i), HiveRCSchemaUtil
		    .findPigDataType(types.get(i))));
	}

	pigSchema = new ResourceSchema(new Schema(fieldSchemaList));

	props = new Properties();

	// setting table schema properties for ColumnarSerDe
	// these properties are never changed by the columns to read filter,
	// because the columnar serde needs to now the
	// complete format of each record.
	props.setProperty(Constants.LIST_COLUMNS,
		HiveRCSchemaUtil.listToString(cols));
	props.setProperty(Constants.LIST_COLUMN_TYPES,
		HiveRCSchemaUtil.listToString(types));

    }

    /**
     * Uses the ColumnarSerde to deserialize the buff:BytesRefArrayWritable into
     * a ColumnarStruct instance.
     *
     * @param buff
     *            BytesRefArrayWritable
     * @return ColumnarStruct
     */
    private ColumnarStruct readColumnarStruct(BytesRefArrayWritable buff) {
	// use ColumnarSerDe to deserialize row
	ColumnarStruct struct = null;
	try {
	    struct = (ColumnarStruct) serde.deserialize(buff);
	} catch (SerDeException e) {
	    LOG.error(e.toString(), e);
	    throw new RuntimeException(e.toString(), e);
	}

	return struct;
    }

    /**
     * Only read the columns that were requested in the constructor.<br/>
     *
     * @param struct
     *            ColumnarStruct
     * @param path
     *            Path
     * @return Tuple
     * @throws IOException
     */
    private Tuple readColumnarTuple(ColumnarStruct struct, Path path)
	    throws IOException {

	int[] columnIndexes = getRequiredColumns();
	// the partition keys if any will already be in the UDFContext here.
	String[] partitionKeys = getPartitionKeys(null, null);
	// only if the path has changed should be run the
	if (currentPath == null || !currentPath.equals(path)) {
	    currentPathPartitionKeyMap = (partitionKeys == null) ? null
		    : pathPartitionerHelper.getPathPartitionKeyValues(path
			    .toString());
	    currentPath = path;
	}

	// if the partitionColumns is null this value will stop the for loop
	// below from trynig to add any partition columns
	// that do not exist
	int partitionColumnStartIndex = Integer.MAX_VALUE;

	if (!(partitionColumns == null || partitionColumns.size() == 0)) {
	    // partition columns are always appended to the schema fields.
	    partitionColumnStartIndex = pigSchema.getFields().length;

	}

	// create tuple with determined previous size
	Tuple t = tupleFactory.newTuple(columnIndexes.length);

	// read in all columns
	for (int i = 0; i < columnIndexes.length; i++) {
	    int columnIndex = columnIndexes[i];

	    if (columnIndex < partitionColumnStartIndex) {
		Object obj = struct.getField(columnIndex);
		Object pigType = HiveRCSchemaUtil
			.extractPigTypeFromHiveType(obj);

		t.set(i, pigType);

	    } else {
		// read the partition columns
		// will only be executed if partitionColumns is not null
		String key = partitionKeys[columnIndex
			- partitionColumnStartIndex];
		Object value = currentPathPartitionKeyMap.get(key);
		t.set(i, value);

	    }

	}

	return t;
    }

    /**
     * Will parse the required columns from the UDFContext properties if the
     * requiredColumns[] variable is null, or else just return the
     * requiredColumns.
     *
     * @return int[]
     */
    private int[] getRequiredColumns() {

	if (requiredColumns == null) {
	    Properties properties = getUDFContext();

	    String projectionStr = properties.getProperty(PROJECTION_ID);

	    if (projectionStr != null) {
		String[] split = projectionStr.split(",");
		int columnIndexes[] = new int[split.length];

		int index = 0;
		for (String splitItem : split) {
		    columnIndexes[index++] = Integer.parseInt(splitItem);
		}

		requiredColumns = columnIndexes;
	    }

	}

	return requiredColumns;
    }

    /**
     * Reads the partition columns
     *
     * @param location
     * @param job
     * @return
     */
    private Set<String> getPartitionColumns(String location, Job job) {

	if (partitionColumns == null) {
	    // read the partition columns from the UDF Context first.
	    // if not in the UDF context then read it using the PathPartitioner.

	    Properties properties = getUDFContext();

	    if (properties == null)
		properties = new Properties();

	    String partitionColumnStr = properties
		    .getProperty(PathPartitionHelper.PARTITION_COLUMNS);

	    if (partitionColumnStr == null
		    && !(location == null || job == null)) {
		// if it hasn't been written yet.
		Set<String> partitionColumnSet;

		try {
		    partitionColumnSet = pathPartitionerHelper
			    .getPartitionKeys(location, job.getConfiguration());
		} catch (IOException e) {

		    RuntimeException rte = new RuntimeException(e);
		    rte.setStackTrace(e.getStackTrace());
		    throw rte;

		}

		if (partitionColumnSet != null) {

		    StringBuilder buff = new StringBuilder();

		    int i = 0;
		    for (String column : partitionColumnSet) {
			if (i++ != 0) {
			    buff.append(',');
			}

			buff.append(column);
		    }

		    String buffStr = buff.toString().trim();

		    if (buffStr.length() > 0) {

			properties.setProperty(
				PathPartitionHelper.PARTITION_COLUMNS,
				buff.toString());
		    }

		    partitionColumns = partitionColumnSet;

		}

	    } else {
		// the partition columns has been set already in the UDF Context
		if (partitionColumnStr != null) {
		    String split[] = partitionColumnStr.split(",");
		    partitionColumns = new LinkedHashSet<String>();
		    if (split.length > 0) {
			for (String splitItem : split) {
			    partitionColumns.add(splitItem);
			}
		    }
		}

	    }

	}

	return partitionColumns;

    }

    @Override
    public String[] getPartitionKeys(String location, Job job)
	    throws IOException {
	Set<String> partitionKeys = getPartitionColumns(location, job);

	return partitionKeys == null ? null : partitionKeys
		.toArray(new String[] {});
    }

    @Override
    public ResourceSchema getSchema(String location, Job job)
	    throws IOException {

	if (!partitionKeysSet) {
	    Set<String> keys = getPartitionColumns(location, job);

	    if (!(keys == null || keys.size() == 0)) {

		// re-edit the pigSchema to contain the new partition keys.
		ResourceFieldSchema[] fields = pigSchema.getFields();

		LOG.debug("Schema: " + Arrays.toString(fields));

		ResourceFieldSchema[] newFields = Arrays.copyOf(fields,
			fields.length + keys.size());

		int index = fields.length;

		for (String key : keys) {
		    newFields[index++] = new ResourceFieldSchema(
			    new FieldSchema(key, DataType.CHARARRAY));
		}

		pigSchema.setFields(newFields);

		LOG.debug("Added partition fields: " + keys
			+ " to loader schema");
		LOG.debug("Schema is: " + Arrays.toString(newFields));
	    }

	    partitionKeysSet = true;

	}

	return pigSchema;
    }

    @Override
    public ResourceStatistics getStatistics(String location, Job job)
	    throws IOException {
	return null;
    }

    @Override
    public void setPartitionFilter(Expression partitionFilter)
	    throws IOException {
	getUDFContext().setProperty(
		PathPartitionHelper.PARITITION_FILTER_EXPRESSION,
		partitionFilter.toString());
    }

    @Override
    public List<OperatorSet> getFeatures() {
	return Arrays.asList(LoadPushDown.OperatorSet.PROJECTION);
    }

    @Override
    public RequiredFieldResponse pushProjection(
	    RequiredFieldList requiredFieldList) throws FrontendException {

	// save the required field list to the UDFContext properties.
	StringBuilder buff = new StringBuilder();

	int i = 0;
	for (RequiredField f : requiredFieldList.getFields()) {
	    if (i++ != 0)
		buff.append(',');

	    buff.append(f.getIndex());
	}

	Properties properties = getUDFContext();

	properties.setProperty(PROJECTION_ID, buff.toString());

	return new RequiredFieldResponse(true);
    }

    @Override
    public void setUDFContextSignature(String signature) {
	super.setUDFContextSignature(signature);

	LOG.debug("Signature: " + signature);
	this.signature = signature;

	// this provides backwards compatibility
	// the HiveRCInputFormat will read this and if set will perform the
	// needed partitionFiltering
	if (dateRange != null) {
	    getUDFContext().setProperty(DATE_RANGE, dateRange);
	}

    }

}