PathPartitionHelper.java example

Explorer
spork-streaming-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements. See the NOTICE file distributed with this
 * work for additional information regarding copyright ownership. The ASF
 * licenses this file to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */
package org.apache.pig.piggybank.storage.partition;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Properties;
import java.util.Set;

import javax.el.ELContext;
import javax.el.ELResolver;
import javax.el.ExpressionFactory;
import javax.el.FunctionMapper;
import javax.el.ValueExpression;
import javax.el.VariableMapper;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.log4j.Logger;
import org.apache.pig.LoadFunc;
import org.apache.pig.impl.util.UDFContext;

/**
 * Implements the logic for:<br/>
 * <ul>
 * <li>Listing partition keys and values used in an hdfs path</li>
 * <li>Filtering of partitions from a pig filter operator expression</li>
 * </ul>
 * <p/>
 * <b>Restrictions</b> <br/>
 * Function calls are not supported by this partition helper and it can only
 * handle String values.<br/>
 * This is normally not a problem given that partition values are part of the
 * hdfs folder path and is given a<br/>
 * determined value that would not need parsing by any external processes.<br/>
 * 
 * 
 */
public class PathPartitionHelper {

    public static final String PARTITION_COLUMNS = PathPartitionHelper.class
	    + ".partition-columns";
    public static final String PARITITION_FILTER_EXPRESSION = PathPartitionHelper.class
	    .getName() + ".partition-filter";

    private static final Logger LOG = Logger
	    .getLogger(PathPartitionHelper.class);

    transient PathPartitioner pathPartitioner = new PathPartitioner();

    /**
     * Returns the Partition keys and each key's value for a single location.<br/>
     * That is the location must be something like
     * mytable/partition1=a/partition2=b/myfile.<br/>
     * This method will return a map with [partition1='a', partition2='b']<br/>
     * The work is delegated to the PathPartitioner class
     * 
     * @param location
     * @return Map of String, String
     * @throws IOException
     */
    public Map<String, String> getPathPartitionKeyValues(String location)
	    throws IOException {
	return pathPartitioner.getPathPartitionKeyValues(location);
    }

    /**
     * Returns the partition keys for a location.<br/>
     * The work is delegated to the PathPartitioner class
     * 
     * @param location
     *            String must be the base directory for the partitions
     * @param conf
     * @return
     * @throws IOException
     */
    public Set<String> getPartitionKeys(String location, Configuration conf)
	    throws IOException {
	return pathPartitioner.getPartitionKeys(location, conf);
    }

    /**
     * Sets the PARITITION_FILTER_EXPRESSION property in the UDFContext
     * identified by the loaderClass.
     * 
     * @param partitionFilterExpression
     * @param loaderClass
     * @throws IOException
     */
    public void setPartitionFilterExpression(String partitionFilterExpression,
	    Class<? extends LoadFunc> loaderClass, String signature)
	    throws IOException {

	UDFContext
		.getUDFContext()
		.getUDFProperties(loaderClass, new String[] { signature })
		.setProperty(PARITITION_FILTER_EXPRESSION,
			partitionFilterExpression);

    }

    /**
     * Reads the partition keys from the location i.e the base directory
     * 
     * @param location
     *            String must be the base directory for the partitions
     * @param conf
     * @param loaderClass
     * @throws IOException
     */
    public void setPartitionKeys(String location, Configuration conf,
	    Class<? extends LoadFunc> loaderClass, String signature)
	    throws IOException {

	Set<String> partitionKeys = getPartitionKeys(location, conf);

	if (partitionKeys != null) {
	    StringBuilder buff = new StringBuilder();
	    int i = 0;
	    for (String key : partitionKeys) {
		if (i++ != 0) {
		    buff.append(",");
		}

		buff.append(key);
	    }

	    UDFContext.getUDFContext()
		    .getUDFProperties(loaderClass, new String[] { signature })
		    .setProperty(PARTITION_COLUMNS, buff.toString());
	}

    }

    /**
     * This method is called by the FileInputFormat to find the input paths for
     * which splits should be calculated.<br/>
     * If applyDateRanges == true: Then the HiveRCDateSplitter is used to apply
     * filtering on the input files.<br/>
     * Else the default FileInputFormat listStatus method is used.
     * 
     * @param ctx
     *            JobContext
     * @param loaderClass
     *            this is chosen to be a subclass of LoadFunc to maintain some
     *            consistency.
     */
    public List<FileStatus> listStatus(JobContext ctx,
	    Class<? extends LoadFunc> loaderClass, String signature)
	    throws IOException {

	Properties properties = UDFContext.getUDFContext().getUDFProperties(
		loaderClass, new String[] { signature });

	String partitionExpression = properties
		.getProperty(PARITITION_FILTER_EXPRESSION);

	ExpressionFactory expressionFactory = null;

	if (partitionExpression != null) {
	    expressionFactory = ExpressionFactory.newInstance();
	}

	String partitionColumnStr = properties
		.getProperty(PathPartitionHelper.PARTITION_COLUMNS);
	String[] partitionKeys = (partitionColumnStr == null) ? null
		: partitionColumnStr.split(",");

	Path[] inputPaths = FileInputFormat.getInputPaths(ctx);

	List<FileStatus> splitPaths = null;

	if (partitionKeys != null) {

	    splitPaths = new ArrayList<FileStatus>();

	    for (Path inputPath : inputPaths) {
		// for each input path work recursively through each partition
		// level to find the rc files

		FileSystem fs = inputPath.getFileSystem(ctx.getConfiguration());

		if (fs.getFileStatus(inputPath).isDir()) {
		    // assure that we are at the root of the partition tree.
		    FileStatus fileStatusArr[] = fs.listStatus(inputPath);

		    if (fileStatusArr != null) {
			for (FileStatus childFileStatus : fileStatusArr) {
			    getPartitionedFiles(expressionFactory,
				    partitionExpression, fs, childFileStatus,
				    0, partitionKeys, splitPaths);
			}
		    }

		} else {
		    splitPaths.add(fs.getFileStatus(inputPath));
		}

	    }

	    if (splitPaths.size() < 1) {
		LOG.error("Not split paths where found, please check that the filter logic for the partition keys does not filter out everything ");
	    }

	}

	return splitPaths;
    }

    /**
     * Recursively works through all directories, skipping filtered partitions.
     * 
     * @param fs
     * @param fileStatus
     * @param partitionLevel
     * @param partitionKeys
     * @param splitPaths
     * @throws IOException
     */
    private void getPartitionedFiles(ExpressionFactory expressionFactory,
	    String partitionExpression, FileSystem fs, FileStatus fileStatus,
	    int partitionLevel, String[] partitionKeys,
	    List<FileStatus> splitPaths) throws IOException {

	String partition = (partitionLevel < partitionKeys.length) ? partitionKeys[partitionLevel]
		: null;

	Path path = fileStatus.getPath();

	// filter out hidden files
	if (path.getName().startsWith("_")) {
	    return;
	}

	// pre filter logic
	// return if any of the logic is not true
	if (partition != null) {
	    if (fileStatus.isDir()) {

		// check that the dir name is equal to that of the partition
		// name
		if (!path.getName().startsWith(partition))
		    return;

	    } else {
		// else its a file but not at the end of the partition tree so
		// its ignored.
		return;
	    }

	    // this means we are inside the partition so that the path will
	    // contain all partitions plus its values
	    // we can apply the partition filter expression here that was passed
	    // to the HiveColumnarLoader.setPartitionExpression
	    if (partitionLevel == (partitionKeys.length - 1)
		    && !evaluatePartitionExpression(expressionFactory,
			    partitionExpression, path)) {

		LOG.debug("Pruning partition: " + path);
		return;

	    }

	}

	// after this point we now that the partition is either null
	// which means we are at the end of the partition tree and all files
	// sub directories should be included.
	// or that we are still navigating the partition tree.
	int nextPartitionLevel = partitionLevel + 1;

	// iterate over directories if fileStatus is a dir.
	FileStatus[] childStatusArr = null;

	if (fileStatus.isDir()) {
	    if ((childStatusArr = fs.listStatus(path)) != null) {
		for (FileStatus childFileStatus : childStatusArr) {
		    getPartitionedFiles(expressionFactory, partitionExpression,
			    fs, childFileStatus, nextPartitionLevel,
			    partitionKeys, splitPaths);
		}
	    }
	} else {
	    // add file to splitPaths
	    splitPaths.add(fileStatus);
	}

    }

    /**
     * Evaluates the partitionExpression set in the
     * HiveColumnarLoader.setPartitionExpression. * @
     * 
     * @param partitionExpression
     *            String
     * @param path
     *            Path
     * @return boolean
     * @throws IOException
     */
    private boolean evaluatePartitionExpression(
	    ExpressionFactory expressionFactory, String partitionExpression,
	    Path path) throws IOException {

	boolean ret = true;

	if (expressionFactory != null) {
	    if (!partitionExpression.startsWith("${")) {
		partitionExpression = "${" + partitionExpression + "}";
	    }

	    Map<String, String> context = pathPartitioner
		    .getPathPartitionKeyValues(path.toString());

	    MapVariableMapper mapper = new MapVariableMapper(expressionFactory,
		    context);
	    VariableContext varContext = new VariableContext(mapper);

	    ValueExpression evalExpression = expressionFactory
		    .createValueExpression(varContext, partitionExpression,
			    Boolean.class);

	    ret = (Boolean) evalExpression.getValue(varContext);

	    LOG.debug("Evaluated: " + partitionExpression + " returned: " + ret);

	}

	return ret;
    }

    /**
     * 
     * ELContext implementation containing the VariableMapper MapVariableMapper
     * 
     */
    class VariableContext extends ELContext {

	VariableMapper variableMapper;

	VariableContext(VariableMapper variableMapper) {
	    this.variableMapper = variableMapper;
	}

	@Override
	public ELResolver getELResolver() {
	    // TODO Auto-generated method stub
	    return null;
	}

	@Override
	public FunctionMapper getFunctionMapper() {
	    return null;
	}

	@Override
	public VariableMapper getVariableMapper() {
	    return variableMapper;
	}

    }

    /**
     * Implementation for the VariableMapper that takes the values in a Map and
     * creates ValueExpression objects for each.
     * 
     */
    class MapVariableMapper extends VariableMapper {
	private Map<String, ValueExpression> valueExpressionMap;

	public MapVariableMapper(ExpressionFactory expressionFactory,
		Map<String, String> variableMap) {

	    valueExpressionMap = new HashMap<String, ValueExpression>();

	    for (Entry<String, String> entry : variableMap.entrySet()) {
		ValueExpression valExpr = expressionFactory
			.createValueExpression(entry.getValue(), String.class);
		valueExpressionMap.put(entry.getKey(), valExpr);
	    }

	}

	@Override
	public ValueExpression resolveVariable(String variableName) {
	    return valueExpressionMap.get(variableName);
	}

	@Override
	public ValueExpression setVariable(String variableName,
		ValueExpression valueExpression) {
	    return valueExpressionMap.put(variableName, valueExpression);
	}

    }

}