HiveRCSchemaUtil.java example

Explorer
spork-streaming-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements. See the NOTICE file distributed with this
 * work for additional information regarding copyright ownership. The ASF
 * licenses this file to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */
package org.apache.pig.piggybank.storage.hiverc;

import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.hadoop.hive.serde2.lazy.LazyArray;
import org.apache.hadoop.hive.serde2.lazy.LazyBoolean;
import org.apache.hadoop.hive.serde2.lazy.LazyByte;
import org.apache.hadoop.hive.serde2.lazy.LazyDouble;
import org.apache.hadoop.hive.serde2.lazy.LazyFloat;
import org.apache.hadoop.hive.serde2.lazy.LazyInteger;
import org.apache.hadoop.hive.serde2.lazy.LazyLong;
import org.apache.hadoop.hive.serde2.lazy.LazyMap;
import org.apache.hadoop.hive.serde2.lazy.LazyShort;
import org.apache.hadoop.hive.serde2.lazy.LazyString;
import org.apache.pig.data.DataType;
import org.apache.pig.data.InternalMap;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;

/**
 * 
 * Implements helper methods for:<br/>
 * <ul>
 * <li>Parsing the hive table schema string.</li>
 * <li>Converting from hive to pig types</li>
 * <li>Converting from pig to hive types</li>
 * </ul>
 */
public class HiveRCSchemaUtil {

    private static final TupleFactory tupleFactory = TupleFactory.getInstance();

    /**
     * Regex to filter out column types
     */
    protected static final Pattern ptypes = Pattern
	    .compile("([ ][a-zA-Z0-9]*)|([a-zA-Z_0-9]*[<][a-zA-Z,_0-9]*[>])");

    /**
     * General schema parsing method, is used to parse the column names.
     * 
     * @param pattern
     *            String
     * @param schema
     *            String
     * @return List of String
     */
    public static List<String> parseSchema(Pattern pattern, String schema) {
	List<String> types = new ArrayList<String>();
	Matcher m = pattern.matcher(schema);
	String item = null;
	while (m.find()) {
	    item = m.group().trim();
	    if (item.length() > 0)
		types.add(item);
	}
	return types;
    }

    /**
     * Parses the schema types and returns a List of these.
     * 
     * @param schema
     * @return List of String
     */
    public static List<String> parseSchemaTypes(String schema) {
	List<String> types = new ArrayList<String>();
	Matcher m = ptypes.matcher(schema);
	String item = null;

	while (m.find()) {
	    item = m.group().trim();
	    if (item.length() > 0) {
		if (item.equalsIgnoreCase("map")) {
		    // if generic type
		    if (m.find()) {
			types.add(item + m.group().trim());
		    } else {
			throw new RuntimeException(
				"Map must have generic types specified");
		    }
		} else if (item.equalsIgnoreCase("array")) {
		    // if generic type
		    if (m.find()) {
			types.add(item + m.group().trim());
		    } else {
			throw new RuntimeException(
				"Array must have generic types specified");
		    }
		} else {
		    types.add(item);
		}
	    }
	}
	return types;
    }

    /**
     * Trims items in the list.
     * 
     * @param list
     * @return String
     */
    public static final String listToString(List<String> list) {
	StringBuilder buff = new StringBuilder();

	for (String item : list) {
	    buff.append(item.trim()).append(",");
	}
	int len = buff.length() - 1;
	buff.delete(len, len);
	return buff.toString();
    }

    /**
     * Extract the date from the hive file names e.g
     * /user/hive/warehouse/table/daydate=2009-10-01/upload001/0002.dat<br/>
     * This method will extract the 2009-10-01 from this name.
     * 
     * @param fileName
     * @return String
     */
    public static final String extractDayDate(String fileName) {
	int index = fileName.indexOf("daydate=");
	String dateStr = null;
	if (index == 0)
	    dateStr = fileName.substring(8, fileName.length());
	else if (index > 0)
	    dateStr = fileName.substring(index + 8,
		    fileName.indexOf('/', index));

	return dateStr;
    }

    /**
     * Returns a set of columns, with the column names strimmed
     * 
     * @param columnsToRead
     * @return Set
     */
    public static final Set<String> compileSet(String columnsToRead) {

	String[] columnsArr = columnsToRead.split(",");
	int len = columnsArr.length;

	Set<String> columnsSet = new TreeSet<String>();

	for (int i = 0; i < len; i++) {
	    columnsSet.add(columnsArr[i].trim());
	}

	return columnsSet;
    }

    /**
     * Returns the pig DataType for the hive type
     * 
     * @param hiveType
     * @return byte from DataType
     */
    public static byte findPigDataType(String hiveType) {
	hiveType = hiveType.toLowerCase();

	if (hiveType.equals("string"))
	    return DataType.CHARARRAY;
	else if (hiveType.equals("int"))
	    return DataType.INTEGER;
	else if (hiveType.equals("bigint") || hiveType.equals("long"))
	    return DataType.LONG;
	else if (hiveType.equals("float"))
	    return DataType.FLOAT;
	else if (hiveType.equals("double"))
	    return DataType.DOUBLE;
	else if (hiveType.equals("boolean"))
	    return DataType.BOOLEAN;
	else if (hiveType.equals("byte"))
	    return DataType.INTEGER;
	else if (hiveType.contains("array"))
	    return DataType.TUPLE;
	else if (hiveType.contains("map"))
	    return DataType.MAP;
	else
	    return DataType.ERROR;
    }

    /**
     * Converts from a hive type to a pig type
     * 
     * @param value
     *            Object hive type
     * @return Object pig type
     */
    public static Object extractPigTypeFromHiveType(Object value) {

	if (value instanceof org.apache.hadoop.hive.serde2.lazy.LazyArray) {
	    value = parseLazyArrayToPigArray((org.apache.hadoop.hive.serde2.lazy.LazyArray) value);
	} else if (value instanceof org.apache.hadoop.hive.serde2.lazy.LazyMap) {
	    value = parseLazyMapToPigMap((org.apache.hadoop.hive.serde2.lazy.LazyMap) value);
	} else {

	    if (value instanceof LazyString) {
		value = ((LazyString) value).getWritableObject().toString();
	    } else if (value instanceof LazyInteger) {
		value = ((LazyInteger) value).getWritableObject().get();
	    } else if (value instanceof LazyLong) {
		value = ((LazyLong) value).getWritableObject().get();
	    } else if (value instanceof LazyFloat) {
		value = ((LazyFloat) value).getWritableObject().get();
	    } else if (value instanceof LazyDouble) {
		value = ((LazyDouble) value).getWritableObject().get();
	    } else if (value instanceof LazyBoolean) {
		boolean boolvalue = ((LazyBoolean) value).getWritableObject()
			.get();
		value = (boolvalue) ? 1 : 0;
	    } else if (value instanceof LazyByte) {
		value = (int) ((LazyByte) value).getWritableObject().get();
	    } else if (value instanceof LazyShort) {
		value = ((LazyShort) value).getWritableObject().get();
	    }

	}

	return value;
    }

    /**
     * Converts the LazyMap to a InternalMap.
     * 
     * @param map
     *            LazyMap
     * @return InternalMap
     */
    public static InternalMap parseLazyMapToPigMap(LazyMap map) {
	InternalMap pigmap = new InternalMap();

	Map<Object, Object> javamap = map.getMap();

	if (javamap != null) {

	    // for each item in the map extract the java primitive type
	    for (Entry<Object, Object> entry : javamap.entrySet()) {
		pigmap.put(extractPigTypeFromHiveType(entry.getKey()),
			extractPigTypeFromHiveType(entry.getValue()));
	    }

	}

	return pigmap;
    }

    /**
     * Converts the LazyArray to a Tuple.<br/>
     * 
     * @param arr
     *            LazyArray
     * @return Tuple
     */
    public static Tuple parseLazyArrayToPigArray(LazyArray arr) {
	List<Object> list = new ArrayList<Object>();

	// each item inside the LazyArray must be converted to its java
	// primitive type
	List<Object> hivedataList = arr.getList();

	for (Object item : hivedataList) {
	    list.add(extractPigTypeFromHiveType(item));
	}

	return tupleFactory.newTuple(list);
    }

}