/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with this
* work for additional information regarding copyright ownership. The ASF
* licenses this file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package org.apache.pig.piggybank.storage.hiverc;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.hadoop.hive.serde2.lazy.LazyArray;
import org.apache.hadoop.hive.serde2.lazy.LazyBoolean;
import org.apache.hadoop.hive.serde2.lazy.LazyByte;
import org.apache.hadoop.hive.serde2.lazy.LazyDouble;
import org.apache.hadoop.hive.serde2.lazy.LazyFloat;
import org.apache.hadoop.hive.serde2.lazy.LazyInteger;
import org.apache.hadoop.hive.serde2.lazy.LazyLong;
import org.apache.hadoop.hive.serde2.lazy.LazyMap;
import org.apache.hadoop.hive.serde2.lazy.LazyShort;
import org.apache.hadoop.hive.serde2.lazy.LazyString;
import org.apache.pig.data.DataType;
import org.apache.pig.data.InternalMap;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
/**
*
* Implements helper methods for:<br/>
* <ul>
* <li>Parsing the hive table schema string.</li>
* <li>Converting from hive to pig types</li>
* <li>Converting from pig to hive types</li>
* </ul>
*/
public class HiveRCSchemaUtil {
private static final TupleFactory tupleFactory = TupleFactory.getInstance();
/**
* Regex to filter out column types
*/
protected static final Pattern ptypes = Pattern
.compile("([ ][a-zA-Z0-9]*)|([a-zA-Z_0-9]*[<][a-zA-Z,_0-9]*[>])");
/**
* General schema parsing method, is used to parse the column names.
*
* @param pattern
* String
* @param schema
* String
* @return List of String
*/
public static List<String> parseSchema(Pattern pattern, String schema) {
List<String> types = new ArrayList<String>();
Matcher m = pattern.matcher(schema);
String item = null;
while (m.find()) {
item = m.group().trim();
if (item.length() > 0)
types.add(item);
}
return types;
}
/**
* Parses the schema types and returns a List of these.
*
* @param schema
* @return List of String
*/
public static List<String> parseSchemaTypes(String schema) {
List<String> types = new ArrayList<String>();
Matcher m = ptypes.matcher(schema);
String item = null;
while (m.find()) {
item = m.group().trim();
if (item.length() > 0) {
if (item.equalsIgnoreCase("map")) {
// if generic type
if (m.find()) {
types.add(item + m.group().trim());
} else {
throw new RuntimeException(
"Map must have generic types specified");
}
} else if (item.equalsIgnoreCase("array")) {
// if generic type
if (m.find()) {
types.add(item + m.group().trim());
} else {
throw new RuntimeException(
"Array must have generic types specified");
}
} else {
types.add(item);
}
}
}
return types;
}
/**
* Trims items in the list.
*
* @param list
* @return String
*/
public static final String listToString(List<String> list) {
StringBuilder buff = new StringBuilder();
for (String item : list) {
buff.append(item.trim()).append(",");
}
int len = buff.length() - 1;
buff.delete(len, len);
return buff.toString();
}
/**
* Extract the date from the hive file names e.g
* /user/hive/warehouse/table/daydate=2009-10-01/upload001/0002.dat<br/>
* This method will extract the 2009-10-01 from this name.
*
* @param fileName
* @return String
*/
public static final String extractDayDate(String fileName) {
int index = fileName.indexOf("daydate=");
String dateStr = null;
if (index == 0)
dateStr = fileName.substring(8, fileName.length());
else if (index > 0)
dateStr = fileName.substring(index + 8,
fileName.indexOf('/', index));
return dateStr;
}
/**
* Returns a set of columns, with the column names strimmed
*
* @param columnsToRead
* @return Set
*/
public static final Set<String> compileSet(String columnsToRead) {
String[] columnsArr = columnsToRead.split(",");
int len = columnsArr.length;
Set<String> columnsSet = new TreeSet<String>();
for (int i = 0; i < len; i++) {
columnsSet.add(columnsArr[i].trim());
}
return columnsSet;
}
/**
* Returns the pig DataType for the hive type
*
* @param hiveType
* @return byte from DataType
*/
public static byte findPigDataType(String hiveType) {
hiveType = hiveType.toLowerCase();
if (hiveType.equals("string"))
return DataType.CHARARRAY;
else if (hiveType.equals("int"))
return DataType.INTEGER;
else if (hiveType.equals("bigint") || hiveType.equals("long"))
return DataType.LONG;
else if (hiveType.equals("float"))
return DataType.FLOAT;
else if (hiveType.equals("double"))
return DataType.DOUBLE;
else if (hiveType.equals("boolean"))
return DataType.BOOLEAN;
else if (hiveType.equals("byte"))
return DataType.INTEGER;
else if (hiveType.contains("array"))
return DataType.TUPLE;
else if (hiveType.contains("map"))
return DataType.MAP;
else
return DataType.ERROR;
}
/**
* Converts from a hive type to a pig type
*
* @param value
* Object hive type
* @return Object pig type
*/
public static Object extractPigTypeFromHiveType(Object value) {
if (value instanceof org.apache.hadoop.hive.serde2.lazy.LazyArray) {
value = parseLazyArrayToPigArray((org.apache.hadoop.hive.serde2.lazy.LazyArray) value);
} else if (value instanceof org.apache.hadoop.hive.serde2.lazy.LazyMap) {
value = parseLazyMapToPigMap((org.apache.hadoop.hive.serde2.lazy.LazyMap) value);
} else {
if (value instanceof LazyString) {
value = ((LazyString) value).getWritableObject().toString();
} else if (value instanceof LazyInteger) {
value = ((LazyInteger) value).getWritableObject().get();
} else if (value instanceof LazyLong) {
value = ((LazyLong) value).getWritableObject().get();
} else if (value instanceof LazyFloat) {
value = ((LazyFloat) value).getWritableObject().get();
} else if (value instanceof LazyDouble) {
value = ((LazyDouble) value).getWritableObject().get();
} else if (value instanceof LazyBoolean) {
boolean boolvalue = ((LazyBoolean) value).getWritableObject()
.get();
value = (boolvalue) ? 1 : 0;
} else if (value instanceof LazyByte) {
value = (int) ((LazyByte) value).getWritableObject().get();
} else if (value instanceof LazyShort) {
value = ((LazyShort) value).getWritableObject().get();
}
}
return value;
}
/**
* Converts the LazyMap to a InternalMap.
*
* @param map
* LazyMap
* @return InternalMap
*/
public static InternalMap parseLazyMapToPigMap(LazyMap map) {
InternalMap pigmap = new InternalMap();
Map<Object, Object> javamap = map.getMap();
if (javamap != null) {
// for each item in the map extract the java primitive type
for (Entry<Object, Object> entry : javamap.entrySet()) {
pigmap.put(extractPigTypeFromHiveType(entry.getKey()),
extractPigTypeFromHiveType(entry.getValue()));
}
}
return pigmap;
}
/**
* Converts the LazyArray to a Tuple.<br/>
*
* @param arr
* LazyArray
* @return Tuple
*/
public static Tuple parseLazyArrayToPigArray(LazyArray arr) {
List<Object> list = new ArrayList<Object>();
// each item inside the LazyArray must be converted to its java
// primitive type
List<Object> hivedataList = arr.getList();
for (Object item : hivedataList) {
list.add(extractPigTypeFromHiveType(item));
}
return tupleFactory.newTuple(list);
}
}