/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.pig.data; import java.lang.reflect.ParameterizedType; import java.lang.reflect.Type; import java.math.BigDecimal; import java.math.BigInteger; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.Iterator; import java.util.Map; import java.util.TreeMap; import org.apache.hadoop.io.WritableComparable; import org.apache.pig.PigException; import org.apache.pig.ResourceSchema; import org.apache.pig.backend.executionengine.ExecException; import org.apache.pig.builtin.ToDate; import org.apache.pig.classification.InterfaceAudience; import org.apache.pig.classification.InterfaceStability; import org.apache.pig.impl.logicalLayer.FrontendException; import org.apache.pig.impl.logicalLayer.schema.Schema; import org.apache.pig.impl.logicalLayer.schema.SchemaMergeException; import org.joda.time.DateTime; /** * A class of static final values used to encode data type and a number of * static helper functions for manipulating data objects. The data type * values could be * done as an enumeration, but it is done as byte codes instead to save * creating objects. */ @InterfaceAudience.Public @InterfaceStability.Stable public class DataType { // IMPORTANT! This list can be used to record values of data on disk, // so do not change the values. You may strand user data. // IMPORTANT! Order matters here, as compare() below uses the order to // order unlike datatypes. Don't change this ordering. // Spaced unevenly to leave room for new entries without changing // values or creating order issues. public static final byte UNKNOWN = 0; public static final byte NULL = 1; public static final byte BOOLEAN = 5; public static final byte BYTE = 6; // internal use only public static final byte INTEGER = 10; public static final byte LONG = 15; public static final byte FLOAT = 20; public static final byte DOUBLE = 25; public static final byte DATETIME = 30; public static final byte BYTEARRAY = 50; public static final byte CHARARRAY = 55; public static final byte BIGINTEGER = 65; public static final byte BIGDECIMAL = 70; /** * Internal use only. */ public static final byte BIGCHARARRAY = 60; //internal use only; for storing/loading chararray bigger than 64K characters in BinStorage public static final byte MAP = 100; public static final byte TUPLE = 110; public static final byte BAG = 120; /** * Internal use only; used to store WriteableComparable objects * for creating ordered index in MergeJoin. Expecting a object that * implements Writable interface and has default constructor */ public static final byte GENERIC_WRITABLECOMPARABLE = 123; /** * Internal use only. */ public static final byte INTERNALMAP = 127; // internal use only; for maps that are object->object. Used by FindQuantiles. public static final byte ERROR = -1; /** * Determine the datatype of an object. * @param o Object to test. * @return byte code of the type, or ERROR if we don't know. */ public static byte findType(Object o) { if (o == null) { return NULL; } // Try to put the most common first if (o instanceof DataByteArray) { return BYTEARRAY; } else if (o instanceof String) { return CHARARRAY; } else if (o instanceof Tuple) { return TUPLE; } else if (o instanceof DataBag) { return BAG; } else if (o instanceof Integer) { return INTEGER; } else if (o instanceof Long) { return LONG; } else if (o instanceof InternalMap) { return INTERNALMAP; } else if (o instanceof Map) { return MAP; } else if (o instanceof Float) { return FLOAT; } else if (o instanceof Double) { return DOUBLE; } else if (o instanceof Boolean) { return BOOLEAN; } else if (o instanceof DateTime) { return DATETIME; } else if (o instanceof Byte) { return BYTE; } else if (o instanceof BigInteger) { return BIGINTEGER; } else if (o instanceof BigDecimal) { return BIGDECIMAL; } else if (o instanceof WritableComparable) { return GENERIC_WRITABLECOMPARABLE; } else {return ERROR;} } /** * Given a Type object determine the data type it represents. This isn't * cheap, as it uses reflection, so use sparingly. * @param t Type to examine * @return byte code of the type, or ERROR if we don't know. */ public static byte findType(Type t) { if (t == null) { return NULL; } // Try to put the most common first if (t == DataByteArray.class) { return BYTEARRAY; } else if (t == String.class) { return CHARARRAY; } else if (t == Integer.class) { return INTEGER; } else if (t == Long.class) { return LONG; } else if (t == Float.class) { return FLOAT; } else if (t == Double.class) { return DOUBLE; } else if (t == Boolean.class) { return BOOLEAN; } else if (t == Byte.class) { return BYTE; } else if (t == BigInteger.class) { return BIGINTEGER; } else if (t == BigDecimal.class) { return BIGDECIMAL; } else if (t == DateTime.class) { return DATETIME; } else if (t == InternalMap.class) { return INTERNALMAP; } else { // Might be a tuple or a bag, need to check the interfaces it // implements if (t instanceof Class) { return extractTypeFromClass(t); }else if (t instanceof ParameterizedType){ ParameterizedType impl=(ParameterizedType)t; Class c=(Class)impl.getRawType(); return extractTypeFromClass(c); } return ERROR; } } private static byte extractTypeFromClass(Type t) { Class c = (Class)t; Class[] ioeInterfaces = c.getInterfaces(); Class[] interfaces = null; if(c.isInterface()){ interfaces = new Class[ioeInterfaces.length+1]; interfaces[0] = c; for (int i = 1; i < interfaces.length; i++) { interfaces[i] = ioeInterfaces[i-1]; } } else { interfaces = ioeInterfaces; } boolean matchedWritableComparable = false; for (int i = 0; i < interfaces.length; i++) { if (interfaces[i].getName().equals("org.apache.pig.data.Tuple")) { return TUPLE; } else if (interfaces[i].getName().equals("org.apache.pig.data.DataBag")) { return BAG; } else if (interfaces[i].getName().equals("java.util.Map")) { return MAP; } else if (interfaces[i].getName().equals("org.apache.hadoop.io.WritableComparable")) { // use GENERIC_WRITABLECOMPARABLE type only as last resort matchedWritableComparable = true; } } if(matchedWritableComparable) { return GENERIC_WRITABLECOMPARABLE; } return ERROR; } /** * Return the number of types Pig knows about. * @return number of types */ public static int numTypes(){ byte[] types = genAllTypes(); return types.length; } /** * Get an array of all type values. * @return byte array with an entry for each type. */ public static byte[] genAllTypes(){ byte[] types = { DataType.BAG, DataType.BIGCHARARRAY, DataType.BOOLEAN, DataType.BYTE, DataType.BYTEARRAY, DataType.CHARARRAY, DataType.DOUBLE, DataType.FLOAT, DataType.DATETIME, DataType.GENERIC_WRITABLECOMPARABLE, DataType.INTEGER, DataType.INTERNALMAP, DataType.LONG, DataType.MAP, DataType.TUPLE, DataType.BIGINTEGER, DataType.BIGDECIMAL}; return types; } private static String[] genAllTypeNames(){ String[] names = { "BAG", "BIGCHARARRAY", "BOOLEAN", "BYTE", "BYTEARRAY", "CHARARRAY", "DOUBLE", "FLOAT", "DATETIME", "GENERIC_WRITABLECOMPARABLE", "INTEGER", "INTERNALMAP", "LONG", "MAP", "TUPLE", "BIGINTEGER", "BIGDECIMAL" }; return names; } /** * Get a map of type values to type names. * @return map */ public static Map<Byte, String> genTypeToNameMap(){ byte[] types = genAllTypes(); String[] names = genAllTypeNames(); Map<Byte,String> ret = new HashMap<Byte, String>(); for(int i=0;i<types.length;i++){ ret.put(types[i], names[i]); } return ret; } /** * Get a map of type names to type values. * @return map */ public static Map<String, Byte> genNameToTypeMap(){ byte[] types = genAllTypes(); String[] names = genAllTypeNames(); Map<String, Byte> ret = new HashMap<String, Byte>(); for(int i=0;i<types.length;i++){ ret.put(names[i], types[i]); } return ret; } /** * Get the type name. * @param o Object to test. * @return type name, as a String. */ public static String findTypeName(Object o) { return findTypeName(findType(o)); } /** * Get the type name from the type byte code * @param dt Type byte code * @return type name, as a String. */ public static String findTypeName(byte dt) { switch (dt) { case NULL: return "NULL"; case BOOLEAN: return "boolean"; case BYTE: return "byte"; case INTEGER: return "int"; case BIGINTEGER: return "biginteger"; case BIGDECIMAL: return "bigdecimal"; case LONG: return "long"; case FLOAT: return "float"; case DOUBLE: return "double"; case DATETIME: return "datetime"; case BYTEARRAY: return "bytearray"; case BIGCHARARRAY: return "bigchararray"; case CHARARRAY: return "chararray"; case MAP: return "map"; case INTERNALMAP: return "internalmap"; case TUPLE: return "tuple"; case BAG: return "bag"; case GENERIC_WRITABLECOMPARABLE: return "generic_writablecomparable"; default: return "Unknown"; } } public static Class<?> findTypeClass(byte dt) { switch (dt) { case NULL: return Void.TYPE; case BOOLEAN: return Boolean.TYPE; case BYTE: return Byte.TYPE; case INTEGER: return Integer.TYPE; case BIGINTEGER: return BigInteger.class; case BIGDECIMAL: return BigDecimal.class; case LONG: return Long.TYPE; case FLOAT: return Float.TYPE; case DOUBLE: return Double.TYPE; case DATETIME: return DateTime.class; case BYTEARRAY: return DataByteArray.class; case BIGCHARARRAY: return String.class; case CHARARRAY: return String.class; case MAP: return Map.class; case INTERNALMAP: return InternalMap.class; case TUPLE: return Tuple.class; case BAG: return DataBag.class; case GENERIC_WRITABLECOMPARABLE: return WritableComparable.class; default: throw new RuntimeException("Invalid type has no corresponding class: " + dt); } } /** * Determine whether the this data type is complex. * @param dataType Data type code to test. * @return true if dataType is bag, tuple, or map. */ public static boolean isComplex(byte dataType) { return ((dataType == BAG) || (dataType == TUPLE) || (dataType == MAP) || (dataType == INTERNALMAP)); } /** * Determine whether the object is complex or atomic. * @param o Object to determine type of. * @return true if dataType is bag, tuple, or map. */ public static boolean isComplex(Object o) { return isComplex(findType(o)); } /** * Determine whether the this data type is atomic. * @param dataType Data type code to test. * @return true if dataType is bytearray, bigchararray, chararray, integer, long, * float, or boolean. */ public static boolean isAtomic(byte dataType) { return ((dataType == BYTEARRAY) || (dataType == CHARARRAY) || (dataType == BIGCHARARRAY) || (dataType == INTEGER) || (dataType == BIGINTEGER) || (dataType == BIGDECIMAL) || (dataType == LONG) || (dataType == FLOAT) || (dataType == DOUBLE) || (dataType == BOOLEAN) || (dataType == BYTE) || (dataType == DATETIME) || (dataType == GENERIC_WRITABLECOMPARABLE)); } /** * Determine whether the this data type is atomic. * @param o Object to determine type of. * @return true if dataType is bytearray, chararray, integer, long, * float, or boolean. */ public static boolean isAtomic(Object o) { return isAtomic(findType(o)); } /** * Determine whether the this object can have a schema. * @param o Object to determine if it has a schema * @return true if the type can have a valid schema (i.e., bag or tuple) */ public static boolean isSchemaType(Object o) { return isSchemaType(findType(o)); } /** * Determine whether the this data type can have a schema. * @param dataType dataType to determine if it has a schema * @return true if the type can have a valid schema (i.e., bag or tuple) */ public static boolean isSchemaType(byte dataType) { return ((dataType == BAG) || (dataType == TUPLE) || dataType == MAP); } /** /** * Compare two objects to each other. This function is necessary * because there's no super class that implements compareTo. This * function provides an (arbitrary) ordering of objects of different * types as follows: NULL < BOOLEAN < BYTE < INTEGER < LONG < * FLOAT < DOUBLE < DATETIME < BYTEARRAY < STRING < MAP < * TUPLE < BAG. No other functions should implement this cross * object logic. They should call this function for it instead. * @param o1 First object * @param o2 Second object * @return -1 if o1 is less, 0 if they are equal, 1 if o2 is less. */ public static int compare(Object o1, Object o2) { byte dt1 = findType(o1); byte dt2 = findType(o2); return compare(o1, o2, dt1, dt2); } /** * Same as {@link #compare(Object, Object)}, but does not use reflection to determine the type * of passed in objects, relying instead on the caller to provide the appropriate values, as * determined by {@link DataType#findType(Object)}. * * Use this version in cases where multiple objects of the same type have to be repeatedly compared. * @param o1 first object * @param o2 second object * @param dt1 type, as byte value, of o1 * @param dt2 type, as byte value, of o2 * @return -1 if o1 is < o2, 0 if they are equal, 1 if o1 > o2 */ @SuppressWarnings("unchecked") public static int compare(Object o1, Object o2, byte dt1, byte dt2) { if (dt1 == dt2) { if(o1 == null) { if(o2 == null) { return 0; } else { return -1; } } else { if(o2 == null) { return 1; } } switch (dt1) { case NULL: return 0; case BOOLEAN: return ((Boolean)o1).compareTo((Boolean)o2); case BYTE: return ((Byte)o1).compareTo((Byte)o2); case INTEGER: return ((Integer)o1).compareTo((Integer)o2); case LONG: return ((Long)o1).compareTo((Long)o2); case FLOAT: return ((Float)o1).compareTo((Float)o2); case DOUBLE: return ((Double)o1).compareTo((Double)o2); case DATETIME: return ((DateTime)o1).compareTo((DateTime)o2); case BYTEARRAY: return ((DataByteArray)o1).compareTo(o2); case CHARARRAY: return ((String)o1).compareTo((String)o2); case BIGINTEGER: return ((BigInteger)o1).compareTo((BigInteger)o2); case BIGDECIMAL: return ((BigDecimal)o1).compareTo((BigDecimal)o2); case MAP: { Map<String, Object> m1 = (Map<String, Object>)o1; Map<String, Object> m2 = (Map<String, Object>)o2; int sz1 = m1.size(); int sz2 = m2.size(); if (sz1 < sz2) { return -1; } else if (sz1 > sz2) { return 1; } else { // This is bad, but we have to sort the keys of the maps in order // to be commutative. TreeMap<String, Object> tm1 = new TreeMap<String, Object>(m1); TreeMap<String, Object> tm2 = new TreeMap<String, Object>(m2); Iterator<Map.Entry<String, Object> > i1 = tm1.entrySet().iterator(); Iterator<Map.Entry<String, Object> > i2 = tm2.entrySet().iterator(); while (i1.hasNext()) { Map.Entry<String, Object> entry1 = i1.next(); Map.Entry<String, Object> entry2 = i2.next(); int c = entry1.getKey().compareTo(entry2.getKey()); if (c != 0) { return c; } else { c = compare(entry1.getValue(), entry2.getValue()); if (c != 0) { return c; } } } return 0; } } case GENERIC_WRITABLECOMPARABLE: return ((Comparable)o1).compareTo(o2); case INTERNALMAP: return -1; // Don't think anyway will want to do this. case TUPLE: return ((Tuple)o1).compareTo(o2); case BAG: return ((DataBag)o1).compareTo(o2); default: throw new RuntimeException("Unkown type " + dt1 + " in compare"); } } else if (dt1 < dt2) { return -1; } else { return 1; } } public static byte[] toBytes(Object o) throws ExecException { return toBytes(o, findType(o)); } @SuppressWarnings("unchecked") public static byte[] toBytes(Object o, byte type) throws ExecException { switch (type) { case BOOLEAN: //return ((Boolean) o).booleanValue() ? new byte[] {1} : new byte[] {0}; return ((Boolean) o).toString().getBytes(); case BYTE: return new byte[] {((Byte) o)}; case BIGINTEGER: case BIGDECIMAL: case INTEGER: case DOUBLE: case FLOAT: case LONG: return ((Number) o).toString().getBytes(); case DATETIME: return ((DateTime) o).toString().getBytes(); case CHARARRAY: return ((String) o).getBytes(); case MAP: return mapToString((Map<String, Object>) o).getBytes(); case TUPLE: return ((Tuple) o).toString().getBytes(); case BYTEARRAY: return ((DataByteArray) o).get(); case BAG: return ((DataBag) o).toString().getBytes(); case NULL: return null; default: int errCode = 1071; String msg = "Cannot convert a " + findTypeName(o) + " to a ByteArray"; throw new ExecException(msg, errCode, PigException.INPUT); } } /** * Force a data object to a Boolean, if possible. Any numeric type can be * forced to a Boolean, as well as CharArray, ByteArray. Complex types * cannot be forced to a Boolean. This isn't particularly efficient, so if * you already <b>know</b> that the object you have is a Boolean you should * just cast it. * * @param o * object to cast * @param type * of the object you are casting * @return The object as a Boolean. * @throws ExecException * if the type can't be forced to a Boolean. */ public static Boolean toBoolean(Object o, byte type) throws ExecException { try { switch (type) { case NULL: return null; case BOOLEAN: return (Boolean) o; case BYTE: return Boolean.valueOf(((Byte) o).byteValue() != 0); case INTEGER: return Boolean.valueOf(((Integer) o).intValue() != 0); case LONG: return Boolean.valueOf(((Long) o).longValue() != 0L); case BIGINTEGER: return Boolean.valueOf(!BigInteger.ZERO.equals(((BigInteger) o))); case BIGDECIMAL: return Boolean.valueOf(!BigDecimal.ZERO.equals(((BigDecimal) o))); case FLOAT: return Boolean.valueOf(((Float) o).floatValue() != 0.0F); case DOUBLE: return Boolean.valueOf(((Double) o).doubleValue() != 0.0D); case BYTEARRAY: String str = ((DataByteArray) o).toString(); if (str.equalsIgnoreCase("true")) { return Boolean.TRUE; } else if (str.equalsIgnoreCase("false")) { return Boolean.FALSE; } else { return null; } case CHARARRAY: if (((String) o).equalsIgnoreCase("true")) { return Boolean.TRUE; } else if (((String) o).equalsIgnoreCase("false")) { return Boolean.FALSE; } else { return null; } case DATETIME: case MAP: case INTERNALMAP: case TUPLE: case BAG: case UNKNOWN: default: int errCode = 1071; String msg = "Cannot convert a " + findTypeName(o) + " to a Boolean"; throw new ExecException(msg, errCode, PigException.INPUT); } } catch (ClassCastException cce) { throw cce; } catch (ExecException ee) { throw ee; } catch (NumberFormatException nfe) { int errCode = 1074; String msg = "Problem with formatting. Could not convert " + o + " to Float."; throw new ExecException(msg, errCode, PigException.INPUT, nfe); } catch (Exception e) { int errCode = 2054; String msg = "Internal error. Could not convert " + o + " to Float."; throw new ExecException(msg, errCode, PigException.BUG); } } public static Boolean toBoolean(Object o) throws ExecException { return toBoolean(o, findType(o)); } /** * Force a data object to an Integer, if possible. Any numeric type * can be forced to an Integer (though precision may be lost), as well * as CharArray, ByteArray, or Boolean. Complex types cannot be * forced to an Integer. This isn't particularly efficient, so if you * already <b>know</b> that the object you have is an Integer you * should just cast it. * @param o object to cast * @param type of the object you are casting * @return The object as an Integer. * @throws ExecException if the type can't be forced to an Integer. */ public static Integer toInteger(Object o,byte type) throws ExecException { try { switch (type) { case BOOLEAN: if (((Boolean)o) == true) { return Integer.valueOf(1); } else { return Integer.valueOf(0); } case BYTE: return Integer.valueOf(((Byte)o).intValue()); case INTEGER: return (Integer)o; case LONG: return Integer.valueOf(((Long)o).intValue()); case FLOAT: return Integer.valueOf(((Float)o).intValue()); case DOUBLE: return Integer.valueOf(((Double)o).intValue()); case BYTEARRAY: return Integer.valueOf(((DataByteArray)o).toString()); case CHARARRAY: return Integer.valueOf((String)o); case BIGINTEGER: return Integer.valueOf(((BigInteger)o).intValue()); case BIGDECIMAL: return Integer.valueOf(((BigDecimal)o).intValue()); case NULL: return null; case DATETIME: return Integer.valueOf(Long.valueOf(((DateTime)o).getMillis()).intValue()); case MAP: case INTERNALMAP: case TUPLE: case BAG: case UNKNOWN: default: int errCode = 1071; String msg = "Cannot convert a " + findTypeName(o) + " to an Integer"; throw new ExecException(msg, errCode, PigException.INPUT); } } catch (ClassCastException cce) { throw cce; } catch (ExecException ee) { throw ee; } catch (NumberFormatException nfe) { int errCode = 1074; String msg = "Problem with formatting. Could not convert " + o + " to Integer."; throw new ExecException(msg, errCode, PigException.INPUT, nfe); } catch (Exception e) { int errCode = 2054; String msg = "Internal error. Could not convert " + o + " to Integer."; throw new ExecException(msg, errCode, PigException.BUG); } } /** * Force a data object to an Integer, if possible. Any numeric type * can be forced to an Integer (though precision may be lost), as well * as CharArray, ByteArray, or Boolean. Complex types cannot be * forced to an Integer. This isn't particularly efficient, so if you * already <b>know</b> that the object you have is an Integer you * should just cast it. Unlike {@link #toInteger(Object, byte)} this * method will first determine the type of o and then do the cast. * Use {@link #toInteger(Object, byte)} if you already know the type. * @param o object to cast * @return The object as an Integer. * @throws ExecException if the type can't be forced to an Integer. */ public static Integer toInteger(Object o) throws ExecException { return toInteger(o, findType(o)); } /** * Force a data object to a Long, if possible. Any numeric type * can be forced to a Long (though precision may be lost), as well * as CharArray, ByteArray, or Boolean. Complex types cannot be * forced to a Long. This isn't particularly efficient, so if you * already <b>know</b> that the object you have is a Long you * should just cast it. * @param o object to cast * @param type of the object you are casting * @return The object as a Long. * @throws ExecException if the type can't be forced to a Long. */ public static Long toLong(Object o,byte type) throws ExecException { try { switch (type) { case BOOLEAN: if (((Boolean)o) == true) { return Long.valueOf(1); } else { return Long.valueOf(0); } case BYTE: return Long.valueOf(((Byte)o).longValue()); case INTEGER: return Long.valueOf(((Integer)o).longValue()); case LONG: return (Long)o; case FLOAT: return Long.valueOf(((Float)o).longValue()); case DOUBLE: return Long.valueOf(((Double)o).longValue()); case BYTEARRAY: return Long.valueOf(((DataByteArray)o).toString()); case CHARARRAY: return Long.valueOf((String)o); case BIGINTEGER: return Long.valueOf(((BigInteger)o).longValue()); case BIGDECIMAL: return Long.valueOf(((BigDecimal)o).longValue()); case NULL: return null; case DATETIME: return Long.valueOf(((DateTime)o).getMillis()); case MAP: case INTERNALMAP: case TUPLE: case BAG: case UNKNOWN: default: int errCode = 1071; String msg = "Cannot convert a " + findTypeName(o) + " to a Long"; throw new ExecException(msg, errCode, PigException.INPUT); } } catch (ClassCastException cce) { throw cce; } catch (ExecException ee) { throw ee; } catch (NumberFormatException nfe) { int errCode = 1074; String msg = "Problem with formatting. Could not convert " + o + " to Long."; throw new ExecException(msg, errCode, PigException.INPUT, nfe); } catch (Exception e) { int errCode = 2054; String msg = "Internal error. Could not convert " + o + " to Long."; throw new ExecException(msg, errCode, PigException.BUG); } } /** * Force a data object to a Long, if possible. Any numeric type * can be forced to a Long (though precision may be lost), as well * as CharArray, ByteArray, or Boolean. Complex types cannot be * forced to an Long. This isn't particularly efficient, so if you * already <b>know</b> that the object you have is a Long you * should just cast it. Unlike {@link #toLong(Object, byte)} this * method will first determine the type of o and then do the cast. * Use {@link #toLong(Object, byte)} if you already know the type. * @param o object to cast * @return The object as a Long. * @throws ExecException if the type can't be forced to an Long. */ public static Long toLong(Object o) throws ExecException { return toLong(o, findType(o)); } /** * Force a data object to a Float, if possible. Any numeric type * can be forced to a Float (though precision may be lost), as well * as CharArray, ByteArray. Complex types cannot be * forced to a Float. This isn't particularly efficient, so if you * already <b>know</b> that the object you have is a Float you * should just cast it. * @param o object to cast * @param type of the object you are casting * @return The object as a Float. * @throws ExecException if the type can't be forced to a Float. */ public static Float toFloat(Object o,byte type) throws ExecException { try { switch (type) { case BOOLEAN: return (Boolean) o ? Float.valueOf(1.0F) : Float.valueOf(0.0F); case INTEGER: return new Float(((Integer)o).floatValue()); case LONG: return new Float(((Long)o).floatValue()); case FLOAT: return (Float)o; case DOUBLE: return new Float(((Double)o).floatValue()); case DATETIME: return new Float(Long.valueOf(((DateTime)o).getMillis()).floatValue()); case BYTEARRAY: return Float.valueOf(((DataByteArray)o).toString()); case CHARARRAY: return Float.valueOf((String)o); case BIGINTEGER: return Float.valueOf(((BigInteger)o).floatValue()); case BIGDECIMAL: return Float.valueOf(((BigDecimal)o).floatValue()); case NULL: return null; case BYTE: case MAP: case INTERNALMAP: case TUPLE: case BAG: case UNKNOWN: default: int errCode = 1071; String msg = "Cannot convert a " + findTypeName(o) + " to a Float"; throw new ExecException(msg, errCode, PigException.INPUT); } } catch (ClassCastException cce) { throw cce; } catch (ExecException ee) { throw ee; } catch (NumberFormatException nfe) { int errCode = 1074; String msg = "Problem with formatting. Could not convert " + o + " to Float."; throw new ExecException(msg, errCode, PigException.INPUT, nfe); } catch (Exception e) { int errCode = 2054; String msg = "Internal error. Could not convert " + o + " to Float."; throw new ExecException(msg, errCode, PigException.BUG); } } /** * Force a data object to a Float, if possible. Any numeric type * can be forced to a Float (though precision may be lost), as well * as CharArray, ByteArray, or Boolean. Complex types cannot be * forced to an Float. This isn't particularly efficient, so if you * already <b>know</b> that the object you have is a Float you * should just cast it. Unlike {@link #toFloat(Object, byte)} this * method will first determine the type of o and then do the cast. * Use {@link #toFloat(Object, byte)} if you already know the type. * @param o object to cast * @return The object as a Float. * @throws ExecException if the type can't be forced to an Float. */ public static Float toFloat(Object o) throws ExecException { return toFloat(o, findType(o)); } /** * Force a data object to a Double, if possible. Any numeric type * can be forced to a Double, as well * as CharArray, ByteArray. Complex types cannot be * forced to a Double. This isn't particularly efficient, so if you * already <b>know</b> that the object you have is a Double you * should just cast it. * @param o object to cast * @param type of the object you are casting * @return The object as a Double. * @throws ExecException if the type can't be forced to a Double. */ public static Double toDouble(Object o,byte type) throws ExecException { try { switch (type) { case BOOLEAN: return (Boolean) o ? Double.valueOf(1.0D) : Double.valueOf(0.0D); case INTEGER: return new Double(((Integer)o).doubleValue()); case LONG: return new Double(((Long)o).doubleValue()); case FLOAT: return new Double(((Float)o).doubleValue()); case DOUBLE: return (Double)o; case DATETIME: return new Double(Long.valueOf(((DateTime)o).getMillis()).doubleValue()); case BYTEARRAY: return Double.valueOf(((DataByteArray)o).toString()); case CHARARRAY: return Double.valueOf((String)o); case BIGINTEGER: return Double.valueOf(((BigInteger)o).doubleValue()); case BIGDECIMAL: return Double.valueOf(((BigDecimal)o).doubleValue()); case NULL: return null; case BYTE: case MAP: case INTERNALMAP: case TUPLE: case BAG: case UNKNOWN: default: int errCode = 1071; String msg = "Cannot convert a " + findTypeName(o) + " to a Double"; throw new ExecException(msg, errCode, PigException.INPUT); } } catch (ClassCastException cce) { throw cce; } catch (ExecException ee) { throw ee; } catch (NumberFormatException nfe) { int errCode = 1074; String msg = "Problem with formatting. Could not convert " + o + " to Double."; throw new ExecException(msg, errCode, PigException.INPUT, nfe); } catch (Exception e) { int errCode = 2054; String msg = "Internal error. Could not convert " + o + " to Double."; throw new ExecException(msg, errCode, PigException.BUG); } } /** * Force a data object to a DateTime, if possible. Only CharArray, ByteArray * can be forced to a DateTime. Numeric types and complex types * cannot be forced to a DateTime. This isn't particularly efficient, so if * you already <b>know</b> that the object you have is a DateTime you should * just cast it. * * @param o * object to cast * @param type * of the object you are casting * @return The object as a Boolean. * @throws ExecException * if the type can't be forced to a Boolean. */ public static DateTime toDateTime(Object o, byte type) throws ExecException { try { switch (type) { case NULL: return null; case BYTEARRAY: return new DateTime(((DataByteArray) o).toString()); case CHARARRAY: // the string can contain just date part or date part plus time part return ToDate.extractDateTime((String) o); case INTEGER: return new DateTime(((Integer) o).longValue()); case LONG: return new DateTime(((Long) o).longValue()); case FLOAT: return new DateTime(((Float) o).longValue()); case DOUBLE: return new DateTime(((Double) o).longValue()); case BIGINTEGER: return new DateTime(((BigInteger) o).longValue()); case BIGDECIMAL: return new DateTime(((BigDecimal) o).longValue()); case DATETIME: return (DateTime) o; case BOOLEAN: case BYTE: case MAP: case INTERNALMAP: case TUPLE: case BAG: case UNKNOWN: default: int errCode = 1071; String msg = "Cannot convert a " + findTypeName(o) + " to a Boolean"; throw new ExecException(msg, errCode, PigException.INPUT); } } catch (ClassCastException cce) { throw cce; } catch (ExecException ee) { throw ee; } catch (NumberFormatException nfe) { int errCode = 1074; String msg = "Problem with formatting. Could not convert " + o + " to Float."; throw new ExecException(msg, errCode, PigException.INPUT, nfe); } catch (Exception e) { int errCode = 2054; String msg = "Internal error. Could not convert " + o + " to Float."; throw new ExecException(msg, errCode, PigException.BUG); } } public static DateTime toDateTime(Object o) throws ExecException { return toDateTime(o, findType(o)); } /** * Force a data object to a Double, if possible. Any numeric type * can be forced to a Double, as well * as CharArray, ByteArray, or Boolean. Complex types cannot be * forced to an Double. This isn't particularly efficient, so if you * already <b>know</b> that the object you have is a Double you * should just cast it. Unlike {@link #toDouble(Object, byte)} this * method will first determine the type of o and then do the cast. * Use {@link #toDouble(Object, byte)} if you already know the type. * @param o object to cast * @return The object as a Double. * @throws ExecException if the type can't be forced to an Double. */ public static Double toDouble(Object o) throws ExecException { return toDouble(o, findType(o)); } public static BigInteger toBigInteger(Object o) throws ExecException { return toBigInteger(o, findType(o)); } public static BigInteger toBigInteger(Object o,byte type) throws ExecException { try { switch (type) { case BOOLEAN: return (Boolean) o ? BigInteger.ONE : BigInteger.ZERO; case INTEGER: return BigInteger.valueOf(((Integer)o).longValue()); case LONG: return BigInteger.valueOf(((Long)o).longValue()); case FLOAT: return BigInteger.valueOf(((Float)o).longValue()); case DOUBLE: return BigInteger.valueOf(((Double)o).longValue()); case BYTEARRAY: return new BigInteger(((DataByteArray)o).toString()); case CHARARRAY: return new BigInteger((String)o); case BIGINTEGER: return (BigInteger)o; case BIGDECIMAL: return ((BigDecimal)o).toBigInteger(); case DATETIME: return BigInteger.valueOf(((DateTime)o).getMillis()); case NULL: return null; case BYTE: case MAP: case INTERNALMAP: case TUPLE: case BAG: case UNKNOWN: default: int errCode = 1071; String msg = "Cannot convert a " + findTypeName(o) + " to a BigInteger."; throw new ExecException(msg, errCode, PigException.INPUT); } } catch (ClassCastException cce) { throw cce; } catch (ExecException ee) { throw ee; } catch (NumberFormatException nfe) { int errCode = 1074; String msg = "Problem with formatting. Could not convert " + o + " to BigInteger."; throw new ExecException(msg, errCode, PigException.INPUT, nfe); } catch (Exception e) { int errCode = 2054; String msg = "Internal error. Could not convert " + o + " to BigInteger."; throw new ExecException(msg, errCode, PigException.BUG); } } public static BigDecimal toBigDecimal(Object o) throws ExecException { return toBigDecimal(o, findType(o)); } public static BigDecimal toBigDecimal(Object o,byte type) throws ExecException { try { switch (type) { case BOOLEAN: return (Boolean) o ? BigDecimal.ONE : BigDecimal.ZERO; case INTEGER: return BigDecimal.valueOf(((Integer)o).longValue()); case LONG: return BigDecimal.valueOf(((Long)o).longValue()); case FLOAT: return BigDecimal.valueOf(((Float)o).doubleValue()); case DOUBLE: return BigDecimal.valueOf(((Double)o).doubleValue()); case BYTEARRAY: return new BigDecimal(((DataByteArray)o).toString()); case CHARARRAY: return new BigDecimal((String)o); case BIGINTEGER: return new BigDecimal((BigInteger)o); case BIGDECIMAL: return (BigDecimal)o; case DATETIME: return BigDecimal.valueOf(((DateTime)o).getMillis()); case NULL: return null; case BYTE: case MAP: case INTERNALMAP: case TUPLE: case BAG: case UNKNOWN: default: int errCode = 1071; String msg = "Cannot convert a " + findTypeName(o) + " to a BigDecimal."; throw new ExecException(msg, errCode, PigException.INPUT); } } catch (ClassCastException cce) { throw cce; } catch (ExecException ee) { throw ee; } catch (NumberFormatException nfe) { int errCode = 1074; String msg = "Problem with formatting. Could not convert " + o + " to BigDecimal."; throw new ExecException(msg, errCode, PigException.INPUT, nfe); } catch (Exception e) { int errCode = 2054; String msg = "Internal error. Could not convert " + o + " to BigDecimal."; throw new ExecException(msg, errCode, PigException.BUG); } } /** * Force a data object to a String, if possible. Any simple (atomic) type * can be forced to a String including ByteArray. Complex types cannot be * forced to a String. This isn't particularly efficient, so if you * already <b>know</b> that the object you have is a String you * should just cast it. * @param o object to cast * @param type of the object you are casting * @return The object as a String. * @throws ExecException if the type can't be forced to a String. */ public static String toString(Object o,byte type) throws ExecException { try { switch (type) { case INTEGER: return ((Integer)o).toString(); case LONG: return ((Long)o).toString(); case FLOAT: return ((Float)o).toString(); case DOUBLE: return ((Double)o).toString(); case DATETIME: return ((DateTime)o).toString(); case BYTEARRAY: return ((DataByteArray)o).toString(); case CHARARRAY: return ((String)o); case BIGINTEGER: return ((BigInteger)o).toString(); case BIGDECIMAL: return ((BigDecimal)o).toString(); case NULL: return null; case BOOLEAN: return ((Boolean)o).toString(); case BYTE: return ((Byte)o).toString(); case MAP: case INTERNALMAP: case TUPLE: case BAG: case UNKNOWN: default: int errCode = 1071; String msg = "Cannot convert a " + findTypeName(o) + " to a String"; throw new ExecException(msg, errCode, PigException.INPUT); } } catch (ClassCastException cce) { throw cce; } catch (ExecException ee) { throw ee; } catch (Exception e) { int errCode = 2054; String msg = "Internal error. Could not convert " + o + " to String."; throw new ExecException(msg, errCode, PigException.BUG); } } /** * Force a data object to a String, if possible. Any simple (atomic) type * can be forced to a String including ByteArray. Complex types cannot be * forced to a String. This isn't particularly efficient, so if you * already <b>know</b> that the object you have is a String you * should just cast it. Unlike {@link #toString(Object, byte)} this * method will first determine the type of o and then do the cast. * Use {@link #toString(Object, byte)} if you already know the type. * @param o object to cast * @return The object as a String. * @throws ExecException if the type can't be forced to a String. */ public static String toString(Object o) throws ExecException { return toString(o, findType(o)); } /** * If this object is a map, return it as a map. * This isn't particularly efficient, so if you * already <b>know</b> that the object you have is a Map you * should just cast it. * @param o object to cast * @return The object as a Map. * @throws ExecException if the type can't be forced to a Double. */ @SuppressWarnings("unchecked") public static Map<String, Object> toMap(Object o) throws ExecException { if (o == null) { return null; } if (o instanceof Map && !(o instanceof InternalMap)) { try { return (Map<String, Object>)o; } catch (Exception e) { int errCode = 2054; String msg = "Internal error. Could not convert " + o + " to Map."; throw new ExecException(msg, errCode, PigException.BUG); } } else { int errCode = 1071; String msg = "Cannot convert a " + findTypeName(o) + " to a Map"; throw new ExecException(msg, errCode, PigException.INPUT); } } /** * If this object is a tuple, return it as a tuple. * This isn't particularly efficient, so if you * already <b>know</b> that the object you have is a Tuple you * should just cast it. * @param o object to cast * @return The object as a Double. * @throws ExecException if the type can't be forced to a Double. */ public static Tuple toTuple(Object o) throws ExecException { if (o == null) { return null; } if (o instanceof Tuple) { try { return (Tuple)o; } catch (Exception e) { int errCode = 2054; String msg = "Internal error. Could not convert " + o + " to Tuple."; throw new ExecException(msg, errCode, PigException.BUG); } } else { int errCode = 1071; String msg = "Cannot convert a " + findTypeName(o) + " to a Tuple"; throw new ExecException(msg, errCode, PigException.INPUT); } } /** * If this object is a bag, return it as a bag. * This isn't particularly efficient, so if you * already <b>know</b> that the object you have is a bag you * should just cast it. * @param o object to cast * @return The object as a Double. * @throws ExecException if the type can't be forced to a Double. */ public static DataBag toBag(Object o) throws ExecException { if (o == null) { return null; } if (o instanceof DataBag) { try { return (DataBag)o; } catch (Exception e) { int errCode = 2054; String msg = "Internal error. Could not convert " + o + " to Bag."; throw new ExecException(msg, errCode, PigException.BUG); } } else { int errCode = 1071; String msg = "Cannot convert a " + findTypeName(o) + " to a DataBag"; throw new ExecException(msg, errCode, PigException.INPUT); } } /** * Purely for debugging */ public static void spillTupleContents(Tuple t, String label) { System.out.print("Tuple " + label + " "); Iterator<Object> i = t.getAll().iterator(); for (int j = 0; i.hasNext(); j++) { System.out.print(j + ":" + i.next().getClass().getName() + " "); } System.out.println(t.toString()); } /** * Determine if this type is a numeric type. * @param t type (as byte value) to test * @return true if this is a numeric type, false otherwise */ public static boolean isNumberType(byte t) { switch (t) { case INTEGER: return true ; case LONG: return true ; case FLOAT: return true ; case DOUBLE: return true ; case BIGINTEGER: return true ; case BIGDECIMAL: return true ; default: return false ; } } /** * Determine if this is a type that can work can be done on. * @param t type (as a byte value) to test * @return false if the type is unknown, null, or error; true otherwise. */ public static boolean isUsableType(byte t) { switch (t) { case UNKNOWN: return false ; case NULL: return false ; case ERROR: return false ; default :return true ; } } /** * Test if one type can cast to the other. * @param castType data type of the cast type * @param inputType data type of the input * @return true or false */ public static boolean castable(byte castType, byte inputType) { // Only legal types can be cast to if ( (!DataType.isUsableType(castType)) || (!DataType.isUsableType(inputType)) ) { return false; } // Same type is castable if (castType==inputType) { return true; } // Numerical type is castable if ( (DataType.isNumberType(castType)) && (DataType.isNumberType(inputType)) ) { return true; } // databyte can cast to anything if (inputType == DataType.BYTEARRAY) { return true; } // Cast numerical type to string, or vice versa is valid if (DataType.isNumberType(inputType)&&castType==DataType.CHARARRAY || DataType.isNumberType(castType)&&inputType==DataType.CHARARRAY) return true; // else return false return false; } /** * Merge types if possible. Merging types means finding a type that one * or both types can be upcast to. * @param type1 * @param type2 * @return the merged type, or DataType.ERROR if not successful */ public static byte mergeType(byte type1, byte type2) { // Only legal types can be merged if ( (!DataType.isUsableType(type1)) || (!DataType.isUsableType(type2)) ) { return DataType.ERROR ; } // Same type is OK if (type1==type2) { return type1 ; } // Both are number so we return the bigger type if ( (DataType.isNumberType(type1)) && (DataType.isNumberType(type2)) ) { return type1>type2 ? type1:type2 ; } // One is bytearray and the other is (number or chararray) if (type1 == DataType.BYTEARRAY) { return type2 ; } if (type2 == DataType.BYTEARRAY) { return type1 ; } // else return just ERROR return DataType.ERROR ; } /** * Given a map, turn it into a String. * @param m map * @return string representation of the map */ public static String mapToString(Map<String, Object> m) { boolean hasNext = false; StringBuilder sb = new StringBuilder(); sb.append("["); for(Map.Entry<String, Object> e: m.entrySet()) { if(hasNext) { sb.append(","); } else { hasNext = true; } sb.append(e.getKey()); sb.append("#"); Object val = e.getValue(); if(val != null) { sb.append(val.toString()); } } sb.append("]"); return sb.toString(); } /** * Test whether two byte arrays (Java byte arrays not Pig byte arrays) are * equal. I have no idea why we have this function. * @param lhs byte array 1 * @param rhs byte array 2 * @return true if both are null or the two are the same length and have * the same bytes. */ public static boolean equalByteArrays(byte[] lhs, byte[] rhs) { if(lhs == null && rhs == null) { return true; } if(lhs == null || rhs == null) { return false; } if(lhs.length != rhs.length) { return false; } for(int i = 0; i < lhs.length; ++i) { if(lhs[i] != rhs[i]) { return false; } } return true; } /** * Utility method that determines the schema from the passed in dataType. * If the dataType is Bag or Tuple, then we need to determine the schemas inside this dataType; * for this we iterate through the fields inside this field. This method works both for raw objects * and ResourceSchema.ResourceFieldSchema field descriptions; the specific behavior is determined by the klass * parameter. * @param dataType DataType.CHARARRAY, DataType.TUPLE, and so on * @param fieldIter iterator over the fields if this is a tuple or a bag * @param fieldNum number of fields inside the field if a tuple * @param klass should be Object or ResourceSchema.ResourceFieldSchema * @return * @throws ExecException * @throws FrontendException * @throws SchemaMergeException */ @SuppressWarnings("deprecation") private static Schema.FieldSchema determineFieldSchema(byte dataType, Iterator fieldIter, long fieldNum, Class klass ) throws ExecException, FrontendException, SchemaMergeException { switch (dataType) { case NULL: return new Schema.FieldSchema(null, BYTEARRAY); case BOOLEAN: case INTEGER: case LONG: case FLOAT: case DOUBLE: case BIGINTEGER: case BIGDECIMAL: case DATETIME: case BYTEARRAY: case CHARARRAY: case MAP: return new Schema.FieldSchema(null, dataType); case TUPLE: { Schema schema = null; if(fieldNum != 0) { schema = new Schema(); for(int i = 0; i < fieldNum; ++i) { schema.add(determineFieldSchema(klass.cast(fieldIter.next()))); } } return new Schema.FieldSchema(null, schema, TUPLE); } case BAG: { Schema schema = null; Schema bagSchema = null; if(fieldNum != 0) { ArrayList<Schema> schemas = new ArrayList<Schema>(); while (fieldIter.hasNext() ) { schemas.add(determineFieldSchema(klass.cast(fieldIter.next())).schema); } schema = schemas.get(0); if(null == schema) { Schema.FieldSchema tupleFs = new Schema.FieldSchema(null, null, TUPLE); bagSchema = new Schema(tupleFs); bagSchema.setTwoLevelAccessRequired(true); return new Schema.FieldSchema(null, bagSchema, BAG); } int schemaSize = schema.size(); for(int i = 1; i < schemas.size(); ++i) { Schema currSchema = schemas.get(i); if((null == currSchema) || (currSchema.size() != schemaSize)) { Schema.FieldSchema tupleFs = new Schema.FieldSchema(null, null, TUPLE); bagSchema = new Schema(tupleFs); bagSchema.setTwoLevelAccessRequired(true); return new Schema.FieldSchema(null, bagSchema, BAG); } schema = Schema.mergeSchema(schema, currSchema, false, false, false); } Schema.FieldSchema tupleFs = new Schema.FieldSchema(null, schema, TUPLE); bagSchema = new Schema(tupleFs); // since this schema has tuple field schema which internally // has a list of field schemas for the actual items in the bag // an access to any field in the bag is a two level access bagSchema.setTwoLevelAccessRequired(true); } return new Schema.FieldSchema(null, bagSchema, BAG); } default: { int errCode = 1073; String msg = "Cannot determine field schema"; throw new ExecException(msg, errCode, PigException.INPUT); } } } /*** * Determine the field schema of an ResourceFieldSchema * @param rcFieldSchema the rcFieldSchema we want translated * @return the field schema corresponding to the object * @throws ExecException,FrontendException,SchemaMergeException */ public static Schema.FieldSchema determineFieldSchema(ResourceSchema.ResourceFieldSchema rcFieldSchema) throws ExecException, FrontendException, SchemaMergeException { byte dt = rcFieldSchema.getType(); Iterator<ResourceSchema.ResourceFieldSchema> fieldIter = null; long fieldNum = 0; if (dt == TUPLE || dt == BAG ) { fieldIter = Arrays.asList(rcFieldSchema.getSchema().getFields()).iterator(); fieldNum = rcFieldSchema.getSchema().getFields().length; } return determineFieldSchema(dt, fieldIter, fieldNum, ResourceSchema.ResourceFieldSchema.class); } /*** * Determine the field schema of an object * @param o the object whose field schema is to be determined * @return the field schema corresponding to the object * @throws ExecException,FrontendException,SchemaMergeException */ public static Schema.FieldSchema determineFieldSchema(Object o) throws ExecException, FrontendException, SchemaMergeException { byte dt = findType(o); Iterator fieldIter = null; long fieldNum = 0; if ( dt == TUPLE ) { fieldIter = ((Tuple) o).getAll().iterator(); fieldNum = ((Tuple) o).size(); } else if ( dt == BAG ) { fieldNum = ((DataBag) o).size(); fieldIter = ((DataBag)o).iterator(); } return determineFieldSchema(dt, fieldIter, fieldNum, Object.class); } }