/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.apache.hive.hcatalog.pig; import java.io.IOException; import java.math.BigDecimal; import java.sql.Date; import java.sql.Timestamp; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Properties; import org.apache.commons.lang.StringUtils; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.common.type.HiveChar; import org.apache.hadoop.hive.common.type.HiveDecimal; import org.apache.hadoop.hive.common.type.HiveVarchar; import org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.RecordWriter; import org.apache.hive.hcatalog.common.HCatException; import org.apache.hive.hcatalog.data.DefaultHCatRecord; import org.apache.hive.hcatalog.data.HCatRecord; import org.apache.hive.hcatalog.data.schema.HCatFieldSchema; import org.apache.hive.hcatalog.data.schema.HCatFieldSchema.Type; import org.apache.hive.hcatalog.data.schema.HCatSchema; import org.apache.pig.ResourceSchema; import org.apache.pig.ResourceStatistics; import org.apache.pig.StoreFunc; import org.apache.pig.StoreMetadata; import org.apache.pig.backend.BackendException; import org.apache.pig.data.DataBag; import org.apache.pig.data.DataByteArray; import org.apache.pig.data.DataType; import org.apache.pig.data.Tuple; import org.apache.pig.impl.logicalLayer.FrontendException; import org.apache.pig.impl.logicalLayer.schema.Schema; import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema; import org.apache.pig.impl.util.ObjectSerializer; import org.apache.pig.impl.util.UDFContext; import org.apache.pig.impl.util.Utils; import org.joda.time.DateTime; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.common.collect.Lists; /** * Base class for HCatStorer and HCatEximStorer * */ abstract class HCatBaseStorer extends StoreFunc implements StoreMetadata { private static final Logger LOG = LoggerFactory.getLogger( HCatBaseStorer.class ); private static final List<Type> SUPPORTED_INTEGER_CONVERSIONS = Lists.newArrayList(Type.TINYINT, Type.SMALLINT, Type.INT); protected static final String COMPUTED_OUTPUT_SCHEMA = "hcat.output.schema"; protected final List<String> partitionKeys; protected final Map<String, String> partitions; protected Schema pigSchema; private RecordWriter<WritableComparable<?>, HCatRecord> writer; protected HCatSchema computedSchema; protected static final String PIG_SCHEMA = "hcat.pig.store.schema"; /** * Controls what happens when incoming Pig value is out-of-range for target Hive column */ static final String ON_OOR_VALUE_OPT = "onOutOfRangeValue"; /** * prop name in Configuration/context */ static final String ON_OORA_VALUE_PROP = "hcat.pig.store.onoutofrangevalue"; /** * valid values for ON_OOR_VALUE_OPT */ public static enum OOR_VALUE_OPT_VALUES {Null, Throw} protected String sign; //it's key that this is a per HCatStorer instance object private final DataLossLogger dataLossLogger = new DataLossLogger(); private final OOR_VALUE_OPT_VALUES onOutOfRange; public HCatBaseStorer(String partSpecs, String schema) throws Exception { partitionKeys = new ArrayList<String>(); partitions = new HashMap<String, String>(); if (partSpecs != null && !partSpecs.trim().isEmpty()) { String[] partKVPs = partSpecs.split(","); for (String partKVP : partKVPs) { String[] partKV = partKVP.split("="); if (partKV.length == 2) { String partKey = partKV[0].trim(); partitionKeys.add(partKey); partitions.put(partKey, partKV[1].trim()); } else { throw new FrontendException("Invalid partition column specification. " + partSpecs, PigHCatUtil.PIG_EXCEPTION_CODE); } } } if (schema != null && !schema.trim().isEmpty()) { pigSchema = Utils.getSchemaFromString(schema); } Properties udfProps = UDFContext.getUDFContext().getUDFProperties(this.getClass(), new String[]{sign}); onOutOfRange = OOR_VALUE_OPT_VALUES.valueOf(udfProps.getProperty(ON_OORA_VALUE_PROP, getDefaultValue().name())); } static OOR_VALUE_OPT_VALUES getDefaultValue() { return OOR_VALUE_OPT_VALUES.Null; } @Override public void checkSchema(ResourceSchema resourceSchema) throws IOException { /* Schema provided by user and the schema computed by Pig * at the time of calling store must match. */ Schema runtimeSchema = Schema.getPigSchema(resourceSchema); if (pigSchema != null) { if (!Schema.equals(runtimeSchema, pigSchema, false, true)) { throw new FrontendException("Schema provided in store statement doesn't match with the Schema" + "returned by Pig run-time. Schema provided in HCatStorer: " + pigSchema.toString() + " Schema received from Pig runtime: " + runtimeSchema.toString(), PigHCatUtil.PIG_EXCEPTION_CODE); } } else { pigSchema = runtimeSchema; } UDFContext.getUDFContext().getUDFProperties(this.getClass(), new String[]{sign}).setProperty(PIG_SCHEMA, ObjectSerializer.serialize(pigSchema)); } /** Constructs HCatSchema from pigSchema. Passed tableSchema is the existing * schema of the table in metastore. */ protected HCatSchema convertPigSchemaToHCatSchema(Schema pigSchema, HCatSchema tableSchema) throws FrontendException { if(LOG.isDebugEnabled()) { LOG.debug("convertPigSchemaToHCatSchema(pigSchema,tblSchema)=(" + pigSchema + "," + tableSchema + ")"); } List<HCatFieldSchema> fieldSchemas = new ArrayList<HCatFieldSchema>(pigSchema.size()); for (FieldSchema fSchema : pigSchema.getFields()) { try { HCatFieldSchema hcatFieldSchema = getColFromSchema(fSchema.alias, tableSchema); //if writing to a partitioned table, then pigSchema will have more columns than tableSchema //partition columns are not part of tableSchema... e.g. TestHCatStorer#testPartColsInData() // HCatUtil.assertNotNull(hcatFieldSchema, "Nothing matching '" + fSchema.alias + "' found " + // "in target table schema", LOG); fieldSchemas.add(getHCatFSFromPigFS(fSchema, hcatFieldSchema, pigSchema, tableSchema)); } catch (HCatException he) { throw new FrontendException(he.getMessage(), PigHCatUtil.PIG_EXCEPTION_CODE, he); } } HCatSchema s = new HCatSchema(fieldSchemas); LOG.debug("convertPigSchemaToHCatSchema(computed)=(" + s + ")"); return s; } public static boolean removeTupleFromBag(HCatFieldSchema hcatFieldSchema, FieldSchema bagFieldSchema) throws HCatException { if (hcatFieldSchema != null && hcatFieldSchema.getArrayElementSchema().get(0).getType() != Type.STRUCT) { return true; } // Column was not found in table schema. Its a new column List<FieldSchema> tupSchema = bagFieldSchema.schema.getFields(); if (hcatFieldSchema == null && tupSchema.size() == 1 && (tupSchema.get(0).schema == null || (tupSchema.get(0).type == DataType.TUPLE && tupSchema.get(0).schema.size() == 1))) { return true; } return false; } /** * Here we are processing HCat table schema as derived from metastore, * thus it should have information about all fields/sub-fields, but not for partition columns */ private HCatFieldSchema getHCatFSFromPigFS(FieldSchema fSchema, HCatFieldSchema hcatFieldSchema, Schema pigSchema, HCatSchema tableSchema) throws FrontendException, HCatException { if(hcatFieldSchema == null) { if(LOG.isDebugEnabled()) { LOG.debug("hcatFieldSchema is null for fSchema '" + fSchema.alias + "'"); //throw new IllegalArgumentException("hcatFiledSchema is null; fSchema=" + fSchema + " " + // "(pigSchema, tableSchema)=(" + pigSchema + "," + tableSchema + ")"); } } byte type = fSchema.type; switch (type) { case DataType.CHARARRAY: case DataType.BIGCHARARRAY: if(hcatFieldSchema != null && hcatFieldSchema.getTypeInfo() != null) { return new HCatFieldSchema(fSchema.alias, hcatFieldSchema.getTypeInfo(), null); } return new HCatFieldSchema(fSchema.alias, TypeInfoFactory.stringTypeInfo, null); case DataType.INTEGER: if (hcatFieldSchema != null) { if (!SUPPORTED_INTEGER_CONVERSIONS.contains(hcatFieldSchema.getType())) { throw new FrontendException("Unsupported type: " + type + " in Pig's schema", PigHCatUtil.PIG_EXCEPTION_CODE); } return new HCatFieldSchema(fSchema.alias, hcatFieldSchema.getTypeInfo(), null); } return new HCatFieldSchema(fSchema.alias, TypeInfoFactory.intTypeInfo, null); case DataType.LONG: return new HCatFieldSchema(fSchema.alias, TypeInfoFactory.longTypeInfo, null); case DataType.FLOAT: return new HCatFieldSchema(fSchema.alias, TypeInfoFactory.floatTypeInfo, null); case DataType.DOUBLE: return new HCatFieldSchema(fSchema.alias, TypeInfoFactory.doubleTypeInfo, null); case DataType.BYTEARRAY: return new HCatFieldSchema(fSchema.alias, TypeInfoFactory.binaryTypeInfo, null); case DataType.BOOLEAN: return new HCatFieldSchema(fSchema.alias, TypeInfoFactory.booleanTypeInfo, null); case DataType.DATETIME: //Pig DATETIME can map to DATE or TIMESTAMP (see HCatBaseStorer#validateSchema()) which //is controlled by Hive target table information if(hcatFieldSchema != null && hcatFieldSchema.getTypeInfo() != null) { return new HCatFieldSchema(fSchema.alias, hcatFieldSchema.getTypeInfo(), null); } return new HCatFieldSchema(fSchema.alias, TypeInfoFactory.timestampTypeInfo, null); case DataType.BIGDECIMAL: if(hcatFieldSchema != null && hcatFieldSchema.getTypeInfo() != null) { return new HCatFieldSchema(fSchema.alias, hcatFieldSchema.getTypeInfo(), null); } return new HCatFieldSchema(fSchema.alias, TypeInfoFactory.decimalTypeInfo, null); case DataType.BAG: Schema bagSchema = fSchema.schema; List<HCatFieldSchema> arrFields = new ArrayList<HCatFieldSchema>(1); FieldSchema field; // Find out if we need to throw away the tuple or not. if (removeTupleFromBag(hcatFieldSchema, fSchema)) { field = bagSchema.getField(0).schema.getField(0); } else { field = bagSchema.getField(0); } arrFields.add(getHCatFSFromPigFS(field, hcatFieldSchema == null ? null : hcatFieldSchema .getArrayElementSchema().get(0), pigSchema, tableSchema)); return new HCatFieldSchema(fSchema.alias, Type.ARRAY, new HCatSchema(arrFields), ""); case DataType.TUPLE: List<HCatFieldSchema> hcatFSs = new ArrayList<HCatFieldSchema>(); HCatSchema structSubSchema = hcatFieldSchema == null ? null : hcatFieldSchema.getStructSubSchema(); List<FieldSchema> fields = fSchema.schema.getFields(); for (int i = 0; i < fields.size(); i++) { FieldSchema fieldSchema = fields.get(i); hcatFSs.add(getHCatFSFromPigFS(fieldSchema, structSubSchema == null ? null : structSubSchema.get(i), pigSchema, tableSchema)); } return new HCatFieldSchema(fSchema.alias, Type.STRUCT, new HCatSchema(hcatFSs), ""); case DataType.MAP: { // Pig's schema contain no type information about map's keys and // values. So, if its a new column assume <string,string> if its existing // return whatever is contained in the existing column. HCatFieldSchema valFS; List<HCatFieldSchema> valFSList = new ArrayList<HCatFieldSchema>(1); if (hcatFieldSchema != null) { return HCatFieldSchema.createMapTypeFieldSchema(fSchema.alias, hcatFieldSchema.getMapKeyTypeInfo(), hcatFieldSchema.getMapValueSchema(), ""); } // Column not found in target table. Its a new column. Its schema is map<string,string> valFS = new HCatFieldSchema(fSchema.alias, TypeInfoFactory.stringTypeInfo, ""); valFSList.add(valFS); return HCatFieldSchema.createMapTypeFieldSchema(fSchema.alias, TypeInfoFactory.stringTypeInfo, new HCatSchema(valFSList), ""); } case DataType.BIGINTEGER: //fall through; doesn't map to Hive/Hcat type; here for completeness default: throw new FrontendException("Unsupported type: " + type + " in Pig's schema", PigHCatUtil.PIG_EXCEPTION_CODE); } } @Override public void prepareToWrite(RecordWriter writer) throws IOException { this.writer = writer; computedSchema = (HCatSchema) ObjectSerializer.deserialize(UDFContext.getUDFContext().getUDFProperties(this.getClass(), new String[]{sign}).getProperty(COMPUTED_OUTPUT_SCHEMA)); } @Override public void putNext(Tuple tuple) throws IOException { List<Object> outgoing = new ArrayList<Object>(tuple.size()); int i = 0; for (HCatFieldSchema fSchema : computedSchema.getFields()) { outgoing.add(getJavaObj(tuple.get(i++), fSchema)); } try { writer.write(null, new DefaultHCatRecord(outgoing)); } catch (InterruptedException e) { throw new BackendException("Error while writing tuple: " + tuple, PigHCatUtil.PIG_EXCEPTION_CODE, e); } } /** * Convert from Pig value object to Hive value object * This method assumes that {@link #validateSchema(org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema, org.apache.hive.hcatalog.data.schema.HCatFieldSchema, org.apache.pig.impl.logicalLayer.schema.Schema, org.apache.hive.hcatalog.data.schema.HCatSchema, int)} * which checks the types in Pig schema are compatible with target Hive table, has been called. */ private Object getJavaObj(Object pigObj, HCatFieldSchema hcatFS) throws HCatException, BackendException { try { if(pigObj == null) return null; // The real work-horse. Spend time and energy in this method if there is // need to keep HCatStorer lean and go fast. Type type = hcatFS.getType(); switch (type) { case BINARY: return ((DataByteArray) pigObj).get(); case STRUCT: HCatSchema structSubSchema = hcatFS.getStructSubSchema(); // Unwrap the tuple. List<Object> all = ((Tuple) pigObj).getAll(); ArrayList<Object> converted = new ArrayList<Object>(all.size()); for (int i = 0; i < all.size(); i++) { converted.add(getJavaObj(all.get(i), structSubSchema.get(i))); } return converted; case ARRAY: // Unwrap the bag. DataBag pigBag = (DataBag) pigObj; HCatFieldSchema tupFS = hcatFS.getArrayElementSchema().get(0); boolean needTuple = tupFS.getType() == Type.STRUCT; List<Object> bagContents = new ArrayList<Object>((int) pigBag.size()); Iterator<Tuple> bagItr = pigBag.iterator(); while (bagItr.hasNext()) { // If there is only one element in tuple contained in bag, we throw away the tuple. bagContents.add(getJavaObj(needTuple ? bagItr.next() : bagItr.next().get(0), tupFS)); } return bagContents; case MAP: Map<?, ?> pigMap = (Map<?, ?>) pigObj; Map<Object, Object> typeMap = new HashMap<Object, Object>(); for (Entry<?, ?> entry : pigMap.entrySet()) { // the value has a schema and not a FieldSchema typeMap.put( // Schema validation enforces that the Key is a String (String) entry.getKey(), getJavaObj(entry.getValue(), hcatFS.getMapValueSchema().get(0))); } return typeMap; case STRING: case INT: case BIGINT: case FLOAT: case DOUBLE: return pigObj; case SMALLINT: if ((Integer) pigObj < Short.MIN_VALUE || (Integer) pigObj > Short.MAX_VALUE) { handleOutOfRangeValue(pigObj, hcatFS); return null; } return ((Integer) pigObj).shortValue(); case TINYINT: if ((Integer) pigObj < Byte.MIN_VALUE || (Integer) pigObj > Byte.MAX_VALUE) { handleOutOfRangeValue(pigObj, hcatFS); return null; } return ((Integer) pigObj).byteValue(); case BOOLEAN: if( pigObj instanceof String ) { if( ((String)pigObj).trim().compareTo("0") == 0 ) { return Boolean.FALSE; } if( ((String)pigObj).trim().compareTo("1") == 0 ) { return Boolean.TRUE; } throw new BackendException("Unexpected type " + type + " for value " + pigObj + " of class " + pigObj.getClass().getName(), PigHCatUtil.PIG_EXCEPTION_CODE); } return Boolean.parseBoolean( pigObj.toString() ); case DECIMAL: BigDecimal bd = (BigDecimal)pigObj; DecimalTypeInfo dti = (DecimalTypeInfo)hcatFS.getTypeInfo(); if(bd.precision() > dti.precision() || bd.scale() > dti.scale()) { handleOutOfRangeValue(pigObj, hcatFS); return null; } return HiveDecimal.create(bd); case CHAR: String charVal = (String)pigObj; CharTypeInfo cti = (CharTypeInfo)hcatFS.getTypeInfo(); if(charVal.length() > cti.getLength()) { handleOutOfRangeValue(pigObj, hcatFS); return null; } return new HiveChar(charVal, cti.getLength()); case VARCHAR: String varcharVal = (String)pigObj; VarcharTypeInfo vti = (VarcharTypeInfo)hcatFS.getTypeInfo(); if(varcharVal.length() > vti.getLength()) { handleOutOfRangeValue(pigObj, hcatFS); return null; } return new HiveVarchar(varcharVal, vti.getLength()); case TIMESTAMP: DateTime dt = (DateTime)pigObj; return new Timestamp(dt.getMillis());//getMillis() returns UTC time regardless of TZ case DATE: /** * We ignore any TZ setting on Pig value since java.sql.Date doesn't have it (in any * meaningful way). So the assumption is that if Pig value has 0 time component (midnight) * we assume it reasonably 'fits' into a Hive DATE. If time part is not 0, it's considered * out of range for target type. */ DateTime dateTime = ((DateTime)pigObj); if(dateTime.getMillisOfDay() != 0) { handleOutOfRangeValue(pigObj, hcatFS, "Time component must be 0 (midnight) in local timezone; Local TZ val='" + pigObj + "'"); return null; } /*java.sql.Date is a poorly defined API. Some (all?) SerDes call toString() on it [e.g. LazySimpleSerDe, uses LazyUtils.writePrimitiveUTF8()], which automatically adjusts for local timezone. Date.valueOf() also uses local timezone (as does Date(int,int,int). Also see PigHCatUtil#extractPigObject() for corresponding read op. This way a DATETIME from Pig, when stored into Hive and read back comes back with the same value.*/ return new Date(dateTime.getYear() - 1900, dateTime.getMonthOfYear() - 1, dateTime.getDayOfMonth()); default: throw new BackendException("Unexpected HCat type " + type + " for value " + pigObj + " of class " + pigObj.getClass().getName(), PigHCatUtil.PIG_EXCEPTION_CODE); } } catch (BackendException e) { // provide the path to the field in the error message throw new BackendException( (hcatFS.getName() == null ? " " : hcatFS.getName() + ".") + e.getMessage(), e); } } private void handleOutOfRangeValue(Object pigObj, HCatFieldSchema hcatFS) throws BackendException { handleOutOfRangeValue(pigObj, hcatFS, null); } /** * depending on user config, throws an exception or logs a msg if the incoming Pig value is * out-of-range for target type. * @param additionalMsg may be {@code null} */ private void handleOutOfRangeValue(Object pigObj, HCatFieldSchema hcatFS, String additionalMsg) throws BackendException { String msg = "Pig value '" + pigObj + "' is outside the bounds of column " + hcatFS.getName() + " with type " + (hcatFS.getTypeInfo() == null ? hcatFS.getType() : hcatFS.getTypeInfo().getTypeName()) + (additionalMsg == null ? "" : "[" + additionalMsg + "]"); switch (onOutOfRange) { case Throw: throw new BackendException(msg, PigHCatUtil.PIG_EXCEPTION_CODE); case Null: dataLossLogger.logDataLossMsg(hcatFS, pigObj, msg); break; default: throw new BackendException("Unexpected " + ON_OOR_VALUE_OPT + " value: '" + onOutOfRange + "'"); } } @Override public String relToAbsPathForStoreLocation(String location, Path curDir) throws IOException { // Need to necessarily override this method since default impl assumes HDFS // based location string. return location; } @Override public void setStoreFuncUDFContextSignature(String signature) { sign = signature; } protected void doSchemaValidations(Schema pigSchema, HCatSchema tblSchema) throws FrontendException, HCatException { // Iterate through all the elements in Pig Schema and do validations as // dictated by semantics, consult HCatSchema of table when need be. int columnPos = 0;//helps with debug messages for (FieldSchema pigField : pigSchema.getFields()) { HCatFieldSchema hcatField = getColFromSchema(pigField.alias, tblSchema); validateSchema(pigField, hcatField, pigSchema, tblSchema, columnPos++); } try { PigHCatUtil.validateHCatTableSchemaFollowsPigRules(tblSchema); } catch (IOException e) { throw new FrontendException("HCatalog schema is not compatible with Pig: " + e.getMessage(), PigHCatUtil.PIG_EXCEPTION_CODE, e); } } /** * This method encodes which Pig type can map (be stored in) to which HCat type. * @throws HCatException * @throws FrontendException */ private void validateSchema(FieldSchema pigField, HCatFieldSchema hcatField, Schema topLevelPigSchema, HCatSchema topLevelHCatSchema, int columnPos) throws HCatException, FrontendException { validateAlias(pigField.alias); byte type = pigField.type; if (DataType.isComplex(type)) { switch (type) { case DataType.MAP: if (hcatField != null) { if (hcatField.getMapKeyType() != Type.STRING) { throw new FrontendException("Key Type of map must be String " + hcatField, PigHCatUtil.PIG_EXCEPTION_CODE); } // Map values can be primitive or complex } break; case DataType.BAG: HCatSchema arrayElementSchema = hcatField == null ? null : hcatField.getArrayElementSchema(); for (FieldSchema innerField : pigField.schema.getField(0).schema.getFields()) { validateSchema(innerField, getColFromSchema(pigField.alias, arrayElementSchema), topLevelPigSchema, topLevelHCatSchema, columnPos); } break; case DataType.TUPLE: HCatSchema structSubSchema = hcatField == null ? null : hcatField.getStructSubSchema(); for (FieldSchema innerField : pigField.schema.getFields()) { validateSchema(innerField, getColFromSchema(pigField.alias, structSubSchema), topLevelPigSchema, topLevelHCatSchema, columnPos); } break; default: throw new FrontendException("Internal Error.", PigHCatUtil.PIG_EXCEPTION_CODE); } } else if(hcatField != null) { //there is no point trying to validate further if we have no type info about target field switch (type) { case DataType.BIGDECIMAL: throwTypeMismatchException(type, Lists.newArrayList(Type.DECIMAL), hcatField, columnPos); break; case DataType.DATETIME: throwTypeMismatchException(type, Lists.newArrayList(Type.TIMESTAMP, Type.DATE), hcatField, columnPos); break; case DataType.BYTEARRAY: throwTypeMismatchException(type, Lists.newArrayList(Type.BINARY), hcatField, columnPos); break; case DataType.BIGINTEGER: throwTypeMismatchException(type, Collections.<Type>emptyList(), hcatField, columnPos); break; case DataType.BOOLEAN: throwTypeMismatchException(type, Lists.newArrayList(Type.BOOLEAN), hcatField, columnPos); break; case DataType.CHARARRAY: throwTypeMismatchException(type, Lists.newArrayList(Type.STRING, Type.CHAR, Type.VARCHAR), hcatField, columnPos); break; case DataType.DOUBLE: throwTypeMismatchException(type, Lists.newArrayList(Type.DOUBLE), hcatField, columnPos); break; case DataType.FLOAT: throwTypeMismatchException(type, Lists.newArrayList(Type.FLOAT), hcatField, columnPos); break; case DataType.INTEGER: throwTypeMismatchException(type, Lists.newArrayList(Type.INT, Type.BIGINT, Type.TINYINT, Type.SMALLINT), hcatField, columnPos); break; case DataType.LONG: throwTypeMismatchException(type, Lists.newArrayList(Type.BIGINT), hcatField, columnPos); break; default: throw new FrontendException("'" + type + "' Pig datatype in column " + columnPos + "(0-based) is not supported by HCat", PigHCatUtil.PIG_EXCEPTION_CODE); } } else { if(false) { //see HIVE-6194 throw new FrontendException("(pigSch,hcatSchema)=(" + pigField + "," + "" + hcatField + ") (topPig, topHcat)=(" + topLevelPigSchema + "," + "" + topLevelHCatSchema + ")"); } } } private static void throwTypeMismatchException(byte pigDataType, List<Type> hcatRequiredType, HCatFieldSchema hcatActualField, int columnPos) throws FrontendException { if(!hcatRequiredType.contains(hcatActualField.getType())) { throw new FrontendException( "Pig '" + DataType.findTypeName(pigDataType) + "' type in column " + columnPos + "(0-based) cannot map to HCat '" + hcatActualField.getType() + "'type. Target filed must be of HCat type {" + StringUtils.join(hcatRequiredType, " or ") + "}"); } } private void validateAlias(String alias) throws FrontendException { if (alias == null) { throw new FrontendException("Column name for a field is not specified. Please provide the full schema as an argument to HCatStorer.", PigHCatUtil.PIG_EXCEPTION_CODE); } if (alias.matches(".*[A-Z]+.*")) { throw new FrontendException("Column names should all be in lowercase. Invalid name found: " + alias, PigHCatUtil.PIG_EXCEPTION_CODE); } } // Finds column by name in HCatSchema, if not found returns null. private HCatFieldSchema getColFromSchema(String alias, HCatSchema tblSchema) { if (tblSchema != null) { for (HCatFieldSchema hcatField : tblSchema.getFields()) { if (hcatField != null && hcatField.getName() != null && hcatField.getName().equalsIgnoreCase(alias)) { return hcatField; } } } // Its a new column return null; } @Override public void cleanupOnFailure(String location, Job job) throws IOException { // No-op. } @Override public void storeStatistics(ResourceStatistics stats, String arg1, Job job) throws IOException { } /** * todo: when job is complete, should print the msgCount table to log */ private static final class DataLossLogger { private static final Map<String, Integer> msgCount = new HashMap<String, Integer>(); private static String getColumnTypeKey(HCatFieldSchema fieldSchema) { return fieldSchema.getName() + "_" + (fieldSchema.getTypeInfo() == null ? fieldSchema.getType() : fieldSchema.getTypeInfo()); } private void logDataLossMsg(HCatFieldSchema fieldSchema, Object pigOjb, String msg) { String key = getColumnTypeKey(fieldSchema); if(!msgCount.containsKey(key)) { msgCount.put(key, 0); LOG.warn(msg + " " + "Will write NULL instead. Only 1 such message per type/column is emitted."); } msgCount.put(key, msgCount.get(key) + 1); } } }