/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hive.hcatalog.streaming; import com.google.common.annotations.VisibleForTesting; import org.apache.hadoop.security.UserGroupInformation; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.metastore.MetaStoreUtils; import org.apache.hadoop.hive.metastore.api.FieldSchema; import org.apache.hadoop.hive.metastore.api.Table; import org.apache.hadoop.hive.serde2.AbstractSerDe; import org.apache.hadoop.hive.serde2.SerDeException; import org.apache.hadoop.hive.serde2.SerDeUtils; import org.apache.hadoop.hive.serde2.lazy.LazySerDeParameters; import org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe; import org.apache.hadoop.hive.serde2.lazy.objectinspector.LazySimpleStructObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.StructField; import org.apache.hadoop.io.BytesWritable; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.util.ArrayList; import java.util.List; import java.util.Properties; /** * Streaming Writer handles delimited input (eg. CSV). * Delimited input is parsed & reordered to match column order in table * Uses Lazy Simple Serde to process delimited input */ public class DelimitedInputWriter extends AbstractRecordWriter { private final boolean reorderingNeeded; private String delimiter; private char serdeSeparator; private int[] fieldToColMapping; private final ArrayList<String> tableColumns; private LazySimpleSerDe serde = null; private final LazySimpleStructObjectInspector recordObjInspector; private final ObjectInspector[] bucketObjInspectors; private final StructField[] bucketStructFields; static final private Logger LOG = LoggerFactory.getLogger(DelimitedInputWriter.class.getName()); /** Constructor. Uses default separator of the LazySimpleSerde * @param colNamesForFields Column name assignment for input fields. nulls or empty * strings in the array indicates the fields to be skipped * @param delimiter input field delimiter * @param endPoint Hive endpoint * @throws ConnectionError Problem talking to Hive * @throws ClassNotFoundException Serde class not found * @throws SerializationError Serde initialization/interaction failed * @throws StreamingException Problem acquiring file system path for partition * @throws InvalidColumn any element in colNamesForFields refers to a non existing column */ public DelimitedInputWriter(String[] colNamesForFields, String delimiter, HiveEndPoint endPoint, StreamingConnection conn) throws ClassNotFoundException, ConnectionError, SerializationError, InvalidColumn, StreamingException { this(colNamesForFields, delimiter, endPoint, null, conn); } /** Constructor. Uses default separator of the LazySimpleSerde * @param colNamesForFields Column name assignment for input fields. nulls or empty * strings in the array indicates the fields to be skipped * @param delimiter input field delimiter * @param endPoint Hive endpoint * @param conf a Hive conf object. Can be null if not using advanced hive settings. * @throws ConnectionError Problem talking to Hive * @throws ClassNotFoundException Serde class not found * @throws SerializationError Serde initialization/interaction failed * @throws StreamingException Problem acquiring file system path for partition * @throws InvalidColumn any element in colNamesForFields refers to a non existing column */ public DelimitedInputWriter(String[] colNamesForFields, String delimiter, HiveEndPoint endPoint, HiveConf conf, StreamingConnection conn) throws ClassNotFoundException, ConnectionError, SerializationError, InvalidColumn, StreamingException { this(colNamesForFields, delimiter, endPoint, conf, (char) LazySerDeParameters.DefaultSeparators[0], conn); } /** * Constructor. Allows overriding separator of the LazySimpleSerde * @param colNamesForFields Column name assignment for input fields * @param delimiter input field delimiter * @param endPoint Hive endpoint * @param conf a Hive conf object. Set to null if not using advanced hive settings. * @param serdeSeparator separator used when encoding data that is fed into the * LazySimpleSerde. Ensure this separator does not occur * in the field data * @param conn connection this Writer is to be used with * @throws ConnectionError Problem talking to Hive * @throws ClassNotFoundException Serde class not found * @throws SerializationError Serde initialization/interaction failed * @throws StreamingException Problem acquiring file system path for partition * @throws InvalidColumn any element in colNamesForFields refers to a non existing column */ public DelimitedInputWriter(String[] colNamesForFields, String delimiter, HiveEndPoint endPoint, HiveConf conf, char serdeSeparator, StreamingConnection conn) throws ClassNotFoundException, ConnectionError, SerializationError, InvalidColumn, StreamingException { super(endPoint, conf, conn); this.tableColumns = getCols(tbl); this.serdeSeparator = serdeSeparator; this.delimiter = delimiter; this.fieldToColMapping = getFieldReordering(colNamesForFields, getTableColumns()); this.reorderingNeeded = isReorderingNeeded(delimiter, getTableColumns()); LOG.debug("Field reordering needed = " + this.reorderingNeeded + ", for endpoint " + endPoint); this.serdeSeparator = serdeSeparator; this.serde = createSerde(tbl, conf, serdeSeparator); // get ObjInspectors for entire record and bucketed cols try { this.recordObjInspector = (LazySimpleStructObjectInspector) serde.getObjectInspector(); this.bucketObjInspectors = getObjectInspectorsForBucketedCols(bucketIds, recordObjInspector); } catch (SerDeException e) { throw new SerializationError("Unable to get ObjectInspector for bucket columns", e); } // get StructFields for bucketed cols bucketStructFields = new StructField[bucketIds.size()]; List<? extends StructField> allFields = recordObjInspector.getAllStructFieldRefs(); for (int i = 0; i < bucketIds.size(); i++) { bucketStructFields[i] = allFields.get(bucketIds.get(i)); } } /** * @deprecated As of release 1.3/2.1. Replaced by {@link #DelimitedInputWriter(String[], String, HiveEndPoint, StreamingConnection)} */ public DelimitedInputWriter(String[] colNamesForFields, String delimiter, HiveEndPoint endPoint) throws ClassNotFoundException, ConnectionError, SerializationError, InvalidColumn, StreamingException { this(colNamesForFields, delimiter, endPoint, null, null); } /** * @deprecated As of release 1.3/2.1. Replaced by {@link #DelimitedInputWriter(String[], String, HiveEndPoint, HiveConf, StreamingConnection)} */ public DelimitedInputWriter(String[] colNamesForFields, String delimiter, HiveEndPoint endPoint, HiveConf conf) throws ClassNotFoundException, ConnectionError, SerializationError, InvalidColumn, StreamingException { this(colNamesForFields, delimiter, endPoint, conf, (char) LazySerDeParameters.DefaultSeparators[0], null); } /** * @deprecated As of release 1.3/2.1. Replaced by {@link #DelimitedInputWriter(String[], String, HiveEndPoint, HiveConf, char, StreamingConnection)} */ public DelimitedInputWriter(String[] colNamesForFields, String delimiter, HiveEndPoint endPoint, HiveConf conf, char serdeSeparator) throws ClassNotFoundException, StreamingException { this(colNamesForFields, delimiter, endPoint, conf, serdeSeparator, null); } private boolean isReorderingNeeded(String delimiter, ArrayList<String> tableColumns) { return !( delimiter.equals(String.valueOf(getSerdeSeparator())) && areFieldsInColOrder(fieldToColMapping) && tableColumns.size()>=fieldToColMapping.length ); } private static boolean areFieldsInColOrder(int[] fieldToColMapping) { for(int i=0; i<fieldToColMapping.length; ++i) { if(fieldToColMapping[i]!=i) { return false; } } return true; } @VisibleForTesting static int[] getFieldReordering(String[] colNamesForFields, List<String> tableColNames) throws InvalidColumn { int[] result = new int[ colNamesForFields.length ]; for(int i=0; i<colNamesForFields.length; ++i) { result[i] = -1; } int i=-1, fieldLabelCount=0; for( String col : colNamesForFields ) { ++i; if(col == null) { continue; } if( col.trim().isEmpty() ) { continue; } ++fieldLabelCount; int loc = tableColNames.indexOf(col); if(loc == -1) { throw new InvalidColumn("Column '" + col + "' not found in table for input field " + i+1); } result[i] = loc; } if(fieldLabelCount>tableColNames.size()) { throw new InvalidColumn("Number of field names exceeds the number of columns in table"); } return result; } // Reorder fields in record based on the order of columns in the table protected byte[] reorderFields(byte[] record) throws UnsupportedEncodingException { if(!reorderingNeeded) { return record; } String[] reorderedFields = new String[getTableColumns().size()]; String decoded = new String(record); String[] fields = decoded.split(delimiter,-1); for (int i=0; i<fieldToColMapping.length; ++i) { int newIndex = fieldToColMapping[i]; if(newIndex != -1) { reorderedFields[newIndex] = fields[i]; } } return join(reorderedFields, getSerdeSeparator()); } // handles nulls in items[] // TODO: perhaps can be made more efficient by creating a byte[] directly private static byte[] join(String[] items, char separator) { StringBuilder buff = new StringBuilder(100); if(items.length == 0) return "".getBytes(); int i=0; for(; i<items.length-1; ++i) { if(items[i]!=null) { buff.append(items[i]); } buff.append(separator); } if(items[i]!=null) { buff.append(items[i]); } return buff.toString().getBytes(); } protected ArrayList<String> getTableColumns() { return tableColumns; } @Override public void write(long transactionId, byte[] record) throws SerializationError, StreamingIOFailure { try { byte[] orderedFields = reorderFields(record); Object encodedRow = encode(orderedFields); int bucket = getBucket(encodedRow); getRecordUpdater(bucket).insert(transactionId, encodedRow); } catch (IOException e) { throw new StreamingIOFailure("Error writing record in transaction (" + transactionId + ")", e); } } @Override public AbstractSerDe getSerde() { return serde; } protected LazySimpleStructObjectInspector getRecordObjectInspector() { return recordObjInspector; } @Override protected StructField[] getBucketStructFields() { return bucketStructFields; } protected ObjectInspector[] getBucketObjectInspectors() { return bucketObjInspectors; } @Override public Object encode(byte[] record) throws SerializationError { try { BytesWritable blob = new BytesWritable(); blob.set(record, 0, record.length); return serde.deserialize(blob); } catch (SerDeException e) { throw new SerializationError("Unable to convert byte[] record into Object", e); } } /** * Creates LazySimpleSerde * @return * @throws SerializationError if serde could not be initialized * @param tbl */ protected static LazySimpleSerDe createSerde(Table tbl, HiveConf conf, char serdeSeparator) throws SerializationError { try { Properties tableProps = MetaStoreUtils.getTableMetadata(tbl); tableProps.setProperty("field.delim", String.valueOf(serdeSeparator)); LazySimpleSerDe serde = new LazySimpleSerDe(); SerDeUtils.initializeSerDe(serde, conf, tableProps, null); return serde; } catch (SerDeException e) { throw new SerializationError("Error initializing serde", e); } } private ArrayList<String> getCols(Table table) { List<FieldSchema> cols = table.getSd().getCols(); ArrayList<String> colNames = new ArrayList<String>(cols.size()); for (FieldSchema col : cols) { colNames.add(col.getName().toLowerCase()); } return colNames; } public char getSerdeSeparator() { return serdeSeparator; } }