/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hive.hcatalog.streaming; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Properties; import org.apache.commons.lang.StringUtils; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.metastore.MetaStoreUtils; import org.apache.hadoop.hive.metastore.api.FieldSchema; import org.apache.hadoop.hive.metastore.api.Table; import org.apache.hadoop.hive.serde.serdeConstants; import org.apache.hadoop.hive.serde2.AbstractSerDe; import org.apache.hadoop.hive.serde2.RegexSerDe; import org.apache.hadoop.hive.serde2.SerDeException; import org.apache.hadoop.hive.serde2.SerDeUtils; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.StructField; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; import org.apache.hadoop.io.Text; /** * Streaming Writer handles text input data with regex. Uses * org.apache.hadoop.hive.serde2.RegexSerDe */ public class StrictRegexWriter extends AbstractRecordWriter { private RegexSerDe serde; private final StructObjectInspector recordObjInspector; private final ObjectInspector[] bucketObjInspectors; private final StructField[] bucketStructFields; /** * @param endPoint the end point to write to * @param conn connection this Writer is to be used with * @throws ConnectionError * @throws SerializationError * @throws StreamingException */ public StrictRegexWriter(HiveEndPoint endPoint, StreamingConnection conn) throws ConnectionError, SerializationError, StreamingException { this(null, endPoint, null, conn); } /** * @param endPoint the end point to write to * @param conf a Hive conf object. Should be null if not using advanced Hive settings. * @param conn connection this Writer is to be used with * @throws ConnectionError * @throws SerializationError * @throws StreamingException */ public StrictRegexWriter(HiveEndPoint endPoint, HiveConf conf, StreamingConnection conn) throws ConnectionError, SerializationError, StreamingException { this(null, endPoint, conf, conn); } /** * @param regex to parse the data * @param endPoint the end point to write to * @param conf a Hive conf object. Should be null if not using advanced Hive settings. * @param conn connection this Writer is to be used with * @throws ConnectionError * @throws SerializationError * @throws StreamingException */ public StrictRegexWriter(String regex, HiveEndPoint endPoint, HiveConf conf, StreamingConnection conn) throws ConnectionError, SerializationError, StreamingException { super(endPoint, conf, conn); this.serde = createSerde(tbl, conf, regex); // get ObjInspectors for entire record and bucketed cols try { recordObjInspector = ( StructObjectInspector ) serde.getObjectInspector(); this.bucketObjInspectors = getObjectInspectorsForBucketedCols(bucketIds, recordObjInspector); } catch (SerDeException e) { throw new SerializationError("Unable to get ObjectInspector for bucket columns", e); } // get StructFields for bucketed cols bucketStructFields = new StructField[bucketIds.size()]; List<? extends StructField> allFields = recordObjInspector.getAllStructFieldRefs(); for (int i = 0; i < bucketIds.size(); i++) { bucketStructFields[i] = allFields.get(bucketIds.get(i)); } } @Override public AbstractSerDe getSerde() { return serde; } @Override protected StructObjectInspector getRecordObjectInspector() { return recordObjInspector; } @Override protected StructField[] getBucketStructFields() { return bucketStructFields; } @Override protected ObjectInspector[] getBucketObjectInspectors() { return bucketObjInspectors; } @Override public void write(long transactionId, byte[] record) throws StreamingIOFailure, SerializationError { try { Object encodedRow = encode(record); int bucket = getBucket(encodedRow); getRecordUpdater(bucket).insert(transactionId, encodedRow); } catch (IOException e) { throw new StreamingIOFailure("Error writing record in transaction(" + transactionId + ")", e); } } /** * Creates RegexSerDe * @param tbl used to create serde * @param conf used to create serde * @param regex used to create serde * @return * @throws SerializationError if serde could not be initialized */ private static RegexSerDe createSerde(Table tbl, HiveConf conf, String regex) throws SerializationError { try { Properties tableProps = MetaStoreUtils.getTableMetadata(tbl); tableProps.setProperty(RegexSerDe.INPUT_REGEX, regex); ArrayList<String> tableColumns = getCols(tbl); tableProps.setProperty(serdeConstants.LIST_COLUMNS, StringUtils.join(tableColumns, ",")); RegexSerDe serde = new RegexSerDe(); SerDeUtils.initializeSerDe(serde, conf, tableProps, null); return serde; } catch (SerDeException e) { throw new SerializationError("Error initializing serde " + RegexSerDe.class.getName(), e); } } private static ArrayList<String> getCols(Table table) { List<FieldSchema> cols = table.getSd().getCols(); ArrayList<String> colNames = new ArrayList<String>(cols.size()); for (FieldSchema col : cols) { colNames.add(col.getName().toLowerCase()); } return colNames; } /** * Encode Utf8 encoded string bytes using RegexSerDe * * @param utf8StrRecord * @return The encoded object * @throws SerializationError */ @Override public Object encode(byte[] utf8StrRecord) throws SerializationError { try { Text blob = new Text(utf8StrRecord); return serde.deserialize(blob); } catch (SerDeException e) { throw new SerializationError("Unable to convert byte[] record into Object", e); } } }