/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hive.serde2; import java.nio.charset.CharacterCodingException; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Properties; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.serde.serdeConstants; import org.apache.hadoop.hive.serde2.objectinspector.MetadataListStructObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; import org.apache.hadoop.hive.serde2.objectinspector.StructField; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; /** * MetadataTypedColumnsetSerDe. * */ @SerDeSpec(schemaProps = { serdeConstants.SERIALIZATION_FORMAT, serdeConstants.SERIALIZATION_NULL_FORMAT, serdeConstants.SERIALIZATION_LIB, serdeConstants.SERIALIZATION_LAST_COLUMN_TAKES_REST }) public class MetadataTypedColumnsetSerDe extends AbstractSerDe { public static final Logger LOG = LoggerFactory .getLogger(MetadataTypedColumnsetSerDe.class.getName()); public static final String DefaultSeparator = "\001"; private String separator; public static final String defaultNullString = "\\N"; private String nullString; private List<String> columnNames; private ObjectInspector cachedObjectInspector; private boolean lastColumnTakesRest = false; private int splitLimit = -1; @Override public String toString() { return "MetaDataTypedColumnsetSerDe[" + separator + "," + columnNames + "]"; } public MetadataTypedColumnsetSerDe() throws SerDeException { separator = DefaultSeparator; } private String getByteValue(String altValue, String defaultVal) { if (altValue != null && altValue.length() > 0) { try { byte[] b = new byte[1]; b[0] = Byte.parseByte(altValue); return new String(b); } catch (NumberFormatException e) { return altValue; } } return defaultVal; } @Override public void initialize(Configuration job, Properties tbl) throws SerDeException { String altSep = tbl.getProperty(serdeConstants.SERIALIZATION_FORMAT); separator = getByteValue(altSep, DefaultSeparator); String altNull = tbl.getProperty(serdeConstants.SERIALIZATION_NULL_FORMAT); nullString = getByteValue(altNull, defaultNullString); String columnProperty = tbl.getProperty("columns"); String serdeName = tbl.getProperty(serdeConstants.SERIALIZATION_LIB); // tables that were serialized with columnsetSerDe doesn't have metadata // so this hack applies to all such tables boolean columnsetSerDe = false; if ((serdeName != null) && serdeName.equals("org.apache.hadoop.hive.serde.thrift.columnsetSerDe")) { columnsetSerDe = true; } final String columnNameDelimiter = tbl.containsKey(serdeConstants.COLUMN_NAME_DELIMITER) ? tbl .getProperty(serdeConstants.COLUMN_NAME_DELIMITER) : String.valueOf(SerDeUtils.COMMA); if (columnProperty == null || columnProperty.length() == 0 || columnsetSerDe) { // Hack for tables with no columns // Treat it as a table with a single column called "col" cachedObjectInspector = ObjectInspectorFactory .getReflectionObjectInspector(ColumnSet.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); } else { columnNames = Arrays.asList(columnProperty.split(columnNameDelimiter)); cachedObjectInspector = MetadataListStructObjectInspector .getInstance(columnNames); } String lastColumnTakesRestString = tbl .getProperty(serdeConstants.SERIALIZATION_LAST_COLUMN_TAKES_REST); lastColumnTakesRest = (lastColumnTakesRestString != null && lastColumnTakesRestString .equalsIgnoreCase("true")); splitLimit = (lastColumnTakesRest && columnNames != null) ? columnNames .size() : -1; LOG.debug(getClass().getName() + ": initialized with columnNames: " + columnNames + " and separator code=" + (int) separator.charAt(0) + " lastColumnTakesRest=" + lastColumnTakesRest + " splitLimit=" + splitLimit); } /** * Split the row into columns. * * @param limit * up to limit columns will be produced (the last column takes all * the rest), -1 for unlimited. * @return The ColumnSet object * @throws Exception */ public static Object deserialize(ColumnSet c, String row, String sep, String nullString, int limit) throws Exception { if (c.col == null) { c.col = new ArrayList<String>(); } else { c.col.clear(); } String[] l1 = row.split(sep, limit); for (String s : l1) { if (s.equals(nullString)) { c.col.add(null); } else { c.col.add(s); } } return (c); } ColumnSet deserializeCache = new ColumnSet(); @Override public Object deserialize(Writable field) throws SerDeException { String row = null; if (field instanceof BytesWritable) { BytesWritable b = (BytesWritable) field; try { row = Text.decode(b.getBytes(), 0, b.getLength()); } catch (CharacterCodingException e) { throw new SerDeException(e); } } else if (field instanceof Text) { row = field.toString(); } try { deserialize(deserializeCache, row, separator, nullString, splitLimit); if (columnNames != null) { assert (columnNames.size() == deserializeCache.col.size()); } return deserializeCache; } catch (ClassCastException e) { throw new SerDeException(this.getClass().getName() + " expects Text or BytesWritable", e); } catch (Exception e) { throw new SerDeException(e); } } @Override public ObjectInspector getObjectInspector() throws SerDeException { return cachedObjectInspector; } @Override public Class<? extends Writable> getSerializedClass() { return Text.class; } Text serializeCache = new Text(); @Override public Writable serialize(Object obj, ObjectInspector objInspector) throws SerDeException { if (objInspector.getCategory() != Category.STRUCT) { throw new SerDeException(getClass().toString() + " can only serialize struct types, but we got: " + objInspector.getTypeName()); } StructObjectInspector soi = (StructObjectInspector) objInspector; List<? extends StructField> fields = soi.getAllStructFieldRefs(); StringBuilder sb = new StringBuilder(); for (int i = 0; i < fields.size(); i++) { if (i > 0) { sb.append(separator); } Object column = soi.getStructFieldData(obj, fields.get(i)); if (fields.get(i).getFieldObjectInspector().getCategory() == Category.PRIMITIVE) { // For primitive object, serialize to plain string sb.append(column == null ? nullString : column.toString()); } else { // For complex object, serialize to JSON format sb.append(SerDeUtils.getJSONString(column, fields.get(i) .getFieldObjectInspector())); } } serializeCache.set(sb.toString()); return serializeCache; } @Override public SerDeStats getSerDeStats() { // no support for statistics return null; } }