package org.apache.hadoop.hive.cassandra.serde;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Properties;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.cassandra.input.LazyCassandraRow;
import org.apache.hadoop.hive.cassandra.output.CassandraPut;
import org.apache.hadoop.hive.serde.Constants;
import org.apache.hadoop.hive.serde2.SerDe;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.hadoop.hive.serde2.SerDeStats;
import org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe.SerDeParameters;
import org.apache.hadoop.hive.serde2.lazy.objectinspector.LazySimpleStructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.MapWritable;
import org.apache.hadoop.io.Writable;
public abstract class AbstractColumnSerDe implements SerDe {
public static final Log LOG = LogFactory.getLog(AbstractColumnSerDe.class.getName());
public static final String CASSANDRA_KEYSPACE_NAME = "cassandra.ks.name"; // keyspace
public static final String CASSANDRA_KEYSPACE_REPFACTOR = "cassandra.ks.repfactor"; //keyspace replication factor
public static final String CASSANDRA_KEYSPACE_STRATEGY = "cassandra.ks.strategy"; //keyspace replica placement strategy
public static final String CASSANDRA_CF_NAME = "cassandra.cf.name"; // column family
public static final String CASSANDRA_CF_COUNTERS = "cassandra.cf.counters"; // flag this as a counter CF
public static final String CASSANDRA_RANGE_BATCH_SIZE = "cassandra.range.size";
public static final String CASSANDRA_SLICE_PREDICATE_SIZE = "cassandra.slice.predicate.size";
public static final String CASSANDRA_SPLIT_SIZE = "cassandra.input.split.size";
public static final String CASSANDRA_HOST = "cassandra.host"; // initialHost
public static final String CASSANDRA_PORT = "cassandra.port"; // rcpPort
public static final String CASSANDRA_PARTITIONER = "cassandra.partitioner"; // partitioner
public static final String CASSANDRA_COL_MAPPING = "cassandra.columns.mapping";
public static final String CASSANDRA_BATCH_MUTATION_SIZE = "cassandra.batchmutate.size";
public static final String CASSANDRA_SLICE_PREDICATE_COLUMN_NAMES = "cassandra.slice.predicate.column_names";
public static final String CASSANDRA_SLICE_PREDICATE_RANGE_START = "cassandra.slice.predicate.range.start";
public static final String CASSANDRA_SLICE_PREDICATE_RANGE_FINISH = "cassandra.slice.predicate.range.finish";
public static final String CASSANDRA_SLICE_PREDICATE_RANGE_COMPARATOR = "cassandra.slice.predicate.range.comparator";
public static final String CASSANDRA_SLICE_PREDICATE_RANGE_REVERSED = "cassandra.slice.predicate.range.reversed";
public static final String CASSANDRA_SLICE_PREDICATE_RANGE_COUNT = "cassandra.slice.predicate.range.count";
public static final String CASSANDRA_ENABLE_WIDEROW_ITERATOR = "cassandra.enable.widerow.iterator";
public static final String CASSANDRA_SPECIAL_COLUMN_KEY = "row_key";
public static final String CASSANDRA_SPECIAL_COLUMN_COL = "column_name";
public static final String CASSANDRA_SPECIAL_COLUMN_SCOL= "sub_column_name";
public static final String CASSANDRA_SPECIAL_COLUMN_VAL = "value";
public static final String CASSANDRA_KEY_COLUMN = ":key";
public static final String CASSANDRA_COLUMN_COLUMN = ":column";
public static final String CASSANDRA_SUBCOLUMN_COLUMN = ":subcolumn";
public static final String CASSANDRA_VALUE_COLUMN = ":value";
public static final String CASSANDRA_CONSISTENCY_LEVEL = "cassandra.consistency.level";
public static final String CASSANDRA_THRIFT_MODE = "cassandra.thrift.mode";
public static final int DEFAULT_SPLIT_SIZE = 64 * 1024;
public static final int DEFAULT_RANGE_BATCH_SIZE = 1000;
public static final int DEFAULT_SLICE_PREDICATE_SIZE = 1000;
public static final String DEFAULT_CASSANDRA_HOST = "localhost";
public static final String DEFAULT_CASSANDRA_PORT = "9160";
public static final String DEFAULT_CONSISTENCY_LEVEL = "ONE";
public static final int DEFAULT_BATCH_MUTATION_SIZE = 500;
/* names of columns from SerdeParameters */
protected List<String> cassandraColumnNames;
/* index of key column in results */
protected int iKey;
protected TableMapping mapping;
protected ObjectInspector cachedObjectInspector;
protected SerDeParameters serdeParams;
protected LazyCassandraRow cachedCassandraRow;
protected String cassandraColumnFamily;
protected List<BytesWritable> cassandraColumnNamesBytes;
@Override
public void initialize(Configuration conf, Properties tbl) throws SerDeException {
initCassandraSerDeParameters(conf, tbl, getClass().getName());
cachedObjectInspector = createObjectInspector();
cachedCassandraRow = new LazyCassandraRow(
(LazySimpleStructObjectInspector) cachedObjectInspector);
if (LOG.isDebugEnabled()) {
LOG.debug("CassandraSerDe initialized with : columnNames = "
+ StringUtils.join(serdeParams.getColumnNames(), ",")
+ " columnTypes = "
+ StringUtils.join(serdeParams.getColumnTypes(), ",")
+ " cassandraColumnMapping = "
+ cassandraColumnNames);
}
}
/**
* Create the object inspector.
*
* @return object inspector
*/
protected abstract ObjectInspector createObjectInspector();
/*
*
* @see org.apache.hadoop.hive.serde2.Deserializer#deserialize(org.apache.hadoop.io.Writable)
* Turns a Cassandra row into a Hive row.
*/
@Override
public Object deserialize(Writable w) throws SerDeException {
if (!(w instanceof MapWritable)) {
throw new SerDeException(getClass().getName() + ": expects MapWritable not "+w.getClass().getName());
}
MapWritable columnMap = (MapWritable) w;
cachedCassandraRow.init(columnMap, cassandraColumnNames, cassandraColumnNamesBytes);
return cachedCassandraRow;
}
@Override
public ObjectInspector getObjectInspector() throws SerDeException {
return cachedObjectInspector;
}
@Override
public Class<? extends Writable> getSerializedClass() {
return CassandraPut.class;
}
/*
* Turns obj (a Hive Row) into a cassandra data format.
*/
@Override
public Writable serialize(Object obj, ObjectInspector objInspector) throws SerDeException {
if (objInspector.getCategory() != Category.STRUCT) {
throw new SerDeException(getClass().toString()
+ " can only serialize struct types, but we got: "
+ objInspector.getTypeName());
}
// Prepare the field ObjectInspectors
StructObjectInspector soi = (StructObjectInspector) objInspector;
List<? extends StructField> fields = soi.getAllStructFieldRefs();
List<Object> list = soi.getStructFieldsDataAsList(obj);
List<? extends StructField> declaredFields =
(serdeParams.getRowTypeInfo() != null &&
((StructTypeInfo) serdeParams.getRowTypeInfo())
.getAllStructFieldNames().size() > 0) ?
((StructObjectInspector) getObjectInspector()).getAllStructFieldRefs()
: null;
try {
assert iKey >= 0;
return mapping.getWritable(fields, list, declaredFields);
} catch (IOException e) {
throw new SerDeException("Unable to serialize this object! " + e);
}
}
protected abstract void initCassandraSerDeParameters(Configuration job, Properties tbl, String serdeName)
throws SerDeException;
/**
* Parses the cassandra columns mapping to identify the column name.
* One of the Hive table columns maps to the cassandra row key, by default the
* first column.
*
* @param columnMapping - the column mapping specification to be parsed
* @return a list of cassandra column names
*/
public static List<String> parseColumnMapping(String columnMapping)
{
assert StringUtils.isNotBlank(columnMapping);
String[] columnArray = columnMapping.split(",");
String[] trimmedColumnArray = trim(columnArray);
List<String> columnList = Arrays.asList(trimmedColumnArray);
int iKey = columnList.indexOf(CASSANDRA_KEY_COLUMN);
if (iKey == -1) {
columnList = new ArrayList<String>(columnList);
columnList.add(0, CASSANDRA_KEY_COLUMN);
}
return columnList;
}
/**
* Return the column mapping created from column names.
*
* @param colNames column names in array format
* @return column mapping string
*/
public static String createColumnMappingString(String[] colNames) {
//First check of this is a "transposed_table" by seeing if all
//values match our special column names
boolean isTransposedTable = true;
boolean hasKey = false;
boolean hasVal = false;
boolean hasCol = false;
boolean hasSubCol = false;
String transposedMapping = "";
for(String column : colNames) {
if (column.equalsIgnoreCase(CASSANDRA_SPECIAL_COLUMN_KEY)){
transposedMapping += ","+CASSANDRA_KEY_COLUMN;
hasKey = true;
} else if(column.equalsIgnoreCase(CASSANDRA_SPECIAL_COLUMN_COL)){
transposedMapping += ","+CASSANDRA_COLUMN_COLUMN;
hasCol = true;
} else if(column.equalsIgnoreCase(CASSANDRA_SPECIAL_COLUMN_SCOL)){
transposedMapping += ","+CASSANDRA_SUBCOLUMN_COLUMN;
hasSubCol = true;
} else if(column.equalsIgnoreCase(CASSANDRA_SPECIAL_COLUMN_VAL)){
transposedMapping += ","+CASSANDRA_VALUE_COLUMN;
hasVal = true;
} else {
isTransposedTable = false;
break;
}
}
if(isTransposedTable && !(colNames.length == 1 && hasKey)){
if(!hasKey || !hasVal || !hasCol ) {
throw new IllegalArgumentException("Transposed table definition missing required fields!");
}
return transposedMapping.substring(1);//skip leading ,
}
//Regular non-transposed logic. The first column maps to the key automatically.
StringBuilder mappingStr = new StringBuilder(CASSANDRA_KEY_COLUMN);
for (int i = 1; i < colNames.length; i++) {
mappingStr.append(",");
mappingStr.append(colNames[i]);
}
return mappingStr.toString();
}
/*
* Creates the cassandra column mappings from the hive column names.
* This would be triggered when no cassandra.columns.mapping has been defined
* in the user query.
*
* row_key is a special column name, it maps to the key of a row in cassandra;
* column_name maps to the name of a column/supercolumn;
* value maps to the value of a column;
* sub_column_name maps to the name of a column (This can only be used for a super column family.)
*
* @param tblColumnStr hive table column names
*/
public static String createColumnMappingString(String tblColumnStr) {
if(StringUtils.isBlank(tblColumnStr)) {
throw new IllegalArgumentException("table must have columns");
}
String[] colNames = tblColumnStr.split(",");
return createColumnMappingString(colNames);
}
/**
* Parse cassandra column family name from table properties.
*
* @param tbl table properties
* @return cassandra column family name
* @throws SerDeException error parsing column family name
*/
protected String getCassandraColumnFamily(Properties tbl) throws SerDeException {
String result = tbl.getProperty(CASSANDRA_CF_NAME);
if (result == null) {
result = tbl
.getProperty(org.apache.hadoop.hive.metastore.api.Constants.META_TABLE_NAME);
if (result == null) {
throw new SerDeException("CassandraColumnFamily not defined" + tbl.toString());
}
if (result.indexOf(".") != -1) {
result = result.substring(result.indexOf(".") + 1);
}
}
return result;
}
/**
* Parse the column mappping from table properties. If cassandra.columns.mapping
* is defined in the property, use it to create the mapping. Otherwise, create the mapping from table
* columns using the default mapping.
*
* @param tbl table properties
* @return A list of column names
* @throws SerDeException
*/
protected List<String> parseOrCreateColumnMapping(Properties tbl) throws SerDeException {
String prop = tbl.getProperty(CASSANDRA_COL_MAPPING);
if (prop != null) {
return parseColumnMapping(prop);
} else {
String tblColumnStr = tbl.getProperty(Constants.LIST_COLUMNS);
if (tblColumnStr != null) {
//auto-create
String mappingStr = createColumnMappingString(tblColumnStr);
if (LOG.isDebugEnabled()) {
LOG.debug("table column string: " + tblColumnStr);
LOG.debug("Auto-created mapping string: " + mappingStr);
}
return Arrays.asList(mappingStr.split(","));
} else {
throw new SerDeException("Can't find table column definitions");
}
}
}
/**
* Set the table mapping. We only support transposed mapping and regular table mapping for now.
*
* @throws SerDeException
*/
protected void setTableMapping() throws SerDeException {
if (isTransposed(cassandraColumnNames)) {
mapping = new TransposedMapping(cassandraColumnFamily, cassandraColumnNames, serdeParams);
} else {
mapping = new RegularTableMapping(cassandraColumnFamily, cassandraColumnNames, serdeParams);
}
}
/**
* Trim the white spaces, new lines from the input array.
*
* @param input a input string array
* @return a trimmed string array
*/
protected static String[] trim(String[] input) {
String[] trimmed = new String[input.length];
for (int i = 0; i < input.length; i++) {
trimmed[i] = input[i].trim();
}
return trimmed;
}
/**
* Return if a table is a transposed. A table is transposed when the column mapping is like
* (:key, :column, :value) or (:key, :column, :subcolumn, :value).
*
* @param column mapping
* @return true if a table is transposed, otherwise false
*/
public static boolean isTransposed(List<String> columnNames)
{
if(columnNames == null || columnNames.size() == 0) {
throw new IllegalArgumentException("no cassandra column information found");
}
boolean hasKey = false;
boolean hasColumn = false;
boolean hasValue = false;
boolean hasSubColumn = false;
for (String column : columnNames) {
if (column.equalsIgnoreCase(CASSANDRA_KEY_COLUMN)) {
hasKey = true;
} else if (column.equalsIgnoreCase(CASSANDRA_COLUMN_COLUMN)) {
hasColumn = true;
} else if (column.equalsIgnoreCase(CASSANDRA_SUBCOLUMN_COLUMN)) {
hasSubColumn = true;
} else if (column.equalsIgnoreCase(CASSANDRA_VALUE_COLUMN)) {
hasValue = true;
} else {
return false;
}
}
//only requested row key
if(columnNames.size() == 1 && hasKey) {
return false;
}
if(!hasKey || !hasValue || !hasColumn) {
return false;
}
return true;
}
/**
* @return 0-based offset of the key column within the table
*/
public int getKeyColumnOffset() {
return iKey;
}
protected class ColumnData {
}
@Override
public SerDeStats getSerDeStats() {
// TODO Auto-generated method stub
return null;
}
}