package org.apache.hadoop.hive.mastiff;
import java.util.ArrayList;
import java.util.List;
import org.apache.hadoop.hive.serde.serdeConstants;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import cn.ac.ncic.mastiff.ValPair;
import cn.ac.ncic.mastiff.utils.Bytes;
/**
* Cell represents the underling structure of a given column family (cf)
* <p>
* In deserialization procedure, each cf is associate with a Cell object and used in the following
* order:
* <ol>
* <li>
* init the object with cf's column types, this result in the exactly position (offset & length) of
* each fixed length field and index position of var length fields</li>
* <li>
* for each ValPair corresponds to this cf, calculate the exactly position of var length fields</li>
* </ol>
*
* Adapted from {@link cn.ac.ncic.mastiff.hive.serde.lazy.ClusterAccessor}
*/
public class Cell {
/**
*
* ClmLocation is used to store field's position in a given {@link cn.ac.ncic.mastiff.ValPair}
*
*/
static class ClmLocation {
public int offset;
public int length;
public int indirectBeginOffset;
public int indirectEndOffset;
}
List<TypeInfo> cols;
List<ClmLocation> locations;
List<Integer> aligned8;
List<Integer> aligned4;
List<Integer> aligned2;
List<Integer> unaligned_fixed;
List<Integer> unaligned_var;
// the offset of the area that stores all the offsets of variable fields
int firstIndirectOffset;
int lastIndirectOffset;
// the offset of the area that stores all the variable data
int firstVarOffset;
// used to calculate the offsets of each fileds
int curLen;
// the aligned bytes
int alignedBytes;
public Cell() {
}
/**
* Get exactly position of fixed length fields
* and index position of var length fields
*
* @param colTypes
* Type infos correspond to the column family
*/
public void init(List<TypeInfo> colTypes) {
cols = colTypes;
aligned8 = new ArrayList<Integer>();
aligned4 = new ArrayList<Integer>();
aligned2 = new ArrayList<Integer>();
unaligned_fixed = new ArrayList<Integer>(); // Byte & Boolean
unaligned_var = new ArrayList<Integer>(); // String
locations = new ArrayList<ClmLocation>();
curLen = 0;
for (int i = 0; i < cols.size(); i++) {
int datalen = getDataLen(cols.get(i));
boolean isFixedType = datalen > 0;
if (!isFixedType) { // varlen columns
unaligned_var.add(i);
} else {
switch (datalen) {
case 4:
aligned4.add(i);
break;
case 8:
aligned8.add(i);
break;
case 2:
aligned2.add(i);
break;
case 1:
unaligned_fixed.add(i);
break;
}
}
ClmLocation curLoc = new ClmLocation();
curLoc.length = datalen;
locations.add(curLoc);
}
// Calculate the aligned bytes
alignedBytes = 1;
if (aligned8.size() > 0) {
alignedBytes = 8;
} else if (aligned4.size() > 0) {
alignedBytes = 4;
} else if (aligned2.size() > 0) {
alignedBytes = 2;
}
initFixedLenObjects(aligned8);
initFixedLenObjects(aligned4);
initFixedLenObjects(aligned2);
firstIndirectOffset = unaligned_var.size() == 0 ? -1 : curLen;
int lastIndirectOffset = -1;
for (int col : unaligned_var) {
ClmLocation curLoc = locations.get(col);
curLoc.indirectBeginOffset = lastIndirectOffset;
lastIndirectOffset = curLoc.indirectEndOffset = curLen;
curLen += Bytes.SIZEOF_SHORT;
// System.err.println("Data Indirect : " + ls.indirectBeginOffset + ", " +
// ls.indirectEndOffset);
}
lastIndirectOffset = unaligned_var.size() == 0 ? -1 : curLen - Bytes.SIZEOF_SHORT;
initFixedLenObjects(unaligned_fixed);
if (unaligned_var.size() != 0) {
locations.get(unaligned_var.get(0)).offset = curLen;
firstVarOffset = curLen;
alignedBytes = 2;
} else {
firstVarOffset = -1;
}
}
/**
* Get each field's location (offset & length) in the current vp</br>
* Fixed length fields' location have already been calculated in init,
* so we just calculate var length fields'
*
* @param vp
* Current vp to be processed
*/
public void getFieldLocation(ValPair vp) {
for (Integer colId : unaligned_var) {
ClmLocation curLocation = locations.get(colId);
int beginOffset, endOffset;
if (curLocation.indirectBeginOffset == -1) { // first var length
beginOffset = curLocation.offset;
} else {
beginOffset = Bytes.toShort(vp.data, vp.offset + curLocation.indirectBeginOffset);
}
endOffset = Bytes.toShort(vp.data, vp.offset + curLocation.indirectEndOffset);
curLocation.offset = beginOffset;
curLocation.length = endOffset - beginOffset;
}
}
public boolean isVarLen() {
return firstVarOffset != -1;
}
public int getFixedLen() {
return firstVarOffset != -1 ? -1 : curLen;
}
/**
* Get type's length in SegmentFile
*
* @param type
* @return type's length in bytes
*/
private static int getDataLen(TypeInfo type) {
String typeName = type.getTypeName();
if (typeName.equals(serdeConstants.BOOLEAN_TYPE_NAME)) {
return Bytes.SIZEOF_BOOLEAN;
} else if (typeName.equals(serdeConstants.BINARY_TYPE_NAME)) {
return Bytes.SIZEOF_BYTE;
} else if (typeName.equals(serdeConstants.SMALLINT_TYPE_NAME)) {
return Bytes.SIZEOF_SHORT;
} else if (typeName.equals(serdeConstants.INT_TYPE_NAME)) {
return Bytes.SIZEOF_INT;
} else if (typeName.equals(serdeConstants.BIGINT_TYPE_NAME)) {
return Bytes.SIZEOF_LONG;
} else if (typeName.equals(serdeConstants.FLOAT_TYPE_NAME)) {
return Bytes.SIZEOF_FLOAT;
} else if (typeName.equals(serdeConstants.DOUBLE_TYPE_NAME)) {
return Bytes.SIZEOF_DOUBLE;
} else if (typeName.equals(serdeConstants.STRING_TYPE_NAME)) {
return 0;
} else if (typeName.equals(serdeConstants.DATE_TYPE_NAME)) {
return Bytes.SIZEOF_LONG;
} else if (typeName.equals(serdeConstants.TIMESTAMP_TYPE_NAME)) {
return Bytes.SIZEOF_LONG;
} else if (typeName.equals(serdeConstants.TINYINT_TYPE_NAME)) {
return Bytes.SIZEOF_BYTE;
}
else {
return -1;
}
}
public List<ClmLocation> getLocations() {
return locations;
}
private void initFixedLenObjects(List<Integer> cols) {
for (int col : cols) {
locations.get(col).offset = curLen;
curLen += locations.get(col).length;
// System.err.println("Cur Length : " + curLen);
}
}
}