/* (c) 2014 LinkedIn Corp. All rights rimport java.util.HashMap; import java.util.Map; import org.apache.pig.backend.executionengine.ExecException; import org.apache.pig.data.Tuple; import org.codehaus.jackson.JsonNode; import com.linkedin.rcf.block.BlockSchema; import com.linkedin.rcf.block.DataType; import com.linkedin.rcf.utils.CodeDictionary; import com.linkedin.rcf.utils.JsonUtils; uted on an "AS IS" BASIS, WITHOUT WARRANTIES OR * CONDITIONS OF ANY KIND, either express or implied. */ package com.linkedin.cubert.operator.cube; import java.util.HashMap; import java.util.Map; import org.apache.pig.backend.executionengine.ExecException; import org.apache.pig.data.Tuple; import org.codehaus.jackson.JsonNode; import com.linkedin.cubert.block.BlockSchema; import com.linkedin.cubert.block.DataType; import com.linkedin.cubert.utils.CodeDictionary; import com.linkedin.cubert.utils.JsonUtils; /** * Manages the dimensions for the CUBE operator. * <p> * The primary capabilities provided by this class are: * <ul> * <li>Extracting the dimensions from the input tuple (via the {@link extractDimensionKey} * method)</li> * * <li>Enumerating the "ancestors" for the dimension keys (via the {@link ancestors} * method)</li> * * <li>Writing back the dimensions into a tuple (via the {@link outputKey} method).</li> * </ul> * * @author Maneesh Varshney * */ public class CubeDimensions { /* * Meta data for dimensions (index in input schema, data type, and offset in the * dimension key) */ // the index of dimension columns in the input tuple private final int[] inputIndex; // the index of the dimension columns in the output tuple private final int[] outputIndex; // the data type of dimension columns private final DataType[] dimensionTypes; // the offset of the dimension columns in the int[] array within the {@link // DimensionKey} private final int[] dimensionOffsets; /* information regarding ancestors */ // total number of ancestors private final int numAncestors; // is this a full cube private final boolean isFullCube; // For an ancestor, certain dimensions have "ALL" semantics. In the DimensionKey these // dimensions store 0 as their values. The following two arrays provides a fast way to // zero out these ALL dimensions // For each ancestor, how many ints within the DimensionKey have to be zeroed out (the // length of this array is equal to the number of ancestors) private final int[] zeroedDimArrayLength; // For each ancestor, the actual index within the DimensionKey that have to be zeroed // out. private final byte[][] zeroedDimIndex; /* singleton copies of dimension key and ancestors */ private final DimensionKey key; private final DimensionKey[] ancestors; // The last int in the DimensionKey stores the nullBitVector. This variable refers to // the index to this int within the array private final int nullBitVectorIndex; // dictionary for string dimensions private final CodeDictionary[] dictionaries; public CubeDimensions(BlockSchema inputSchema, BlockSchema outputSchema, String[] dimensions, JsonNode groupingSetsJson) { // create the arrays inputIndex = new int[dimensions.length]; outputIndex = new int[dimensions.length]; dimensionTypes = new DataType[dimensions.length]; dimensionOffsets = new int[dimensions.length]; dictionaries = new CodeDictionary[dimensions.length]; // intialize the above arrays int idx = 0; int offset = 0; for (String dim : dimensions) { inputIndex[idx] = inputSchema.getIndex(dim); outputIndex[idx] = outputSchema.getIndex(dim); dimensionTypes[idx] = inputSchema.getType(inputIndex[idx]); dimensionOffsets[idx] = offset; offset++; // pad one more int if the data type is long ("encoded" as 2 ints) if (dimensionTypes[idx] == DataType.LONG) offset++; // create dictionary if the dimension is string if (dimensionTypes[idx] == DataType.STRING) dictionaries[idx] = new CodeDictionary(); idx++; } // the "last" int in the dimension key is used to store the null bit vector nullBitVectorIndex = offset; // create the dimension key key = new DimensionKey(nullBitVectorIndex); key.getArray()[nullBitVectorIndex] = 0; // determine if this is a full cube isFullCube = (groupingSetsJson == null) || (groupingSetsJson.isNull()) || groupingSetsJson.size() == 0; // determine the number of ancestors if (isFullCube) numAncestors = (int) Math.pow(2, dimensions.length); else numAncestors = groupingSetsJson.size(); // allocate the ancestors ancestors = new DimensionKey[numAncestors]; for (int i = 0; i < numAncestors; i++) ancestors[i] = new DimensionKey(nullBitVectorIndex); // pre-assign null bit vector for the ancestors assignNullBitVector(dimensions, groupingSetsJson); // assign zeroedDimIndex for the ancestors zeroedDimArrayLength = new int[numAncestors]; zeroedDimIndex = new byte[numAncestors][64]; assignZeroedDimensions(dimensions); } public int getDimensionKeyLength() { return nullBitVectorIndex + 1; } private void assignNullBitVector(String[] dimensions, JsonNode groupingSetsJson) { // assign null bit vector for the ancestors if (isFullCube) { for (int i = 0; i < numAncestors; i++) { int bitmap = i; int nullBitVector = 0; for (int j = 0; j < dimensions.length; j++) { boolean lastBitZero = (bitmap % 2 == 0); bitmap >>= 1; if (lastBitZero) nullBitVector = nullBitVector | (1 << j); } ancestors[i].getArray()[nullBitVectorIndex] = nullBitVector; } } else { String[] groupingSetsInput = JsonUtils.asArray(groupingSetsJson); Map<String, Integer> dimensionPosition = new HashMap<String, Integer>(); for (int i = 0; i < dimensions.length; i++) dimensionPosition.put(dimensions[i], i); for (int i = 0; i < groupingSetsInput.length; i++) { int nullBitVector = -1; if (! groupingSetsInput[i].equals("")) // else 'ALL' rollups => no null bits { String[] fields = groupingSetsInput[i].split(","); for (String field : fields) nullBitVector &= ~(1 << dimensionPosition.get(field)); } ancestors[i].getArray()[nullBitVectorIndex] = nullBitVector; } } } private void assignZeroedDimensions(String[] dimensions) { for (int i = 0; i < numAncestors; i++) { int nullBitVector = ancestors[i].getArray()[nullBitVectorIndex]; int idx = 0; for (int j = 0; j < dimensions.length; j++) { boolean isDimZeroed = (nullBitVector & (1 << j)) != 0; if (isDimZeroed) { zeroedDimIndex[i][idx++] = (byte) dimensionOffsets[j]; if (dimensionTypes[j] == DataType.LONG) zeroedDimIndex[i][idx++] = (byte) (dimensionOffsets[j] + 1); } } zeroedDimArrayLength[i] = idx; } } public DimensionKey extractDimensionKey(Tuple tuple) throws ExecException { int[] array = key.getArray(); for (int i = 0; i < inputIndex.length; i++) { Object dim = tuple.get(inputIndex[i]); if (dim == null) throw new RuntimeException("Dimension is null for tuple " + tuple); switch (dimensionTypes[i]) { case BOOLEAN: array[dimensionOffsets[i]] = ((Boolean) dim) ? 1 : 0; break; case INT: array[dimensionOffsets[i]] = ((Number) dim).intValue(); break; case LONG: long val = ((Number) dim).longValue(); array[dimensionOffsets[i]] = (int) (val >> 32); // upper 32 bits array[dimensionOffsets[i] + 1] = (int) val; // lower 32 bits break; case STRING: CodeDictionary dict = dictionaries[i]; int code = dict.getCodeForKey((String) dim); if (code == -1) code = dict.addKey((String) dim); array[dimensionOffsets[i]] = code; break; default: throw new RuntimeException("Type of dimension is not INT, LONG or STRING for tuple " + tuple + " at col " + i); } } return key; } public DimensionKey[] ancestors(DimensionKey key) { for (int i = 0; i < numAncestors; i++) { int[] array = ancestors[i].getArray(); // copy the int[] from the original key System.arraycopy(key.getArray(), 0, array, 0, nullBitVectorIndex); // zero out the dimensions that have null bit set for (int j = 0; j < zeroedDimArrayLength[i]; j++) array[zeroedDimIndex[i][j]] = 0; } return ancestors; } public DimensionKey[] ancestors(Tuple tuple) throws ExecException { return ancestors(extractDimensionKey(tuple)); } public void outputKey(DimensionKey key, Tuple outputTuple) throws ExecException { int[] array = key.getArray(); int nullBitVector = array[nullBitVectorIndex]; for (int dim = 0; dim < outputIndex.length; dim++) { int dimIndex = outputIndex[dim]; boolean isNull = (nullBitVector & (1 << dim)) != 0; if (isNull) { outputTuple.set(dimIndex, null); } else { switch (dimensionTypes[dim]) { case BOOLEAN: outputTuple.set(dimIndex, array[dimensionOffsets[dim]] == 1); break; case INT: outputTuple.set(dimIndex, array[dimensionOffsets[dim]]); break; case LONG: long val = (long) (array[dimensionOffsets[dim]]); val = (val << 32) | (array[dimensionOffsets[dim] + 1] & 0xFFFFFFFFL); outputTuple.set(dimIndex, val); break; case STRING: CodeDictionary dict = dictionaries[dim]; String str = dict.getValueForCode(array[dimensionOffsets[dim]]); outputTuple.set(dimIndex, str); break; default: throw new RuntimeException("Type of dimension is not INT, LONG or STRING for tuple "); } } } } }