/* (c) 2014 LinkedIn Corp. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use
* this file except in compliance with the License. You may obtain a copy of the
* License at http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed
* under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
* CONDITIONS OF ANY KIND, either express or implied.
*/
package com.linkedin.cubert.operator;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.data.BagFactory;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.codehaus.jackson.JsonNode;
import com.linkedin.cubert.block.Block;
import com.linkedin.cubert.block.BlockProperties;
import com.linkedin.cubert.block.BlockSchema;
import com.linkedin.cubert.block.ColumnType;
import com.linkedin.cubert.block.DataType;
public class FlattenBagOperator implements TupleOperator
{
public static enum FlattenType
{
TUPLE(0), BAG(1), BAG_TUPLE(2);
private final int value;
private FlattenType(int value)
{
this.value = value;
}
public int getValue()
{
return value;
}
public static FlattenType fromString(String s) throws IllegalArgumentException
{
for (FlattenType ftype : FlattenType.values())
{
if (s.equalsIgnoreCase(ftype.toString()))
return ftype;
}
throw new IllegalArgumentException("Unmatched FlattenType");
}
}
// input column positions that are being flattened
private final Set<Integer> columnIndexArray = new TreeSet<Integer>();
// whether flatten is needed at a particular output
private final HashMap<Integer, FlattenType> flattenPositions =
new HashMap<Integer, FlattenType>();
private Tuple outTuple = null;
private final ArrayList<Iterator<Tuple>> odometerIterators =
new ArrayList<Iterator<Tuple>>();
private Block inputBlock;
private Tuple inTuple;
private BlockSchema outSchema;
private final Map<String, List<ColumnType>> outputColumnTypeMap =
new HashMap<String, List<ColumnType>>();
private final Set<String> flattenColumnNameSet = new HashSet<String>();
private int resultFieldCount;
private BlockSchema inSchema;
private DataBag nullBag;
@Override
public void setInput(Map<String, Block> input,
JsonNode operatorJson,
BlockProperties props) throws IOException,
InterruptedException
{
for (Block ib : input.values())
{
inputBlock = ib;
break;
}
BlockSchema inputSchema = inputBlock.getProperties().getSchema();
init(operatorJson, inputSchema);
createNullBag();
}
private void createNullBag()
{
nullBag = BagFactory.getInstance().newDefaultBag();
Tuple emptyTuple = TupleFactory.getInstance().newTuple(0);
nullBag.add((Tuple) emptyTuple);
}
private void init(JsonNode operatorJson, BlockSchema inputSchema)
{
JsonNode exprListNode = operatorJson.get("genExpressions");
Iterator<JsonNode> it = exprListNode.getElements();
while (it.hasNext())
{
JsonNode e = it.next();
String colName = e.get("col").getTextValue();
int columnId = inputSchema.getIndex(colName);
flattenColumnNameSet.add(colName);
columnIndexArray.add(columnId);
JsonNode flattenNode = e.get("flatten");
if (flattenNode != null)
{
String ftypestr = flattenNode.getTextValue();
flattenPositions.put(inputSchema.getIndex(colName),
FlattenType.fromString(ftypestr));
// System.out.println("Flatten column =" + colName + " position = " +
// inputSchema.getIndex(colName) + " type= " + ftypestr);
odometerIterators.add(null);
// out put column definitions:
List<ColumnType> outputColumTypeList = new ArrayList<ColumnType>();
outputColumnTypeMap.put(colName, outputColumTypeList);
JsonNode outputNode = e.get("output");
for (JsonNode j : outputNode)
{
String outColName = j.get("col").getTextValue();
DataType outType = DataType.valueOf(j.get("type").getTextValue());
outputColumTypeList.add(new ColumnType(outColName, outType));
}
}
}
this.outSchema = generateOutSchema(inputSchema);
}
private BlockSchema generateOutSchema(BlockSchema inputSchema)
{
inSchema = inputSchema;
List<ColumnType> colTypeList = new ArrayList<ColumnType>();
resultFieldCount = 0;
for (ColumnType ct : inSchema.getColumnTypes())
{
String colName = ct.getName();
if (flattenColumnNameSet.contains(colName))
{
BlockSchema columnSchema = ct.getColumnSchema();
// handle nested tuple schema for bags. [BAG [TUPLE [x, y]]]
ColumnType[] ctypes = columnSchema.getColumnTypes();
int colid = inputSchema.getIndex(colName);
FlattenType ftype = flattenPositions.get(colid);
if (isFlattenTuple(ftype) && isFlattenBag(ftype)){
// Unnest the tuple schema into columnSchema.
if (ctypes.length == 1 && ctypes[0].getType() == DataType.TUPLE)
columnSchema = ctypes[0].getColumnSchema();
}
List<ColumnType> outputColTypeDef = outputColumnTypeMap.get(colName);
if (outputColTypeDef != null && !outputColTypeDef.isEmpty())
{
if (columnSchema == null || columnSchema.getColumnTypes() == null)
throw new RuntimeException("invalid schema for column = " + colName + " columnSchema = " + columnSchema);
if (outputColTypeDef.size() != columnSchema.getColumnTypes().length)
throw new RuntimeException("Output column specification does not match number of input fields for "
+ colName + " outputColTypeDef size = " + outputColTypeDef.size() + " columnSchema = " + columnSchema);
colTypeList.addAll(outputColTypeDef);
}
else
{
outputColTypeDef = new ArrayList<ColumnType>();
if (columnSchema == null)
{
throw new RuntimeException("Schema is unkown for col: " + colName);
}
else
{
List<ColumnType> subColTypes =
Arrays.asList(columnSchema.getColumnTypes());
colTypeList.addAll(subColTypes);
outputColTypeDef.addAll(subColTypes);
}
outputColumnTypeMap.put(colName, outputColTypeDef);
}
resultFieldCount += outputColTypeDef.size();
}
else
{
colTypeList.add(ct);
resultFieldCount++;
}
}
return new BlockSchema(colTypeList.toArray(new ColumnType[0]));
}
@Override
public Tuple next() throws IOException,
InterruptedException
{
Tuple tuple1 = flattenBagNext();
if (tuple1 == null)
return null; // Rui
Tuple result = tupleFlatten(tuple1);
// System.out.println("flatten result: " + result.toString());
return result;
}
public Tuple tupleFlatten(Tuple inTuple) throws IOException,
InterruptedException
{
// field count
// System.out.println("tupleFlatten input: " + inTuple.toString());
int nfields = resultFieldCount;
Tuple outTuple = TupleFactory.getInstance().newTuple(nfields);
// for each position, retrieve either column value or flattened value
int outidx = 0;
for (int colId = 0; colId < inTuple.size(); colId++)
{
Object o = inTuple.get(colId);
if (!isFlattenTuple(flattenPositions.get(colId)))
{
outTuple.set(outidx++, o);
continue;
}
String inputColumnName = inSchema.getColumnNames()[colId];
int nColumnFields = this.outputColumnTypeMap.get(inputColumnName).size();
Tuple t = (Tuple) o;
if (o == null || t.size() == 0)
{
for (int i = 0; i < nColumnFields; i++)
outTuple.set(outidx++, null);
}
else
{
for (int i = 0; i < nColumnFields; i++)
outTuple.set(outidx++, t.get(i));
}
}
if (outidx < resultFieldCount)
throw new RuntimeException(String.format("FlattenTuple: found fewer fields than expected=%d, found=%d",
resultFieldCount,
outidx));
return outTuple;
}
public Tuple flattenBagNext() throws IOException,
InterruptedException
{
Tuple t; // Rui. to avoid currentTupleNext being called twice.
if (outTuple == null || (t = currentTupleNext()) == null)
{
Tuple inTuple = inputBlock.next();
if (inTuple == null)
return null; // Rui.
initCurrentTuple(inTuple);
return this.outTuple;
}
return t;
}
private boolean isFlattenBag(FlattenType ftype)
{
return (ftype == FlattenType.BAG || ftype == FlattenType.BAG_TUPLE) ? true
: false;
}
private boolean isFlattenTuple(FlattenType ftype)
{
return (ftype == FlattenType.TUPLE || ftype == FlattenType.BAG_TUPLE ? true
: false);
}
private void initCurrentTuple(Tuple inTuple) throws ExecException
{
// TODO Auto-generated method stub
this.inTuple = inTuple;
this.odometerIterators.clear();
for (int columnId : columnIndexArray)
{
FlattenType ftype = flattenPositions.get(columnId);
if (ftype == null || !isFlattenBag(ftype))
{
continue;
}
DataBag dbag = (DataBag) (inTuple.get(columnId));// Rui. change outTuple to
// inTuple
Iterator<Tuple> tupleIt;
// Deal with null and empty bags as if they contained a single null tuple.
if (dbag == null || dbag.size() == 0)
tupleIt = nullBag.iterator();
else
tupleIt = dbag.iterator();
odometerIterators.add(tupleIt);
}
seedOutTuple();// Rui. move it here.
}
private void seedOutTuple() throws ExecException
{
// TODO Auto-generated method stub
this.outTuple = TupleFactory.getInstance().newTuple(inTuple.size());
int oid = 0;
for (int idx = 0; idx < inTuple.size(); idx++)
{
Object o = this.inTuple.get(idx);
FlattenType ftype = flattenPositions.get(idx);
if (ftype == null || !isFlattenBag(ftype))
{
this.outTuple.set(idx, o);
continue;
}
// DataBag dbag = (DataBag) o;
Iterator<Tuple> tupleIt = odometerIterators.get(oid++);
Object o1 = tupleIt.next();
this.outTuple.set(idx, o1);
}
}
private Tuple currentTupleNext() throws ExecException
{
int nfields = outTuple.size();
Tuple resultTuple = TupleFactory.getInstance().newTuple(nfields);
// copy current outTuple to resultTuple;
for (int idx = 0; idx < nfields; idx++)
{
Object o = outTuple.get(idx);
resultTuple.set(idx, o);
}
// Figure out which of the iterators can be advanced and which needs reset.
int oid = 0;
for (int columnId : columnIndexArray)
{
FlattenType ftype = flattenPositions.get(columnId);
if (ftype == null || !isFlattenBag(ftype))
continue;
Iterator<Tuple> tupleIt = odometerIterators.get(oid);
if (tupleIt.hasNext())
{
Object o1 = tupleIt.next();
resultTuple.set(columnId, o1);
outTuple = resultTuple;
return resultTuple;
}
else
{
// reset the iterator
Object o = this.inTuple.get(columnId);
DataBag dbag = (DataBag) o;
tupleIt = dbag.iterator();
odometerIterators.set(oid, tupleIt);
}
oid++;
}
return null;
}
@Override
public PostCondition getPostCondition(Map<String, PostCondition> preConditions,
JsonNode json) throws PreconditionException
{
PostCondition condition = preConditions.values().iterator().next();
BlockSchema inputSchema = condition.getSchema();
this.init(json, inputSchema);
return new PostCondition(outSchema,
condition.getPartitionKeys(),
condition.getSortKeys());
}
}