/* (c) 2014 LinkedIn Corp. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use
* this file except in compliance with the License. You may obtain a copy of the
* License at http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed
* under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
* CONDITIONS OF ANY KIND, either express or implied.
*/
package com.linkedin.cubert.operator;
import com.linkedin.cubert.block.Block;
import com.linkedin.cubert.block.BlockProperties;
import com.linkedin.cubert.block.BlockSchema;
import com.linkedin.cubert.block.ColumnType;
import com.linkedin.cubert.block.DataType;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.data.BagFactory;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.codehaus.jackson.JsonNode;
public class FlattenOperator2 implements TupleOperator
{
public static enum FlattenType
{
TUPLE(0), BAG(1), BAG_TUPLE(2);
private final int value;
private FlattenType(int value)
{
this.value = value;
}
public static FlattenType fromString(String s) throws IllegalArgumentException
{
for (FlattenType ftype : FlattenType.values())
{
if (s.equalsIgnoreCase(ftype.toString()))
return ftype;
}
throw new IllegalArgumentException("Unmatched FlattenType");
}
}
// TODO: use this datastructure as value in TreeMap<Integer, FlattenInfo> inputColumnToFlattenInfo to
// replace columnIndexArray, flattenPositions, inputColumnIndexToOutputTypes
class FlattenInfo {
private FlattenType type;
private List<ColumnType> outputColumnTypes;
}
// input column positions that are being flattened
private final Set<Integer> columnIndexArray = new TreeSet<Integer>();
private final Map<Integer, FlattenType> flattenPositions = new HashMap<Integer, FlattenType>();
private Tuple outTuple = null;
// TODO: refactor odometer into a class by itself
private final ArrayList<Iterator<Tuple>> odometerIterators = new ArrayList<Iterator<Tuple>>();
private Block inputBlock;
private Tuple inTuple;
private BlockSchema outSchema;
private final Map<Integer, List<ColumnType>> inputColumnIndexToOutputTypes =
new HashMap<Integer, List<ColumnType>>();
private final Set<String> flattenColumnNameSet = new HashSet<String>();
// private int resultFieldCount;
// private BlockSchema inSchema;
private DataBag nullBag;
@Override
public void setInput(Map<String, Block> input,
JsonNode operatorJson,
BlockProperties props) throws IOException,
InterruptedException
{
inputBlock = input.values().iterator().next();
init(operatorJson, inputBlock.getProperties().getSchema());
nullBag = BagFactory.getInstance().newDefaultBag();
nullBag.add(TupleFactory.getInstance().newTuple(0));
}
private void init(JsonNode operatorJson, BlockSchema inputSchema)
{
JsonNode exprListNode = operatorJson.get("genExpressions");
Iterator<JsonNode> it = exprListNode.getElements();
while (it.hasNext())
{
JsonNode expr = it.next();
String colName = expr.get("col").getTextValue();
int columnId = inputSchema.getIndex(colName);
flattenColumnNameSet.add(colName);
columnIndexArray.add(columnId);
if (expr.has("flatten"))
{
final String flattenTypeStr = expr.get("flatten").getTextValue();
flattenPositions.put(inputSchema.getIndex(colName),
FlattenType.fromString(flattenTypeStr));
odometerIterators.add(null);
// Extract output column definition from 'expr'
List<ColumnType> outputColumTypeList = new ArrayList<ColumnType>();
inputColumnIndexToOutputTypes.put(columnId, outputColumTypeList);
JsonNode outputNode = expr.get("output");
for (JsonNode j : outputNode)
{
String outColName = j.get("col").getTextValue();
DataType outType = DataType.valueOf(j.get("type").getTextValue());
outputColumTypeList.add(new ColumnType(outColName, outType));
}
}
}
this.outSchema = generateOutSchema(inputSchema);
}
private BlockSchema generateOutSchema(BlockSchema inputSchema)
{
List<ColumnType> outputColumnTypes = new ArrayList<ColumnType>();
for (ColumnType ct : inputSchema.getColumnTypes())
{
String colName = ct.getName();
int colIndex = inputSchema.getIndex(colName);
if (! flattenColumnNameSet.contains(colName))
{
outputColumnTypes.add(ct);
}
else
{
BlockSchema inputNestedColumnSchema = ct.getColumnSchema();
ColumnType[] ctypes = inputNestedColumnSchema.getColumnTypes();
if (ctypes.length == 1 && ctypes[0].getType() == DataType.TUPLE)
inputNestedColumnSchema = ctypes[0].getColumnSchema();
List<ColumnType> flattedOutputColumnTypes = inputColumnIndexToOutputTypes.get(colIndex);
if (flattedOutputColumnTypes != null && !flattedOutputColumnTypes.isEmpty())
{
// output schema published in json.
// TODO: assert output schema in json matches nested input schema for the column
if (inputNestedColumnSchema == null || inputNestedColumnSchema.getColumnTypes() == null)
throw new RuntimeException("Invalid schema for columnn: "
+ colName + " column schema: " + inputNestedColumnSchema);
if (flattedOutputColumnTypes.size() != inputNestedColumnSchema.getColumnTypes().length)
throw new RuntimeException("Output column specification does not match number of input fields for "
+ colName);
}
else
{
// output schema not published in json. Extract from nested input column schema
if (inputNestedColumnSchema == null)
{
throw new RuntimeException("Schema is unknown for column: " + colName);
}
else
{
List<ColumnType> subColTypes = Arrays.asList(inputNestedColumnSchema.getColumnTypes());
flattedOutputColumnTypes = new ArrayList<ColumnType>();
flattedOutputColumnTypes.addAll(subColTypes);
}
inputColumnIndexToOutputTypes.put(colIndex, flattedOutputColumnTypes);
}
outputColumnTypes.addAll(flattedOutputColumnTypes);
}
}
return new BlockSchema(outputColumnTypes.toArray(new ColumnType[0]));
}
@Override
public Tuple next() throws IOException,
InterruptedException
{
// First attempt to flatten bag, if type == BAG || BAG_TUPLE
Tuple tuple1 = flattenBagNext();
if (tuple1 == null) {
// Past the last tuple.
return null;
}
// Now attempt to flatten tuple, if type == TUPLE || BAG_TUPLE
return tupleFlatten(tuple1);
}
public Tuple tupleFlatten(Tuple inTuple) throws IOException
{
final int count = outSchema.getNumColumns();
// TODO: can we reuse tuple?
Tuple tuple = TupleFactory.getInstance().newTuple(count);
// for each position, retrieve either column value or flattened value
int outidx = 0;
for (int colId = 0; colId < inTuple.size(); colId++)
{
Object obj = inTuple.get(colId);
if (!isFlattenTuple(flattenPositions.get(colId)))
{
// Not a "flatten" column. Preserve object.
tuple.set(outidx++, obj);
continue;
}
// Object is a tuple. Flatten it.
Tuple preFlattening = (Tuple) obj;
int nColumnFields = this.inputColumnIndexToOutputTypes.get(colId).size();
if (obj == null || preFlattening.size() == 0)
{
for (int i = 0; i < nColumnFields; i++)
tuple.set(outidx++, null);
}
else
{
for (int i = 0; i < nColumnFields; i++)
tuple.set(outidx++, preFlattening.get(i));
}
}
if (outidx < count)
throw new RuntimeException(String.format("FlattenTuple: found fewer fields than expected=%d, found=%d",
count,
outidx));
return tuple;
}
public Tuple flattenBagNext() throws IOException,
InterruptedException
{
Tuple t; // Rui. to avoid currentTupleNext being called twice.
if (outTuple == null || (t = currentTupleNext()) == null)
{
Tuple inTuple = inputBlock.next();
if (inTuple == null)
return null; // Rui.
initCurrentTuple(inTuple);
return this.outTuple;
}
return t;
}
private boolean isFlattenBag(FlattenType ftype)
{
return (ftype == FlattenType.BAG || ftype == FlattenType.BAG_TUPLE);
}
private boolean isFlattenTuple(FlattenType ftype)
{
return (ftype == FlattenType.TUPLE || ftype == FlattenType.BAG_TUPLE);
}
private void initCurrentTuple(Tuple inTuple) throws ExecException
{
this.inTuple = inTuple;
this.odometerIterators.clear();
for (int columnId : columnIndexArray)
{
FlattenType ftype = flattenPositions.get(columnId);
if (ftype == null || !isFlattenBag(ftype))
{
continue;
}
DataBag dbag = (DataBag) (inTuple.get(columnId));// Rui. change outTuple to
// inTuple
if (dbag == null || dbag.size() == 0)
{
// Null and empty bags are treated as if they contained a single null tuple.
odometerIterators.add(nullBag.iterator());
} else
{
odometerIterators.add(dbag.iterator());
}
}
seedOutTuple();// Rui. move it here.
}
private void seedOutTuple() throws ExecException
{
// TODO: Can we re-use this tuple?
this.outTuple = TupleFactory.getInstance().newTuple(inTuple.size());
int oid = 0;
for (int idx = 0; idx < inTuple.size(); idx++)
{
Object o = this.inTuple.get(idx);
FlattenType ftype = flattenPositions.get(idx);
if (ftype == null || !isFlattenBag(ftype))
{
// Non-flatten element. Add it as is.
this.outTuple.set(idx, o);
continue;
}
// Extract first element from oid-th iterator. Iterator guaranteed to have
// at least one element: first element in bag or null.
Iterator<Tuple> tupleIt = odometerIterators.get(oid);
this.outTuple.set(idx, tupleIt.next());
oid++;
}
}
private Tuple currentTupleNext() throws ExecException
{
int nfields = outTuple.size();
// TODO: Can we re-use this tuple?
Tuple resultTuple = TupleFactory.getInstance().newTuple(nfields);
// copy current outTuple to resultTuple;
for (int idx = 0; idx < nfields; idx++)
{
Object o = outTuple.get(idx);
resultTuple.set(idx, o);
}
// Figure out which of the iterators can be advanced and which needs reset.
int oid = 0;
for (int columnId : columnIndexArray)
{
FlattenType ftype = flattenPositions.get(columnId);
if (ftype == null || !isFlattenBag(ftype))
continue;
Iterator<Tuple> tupleIt = odometerIterators.get(oid);
if (tupleIt.hasNext())
{
Object o1 = tupleIt.next();
resultTuple.set(columnId, o1);
outTuple = resultTuple;
return resultTuple;
}
else
{
// reset the iterator
Object o = this.inTuple.get(columnId);
DataBag dbag = (DataBag) o;
tupleIt = dbag.iterator();
odometerIterators.set(oid, tupleIt);
}
oid++;
}
return null;
}
@Override
public PostCondition getPostCondition(Map<String, PostCondition> preConditions,
JsonNode json) throws PreconditionException
{
PostCondition condition = preConditions.values().iterator().next();
BlockSchema inputSchema = condition.getSchema();
this.init(json, inputSchema);
return new PostCondition(outSchema,
condition.getPartitionKeys(),
condition.getSortKeys());
}
}