/* (c) 2014 LinkedIn Corp. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use
* this file except in compliance with the License. You may obtain a copy of the
* License at http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed
* under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
* CONDITIONS OF ANY KIND, either express or implied.
*/
package com.linkedin.cubert.operator;
import java.io.IOException;
import java.util.Iterator;
import java.util.Map;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.codehaus.jackson.JsonNode;
import com.linkedin.cubert.block.Block;
import com.linkedin.cubert.block.BlockProperties;
import com.linkedin.cubert.block.BlockSchema;
import com.linkedin.cubert.block.ColumnType;
import com.linkedin.cubert.block.DataType;
import com.linkedin.cubert.utils.TupleUtils;
public class MedianFlattenOperator implements TupleOperator
{
private Block block;
private BlockSchema schema;
private Tuple secondOutput;
@Override
public void setInput(Map<String, Block> input, JsonNode json, BlockProperties props) throws IOException,
InterruptedException
{
block = input.values().iterator().next();
schema = getOutputSchema(block.getProperties().getSchema());
secondOutput = null;
}
private BlockSchema getOutputSchema(BlockSchema inputSchema)
{
int sizeOfOutputSchema = inputSchema.getColumnNames().length + 1;
ColumnType[] columns = new ColumnType[sizeOfOutputSchema];
for (int i = 0; i < inputSchema.getColumnNames().length; i++)
{
ColumnType column = inputSchema.getColumnType(i);
if (column.getType() != DataType.BAG)
{
columns[i] = column;
}
else
{
ColumnType tuple = column.getColumnSchema().getColumnType(0);
ColumnType[] innerColumns = tuple.getColumnSchema().getColumnTypes();
assert (innerColumns.length == 2);
columns[i] = innerColumns[0];
columns[i + 1] = innerColumns[1];
}
}
return new BlockSchema(columns);
}
@Override
public Tuple next() throws IOException,
InterruptedException
{
if (secondOutput != null)
{
Tuple copy = TupleUtils.getDeepCopy(secondOutput);
secondOutput = null;
return copy;
}
Tuple t = block.next();
if (t == null)
{
return null;
}
// tupleFlatten also sets secondOutput if the bag contains more than one tuple
return tupleFlatten(t);
}
private Tuple tupleFlatten(Tuple inTuple) throws ExecException
{
int outputSchemaSize = schema.getNumColumns();
Tuple outTuple = TupleFactory.getInstance().newTuple(outputSchemaSize);
// last column of inTuple is bag
for (int i = 0; i < inTuple.size() - 1; i++)
{
outTuple.set(i, inTuple.get(i));
}
// outputSchemaSize is 1 greater than inputSchemaSize, and tuple zero indexed, so
// -2
DataBag bag = (DataBag) inTuple.get(outputSchemaSize - 2);
Iterator<Tuple> bagIterator = bag.iterator();
Tuple firstTuple = bagIterator.next();
if (firstTuple == null)
{
throw new RuntimeException("Bag should not be empty");
}
outTuple.set(outputSchemaSize - 2, firstTuple.get(0));
outTuple.set(outputSchemaSize - 1, firstTuple.get(1));
if (bagIterator.hasNext())
{
Tuple secondTuple = bagIterator.next();
secondOutput = TupleFactory.getInstance().newTuple(outputSchemaSize);
// last column of inTuple is bag
for (int i = 0; i < inTuple.size() - 1; i++)
{
secondOutput.set(i, inTuple.get(i));
}
secondOutput.set(outputSchemaSize - 2, secondTuple.get(0));
secondOutput.set(outputSchemaSize - 1, secondTuple.get(1));
}
return outTuple;
/*
* Tuple outTuple = TupleFactory.getInstance().newTuple(4);
*
* outTuple.set(0, inTuple.get(0)); outTuple.set(1, inTuple.get(1));
*
* DataBag bag = (DataBag) inTuple.get(2); Iterator<Tuple> bagIterator =
* bag.iterator(); Tuple firstTuple = bagIterator.next();
*
* if (firstTuple == null) { throw new RuntimeException
* ("Bag should not be empty"); }
*
* outTuple.set(2, firstTuple.get(0)); outTuple.set(3, firstTuple.get(1));
*
* // case of two outputs if (bagIterator.hasNext()) { Tuple secondTuple =
* bagIterator.next(); secondOutput = TupleFactory.getInstance().newTuple(4);
* secondOutput.set(0, inTuple.get(0)); secondOutput.set(1, inTuple.get(1));
* secondOutput.set(2, secondTuple.get(0)); secondOutput.set(3,
* secondTuple.get(1)); }
*
* return outTuple;
*/
}
@Override
public PostCondition getPostCondition(Map<String, PostCondition> preConditions,
JsonNode json) throws PreconditionException
{
PostCondition condition = preConditions.values().iterator().next();
BlockSchema inputSchema = condition.getSchema();
return new PostCondition(getOutputSchema(inputSchema),
condition.getPartitionKeys(),
condition.getSortKeys());
}
}