/* (c) 2014 LinkedIn Corp. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use
* this file except in compliance with the License. You may obtain a copy of the
* License at http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed
* under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
* CONDITIONS OF ANY KIND, either express or implied.
*/
package com.linkedin.cubert.operator;
import java.io.IOException;
import java.util.Arrays;
import java.util.Map;
import com.linkedin.cubert.block.RowPivotedBlock;
import com.linkedin.cubert.utils.CommonUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.pig.data.Tuple;
import org.codehaus.jackson.JsonNode;
import com.linkedin.cubert.block.Block;
import com.linkedin.cubert.block.PivotedBlock;
import com.linkedin.cubert.block.TupleStoreBlock;
import com.linkedin.cubert.utils.JsonUtils;
import com.linkedin.cubert.utils.RawTupleStore;
import com.linkedin.cubert.utils.SerializedTupleStore;
import com.linkedin.cubert.utils.TupleStore;
/**
* Pivots a block on specified keys.
* <p/>
* This is a block operator -- that is, an operator that generates multiple blocks as
* output. The number of output blocks is equal to the number of distinct pivot keys found
* in the input block.
* <p/>
* This operator can operate either in streaming fashion (reads one tuple at a time from
* the input source), on in bulk load manner (loads all input data in memory first). This
* behavior is controlled by the "inMemory" boolean flag in the JSON.
* <p/>
* If the data is bulk loaded in memory (when inMemory=true in JSON), this operator
* exhibits an "auto rewind" behavior. The data can chosen to be serialized to reduce the
* memory usage, or not serialized to have better speed.
* <p/>
* Auto Rewind: In standard use, a block generate tuples (in the next() method) and
* finally when it cannot generate any more data, it returns null. If the next() method
* were to be called from this point on, the block will keep not returning null,
* indicating that is no data to return. In auto-rewind case, however, the block will
* rewind to the beginning of the buffer once it has exhausted all tuples. That is, the
* block will first return tuples in the next() method, and when not more data is
* available, it will return null. After returning null, this block will then rewind the
* in-memory buffer. Therefore, if a next() call were to be made now, the first tuple will
* be returned.
* <p/>
* Note that auto-rewind is possible only when this operator bulk loads all input in
* memory.
*
* @author Maneesh Varshney
*/
public class PivotBlockOperator implements BlockOperator
{
private Block sourceBlock;
private boolean inMemory = false;
private boolean firstBlock = true;
private boolean serialized = false;
private boolean pivoted = false;
private boolean byRow = false;
@Override
public void setInput(Configuration conf, Map<String, Block> input, JsonNode json)
throws IOException, InterruptedException
{
Block inputBlock = input.values().iterator().next();
String[] pivotBy = JsonUtils.asArray(json, "pivotBy");
boolean inMemory = json.get("inMemory").getBooleanValue();
long pivotRowCount = 0L;
if (json.has("pivotType") && JsonUtils.getText(json, "pivotType").equalsIgnoreCase("ROW"))
{
byRow = true;
pivotRowCount = Long.parseLong(JsonUtils.getText(json, "pivotValue"));
}
setInput(inputBlock, pivotBy, inMemory, pivotRowCount);
}
public void setInput(Block block, String[] pivotBy, boolean inMemory) throws IOException, InterruptedException
{
setInput(block, pivotBy, inMemory, 0L);
}
public void setInput(Block block, String[] pivotBy, boolean inMemory, long pivotRowCount)
throws IOException, InterruptedException
{
if (byRow)
{
sourceBlock = new RowPivotedBlock(block, pivotRowCount);
pivoted = true;
}
else if (pivotBy.length > 0)
{
sourceBlock = new PivotedBlock(block, pivotBy);
pivoted = true;
}
else
{
sourceBlock = block;
}
this.inMemory = inMemory;
}
@Override
public Block next() throws IOException, InterruptedException
{
if (firstBlock)
{
firstBlock = false;
if (inMemory)
{
return loadInMemory(sourceBlock);
}
else
{
return sourceBlock;
}
}
// only one block to return for non-pivoted case
if (!pivoted)
{
return null;
}
if (byRow)
{
if (!((RowPivotedBlock) sourceBlock).advancePivot())
{
return null;
}
}
else if (!((PivotedBlock) sourceBlock).advancePivot())
{
return null;
}
if (inMemory)
{
return loadInMemory(sourceBlock);
}
else
{
return sourceBlock;
}
}
// bulk load the data in memory, and store it in TupleStore.
private Block loadInMemory(Block block) throws IOException, InterruptedException
{
TupleStore store = serialized ? new SerializedTupleStore(block.getProperties().getSchema()) : new RawTupleStore(
block.getProperties().getSchema());
Tuple tuple;
while ((tuple = block.next()) != null)
{
store.addToStore(tuple);
}
return new TupleStoreBlock(store, block.getProperties());
}
@Override
public PostCondition getPostCondition(Map<String, PostCondition> preConditions, JsonNode json)
throws PreconditionException
{
boolean inMemory = json.get("inMemory").getBooleanValue();
boolean byRow = false;
if (json.has("pivotType") && JsonUtils.getText(json, "pivotType").equalsIgnoreCase("ROW"))
{
byRow = true;
}
// Currently we don't allow pivoting BY ROW that is not IN MEMORY
if (byRow && !inMemory)
{
throw new PreconditionException(PreconditionExceptionType.INVALID_CONFIG,
"PIVOT BY ROW must be used in conjunction with IN MEMORY");
}
PostCondition preCondition = preConditions.values().iterator().next();
String[] pivotBy = JsonUtils.asArray(json, "pivotBy");
if (pivotBy != null)
{
if (!CommonUtils.isPrefix(preCondition.getSortKeys(), pivotBy))
{
throw new PreconditionException(PreconditionExceptionType.INVALID_SORT_KEYS);
}
String[] sortKeys =
Arrays.copyOfRange(preCondition.getSortKeys(), pivotBy.length, preCondition.getSortKeys().length);
return new PostCondition(preCondition.getSchema(), preCondition.getPartitionKeys(), sortKeys, pivotBy);
}
else
{
return preCondition;
}
}
}