/* (c) 2014 LinkedIn Corp. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use
* this file except in compliance with the License. You may obtain a copy of the
* License at http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed
* under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
* CONDITIONS OF ANY KIND, either express or implied.
*/
package com.linkedin.cubert.block;
import java.io.IOException;
import java.io.InputStream;
import java.nio.ByteBuffer;
import java.util.HashMap;
import java.util.Map;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.codehaus.jackson.JsonNode;
import com.linkedin.cubert.io.BlockInputStream;
import com.linkedin.cubert.io.BlockSerializationType;
import com.linkedin.cubert.io.rubix.RubixMemoryBlock;
import com.linkedin.cubert.operator.PivotBlockOperator;
import com.linkedin.cubert.utils.Pair;
import com.linkedin.cubert.utils.print;
public class BlockUtils
{
private static final int BLOCK_BUFFER_SIZE = 20 * 1024;
private static final boolean emptyForMissing = true;
private static final Map<IndexEntry, ByteBuffer> inMemoryBlockCache =
new HashMap<IndexEntry, ByteBuffer>();
private static final Map<String, Map<Object, Pair<Integer, Integer>>> columnIndexCache =
new HashMap<String, Map<Object, Pair<Integer, Integer>>>();
@SuppressWarnings("unchecked")
public static Block loadBlock(BlockProperties props,
IndexEntry indexEntry,
Configuration conf,
JsonNode json,
BlockSerializationType serializationType,
boolean isInMemoryBlock) throws IOException,
ClassNotFoundException,
InstantiationException,
IllegalAccessException,
InterruptedException
{
Block block;
if (indexEntry == null)
{
if (emptyForMissing){
long blockId = props.getBlockId();
return new EmptyBlock(props, blockId);
}
throw new IOException(String.format("Index entry is null"));
}
// populate props
props.setBlockId(indexEntry.getBlockId());
props.setNumRecords(indexEntry.getNumRecords());
// Open the file and seek to the offset for this block
Path file = new Path(indexEntry.getFile());
FileSystem fs = file.getFileSystem(conf);
FSDataInputStream fsin = fs.open(file, BLOCK_BUFFER_SIZE);
fsin.seek(indexEntry.getOffset());
// Gather information needed to read this block
Class<Tuple> valueClass =
(Class<Tuple>) TupleFactory.getInstance().newTuple().getClass();
CompressionCodec codec = new CompressionCodecFactory(conf).getCodec(file);
// Load the block now
if (isInMemoryBlock)
{
print.f("LOADING IN MEMORY the block %d", indexEntry.getBlockId());
ByteBuffer byteBuffer = inMemoryBlockCache.get(indexEntry);
if (byteBuffer == null)
{
int read = 0;
byte[] data = new byte[(int) indexEntry.getLength()];
while (read != data.length)
{
read += fsin.read(data, read, data.length - read);
}
fsin.close();
byteBuffer = ByteBuffer.wrap(data);
inMemoryBlockCache.put(indexEntry, byteBuffer);
}
else
{
print.f("REUSED FROM CACHE!!");
byteBuffer.rewind();
}
block =
new RubixMemoryBlock(props,
conf,
byteBuffer,
valueClass,
codec,
serializationType);
block.configure(json);
return block;
}
else
{
print.f("STREAMING the block %d", indexEntry.getBlockId());
InputStream in = new BlockInputStream(fsin, indexEntry.getLength());
if (codec != null)
{
in = codec.createInputStream(in);
}
block =
new CubertBlock(props, new BlockIterator<Tuple>(conf,
in,
valueClass,
serializationType,
props.getSchema()));
block.configure(json);
print.f("Loaded block id=%d from file=%s offset=%d length=%d",
indexEntry.getBlockId(),
file.toString(),
indexEntry.getOffset(),
indexEntry.getLength());
return block;
}
}
public static long getBlockId(Tuple partitionKey)
{
long h = partitionKey.hashCode();
return (h < 0 ? -h : h);
}
public static Map<Object, Pair<Integer, Integer>> generateColumnIndex(RubixMemoryBlock inputBlock,
String lookupColumn) throws IOException,
InterruptedException,
ExecException
{
String indexKey = inputBlock.getProperties().getBlockId() + ":" + lookupColumn;
if (columnIndexCache.get(indexKey) != null)
{
System.out.println("CACHED COLUMN INDEX for " + indexKey);
return columnIndexCache.get(indexKey);
}
int lookupColumnIndex =
inputBlock.getProperties().getSchema().getIndex(lookupColumn);
PivotBlockOperator pivotedBlock = new PivotBlockOperator();
String[] lookupColumns = new String[1];
lookupColumns[0] = lookupColumn;
pivotedBlock.setInput(inputBlock, lookupColumns, false);
// inputBlock.markRewindPosition();
Map<Object, Pair<Integer, Integer>> coord2offsets =
new HashMap<Object, Pair<Integer, Integer>>();
while (true)
{
Block thisBlock = pivotedBlock.next();
if (thisBlock == null)
break;
Tuple tuple = thisBlock.next();
if (null == tuple)
continue; // Would this condition happen?
Object coordinate = tuple.get(lookupColumnIndex);
int start = inputBlock.getCurrentTuplePosition();
// Run though the block (in-memory)
while (null != (tuple = thisBlock.next()))
;
int end = inputBlock.getCurrentTuplePosition();
if (end != start)
coord2offsets.put(coordinate, new Pair<Integer, Integer>(start, end));
}
columnIndexCache.put(indexKey, coord2offsets);
return coord2offsets;
//
// inputBlock.rewind();
}
}