/**
* Copyright 2007 The Apache Software Foundation
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package cn.ac.ncic.mastiff.io.coding;
import java.io.IOException;
import java.io.InputStream;
import java.nio.ByteBuffer;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.io.DataInputBuffer;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.Text;
import FlexibleEncoding.ORC.DynamicByteArray;
import FlexibleEncoding.ORC.InStream;
import FlexibleEncoding.ORC.IntegerReader;
import FlexibleEncoding.ORC.OrcProto;
import FlexibleEncoding.ORC.RunLengthIntegerReader;
import FlexibleEncoding.ORC.RunLengthIntegerReaderV2;
import FlexibleEncoding.ORC.StreamName;
import FlexibleEncoding.Parquet.Binary;
import FlexibleEncoding.Parquet.Utils;
import cn.ac.ncic.mastiff.Chunk;
import cn.ac.ncic.mastiff.ValPair;
import cn.ac.ncic.mastiff.io.MultiChunk;
import cn.ac.ncic.mastiff.io.coding.Compression.Algorithm;
import cn.ac.ncic.mastiff.utils.Bytes;
public class RedBlackTreeStringReader implements Decoder {
static final Log LOG = LogFactory.getLog(MVDecoder.class);
ValPair pair = new ValPair();
int valueLen;
DynamicByteArray dictionaryBuffer;
MultiChunk mvChunk;
MultiChunk shadowChunk = null;
int[] dictionaryOffsets;
/** Compressed data */
byte[] page = null;
int offset;
int compressedSize;
int decompressedSize;
private IntegerReader reader;
/** statistic of a page */
int numPairs;
int startPos;
ByteBuffer bb;
/** a index area if the cluster is var-length */
int indexOffset;
// used for iteration
int curIdx = 1;
Algorithm compressAlgo;
DataInputBuffer inBuf = new DataInputBuffer();
/**
* MVDecoder
* @param sortedCol which column to sorted
* @param valueLen_ the length of the value
* @param algorithm which compression algorithm used to compress the data
*/
public RedBlackTreeStringReader(int sortedCol, int valueLen_, Algorithm algorithm ) {
valueLen = valueLen_;
compressAlgo = algorithm;
mvChunk = new MultiChunk(sortedCol, true, true, valueLen_);
}
@Override
public ValPair begin() throws IOException {
//ensureDecompressed();
pair.data = page;
pair.offset = offset + 3 * Bytes.SIZEOF_INT;
pair.length = valueLen == -1 ?
Bytes.toInt(page, offset + indexOffset) - 3 * Bytes.SIZEOF_INT : valueLen;
pair.pos = startPos;
return pair;
}
@Override
public int beginPos() {
return startPos;
}
@Override
public ValPair end() throws IOException {
// ensureDecompressed();
if (numPairs == 1)
return begin();
else {
pair.data = page;
pair.pos = startPos + numPairs - 1;
if (valueLen == -1) {
int lastPairOffset = Bytes.toInt(page, offset + indexOffset + (numPairs - 2) * Bytes.SIZEOF_INT);
pair.offset = offset + lastPairOffset;
pair.length = indexOffset - lastPairOffset;
} else {
pair.offset = offset + 3 * Bytes.SIZEOF_INT + (numPairs - 1) * valueLen;
pair.length = valueLen;
}
return pair;
}
}
@Override
public int endPos() {
return startPos + numPairs - 1;
}
@Override
public byte[] getBuffer() {
return page;
}
@Override
public int getBufferLen() {
return decompressedSize;
}
@Override
public int getNumPairs() {
return numPairs;
}
@Override
public boolean hashNextChunk() {
return curIdx < 1;
}
@Override
public Chunk nextChunk() throws IOException {
if (curIdx >= 1) return null;
// ensureDecompressed();
System.out.println("152 page size "+page.length);
mvChunk.setBuffer(page, offset + 3 * Bytes.SIZEOF_INT,
offset + indexOffset, numPairs, startPos);
curIdx++;
return mvChunk;
}
@Override
public Chunk getChunkByPosition(int position) throws IOException {
if (position < startPos || position >= startPos + numPairs)
return null;
if (shadowChunk == null)
shadowChunk = new MultiChunk(0, true, true, valueLen);
//ensureDecompressed();
System.out.println("167 page size "+page.length);
shadowChunk.setBuffer(page, offset + 3 * Bytes.SIZEOF_INT,
offset + indexOffset, numPairs, startPos);
return shadowChunk;
}
@Override
public void reset() {
curIdx = 1;
}
@Override
public void reset(byte[] buffer, int offset, int length) throws IOException {
this.offset = offset;
compressedSize = length;
bb = ByteBuffer.wrap(buffer, offset, length);
decompressedSize = bb.getInt();
numPairs = bb.getInt();
startPos = bb.getInt();
curIdx = 0;
indexOffset = valueLen == -1 ? decompressedSize - numPairs * Bytes.SIZEOF_INT : -1;
if (compressAlgo == null||Algorithm.NONE==compressAlgo){
inBuf.reset(buffer, offset, length);
page = ensureDecompressed() ;
System.out.println("201 page size "+page.length+" ensureDecompressed() "+ensureDecompressed().length);
}
else{
decompressedSize=bb.getInt();
inBuf.reset(buffer, offset + 4 * Bytes.SIZEOF_INT, length - 4 * Bytes.SIZEOF_INT);
ensureDecompress() ;
page = CompressensureDecompressed();
}
}
public void ensureDecompress() throws IOException {
org.apache.hadoop.io.compress.Decompressor decompressor = this.compressAlgo.getDecompressor();
InputStream is = this.compressAlgo.createDecompressionStream(inBuf, decompressor, 0);
ByteBuffer buf = ByteBuffer.allocate(decompressedSize);
// ByteBuffer buf = ByteBuffer.allocate(is.available());
IOUtils.readFully(is, buf.array(),0, buf.capacity());
is.close();
this.compressAlgo.returnDecompressor(decompressor);
inBuf.reset(buf.array(), offset, buf.capacity());
}
public byte[] CompressensureDecompressed() throws IOException {
FlexibleEncoding.Parquet.DeltaByteArrayReader reader = new FlexibleEncoding.Parquet.DeltaByteArrayReader();
DataOutputBuffer transfer=new DataOutputBuffer() ;
// transfer.write(inBuf.getData(), 12, inBuf.getLength()-12);
transfer.write(inBuf.getData(), 0, inBuf.getLength());
byte [] data =transfer.getData() ;
System.out.println("286 byte [] data "+ data.length+" numPairs "+numPairs);
inBuf.close();
Binary[] bin = new Utils().readData(reader, data, numPairs);
System.out.println("2998 Binary[] bin "+ bin.length);
// bb = ByteBuffer.wrap(page, 0, page.length);
// int count=0 ;
DataOutputBuffer decoding = new DataOutputBuffer();
DataOutputBuffer offset = new DataOutputBuffer();
decoding.writeInt(decompressedSize);
decoding.writeInt(numPairs);
decoding.writeInt(startPos);
int dataoffset=12;
// int dataoffset=0 ;
String str ;
for(int i=0; i < numPairs; i++) {
str =bin[i].toStringUsingUTF8() ;
decoding.writeUTF(str);
// if(i<5){
// System.out.println("304 bin[i] "+str+" decoding "+ decoding.size());
// }
dataoffset =decoding.size() ;
offset.writeInt(dataoffset);
}
System.out.println("315 offset.size() "+offset.size()+" decoding.szie "+decoding.size());
System.out.println("316 dataoffet "+dataoffset);
// System.out.println("number of Pairs = "+ceshi);
decoding.write(offset.getData(), 0,offset.size());
inBuf.close();
offset.close();
System.out.println("316 decoding "+decoding.size()+" "+decoding.getLength()+" decoding.getData() "+decoding.getData().length);
return decoding.getData();
}
@Override
public byte[] ensureDecompressed() throws IOException {
DataOutputBuffer transfer=new DataOutputBuffer() ;
transfer.write(inBuf.getData(), 12, inBuf.getLength()-12);
DataInputBuffer dib = new DataInputBuffer();
dib.reset(transfer.getData(),0,transfer.getLength());
int dictionarySize=dib.readInt() ;
int length1=dib.readInt() ;
byte [] data =transfer.getData() ;
transfer.close();
dib.reset(data,Integer.SIZE+Integer.SIZE,length1);
FlexibleEncoding.ORC.StreamName name = new FlexibleEncoding.ORC.StreamName(0,
OrcProto.Stream.Kind.DICTIONARY_DATA);
ByteBuffer inBuf1 = ByteBuffer.allocate(length1);
inBuf1.put(dib.getData(), 0, dib.getLength());
inBuf1.flip();
InStream in = InStream.create
("test1", inBuf1, null, dictionarySize) ;
if (in.available() > 0) {
dictionaryBuffer = new DynamicByteArray(64, in.available());
dictionaryBuffer.readAll(in);
in.close();
// read the lengths google proto buffer
name = new StreamName(1, OrcProto.Stream.Kind.LENGTH);
dib.reset(data, 4+4+length1,4);
int length2=dib.readInt() ;
dib.reset(data, 4+4+length1+4, length2);
// in = streams.get(name);
ByteBuffer inBuf2 = ByteBuffer.allocate(length2);
inBuf2.put(dib.getData(),0 , length2) ;
inBuf2.flip();
in = InStream.create
("test2", inBuf2, null, dictionarySize) ;
// IntegerReader lenReader = createIntegerReader(encodings.get(columnId)
// .getKind(), in, false);
IntegerReader lenReader = createIntegerReader(OrcProto.ColumnEncoding.Kind.DIRECT_V2, in, false);
int offset = 0;
dictionaryOffsets = new int[dictionarySize + 1];
for(int i=0; i < dictionarySize; ++i) {
dictionaryOffsets[i] = offset;
offset += (int) lenReader.next();
}
dictionaryOffsets[dictionarySize] = offset;
in.close();
name = new FlexibleEncoding.ORC.StreamName(2, OrcProto.Stream.Kind.DATA);
dib.reset(data, 4+4+length1+4+length2, 4);
int length3=dib.readInt() ;
dib.reset(data, 4+4+length1+4+length2+4, length3);
ByteBuffer inBuf3 = ByteBuffer.allocate(length3);
inBuf3.put(dib.getData(), 0, length3) ;
inBuf3.flip();
in = InStream.create
("test3", inBuf3, null, dictionarySize) ;
reader = createIntegerReader(OrcProto.ColumnEncoding.Kind.DIRECT_V2,
in, false);
}
inBuf.close();
DataOutputBuffer decoding = new DataOutputBuffer();
DataOutputBuffer offsets = new DataOutputBuffer();
decoding.writeInt(decompressedSize);
decoding.writeInt(numPairs);
decoding.writeInt(startPos);
int dataoffset=12;
String str ;
for(int i=0; i < numPairs; i++) {
str = readEachValue(null);
decoding.writeUTF(str);
// if(i<5){
// System.out.println("304 bin[i] "+str+" decoding "+ decoding.size());
// }
dataoffset =decoding.size() ;
offsets.writeInt(dataoffset);
}
System.out.println("315 offset.size() "+offsets.size()+" decoding.szie "+decoding.size());
System.out.println("316 dataoffet "+dataoffset);
decoding.write(offsets.getData(), 0,offsets.size());
inBuf.close();
offsets.close();
dib.close();
System.out.println("316 decoding "+decoding.size()+decoding.getLength()+" decoding.getData() "+decoding.getData().length);
inBuf1.clear() ;
return decoding.getData();
}
@Override
public boolean skipToPos(int pos) {
if (pos < startPos || pos >= startPos + numPairs)
return false;
return true;
}
public String readEachValue(Text previous) throws IOException{
Text result = null;
int entry = (int) reader.next();
if (previous == null) {
result = new Text();
} else {
result = (Text) previous;
}
int offset = dictionaryOffsets[entry];
int length;
if (entry < dictionaryOffsets.length - 1) {
length = dictionaryOffsets[entry + 1] - offset;
} else {
length = dictionaryBuffer.size() - offset;
}
// If the column is just empty strings, the size will be zero,
// so the buffer will be null, in that case just return result
// as it will default to empty
if (dictionaryBuffer != null) {
dictionaryBuffer.setText(result, offset, length);
} else {
result.clear();
}
// }
return result.toString();
}
public IntegerReader createIntegerReader(OrcProto.ColumnEncoding.Kind kind,
InStream in,
boolean signed) throws IOException {
switch (kind) {
case DIRECT_V2:
case DICTIONARY_V2:
return new RunLengthIntegerReaderV2(in, signed);
case DIRECT:
case DICTIONARY:
return new RunLengthIntegerReader(in, signed);
default:
throw new IllegalArgumentException("Unknown encoding " + kind);
}
}
}