RedBlackTreeStringReader.java example

Explorer
----Data---Storage---master
- src
/**
 * Copyright 2007 The Apache Software Foundation
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package cn.ac.ncic.mastiff.io.coding;

import java.io.IOException;
import java.io.InputStream;
import java.nio.ByteBuffer;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.io.DataInputBuffer;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.Text;

import FlexibleEncoding.ORC.DynamicByteArray;
import FlexibleEncoding.ORC.InStream;
import FlexibleEncoding.ORC.IntegerReader;
import FlexibleEncoding.ORC.OrcProto;
import FlexibleEncoding.ORC.RunLengthIntegerReader;
import FlexibleEncoding.ORC.RunLengthIntegerReaderV2;
import FlexibleEncoding.ORC.StreamName;
import FlexibleEncoding.Parquet.Binary;
import FlexibleEncoding.Parquet.Utils;
import cn.ac.ncic.mastiff.Chunk;
import cn.ac.ncic.mastiff.ValPair;
import cn.ac.ncic.mastiff.io.MultiChunk;
import cn.ac.ncic.mastiff.io.coding.Compression.Algorithm;
import cn.ac.ncic.mastiff.utils.Bytes;

public class RedBlackTreeStringReader implements Decoder {
  static final Log LOG = LogFactory.getLog(MVDecoder.class);
  ValPair pair = new ValPair();
  int valueLen;
  DynamicByteArray dictionaryBuffer;
  MultiChunk mvChunk;
  MultiChunk shadowChunk = null;
  int[] dictionaryOffsets;
  /** Compressed data */
  byte[] page = null;
  int offset;
  int compressedSize;
  int decompressedSize;
  private IntegerReader reader;
  /** statistic of a page */
  int numPairs;
  int startPos;
  ByteBuffer bb;

  /** a index area if the cluster is var-length */
  int indexOffset;

  // used for iteration
  int curIdx = 1;

  Algorithm compressAlgo;
  DataInputBuffer inBuf = new DataInputBuffer();

  /**
   * MVDecoder 
   * @param sortedCol which column to sorted
   * @param valueLen_ the length of the value
   * @param algorithm which compression algorithm used to compress the data
   */
  public RedBlackTreeStringReader(int sortedCol, int valueLen_, Algorithm algorithm ) {
    valueLen = valueLen_;
    compressAlgo = algorithm;
    mvChunk = new MultiChunk(sortedCol, true, true, valueLen_);
  }

  @Override
  public ValPair begin() throws IOException {
    //ensureDecompressed();
    pair.data = page;
    pair.offset = offset + 3 * Bytes.SIZEOF_INT;
    pair.length = valueLen == -1 ?  
        Bytes.toInt(page, offset + indexOffset) - 3 * Bytes.SIZEOF_INT : valueLen;
        pair.pos = startPos;
        return pair;
  }

  @Override
  public int beginPos() {
    return startPos;
  }

  @Override
  public ValPair end() throws IOException {
    // ensureDecompressed();
    if (numPairs == 1) 
      return begin();
    else {
      pair.data = page;
      pair.pos = startPos + numPairs - 1;
      if (valueLen == -1) {
        int lastPairOffset = Bytes.toInt(page, offset + indexOffset + (numPairs - 2) * Bytes.SIZEOF_INT);
        pair.offset = offset + lastPairOffset;
        pair.length = indexOffset - lastPairOffset;
      } else {
        pair.offset = offset + 3 * Bytes.SIZEOF_INT + (numPairs - 1) * valueLen;
        pair.length = valueLen;
      }
      return pair;
    }
  }

  @Override
  public int endPos() {
    return startPos + numPairs - 1;
  }

  @Override
  public byte[] getBuffer() {
    return page;
  }

  @Override
  public int getBufferLen() {
    return decompressedSize;
  }

  @Override
  public int getNumPairs() {
    return numPairs;
  }

  @Override
  public boolean hashNextChunk() {
    return curIdx < 1;
  }

  @Override
  public Chunk nextChunk() throws IOException {
    if (curIdx >= 1) return null;
    //  ensureDecompressed();
    System.out.println("152   page  size "+page.length);
    mvChunk.setBuffer(page, offset +  3 * Bytes.SIZEOF_INT,
        offset + indexOffset, numPairs, startPos);
    curIdx++;
    return mvChunk;
  }

  @Override
  public Chunk getChunkByPosition(int position) throws IOException {
    if (position < startPos || position >= startPos + numPairs)
      return null;
    if (shadowChunk == null)
      shadowChunk = new MultiChunk(0, true, true, valueLen);
    //ensureDecompressed();
    System.out.println("167  page  size "+page.length);
    shadowChunk.setBuffer(page, offset +  3 * Bytes.SIZEOF_INT,
        offset + indexOffset, numPairs, startPos);
    return shadowChunk;
  }

  @Override
  public void reset() {
    curIdx = 1;
  }

  @Override
  public void reset(byte[] buffer, int offset, int length) throws IOException {
    this.offset = offset;
    compressedSize = length;
    bb = ByteBuffer.wrap(buffer, offset, length);
    decompressedSize = bb.getInt();
    numPairs = bb.getInt();
    startPos = bb.getInt();
    curIdx = 0;
    indexOffset = valueLen == -1 ? decompressedSize - numPairs * Bytes.SIZEOF_INT : -1;
    if (compressAlgo == null||Algorithm.NONE==compressAlgo){
      inBuf.reset(buffer, offset, length);
      page = ensureDecompressed() ;
      System.out.println("201   page  size  "+page.length+" ensureDecompressed()  "+ensureDecompressed().length);
    }
    else{
      decompressedSize=bb.getInt();
      inBuf.reset(buffer, offset + 4 * Bytes.SIZEOF_INT, length - 4 * Bytes.SIZEOF_INT);
      ensureDecompress() ;
      page =  CompressensureDecompressed();
    }
  }

  public  void ensureDecompress() throws IOException {
    org.apache.hadoop.io.compress.Decompressor decompressor = this.compressAlgo.getDecompressor();
    InputStream is = this.compressAlgo.createDecompressionStream(inBuf, decompressor, 0);
    ByteBuffer buf = ByteBuffer.allocate(decompressedSize);
    // ByteBuffer buf = ByteBuffer.allocate(is.available());
    IOUtils.readFully(is, buf.array(),0, buf.capacity());
    is.close(); 
    this.compressAlgo.returnDecompressor(decompressor);
    inBuf.reset(buf.array(), offset, buf.capacity());
  }
  public byte[]  CompressensureDecompressed() throws IOException {
    FlexibleEncoding.Parquet.DeltaByteArrayReader reader = new FlexibleEncoding.Parquet.DeltaByteArrayReader();
    DataOutputBuffer   transfer=new DataOutputBuffer() ;
    //   transfer.write(inBuf.getData(), 12, inBuf.getLength()-12);
    transfer.write(inBuf.getData(), 0, inBuf.getLength());
    byte [] data =transfer.getData() ;
    System.out.println("286   byte [] data  "+  data.length+"  numPairs  "+numPairs);
    inBuf.close();
    Binary[] bin = new Utils().readData(reader, data, numPairs);
    System.out.println("2998   Binary[] bin   "+     bin.length);
    // bb = ByteBuffer.wrap(page, 0, page.length);
    //  int  count=0 ;
    DataOutputBuffer   decoding = new DataOutputBuffer();
    DataOutputBuffer   offset = new DataOutputBuffer();
    decoding.writeInt(decompressedSize);
    decoding.writeInt(numPairs);
    decoding.writeInt(startPos);
    int dataoffset=12;
    // int dataoffset=0 ;
    String  str ;
    for(int i=0; i < numPairs; i++) {
      str =bin[i].toStringUsingUTF8() ;
      decoding.writeUTF(str);
//      if(i<5){
//        System.out.println("304  bin[i]  "+str+"  decoding    "+ decoding.size());
//      }
      dataoffset =decoding.size()  ;
      offset.writeInt(dataoffset);
    }
    System.out.println("315  offset.size() "+offset.size()+"  decoding.szie   "+decoding.size());
    System.out.println("316  dataoffet   "+dataoffset);
    //  System.out.println("number  of Pairs =  "+ceshi);
    decoding.write(offset.getData(), 0,offset.size());
    inBuf.close();
    offset.close();
    System.out.println("316   decoding   "+decoding.size()+"   "+decoding.getLength()+" decoding.getData()   "+decoding.getData().length);
    return decoding.getData();
  }
  @Override
  public byte[]  ensureDecompressed() throws IOException {
    DataOutputBuffer   transfer=new DataOutputBuffer() ;
    transfer.write(inBuf.getData(), 12, inBuf.getLength()-12);
    DataInputBuffer  dib = new DataInputBuffer();
    dib.reset(transfer.getData(),0,transfer.getLength());
    int  dictionarySize=dib.readInt() ;
    int length1=dib.readInt() ;
    byte [] data =transfer.getData() ;
    transfer.close();
    dib.reset(data,Integer.SIZE+Integer.SIZE,length1);
    FlexibleEncoding.ORC.StreamName name = new  FlexibleEncoding.ORC.StreamName(0,
        OrcProto.Stream.Kind.DICTIONARY_DATA);
    ByteBuffer inBuf1 = ByteBuffer.allocate(length1);
    inBuf1.put(dib.getData(), 0, dib.getLength());
    inBuf1.flip();
    InStream in = InStream.create
        ("test1", inBuf1, null, dictionarySize) ;
    if (in.available() > 0) {
      dictionaryBuffer = new DynamicByteArray(64, in.available());
      dictionaryBuffer.readAll(in);
      in.close();
      // read the lengths    google  proto buffer
      name = new StreamName(1, OrcProto.Stream.Kind.LENGTH);
      dib.reset(data, 4+4+length1,4);
      int   length2=dib.readInt() ;
      dib.reset(data, 4+4+length1+4, length2);
      //  in = streams.get(name);
      ByteBuffer inBuf2 = ByteBuffer.allocate(length2);
      inBuf2.put(dib.getData(),0 , length2) ;
      inBuf2.flip();
      in = InStream.create
          ("test2", inBuf2, null, dictionarySize) ;
      //    IntegerReader lenReader = createIntegerReader(encodings.get(columnId)
      //        .getKind(), in, false);
      IntegerReader lenReader = createIntegerReader(OrcProto.ColumnEncoding.Kind.DIRECT_V2, in, false);
      int offset = 0;
      dictionaryOffsets = new int[dictionarySize + 1];
      for(int i=0; i < dictionarySize; ++i) {
        dictionaryOffsets[i] = offset;
        offset += (int) lenReader.next();
      }
      dictionaryOffsets[dictionarySize] = offset;
      in.close();
      name = new FlexibleEncoding.ORC.StreamName(2, OrcProto.Stream.Kind.DATA);
      dib.reset(data, 4+4+length1+4+length2, 4);
      int length3=dib.readInt() ;
      dib.reset(data, 4+4+length1+4+length2+4, length3);
      ByteBuffer inBuf3 = ByteBuffer.allocate(length3);
      inBuf3.put(dib.getData(), 0, length3) ;
      inBuf3.flip();
      in = InStream.create
          ("test3", inBuf3, null, dictionarySize) ;
      reader = createIntegerReader(OrcProto.ColumnEncoding.Kind.DIRECT_V2,
          in, false);
    }
    inBuf.close();
    DataOutputBuffer   decoding = new DataOutputBuffer();
    DataOutputBuffer   offsets = new DataOutputBuffer();
    decoding.writeInt(decompressedSize);
    decoding.writeInt(numPairs);
    decoding.writeInt(startPos);
    int dataoffset=12;
    String  str ;
    for(int i=0; i < numPairs; i++) {
      str = readEachValue(null);
      decoding.writeUTF(str);
//      if(i<5){
//        System.out.println("304  bin[i]  "+str+"  decoding    "+ decoding.size());
//      }
      dataoffset =decoding.size()  ;
      offsets.writeInt(dataoffset);
    }
    System.out.println("315  offset.size() "+offsets.size()+"  decoding.szie   "+decoding.size());
    System.out.println("316  dataoffet   "+dataoffset);
    decoding.write(offsets.getData(), 0,offsets.size());
    inBuf.close();
    offsets.close();
    dib.close();
    System.out.println("316   decoding   "+decoding.size()+decoding.getLength()+" decoding.getData()   "+decoding.getData().length);
    inBuf1.clear() ;
    return decoding.getData();
  }
  @Override
  public boolean skipToPos(int pos) {
    if (pos < startPos || pos >= startPos + numPairs)
      return false;

    return true;
  }
  public  String  readEachValue(Text previous) throws IOException{
    Text result = null;
    int entry = (int) reader.next();
    if (previous == null) {
      result = new Text();
    } else {
      result = (Text) previous;
    }
    int offset = dictionaryOffsets[entry];
    int length;
    if (entry < dictionaryOffsets.length - 1) {
      length = dictionaryOffsets[entry + 1] - offset;
    } else {
      length = dictionaryBuffer.size() - offset;
    }
    // If the column is just empty strings, the size will be zero,
    // so the buffer will be null, in that case just return result
    // as it will default to empty
    if (dictionaryBuffer != null) {
      dictionaryBuffer.setText(result, offset, length);
    } else {
      result.clear();
    }
    // }
    return result.toString();

  }
  public  IntegerReader createIntegerReader(OrcProto.ColumnEncoding.Kind kind,
      InStream in,
      boolean signed) throws IOException {
    switch (kind) {
    case DIRECT_V2:
    case DICTIONARY_V2:
      return new RunLengthIntegerReaderV2(in, signed);
    case DIRECT:
    case DICTIONARY:
      return new RunLengthIntegerReader(in, signed);
    default:
      throw new IllegalArgumentException("Unknown encoding " + kind);
    }
  }
}