package org.apache.hadoop.hive.mastiffFlexibleEncoding.parquet;
/*
* adapt from parquet
*
*/
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.List;
/**
* functionality of ByteArrayOutputStream without the memory and copy overhead
*
* It will linearly create a new slab of the initial size when needed (instead of creating a new buffer and copying the data).
* After 10 slabs their size will increase exponentially (similar to {@link ByteArrayOutputStream} behavior) by making the new slab size the size of the existing data.
*
* When reusing a buffer it will adjust the slab size based on the previous data size ({@link CapacityByteArrayOutputStream#reset()})
*
* @author Julien Le Dem
*
*/
public class CapacityByteArrayOutputStream extends OutputStream {
private static final Log LOG = Log.getLog(CapacityByteArrayOutputStream.class);
private static final int MINIMUM_SLAB_SIZE = 64 * 1024;
private static final int EXPONENTIAL_SLAB_SIZE_THRESHOLD = 10;
private int slabSize;
private List<byte[]> slabs = new ArrayList<byte[]>();
private byte[] currentSlab;
private int capacity;
private int currentSlabIndex;
private int currentSlabPosition;
private int size;
/**
* @param initialSize the initialSize of the buffer (also slab size)
*/
public CapacityByteArrayOutputStream(int initialSize) {
Preconditions.checkArgument(initialSize > 0, "initialSize must be > 0");
initSlabs(initialSize);
}
private void initSlabs(int initialSize) {
if (Log.DEBUG) LOG.debug(String.format("initial slab of size %d", initialSize));
this.slabSize = initialSize;
this.slabs.clear();
this.capacity = initialSize;
this.currentSlab = new byte[slabSize];
this.slabs.add(currentSlab);
this.currentSlabIndex = 0;
this.currentSlabPosition = 0;
this.size = 0;
}
private void addSlab(int minimumSize) {
this.currentSlabIndex += 1;
if (currentSlabIndex < this.slabs.size()) {
// reuse existing slab
this.currentSlab = this.slabs.get(currentSlabIndex);
if (Log.DEBUG) LOG.debug(String.format("reusing slab of size %d", currentSlab.length));
if (currentSlab.length < minimumSize) {
if (Log.DEBUG) LOG.debug(String.format("slab size %,d too small for value of size %,d. replacing slab", currentSlab.length, minimumSize));
byte[] newSlab = new byte[minimumSize];
capacity += minimumSize - currentSlab.length;
this.currentSlab = newSlab;
this.slabs.set(currentSlabIndex, newSlab);
}
} else {
if (currentSlabIndex > EXPONENTIAL_SLAB_SIZE_THRESHOLD) {
// make slabs bigger in case we are creating too many of them
// double slab size every time.
this.slabSize = size;
if (Log.DEBUG) LOG.debug(String.format("used %d slabs, new slab size %d", currentSlabIndex, slabSize));
}
if (slabSize < minimumSize) {
if (Log.DEBUG) LOG.debug(String.format("slab size %,d too small for value of size %,d. Bumping up slab size", slabSize, minimumSize));
this.slabSize = minimumSize;
}
if (Log.DEBUG) LOG.debug(String.format("new slab of size %d", slabSize));
this.currentSlab = new byte[slabSize];
this.slabs.add(currentSlab);
this.capacity += slabSize;
}
this.currentSlabPosition = 0;
}
@Override
public void write(int b) {
if (currentSlabPosition == currentSlab.length) {
addSlab(1);
}
currentSlab[currentSlabPosition] = (byte) b;
currentSlabPosition += 1;
size += 1;
}
@Override
public void write(byte b[], int off, int len) {
if ((off < 0) || (off > b.length) || (len < 0) ||
((off + len) - b.length > 0)) {
throw new IndexOutOfBoundsException();
}
if (currentSlabPosition + len >= currentSlab.length) {
final int length1 = currentSlab.length - currentSlabPosition;
System.arraycopy(b, off, currentSlab, currentSlabPosition, length1);
final int length2 = len - length1;
addSlab(length2);
System.arraycopy(b, off + length1, currentSlab, currentSlabPosition, length2);
currentSlabPosition = length2;
} else {
System.arraycopy(b, off, currentSlab, currentSlabPosition, len);
currentSlabPosition += len;
}
size += len;
}
/**
* Writes the complete contents of this buffer to the specified output stream argument. the output
* stream's write method <code>out.write(slab, 0, slab.length)</code>) will be called once per slab.
*
* @param out the output stream to which to write the data.
* @exception IOException if an I/O error occurs.
*/
public void writeTo(OutputStream out) throws IOException {
for (int i = 0; i < currentSlabIndex; i++) {
final byte[] slab = slabs.get(i);
out.write(slab, 0, slab.length);
}
// System.out.println(out.getClass());
out.write(currentSlab, 0, currentSlabPosition);
// File file= new File("/home/wangmeng/file");
// if(file.exists()){
// f
// }
//added by me
// FileOutputStream fos;
// File file=new File("/home/wangmeng/encodingFile");
// if(file.exists()){
// fos =new FileOutputStream(new File("/home/wangmeng/dictionaryFile"));
// }
// else{
// fos =new FileOutputStream(file);
// }
// //added by me
// ((ByteArrayOutputStream) out).writeTo(fos);
// fos.close();
// //added by me
// out.close();
}
/**
* @return the size of the allocated buffer
*/
public int getCapacity() {
return capacity;
}
/**
* When re-using an instance with reset, it will adjust slab size based on previous data size.
* The intent is to reuse the same instance for the same type of data (for example, the same column).
* The assumption is that the size in the buffer will be consistent. Otherwise we fall back to exponentialy double the slab size.
* <ul>
* <li>if we used less than half of the first slab (and it is above the minimum slab size), we will make the slab size smaller.
* <li>if we used more than the slab count threshold (10), we will re-adjust the slab size.
* </ul>
* if re-adjusting the slab size we will make it 1/5th of the previous used size in the buffer so that overhead of extra memory allocation is about 20%
* If we used less than the available slabs we free up the unused ones to reduce memory overhead.
*/
public void reset() {
// heuristics to adjust slab size
if (
// if we have only one slab, make sure it is not way too big (more than twice what we need). Except if the slab is already small
(currentSlabIndex == 0 && currentSlabPosition < currentSlab.length / 2 && currentSlab.length > MINIMUM_SLAB_SIZE)
||
// we want to avoid generating too many slabs.
(currentSlabIndex > EXPONENTIAL_SLAB_SIZE_THRESHOLD)
){
// readjust slab size
initSlabs(Math.max(size / 5, MINIMUM_SLAB_SIZE)); // should make overhead to about 20% without incurring many slabs
if (Log.DEBUG) LOG.debug(String.format("used %d slabs, new slab size %d", currentSlabIndex + 1, slabSize));
} else if (currentSlabIndex < slabs.size() - 1) {
// free up the slabs that we are not using. We want to minimize overhead
this.slabs = new ArrayList<byte[]>(slabs.subList(0, currentSlabIndex + 1));
this.capacity = 0;
for (byte[] slab : slabs) {
capacity += slab.length;
}
}
this.currentSlabIndex = 0;
this.currentSlabPosition = 0;
this.currentSlab = slabs.get(currentSlabIndex);
this.size = 0;
}
/**
* @return the size of the buffered data
*/
public long size() {
return size;
}
/**
* @return the index of the last value being written to this stream, which
* can be passed to {@link #setByte(long, byte)} in order to change it
*/
public long getCurrentIndex() {
Preconditions.checkArgument(size > 0, "This is an empty stream");
return size - 1;
}
/**
* Replace the byte stored at position index in this stream with value
*
* @param index which byte to replace
* @param value the value to replace it with
*/
public void setByte(long index, byte value) {
Preconditions.checkArgument(index < size,
"Index: " + index + " is >= the current size of: " + size);
long seen = 0;
for (int i = 0; i <=currentSlabIndex; i++) {
byte[] slab = slabs.get(i);
if (index < seen + slab.length) {
// ok found index
slab[(int)(index-seen)] = value;
break;
}
seen += slab.length;
}
}
/**
* @param prefix a prefix to be used for every new line in the string
* @return a text representation of the memory usage of this structure
*/
public String memUsageString(String prefix) {
return String.format("%s %s %d slabs, %,d bytes", prefix, getClass().getSimpleName(), slabs.size(), getCapacity());
}
/**
* @return the total count of allocated slabs
*/
int getSlabCount() {
return slabs.size();
}
}