package org.apache.hadoop.hive.mastiff;
import java.io.Closeable;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.lang.management.ManagementFactory;
import java.lang.management.MemoryUsage;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.mastiff.SerializeUtil.PageId;
import org.apache.hadoop.hive.mastiffFlexibleEncoding.orc.OutStream;
//import FlexibleEncoding.ORC.RunLengthByteWriter;
import org.apache.hadoop.hive.mastiffFlexibleEncoding.orc.RunLengthByteWriter;
import org.apache.hadoop.hive.mastiffFlexibleEncoding.orc.RunLengthIntegerWriter;
import org.apache.hadoop.hive.mastiffFlexibleEncoding.orc.TestInStream;
import org.apache.hadoop.hive.mastiffFlexibleEncoding.parquet.Binary;
import org.apache.hadoop.hive.mastiffFlexibleEncoding.parquet.BytesInput;
import org.apache.hadoop.hive.mastiffFlexibleEncoding.parquet.DeltaBinaryPackingValuesWriter;
import org.apache.hadoop.hive.mastiffFlexibleEncoding.parquet.DeltaByteArrayWriter;
import org.apache.hadoop.hive.mastiffFlexibleEncoding.parquet.OnlyDictionaryValuesWriter.PlainBinaryDictionaryValuesWriter;
import org.apache.hadoop.hive.mastiffFlexibleEncoding.parquet.OnlyDictionaryValuesWriter.PlainIntegerDictionaryValuesWriter;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.DataInputBuffer;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.util.StringUtils;
import FlexibleEncoding.ORC.DynamicIntArray;
import cn.ac.ncic.mastiff.PosChunk;
import cn.ac.ncic.mastiff.ValPair;
import cn.ac.ncic.mastiff.etl.ETLUtils;
import cn.ac.ncic.mastiff.hive.serde.lazy.ClusterAccessor;
import cn.ac.ncic.mastiff.hive.serde.lazy.ClusterAccessor.DataType;
import cn.ac.ncic.mastiff.hive.serde.lazy.Row;
import cn.ac.ncic.mastiff.io.PosRLEChunk;
import cn.ac.ncic.mastiff.io.coding.Compression.Algorithm;
import cn.ac.ncic.mastiff.io.coding.EnDecode;
import cn.ac.ncic.mastiff.io.coding.Encoder;
import cn.ac.ncic.mastiff.io.coding.Encoder.CodingType;
import cn.ac.ncic.mastiff.io.coding.ORCStringEcnodingUtil;
import cn.ac.ncic.mastiff.io.segmentfile.BlockCache;
import cn.ac.ncic.mastiff.io.segmentfile.LruBlockCache;
import cn.ac.ncic.mastiff.io.segmentfile.PageCache;
import cn.ac.ncic.mastiff.io.segmentfile.PageMeta;
import cn.ac.ncic.mastiff.io.segmentfile.PageMeta.ScanMode;
import cn.ac.ncic.mastiff.io.segmentfile.PageMetaList;
import cn.ac.ncic.mastiff.io.segmentfile.PageMetaSection;
import cn.ac.ncic.mastiff.io.segmentfile.SimplePageCache;
import cn.ac.ncic.mastiff.mapred.MastiffMapReduce;
import cn.ac.ncic.mastiff.operators.ExprDesc;
import cn.ac.ncic.mastiff.operators.Predicate;
import cn.ac.ncic.mastiff.utils.Bytes;
import cn.ac.ncic.mastiff.utils.Utils;
/**
* File format for mastiff.
*
* Copied from {@link cn.ac.ncic.mastiff.io.segmentfile.SegmentFile}
*/
public class SegmentFile {
static final Log LOG = LogFactory.getLog(SegmentFile.class);
public static final String SEGFILE_CACHE_SIZE_KEY = "mastiff.pagecache.size";
private static BlockCache globalPageCache = null;
/**
* SegmentFile Writer.
*/
public static class Writer implements Closeable {
public class TabConfig {
private String[] str=null ;
private DeltaByteArrayWriter[] deltaByteArrayStringWriter =null;
private DynamicIntArray[] dynamicIntArray =null ;
private PlainBinaryDictionaryValuesWriter[] dictionaryBitPackingRLEZigZarByte=null;
private PlainIntegerDictionaryValuesWriter[] dictionaryBitPackingRLEZigZarInt=null;
private ORCStringEcnodingUtil[] oRCStringEcnodingUtil=null ;
private TestInStream.OutputCollector[] collect=null;
private DeltaBinaryPackingValuesWriter[] deltaBianryBitPackingInt=null ;
private DeltaByteArrayWriter[] deltaByteArrayWriter=null ;
private RunLengthIntegerWriter[] runLengthInteger=null;
private RunLengthByteWriter[] runLengthByte=null;
private boolean[] pageInit ,firstNumber;
private int[] pagesizes ;
private Algorithm[] algorithms ;
public boolean isInit=false ,isFirst=true ;
private DataType[][] originalTableSchema;
private List<List<DataType>> originalSchema;
private int numFields;
private int[] cluster_pages;
private List<List<DataType>> clusterSchema;
private int numClusters;
private ClusterAccessor[] accessors;
private ClusterAccessor[] backups;
private Row[] rows, prevRows, maxs, mins, segMaxs, segMins;
private Encoder[] compressors;
private CodingType[] codings;
private ValPair[] vps;
private ValPair[] backupVps;
private PageMeta[] pms;
private int[] pageIds;
private long pageIdCount=0 ;
private long sgementSize = 0;
private int[] startPoss, numReps;
private PageId segId;
private final int count=0 ;
private ArrayList<List<BytesWritable>> clusterValue =null;
private final BytesWritable outValue = new BytesWritable();
private final DataOutputBuffer out = new DataOutputBuffer();
private final long SegmentSize=536870912-SerializeUtil.desc.clusterTypes.size()*131072;
private final int[] tmpLength = new int[1];
private int[][] columnsMapping;
private final DataInputBuffer in = new DataInputBuffer();
public void configure(JobConf job, Properties tbl) throws IOException {
int numClusters = SerializeUtil.desc.clusterTypes.size();
if (numClusters != SerializeUtil.desc.clusterAlgos.length) {
throw new RuntimeException("Please check the cluster algorithms, " +
SerializeUtil.desc.clusterAlgos.length + " algorithms provided while there are " +
numClusters + " clusters.");
}
if (numClusters != SerializeUtil.desc.clusterCodingTypes.length) {
throw new RuntimeException("Please check the cluster coding types, " +
SerializeUtil.desc.clusterAlgos.length + " coding types provided while there are " +
numClusters + " clusters.");
}
numFields = SerializeUtil.desc.tableSchema[0].length;
originalTableSchema = SerializeUtil.desc.tableSchema;
DataType[][] storageSchema = new DataType[originalTableSchema.length][];
for (int i = 0; i < originalTableSchema.length; i++) {
storageSchema[i] = new DataType[originalTableSchema[i].length];
for (int j = 0; j < storageSchema[i].length; j++) {
if (originalTableSchema[i][j] == DataType.DATE) {
storageSchema[i][j] = DataType.LONG;
} else {
storageSchema[i][j] = originalTableSchema[i][j];
}
}
}
dynamicIntArray=new DynamicIntArray[numClusters] ;
oRCStringEcnodingUtil=new ORCStringEcnodingUtil[numClusters] ;
deltaByteArrayStringWriter=new DeltaByteArrayWriter[numClusters] ;
runLengthInteger=new RunLengthIntegerWriter[numClusters] ;
runLengthByte=new RunLengthByteWriter[numClusters] ;
collect=new TestInStream.OutputCollector[numClusters] ;
deltaBianryBitPackingInt=new DeltaBinaryPackingValuesWriter[numClusters] ;
deltaByteArrayWriter=new DeltaByteArrayWriter[numClusters] ;
dictionaryBitPackingRLEZigZarInt=new PlainIntegerDictionaryValuesWriter[numClusters] ;
dictionaryBitPackingRLEZigZarByte=new PlainBinaryDictionaryValuesWriter[numClusters] ;
pageInit=new boolean[numClusters];
firstNumber=new boolean[numClusters];
str=new String[numClusters];
for(int i=0 ;i<numClusters;i++){
pageInit[i]=false;
firstNumber[i]=false;
str[i]=new String() ;
}
// replace DataType.DATE to DataType.LONG
for (int i = 0; i < SerializeUtil.desc.clusterTypes.size(); i++) {
for (int j = 0; j < SerializeUtil.desc.clusterTypes.get(i).size(); j++) {
if (SerializeUtil.desc.clusterTypes.get(i).get(j) == DataType.DATE) {
SerializeUtil.desc.clusterTypes.get(i).set(j, DataType.LONG);
}
}
}
// int pagesize = MastiffMapReduce.getTablePageSize(job);
int pagesize = 131072*4*10;
cluster_pages = new int[numClusters];
for (int i = 0; i < numClusters; i++) {
cluster_pages[i] = pagesize ;
// cluster_pages[i] = SerializeUtil.desc.clusterAlgos[i] == null ?
// pagesize : SerializeUtil.desc.clusterAlgos[i].getScaleRatio() * pagesize;
// }
}
// for (int b = 0; b < numClusters; b++) {
// // switch (b){
// // case 0:
// // clusterValue.add(new ArrayList<BytesWritable>(90));
// clusterValue.add(new ArrayList<BytesWritable>(512));
// // // break ;
// // // case 1:
// // // clusterValue.add(new ArrayList<BytesWritable>(175));
// // clusterValue.add(new ArrayList<BytesWritable>(40));
// // // break ;
// // // case 2:
// // // clusterValue.add(new ArrayList<BytesWritable>(90));
// // clusterValue.add(new ArrayList<BytesWritable>(40));
// //// break ;
// // case 3:
// // // clusterValue.add(new ArrayList<BytesWritable>(680));
// // clusterValue.add(new ArrayList<BytesWritable>(40));
// // break ;
// // case 4:
// // clusterValue.add(new ArrayList<BytesWritable>(45));
// // break ;
// // case 5:
// // // clusterValue.add(new ArrayList<BytesWritable>(170));
// // clusterValue.add(new ArrayList<BytesWritable>(40));
// // break ;
// // case 6:
// // // clusterValue.add(new ArrayList<BytesWritable>(170));
// // clusterValue.add(new ArrayList<BytesWritable>(40));
// // break ;
// // case 7:
// // // clusterValue.add(new ArrayList<BytesWritable>(170));
// // clusterValue.add(new ArrayList<BytesWritable>(40));
// // break ;
// // case 8:
// // // clusterValue.add(new ArrayList<BytesWritable>(1120));
// // clusterValue.add(new ArrayList<BytesWritable>(40));
// // break ;
// // }
//
// }
preConfigure(job, storageSchema, ETLUtils.getSchema(SerializeUtil.desc.clusterTypes),
cluster_pages, SerializeUtil.desc.clusterAlgos, SerializeUtil.desc.clusterCodingTypes,
SerializeUtil.desc.columnsMapping);
}
public void preConfigure(JobConf job, DataType[][] originalTypes,
DataType[][] clusterTypes, int[] pagesizes, Algorithm[] algorithms,
CodingType[] codings, int[][] columnsMapping) {
this.columnsMapping = columnsMapping;
this.codings = codings;
originalSchema = ETLUtils.getSchema(originalTypes);// //���������������������������������ize������
numFields = originalSchema.get(0).size();// //������������������
clusterSchema = ETLUtils.getSchema(clusterTypes);
numClusters = clusterSchema.size();
accessors = new ClusterAccessor[numClusters];
backups = new ClusterAccessor[numClusters];
rows = new Row[numClusters];// ///
prevRows = new Row[numClusters];
maxs = new Row[numClusters];
mins = new Row[numClusters];
segMaxs = new Row[numClusters];
segMins = new Row[numClusters];
compressors = new Encoder[numClusters];
vps = new ValPair[numClusters];
backupVps = new ValPair[numClusters];
pms = new PageMeta[numClusters];
pageIds = new int[numClusters];
startPoss = new int[numClusters];
numReps = new int[numClusters];
this.pagesizes=pagesizes;
this.algorithms=algorithms;
for (int i = 0; i < numClusters; i++) {
accessors[i] = new ClusterAccessor();
accessors[i].init(clusterSchema.get(i));
backups[i] = new ClusterAccessor(accessors[i]);
rows[i] = new Row(clusterSchema.get(i));
prevRows[i] = null;
if (Algorithm.NONE ==SerializeUtil.desc.clusterAlgos[i]) {
algorithms[i] = null;
}
if (accessors[i].getFixedLen() > 0) {
compressors[i]=EnDecode.getEncoder(pagesizes[i],accessors[i].getFixedLen(), 0, algorithms[i],codings[i]);
} else {
compressors[i]=EnDecode.getEncoder(pagesizes[i],0, 0, algorithms[i],codings[i]);
// compressors[i] = new VarLenEncoder(pagesizes[i], 0, algorithms[i]);
}
compressors[i].reset();
vps[i] = new ValPair();
backupVps[i] = new ValPair();
pms[i] = new PageMeta();
pms[i].startPos = 0;
pms[i].numPairs = 0;
pageIds[i] = 0;
startPoss[i] = numReps[i] = 0;
}
segId = new PageId();
segId.setSegmentId(Math.random() * 100 + "");
// String taskId = job.get("mapred.tip.id");
// String taskId = ETLUtils.getTaskId(job);
// segId.setSegmentId(taskId);
}
public void updateMaxMins(Row oldMax, Row oldMin, Row newMax, Row newMin) {
int size = oldMax.size();
for (int i = 0; i < size; i++) {
Comparable maxCp = (Comparable) newMax.get(i).getObject();
if (maxCp.compareTo(oldMax.get(i).getObject()) > 0) {
oldMax.get(i).setObject(newMax.get(i).getPrimitiveObject());
}
Comparable minCp = (Comparable) newMin.get(i).getObject();
if (minCp.compareTo(oldMin.get(i).getObject()) < 0) {
oldMin.get(i).setObject(newMin.get(i).getPrimitiveObject());
}
}
}
void outputPage(int i) throws IOException {
// 1) prepare the page
byte[] page = compressors[i].getPage();
int pageLen = compressors[i].getPageLen();
if (page == null || pageLen <= 0) {
return;
}
out.reset();
out.writeInt(pageLen);
out.write(page, 0, pageLen);
// 2) write page meta
backupVps[i].data = backups[i].serialize(maxs[i], tmpLength);
backupVps[i].offset = 0;
backupVps[i].length = tmpLength[0];
backupVps[i].write(out);
backupVps[i].data = backups[i].serialize(mins[i], tmpLength);
backupVps[i].length = tmpLength[0];
// write min row
backupVps[i].write(out);
out.writeInt(pms[i].startPos);
out.writeInt(pms[i].numPairs);
// set max & min row of current segment
if (segMaxs[i] == null || segMins[i] == null) {
segMaxs[i] = maxs[i];
segMins[i] = mins[i];
} else {
// Row.updateMaxMins(segMaxs[i], segMins[i], maxs[i], mins[i]);
updateMaxMins(segMaxs[i], segMins[i], maxs[i], mins[i]);
}
// 3) output the page
segId.setPageId(pageIds[i]);
segId.setClusterId(i);
if(!isInit){
isInit=true ;
clusterValue=new ArrayList<List<BytesWritable>>(SerializeUtil.desc.clusterTypes.size());
for (int b = 0; b < numClusters; b++) {
clusterValue.add(new ArrayList<BytesWritable>(512));
}
}
clusterValue.get(i).add(new BytesWritable());
clusterValue.get(i).get(clusterValue.get(i).size() - 1)
.set(out.getData(), 0, out.getLength());
sgementSize = sgementSize + cluster_pages[i];
// 4) reset
pageIds[i]++;
pms[i].startPos += pms[i].numPairs;
pms[i].numPairs = 0;
maxs[i] = mins[i] = null;
compressors[i].reset();
}
void SegmentMeta() throws IOException {
for (int i = 0; i < numClusters; i++) {
if(codings[i]==CodingType.MV){
outputPage(i);
}
else if(codings[i]==CodingType.RunLengthEncodingInt){
lastPageRunLengthEncodingInt( i);
}
else if(codings[i]==CodingType.RunLengthEncodingByte){
lastPageRunLengthEncodingByte( i);
}
else if(codings[i]==CodingType.RunLengthEncodingLong){
lastPageRunLengthEncodingLong(i);
}
else if(codings[i]==CodingType.DeltaBinaryArrayZigZarByte){
// lastPageRunLengthEncodingByte( i);
lastPageDeltaBinaryArraysBitPackingByte(i);
}
else if(codings[i]==CodingType.DeltaBinaryBitPackingZigZarInt){
// lastPageRunLengthEncodingByte( i);
lastPageDeltaBinaryBitPackingInt(i);
}
else if(codings[i]==CodingType.DeltaBinaryBitPackingZigZarLong){
// lastPageRunLengthEncodingByte( i);
lastPageDeltaBinaryBitPackingLong(i);
}
else if(codings[i]==CodingType.DictionaryBitPackingRLEByte){
// lastPageRunLengthEncodingByte( i);
lastPageDictionaryBitPackingRLEZigZarByte(i);
}
else if(codings[i]==CodingType.DictionaryBitPackingRLEInt){
// lastPageRunLengthEncodingByte( i);
lastPageDictionaryBitPackingRLEZigZarInt(i);
}
else if(codings[i]==CodingType.DictionaryBitPackingRLELong){
// lastPageRunLengthEncodingByte( i);
lastPageDictionaryBitPackingRLEZigZarLong(i);
} else if(codings[i]==CodingType.DeltaBinaryPackingString){
// lastPageRunLengthEncodingByte( i);
lastPageDeltaBinaryPackingString(i);
}
else if(codings[i]==CodingType.RedBlackTreeString){
// lastPageRunLengthEncodingByte( i);
lastPageRedBlackTreeString(i);
}
else {
throw new UnsupportedEncodingException("Wrong encoding Type");
}
} // write out the segment infomations
for (int i = 0; i < numClusters; i++) {
outputSegmentMeta(i);
}
}
private void lastPageRedBlackTreeString(int i) throws IOException {
oRCStringEcnodingUtil[i].dictionarySize= oRCStringEcnodingUtil[i].dictionary.size();
oRCStringEcnodingUtil[i].dumpOrder=new int[ oRCStringEcnodingUtil[i].dictionary.size()] ;
oRCStringEcnodingUtil[i].init();
oRCStringEcnodingUtil[i].iterator();
oRCStringEcnodingUtil[i].rowoutPut() ;
oRCStringEcnodingUtil[i].flush();
DataOutputBuffer dob=new DataOutputBuffer() ;
dob.writeInt( oRCStringEcnodingUtil[i].dictionarySize);
int length=oRCStringEcnodingUtil[i].collect1.buffer.size() ;
ByteBuffer inBuf = ByteBuffer.allocate( length);
oRCStringEcnodingUtil[i].collect1.buffer.setByteBuffer(inBuf, 0, length);
inBuf.flip();
dob.writeInt(length);
dob.write(inBuf.array(), 0, length);
inBuf.clear() ;
length=oRCStringEcnodingUtil[i].collect2.buffer.size() ;
oRCStringEcnodingUtil[i].collect2.buffer.setByteBuffer(inBuf, 0, length);
inBuf.flip();
dob.writeInt(length);
dob.write(inBuf.array(), 0, length);
inBuf.clear() ;
length=oRCStringEcnodingUtil[i].collect3.buffer.size() ;
oRCStringEcnodingUtil[i].collect3.buffer.setByteBuffer(inBuf, 0, length);
inBuf.flip();
dob.writeInt(length);
dob.write(inBuf.array(), 0, length);
inBuf.clear() ;
vps[i].data=dob.getData() ;
vps[i].offset = 0;
vps[i].length= vps[i].data.length ;
compressors[i].appendPage(vps[i]);
outputPage(i);
LOG.info("dataoffset "+compressors[i].dataOffset);
LOG.info("numPairs "+compressors[i].numPairs);
LOG.info("startPos "+compressors[i].startPos);
pageInit[i]=false;
}
private void lastPageDeltaBinaryPackingString(int i) throws IOException {
vps[i].data=deltaByteArrayStringWriter[i].getBytes().toByteArray();
vps[i].offset = 0;
vps[i].length= vps[i].data.length ;
compressors[i].appendPage(vps[i]);
outputPage(i);
pageInit[i]=false;
deltaByteArrayStringWriter[i].reset();
}
void lastPageRunLengthEncodingInt(int i) throws IOException{
runLengthInteger[i].flush();
ByteBuffer inBuf = ByteBuffer.allocate(collect[i].buffer.size());
byte[] pageBytes=collect[i].buffer.getByteBuffer(inBuf, 0, collect[i].buffer.size()) ;
collect[i].buffer.clear();
inBuf.clear();
vps[i].data=pageBytes ;
vps[i].offset = 0;
vps[i].length= pageBytes.length;
compressors[i].appendPage(vps[i]);
outputPage(i);
pageInit[i]=false;
}
void lastPageRunLengthEncodingByte(int i) throws IOException{
runLengthByte[i].flush();
ByteBuffer inBuf = ByteBuffer.allocate(collect[i].buffer.size());
byte[] pageBytes=collect[i].buffer.getByteBuffer(inBuf, 0, collect[i].buffer.size()) ;
collect[i].buffer.clear();
inBuf.clear();
vps[i].data=pageBytes ;
vps[i].offset = 0;
vps[i].length= pageBytes.length;
compressors[i].appendPage(vps[i]);
outputPage(i);
pageInit[i]=false;
}
void lastPageRunLengthEncodingLong(int i) throws IOException{
throw new UnsupportedEncodingException("WangMeng has not implement this method on 03.27.2014 yet");
}
void lastPageDeltaBinaryBitPackingInt(int i) throws IOException{
BytesInput bi=deltaBianryBitPackingInt[i].getBytes() ;
byte[] pageBytes=bi.getBufferSize() ;
vps[i].data=pageBytes ;
vps[i].offset = 0;
vps[i].length= pageBytes.length;
try {
compressors[i].appendPage(vps[i]);
} catch (IOException e) {
e.printStackTrace();
}
try {
outputPage(i);
} catch (IOException e) {
e.printStackTrace();
}
pageInit[i]=false;
}
void lastPageDeltaBinaryArraysBitPackingByte(int i) throws IOException{
BytesInput bi=deltaByteArrayWriter[i].getBytes();
byte[] pageBytes=bi.getBufferSize() ;
vps[i].data=pageBytes ;
vps[i].offset = 0;
vps[i].length= pageBytes.length;
try {
compressors[i].appendPage(vps[i]);
} catch (IOException e) {
e.printStackTrace();
}
try {
outputPage(i);
} catch (IOException e) {
e.printStackTrace();
}
pageInit[i]=false;
}
void lastPageDeltaBinaryBitPackingLong(int i) throws IOException{
throw new UnsupportedEncodingException("WangMeng has not implement this method on 03.27.2014 yet");
}
void lastPageDictionaryBitPackingRLEZigZarByte(int i) throws IOException{
BytesInput sbi= dictionaryBitPackingRLEZigZarByte[i].getBytes();
byte[] dictionaryBuffer=dictionaryBitPackingRLEZigZarByte[i].getDictionaryBuffer();
byte[] dictionaryID=sbi.getBufferSize() ;
DataOutputBuffer dob=new DataOutputBuffer() ;
dob.writeInt(dictionaryBuffer.length);
dob.write(dictionaryBuffer, 0, dictionaryBuffer.length);
dob.write(dictionaryID, 0, dictionaryID.length);
byte[] pageBytes=dob.getData() ;
dob.close();
vps[i].data=pageBytes ;
vps[i].offset = 0;
vps[i].length= pageBytes.length;
try {
compressors[i].appendPage(vps[i]);
} catch (IOException e) {
e.printStackTrace();
}
try {
outputPage(i);
} catch (IOException e) {
e.printStackTrace();
}
pageInit[i]=false;
}
void lastPageDictionaryBitPackingRLEZigZarInt(int i) throws IOException{
BytesInput sbi= dictionaryBitPackingRLEZigZarInt[i].getBytes();
byte[] dictionaryBuffer=dictionaryBitPackingRLEZigZarInt[i].getDictionaryBuffer();
byte[] dictionaryID=sbi.getBufferSize() ;
DataOutputBuffer dob=new DataOutputBuffer() ;
dob.writeInt(dictionaryBuffer.length);
dob.write(dictionaryBuffer, 0, dictionaryBuffer.length);
dob.write(dictionaryID, 0, dictionaryID.length);
byte[] pageBytes=dob.getData() ;
dob.close();
vps[i].data=pageBytes ;
vps[i].offset = 0;
vps[i].length= pageBytes.length;
try {
compressors[i].appendPage(vps[i]);
} catch (IOException e) {
e.printStackTrace();
}
try {
outputPage(i);
} catch (IOException e) {
e.printStackTrace();
}
pageInit[i]=false;
}
void lastPageDictionaryBitPackingRLEZigZarLong(int i) throws IOException{
throw new UnsupportedEncodingException("WangMeng has not implement this method on 03.27.2014 yet");
}
private void outputSegmentMeta(int i)
throws IOException {
out.reset();
// 1) write segment meta
backupVps[i].data = backups[i].serialize(segMaxs[i], tmpLength);
backupVps[i].offset = 0;
backupVps[i].length = tmpLength[0];
backupVps[i].write(out);
backupVps[i].data = backups[i].serialize(segMins[i], tmpLength);
backupVps[i].length = tmpLength[0];
// write min row
backupVps[i].write(out);
out.writeInt(0); // start pos of a segment is zero
out.writeInt(pms[i].startPos);
// 2) output the page
segId.setPageId(-1); // meta data page is -1
segId.setClusterId(i);
outValue.set(out.getData(), 0, out.getLength());
clusterValue.get(i).add(new BytesWritable());
clusterValue.get(i).get(clusterValue.get(i).size() - 1)
.set(out.getData(), 0, out.getLength());
sgementSize = sgementSize + cluster_pages[i];
// segMaxs[i]=null ;
// segMins[i]=null ;
pms[i].startPos =0;
pms[i].numPairs = 0;
/////////////////////////////////
if (accessors[i].getFixedLen() > 0) {
compressors[i]=EnDecode.getEncoder(pagesizes[i],
accessors[i].getFixedLen(), 0, algorithms[i],codings[i]);
compressors[i].reset();
} else {
// compressors[i] = new VarLenEncoder(pagesizes[i], 0, algorithms[i]);
compressors[i]=EnDecode.getEncoder(pagesizes[i],0, 0, algorithms[i],codings[i]);
compressors[i].reset();
}
maxs[i] = mins[i] = null;
segMaxs[i]=null ;
segMins[i]=null ;
pms[i].startPos =0;
pms[i].numPairs = 0;
backups[i] = new ClusterAccessor(accessors[i]);
vps[i] = new ValPair();
backupVps[i] = new ValPair();
pms[i] = new PageMeta();
pageIds[i] = 0;
startPoss[i] = numReps[i] = 0;
}
void passValue(PageId key, ArrayList<List<BytesWritable>> segmentValue2)
throws IOException {
int m = 0;
beginSegment();
beginCluster();
for (int i = 0; i < segmentValue2.size(); i++) {
for (int j = -1; j < segmentValue2.get(i).size() - 1; j++) {
PageMeta pm = new PageMeta();
BytesWritable bw;
if (j != -1) {
bw = (BytesWritable) segmentValue2.get(i).get(j);
in.reset(bw.getBytes(), 0, bw.getLength());
}
else {
bw = (BytesWritable) segmentValue2.get(i).get(segmentValue2.get(i).size() - 1);
in.reset(bw.getBytes(), 0, bw.getLength());
}
if (m == 0) {
pm.readFields(in);
addSegmentMeta(i, pm);
m++;
}
else {
int length = in.readInt();
in.skip(length);
pm.readFields(in);
Segappend(bw.getBytes(), Bytes.SIZEOF_INT, length, pm);
}
}
finishCluster();
m = 0;
if (i < segmentValue2.size() - 1) {
beginCluster();
}
}
finishSegment();
}
public void runLengthEncodingInt(int i){
if( pageInit[i]==false){
collect[i] = new TestInStream.OutputCollector();
compressors[i].reset();
try {
runLengthInteger[i] = new RunLengthIntegerWriter(new OutStream("test", 1000, null, collect[i]), true);
} catch (IOException e) {
e.printStackTrace();
}
pageInit[i]=true;
}
if(compressors[i].dataOffset + compressors[i].valueLen < compressors[i].pageCapacity){
try {
runLengthInteger[i].write(((Integer)(rows[i].getValue(0))).intValue());
} catch (IOException e) {
e.printStackTrace();
}
compressors[i].numPairs++;
compressors[i].dataOffset += compressors[i].valueLen;
if (maxs[i] == null || mins[i] == null) {
maxs[i] = rows[i].duplicate();
mins[i] = rows[i].duplicate();
} else {
rows[i].compareAndSetMaxMin(maxs[i], mins[i]);
}
pms[i].numPairs++;
}
else{
try {
runLengthInteger[i].write(((Integer)(rows[i].getValue(0))).intValue());
} catch (IOException e) {
e.printStackTrace();
}
compressors[i].numPairs++;
compressors[i].dataOffset += compressors[i].valueLen;
if (maxs[i] == null || mins[i] == null) {
maxs[i] = rows[i].duplicate();
mins[i] = rows[i].duplicate();
} else {
rows[i].compareAndSetMaxMin(maxs[i], mins[i]);
}
pms[i].numPairs++;
try {
runLengthInteger[i].flush();
} catch (IOException e) {
e.printStackTrace();
}
ByteBuffer inBuf = ByteBuffer.allocate(collect[i].buffer.size());
byte[] pageBytes=collect[i].buffer.getByteBuffer(inBuf, 0, collect[i].buffer.size()) ;
collect[i].buffer.clear();
inBuf.clear();
vps[i].data=pageBytes ;
vps[i].offset = 0;
vps[i].length= pageBytes.length;
try {
compressors[i].appendPage(vps[i]);
} catch (IOException e) {
e.printStackTrace();
}
try {
outputPage(i);
} catch (IOException e) {
e.printStackTrace();
}
if (maxs[i] == null || mins[i] == null) {
maxs[i] = rows[i].duplicate();
mins[i] = rows[i].duplicate();
} else {
rows[i].compareAndSetMaxMin(maxs[i], mins[i]);
}
LOG.info("dataoffset "+compressors[i].dataOffset);
LOG.info("numPairs "+compressors[i].numPairs);
LOG.info("startPos "+compressors[i].startPos);
pageInit[i]=false;
}
}
public void runLengthEncodingByte(int i){
if( pageInit[i]==false){
collect[i] = new TestInStream.OutputCollector();
compressors[i].reset();
try {
runLengthByte[i] = new RunLengthByteWriter(new OutStream("test", 1000, null, collect[i]));
} catch (IOException e) {
e.printStackTrace();
}
pageInit[i]=true;
}
if(compressors[i].dataOffset + compressors[i].valueLen < compressors[i].pageCapacity){
try {
runLengthByte[i].write(((Byte)(rows[i].getValue(0))).byteValue());
} catch (IOException e) {
e.printStackTrace();
}
compressors[i].numPairs++;
compressors[i].dataOffset += compressors[i].valueLen;
if (maxs[i] == null || mins[i] == null) {
maxs[i] = rows[i].duplicate();
mins[i] = rows[i].duplicate();
} else {
rows[i].compareAndSetMaxMin(maxs[i], mins[i]);
}
pms[i].numPairs++;
}
else{
try {
runLengthByte[i].write(((Byte)(rows[i].getValue(0))).byteValue());
} catch (IOException e) {
e.printStackTrace();
}
compressors[i].numPairs++;
compressors[i].dataOffset += compressors[i].valueLen;
if (maxs[i] == null || mins[i] == null) {
maxs[i] = rows[i].duplicate();
mins[i] = rows[i].duplicate();
} else {
rows[i].compareAndSetMaxMin(maxs[i], mins[i]);
}
pms[i].numPairs++;
try {
runLengthByte[i].flush();
} catch (IOException e) {
e.printStackTrace();
}
ByteBuffer inBuf = ByteBuffer.allocate(collect[i].buffer.size());
// collect.buffer.write(dos, 0, collect.buffer.size());
byte[] pageBytes=collect[i].buffer.getByteBuffer(inBuf, 0, collect[i].buffer.size()) ;
collect[i].buffer.clear();
inBuf.clear();
vps[i].data=pageBytes ;
vps[i].offset = 0;
vps[i].length= pageBytes.length;
try {
compressors[i].appendPage(vps[i]);
} catch (IOException e) {
e.printStackTrace();
}
try {
outputPage(i);
} catch (IOException e) {
e.printStackTrace();
}
if (maxs[i] == null || mins[i] == null) {
maxs[i] = rows[i].duplicate();
mins[i] = rows[i].duplicate();
} else {
rows[i].compareAndSetMaxMin(maxs[i], mins[i]);
}
// pms[i].numPairs++;
pageInit[i]=false;
}
}
public void runLengthEncodingLong(int i) throws UnsupportedEncodingException{
throw new UnsupportedEncodingException("WangMeng has not implement this method on 03.27.2014 yet");
}
public void deltaBinaryArrayZigZarByte(int i) throws IOException{
if( pageInit[i]==false){
// int blockSize = 128;
// int miniBlockNum = 4;
deltaByteArrayWriter[i]=new DeltaByteArrayWriter( pagesizes[i]/4);
compressors[i].reset();
pageInit[i]=true;
}
if(compressors[i].dataOffset + compressors[i].valueLen < compressors[i].pageCapacity){
deltaByteArrayWriter[i].writeBytes(Binary.fromString(""+((Byte)(rows[i].getValue(0))).byteValue()));
compressors[i].numPairs++;
compressors[i].dataOffset += compressors[i].valueLen;
if (maxs[i] == null || mins[i] == null) {
maxs[i] = rows[i].duplicate();
mins[i] = rows[i].duplicate();
} else {
rows[i].compareAndSetMaxMin(maxs[i], mins[i]);
}
pms[i].numPairs++;
}
else{
deltaByteArrayWriter[i].writeBytes(Binary.fromString(""+((Byte)(rows[i].getValue(0))).byteValue()));
compressors[i].numPairs++;
compressors[i].dataOffset += compressors[i].valueLen;
if (maxs[i] == null || mins[i] == null) {
maxs[i] = rows[i].duplicate();
mins[i] = rows[i].duplicate();
} else {
rows[i].compareAndSetMaxMin(maxs[i], mins[i]);
}
pms[i].numPairs++;
BytesInput bi=deltaByteArrayWriter[i].getBytes();
byte[] pageBytes=bi.getBufferSize() ;
vps[i].data=pageBytes ;
vps[i].offset = 0;
vps[i].length= pageBytes.length;
try {
compressors[i].appendPage(vps[i]);
} catch (IOException e) {
e.printStackTrace();
}
try {
outputPage(i);
} catch (IOException e) {
e.printStackTrace();
}
if (maxs[i] == null || mins[i] == null) {
maxs[i] = rows[i].duplicate();
mins[i] = rows[i].duplicate();
} else {
rows[i].compareAndSetMaxMin(maxs[i], mins[i]);
}
LOG.info("dataoffset "+compressors[i].dataOffset);
LOG.info("numPairs "+compressors[i].numPairs);
LOG.info("startPos "+compressors[i].startPos);
// pms[i].numPairs++;
pageInit[i]=false;
}
}
public void deltaBinaryBitPackingZigZarInt(int i) throws IOException{
if( pageInit[i]==false){
int blockSize = 128;
int miniBlockNum = 4;
deltaBianryBitPackingInt[i] = new DeltaBinaryPackingValuesWriter(blockSize, miniBlockNum, pagesizes[i]/4);
compressors[i].reset();
pageInit[i]=true;
}
if(compressors[i].dataOffset + compressors[i].valueLen < compressors[i].pageCapacity){
deltaBianryBitPackingInt[i].writeInteger(((Integer)(rows[i].getValue(0))).intValue());
compressors[i].numPairs++;
compressors[i].dataOffset += compressors[i].valueLen;
if (maxs[i] == null || mins[i] == null) {
maxs[i] = rows[i].duplicate();
mins[i] = rows[i].duplicate();
} else {
rows[i].compareAndSetMaxMin(maxs[i], mins[i]);
}
pms[i].numPairs++;
}
else{
deltaBianryBitPackingInt[i].writeInteger(((Integer)(rows[i].getValue(0))).intValue());
compressors[i].numPairs++;
compressors[i].dataOffset += compressors[i].valueLen;
if (maxs[i] == null || mins[i] == null) {
maxs[i] = rows[i].duplicate();
mins[i] = rows[i].duplicate();
} else {
rows[i].compareAndSetMaxMin(maxs[i], mins[i]);
}
pms[i].numPairs++;
BytesInput bi=deltaBianryBitPackingInt[i].getBytes() ;
byte[] pageBytes=bi.getBufferSize() ;
vps[i].data=pageBytes ;
vps[i].offset = 0;
vps[i].length= pageBytes.length;
try {
compressors[i].appendPage(vps[i]);
} catch (IOException e) {
e.printStackTrace();
}
try {
outputPage(i);
} catch (IOException e) {
e.printStackTrace();
}
if (maxs[i] == null || mins[i] == null) {
maxs[i] = rows[i].duplicate();
mins[i] = rows[i].duplicate();
} else {
rows[i].compareAndSetMaxMin(maxs[i], mins[i]);
}
LOG.info("dataoffset "+compressors[i].dataOffset);
LOG.info("numPairs "+compressors[i].numPairs);
LOG.info("startPos "+compressors[i].startPos);
// pms[i].numPairs++;
pageInit[i]=false;
}
}
public void deltaBinaryBitPackingZigZarLong(int i) throws UnsupportedEncodingException{
throw new UnsupportedEncodingException("now I have not implement this method for twitter.Parquet do not implement this just now") ;
}
public void dictionaryBitPackingRLEByte(int i) throws IOException{
if( pageInit[i]==false){
// deltaBianryBitPackingInt[i] = new DeltaBinaryPackingValuesWriter(blockSize, miniBlockNum, pagesizes[i]/4);
dictionaryBitPackingRLEZigZarByte[i]= new PlainBinaryDictionaryValuesWriter(Integer.MAX_VALUE, pagesizes[i]);
compressors[i].reset();
pageInit[i]=true;
}
if(compressors[i].dataOffset + compressors[i].valueLen < compressors[i].pageCapacity){
dictionaryBitPackingRLEZigZarByte[i].writeBytes(Binary.fromString("" +((Byte)(rows[i].getValue(0))).byteValue()));
compressors[i].numPairs++;
compressors[i].dataOffset += compressors[i].valueLen;
if (maxs[i] == null || mins[i] == null) {
maxs[i] = rows[i].duplicate();
mins[i] = rows[i].duplicate();
} else {
rows[i].compareAndSetMaxMin(maxs[i], mins[i]);
}
pms[i].numPairs++;
}
else{
dictionaryBitPackingRLEZigZarByte[i].writeBytes(Binary.fromString("" +((Byte)(rows[i].getValue(0))).byteValue()));
compressors[i].numPairs++;
compressors[i].dataOffset += compressors[i].valueLen;
if (maxs[i] == null || mins[i] == null) {
maxs[i] = rows[i].duplicate();
mins[i] = rows[i].duplicate();
} else {
rows[i].compareAndSetMaxMin(maxs[i], mins[i]);
}
pms[i].numPairs++;
BytesInput sbi= dictionaryBitPackingRLEZigZarByte[i].getBytes();
byte[] dictionaryBuffer=dictionaryBitPackingRLEZigZarByte[i].getDictionaryBuffer();
byte[] dictionaryID=sbi.getBufferSize() ;
DataOutputBuffer dob=new DataOutputBuffer() ;
dob.writeInt(dictionaryBuffer.length);
dob.write(dictionaryBuffer, 0, dictionaryBuffer.length);
dob.write(dictionaryID, 0, dictionaryID.length);
byte[] pageBytes=dob.getData() ;
dob.close();
vps[i].data=pageBytes ;
vps[i].offset = 0;
vps[i].length= pageBytes.length;
try {
compressors[i].appendPage(vps[i]);
} catch (IOException e) {
e.printStackTrace();
}
try {
outputPage(i);
} catch (IOException e) {
e.printStackTrace();
}
if (maxs[i] == null || mins[i] == null) {
maxs[i] = rows[i].duplicate();
mins[i] = rows[i].duplicate();
} else {
rows[i].compareAndSetMaxMin(maxs[i], mins[i]);
}
LOG.info("dataoffset "+compressors[i].dataOffset);
LOG.info("numPairs "+compressors[i].numPairs);
LOG.info("startPos "+compressors[i].startPos);
// pms[i].numPairs++;
pageInit[i]=false;
}
}
public void dictionaryBitPackingRLEInt(int i) throws IOException{
if( pageInit[i]==false){
// deltaBianryBitPackingInt[i] = new DeltaBinaryPackingValuesWriter(blockSize, miniBlockNum, pagesizes[i]/4);
dictionaryBitPackingRLEZigZarInt[i]= new PlainIntegerDictionaryValuesWriter(Integer.MAX_VALUE, pagesizes[i]);
compressors[i].reset();
pageInit[i]=true;
}
if(compressors[i].dataOffset + compressors[i].valueLen < compressors[i].pageCapacity){
dictionaryBitPackingRLEZigZarInt[i].writeInteger(((Integer)(rows[i].getValue(0))).intValue());
compressors[i].numPairs++;
compressors[i].dataOffset += compressors[i].valueLen;
if (maxs[i] == null || mins[i] == null) {
maxs[i] = rows[i].duplicate();
mins[i] = rows[i].duplicate();
} else {
rows[i].compareAndSetMaxMin(maxs[i], mins[i]);
}
pms[i].numPairs++;
}
else{
dictionaryBitPackingRLEZigZarInt[i].writeInteger(((Integer)(rows[i].getValue(0))).intValue());
compressors[i].numPairs++;
compressors[i].dataOffset += compressors[i].valueLen;
if (maxs[i] == null || mins[i] == null) {
maxs[i] = rows[i].duplicate();
mins[i] = rows[i].duplicate();
} else {
rows[i].compareAndSetMaxMin(maxs[i], mins[i]);
}
pms[i].numPairs++;
BytesInput sbi= dictionaryBitPackingRLEZigZarInt[i].getBytes();
byte[] dictionaryBuffer=dictionaryBitPackingRLEZigZarInt[i].getDictionaryBuffer();
byte[] dictionaryID=sbi.getBufferSize() ;
DataOutputBuffer dob=new DataOutputBuffer() ;
dob.writeInt(dictionaryBuffer.length);
dob.write(dictionaryBuffer, 0, dictionaryBuffer.length);
dob.write(dictionaryID, 0, dictionaryID.length);
byte[] pageBytes=dob.getData() ;
dob.close();
vps[i].data=pageBytes ;
vps[i].offset = 0;
vps[i].length= pageBytes.length;
try {
compressors[i].appendPage(vps[i]);
} catch (IOException e) {
e.printStackTrace();
}
try {
outputPage(i);
} catch (IOException e) {
e.printStackTrace();
}
if (maxs[i] == null || mins[i] == null) {
maxs[i] = rows[i].duplicate();
mins[i] = rows[i].duplicate();
} else {
rows[i].compareAndSetMaxMin(maxs[i], mins[i]);
}
LOG.info("dataoffset "+compressors[i].dataOffset);
LOG.info("numPairs "+compressors[i].numPairs);
LOG.info("startPos "+compressors[i].startPos);
// pms[i].numPairs++;
pageInit[i]=false;
}
}
public void dictionaryBitPackingRLELong(int i) throws UnsupportedEncodingException{
throw new UnsupportedEncodingException("WangMeng has not implement this method on 03.27.2014 yet");
}
public void append(Writable val) throws IOException {
RowMap rowMap = (RowMap) val;
for (int i = 0; i < numClusters; i++) {
rows[i]=rowMap.row[i];
if(codings[i]!=CodingType.MV){
switch (codings[i]){
case RunLengthEncodingByte: runLengthEncodingByte(i); break ;
case RunLengthEncodingInt: runLengthEncodingInt(i); break ;
case RunLengthEncodingLong: runLengthEncodingLong(i); break ;
case DeltaBinaryArrayZigZarByte :deltaBinaryArrayZigZarByte(i); break ;
case DeltaBinaryBitPackingZigZarInt :deltaBinaryBitPackingZigZarInt(i); break ;
case DeltaBinaryBitPackingZigZarLong :deltaBinaryBitPackingZigZarLong(i); break ;
case DictionaryBitPackingRLEByte:dictionaryBitPackingRLEByte(i);break ;
case DictionaryBitPackingRLEInt:dictionaryBitPackingRLEInt(i);break ;
case DictionaryBitPackingRLELong:dictionaryBitPackingRLELong(i);break ;
case DeltaBinaryPackingString:deltaBinaryPackingString(i);break ;
case RedBlackTreeString:redBlackTreeString(i);break ;
default : throw new UnsupportedEncodingException() ;
}
}
else {
vps[i].data = accessors[i].serialize(rows[i], tmpLength);
vps[i].offset = 0;
vps[i].length = tmpLength[0];
vps[i].pos = pms[i].startPos + pms[i].numPairs;
while (!compressors[i].append(vps[i])) {//bytesLeft < )
// System.out.println("............................................................................. "+rows[i].getValue(0)+compressors[i].bytesLeft+" "+(vps[i].length + Bytes.SIZEOF_INT));
outputPage(i);
}
// update page meta
if (maxs[i] == null || mins[i] == null) {
maxs[i] = rows[i].duplicate();
mins[i] = rows[i].duplicate();
} else {
rows[i].compareAndSetMaxMin(maxs[i], mins[i]);
}
pms[i].numPairs++;
}
// ///very important ,jude whether write to disk or not
if ((sgementSize>SegmentSize) && i == (numClusters - 1) ) {
SegmentMeta();
passValue(segId, clusterValue);
outputStream.flush();
segId = new PageId();
// segId.setSegmentId(Math.random() * 100 + "");
pageIdCount++ ;
segId.setSegmentId(""+ pageIdCount);
// for(int l=0 ;l< clusterValue.size();l++){
// // for(int j=0;j<clusterValue.get(i).size();j++){
// // clusterValue.get(i).get(j).
// // }
// clusterValue.get(l).clear();
// }
// clusterValue.clear();
// segmentValue.clear();
sgementSize = 0;
isInit=false ;
}
}
// rowMap=null ;
// val=null;
}
private void redBlackTreeString(int i) throws IOException {
if( pageInit[i]==false){
System.out.println("compressors[i].bytesLeft "+compressors[i].bytesLeft);
oRCStringEcnodingUtil[i]=new ORCStringEcnodingUtil() ;
compressors[i].reset();
pageInit[i]=true;
}
if(firstNumber[i]==true){
// System.out.println("1545 run one time ..............................................................."+str[i]+" "+compressors[i].numPairs+" "+ compressors[i].dataOffset);
deltaByteArrayStringWriter[i].writeBytes(Binary.fromString(str[i]));
compressors[i].numPairs++;
compressors[i].dataOffset += str[i].length();
compressors[i].index.writeInt(compressors[i].dataOffset);
compressors[i].bytesLeft -= str[i].length() ;
compressors[i].bytesLeft -= Bytes.SIZEOF_INT;
if (maxs[i] == null || mins[i] == null) {
maxs[i] = rows[i].duplicate();
mins[i] = rows[i].duplicate();
} else {
rows[i].compareAndSetMaxMin(maxs[i], mins[i]);
}
firstNumber[i]=false;
}
// if (bytesLeft < pair.length + Bytes.SIZEOF_INT)
String str=((String)(rows[i].getValue(0))) ;
if(compressors[i].bytesLeft>=( Bytes.SIZEOF_INT+str.length()+2)){
//deltaByteArrayStringWriter[i].writeBytes(Binary.fromString(str));
oRCStringEcnodingUtil[i].add(str);
compressors[i].numPairs++;
compressors[i].dataOffset += str.length()+2;
//compressors[i].index.writeInt(compressors[i].dataOffset);
compressors[i].bytesLeft -= str.length()+2 ;
compressors[i].bytesLeft -= Bytes.SIZEOF_INT;
if (maxs[i] == null || mins[i] == null) {
maxs[i] = rows[i].duplicate();
mins[i] = rows[i].duplicate();
} else {
rows[i].compareAndSetMaxMin(maxs[i], mins[i]);
}
pms[i].numPairs++;
}
else{
firstNumber[i]=true;
if (maxs[i] == null || mins[i] == null) {
maxs[i] = rows[i].duplicate();
mins[i] = rows[i].duplicate();
} else {
rows[i].compareAndSetMaxMin(maxs[i], mins[i]);
}
pms[i].numPairs++;
oRCStringEcnodingUtil[i].dictionarySize= oRCStringEcnodingUtil[i].dictionary.size();
oRCStringEcnodingUtil[i].dumpOrder=new int[ oRCStringEcnodingUtil[i].dictionary.size()] ;
oRCStringEcnodingUtil[i].init();
oRCStringEcnodingUtil[i].iterator();
oRCStringEcnodingUtil[i].rowoutPut() ;
oRCStringEcnodingUtil[i].flush();
DataOutputBuffer dob=new DataOutputBuffer() ;
dob.writeInt( oRCStringEcnodingUtil[i].dictionarySize);
int length=oRCStringEcnodingUtil[i].collect1.buffer.size() ;
ByteBuffer inBuf = ByteBuffer.allocate( length);
oRCStringEcnodingUtil[i].collect1.buffer.setByteBuffer(inBuf, 0, length);
inBuf.flip();
dob.writeInt(length);
dob.write(inBuf.array(), 0, length);
inBuf.clear() ;
length=oRCStringEcnodingUtil[i].collect2.buffer.size() ;
oRCStringEcnodingUtil[i].collect2.buffer.setByteBuffer(inBuf, 0, length);
inBuf.flip();
dob.writeInt(length);
dob.write(inBuf.array(), 0, length);
inBuf.clear() ;
length=oRCStringEcnodingUtil[i].collect3.buffer.size() ;
oRCStringEcnodingUtil[i].collect3.buffer.setByteBuffer(inBuf, 0, length);
inBuf.flip();
dob.writeInt(length);
dob.write(inBuf.array(), 0, length);
inBuf.clear() ;
vps[i].data=dob.getData() ;
vps[i].offset = 0;
vps[i].length= vps[i].data.length ;
compressors[i].appendPage(vps[i]);
outputPage(i);
LOG.info("dataoffset "+compressors[i].dataOffset);
LOG.info("numPairs "+compressors[i].numPairs);
LOG.info("startPos "+compressors[i].startPos);
pageInit[i]=false;
}
}
private void deltaBinaryPackingString(int i) throws IOException {
if( pageInit[i]==false){
// System.out.println("compressors[i].bytesLeft "+compressors[i].bytesLeft);
deltaByteArrayStringWriter[i]=new DeltaByteArrayWriter(64*1024) ;
// fixedEncoding= (FixedLenEncoder)compressors[i] ;
compressors[i].reset();
pageInit[i]=true;
}
str[i]=((String)(rows[i].getValue(0))) ;
if(compressors[i].bytesLeft-100>=( Bytes.SIZEOF_INT+str[i].length()+2)){
deltaByteArrayStringWriter[i].writeBytes(Binary.fromString(str[i]));
compressors[i].numPairs++;
compressors[i].dataOffset += str[i].length()+2;
compressors[i].bytesLeft -= str[i].length()+2 ;
compressors[i].bytesLeft -= Bytes.SIZEOF_INT;
if (maxs[i] == null || mins[i] == null) {
maxs[i] = rows[i].duplicate();
mins[i] = rows[i].duplicate();
} else {
rows[i].compareAndSetMaxMin(maxs[i], mins[i]);
}
pms[i].numPairs++;
}
else{
deltaByteArrayStringWriter[i].writeBytes(Binary.fromString(str[i]));
compressors[i].numPairs++;
compressors[i].dataOffset += str[i].length();
compressors[i].index.writeInt(compressors[i].dataOffset);
compressors[i].bytesLeft -= str[i].length() ;
compressors[i].bytesLeft -= Bytes.SIZEOF_INT;
firstNumber[i]=true;
// System.out.println("1579 .................................... " +" firstNumber[i] "+ firstNumber[i]+" "+ str[i]+" "+( Bytes.SIZEOF_INT+str[i].length()+2)+" "+compressors[i].bytesLeft);
if (maxs[i] == null || mins[i] == null) {
maxs[i] = rows[i].duplicate();
mins[i] = rows[i].duplicate();
} else {
rows[i].compareAndSetMaxMin(maxs[i], mins[i]);
}
pms[i].numPairs++;
vps[i].data=deltaByteArrayStringWriter[i].getBytes().toByteArray();
vps[i].offset = 0;
vps[i].length= vps[i].data.length ;
compressors[i].appendPage(vps[i]);
outputPage(i);
LOG.info("dataoffset "+compressors[i].dataOffset);
LOG.info("numPairs "+compressors[i].numPairs);
LOG.info("startPos "+compressors[i].startPos);
// pms[i].numPairs++;
pageInit[i]=false;
}
}
void close() throws IOException {
SegmentMeta();
passValue(segId, clusterValue);
for (int k = 0; k < numClusters; k++) {
pageIds[k] = 0;
pms[k].startPos = 0;
pms[k].numPairs = 0;
maxs[k] = mins[k] = null;
}
segId = new PageId();
segId.setSegmentId(Math.random() * 100 + "");
clusterValue.clear();
// segmentValue.clear();
sgementSize = 0;
}
}
// FileSystem stream to write on.
private FSDataOutputStream outputStream;
// True if we opened the <code>outputStream</code> (and so will close it).
private boolean closeOutputStream;
// Name for this object used when logging or in toString. Is either
// the result of a toString on stream or else toString of passed file Path.
private String name;
private boolean TabConfiInit = false;
private final ArrayList<Long> segOffsets = new ArrayList<Long>();
private final ArrayList<Long> segLengths = new ArrayList<Long>();
private final ArrayList<Long> segPMSOffsets = new ArrayList<Long>();
// Segment Meta
private PageMeta[] curSegMetas;
private final List<PageMeta[]> segMetasList = new ArrayList<PageMeta[]>();
// Page Meta Section
private PageMetaSection pms;
private PageMetaList[] pageMetaLists;
private List<PageMeta> curPageMetaList;
private List<Long> curPMOffsetList;
private TabConfig tabConfig;
// Cluster Offset in current segment
private long[] clusterOffsetInCurSegment;
// May be null if we were passed a stream.
private Path path = null;
private Path tmpPath=null ;
private Path tmpoutputPath=null ;
private int curClusterIdx;
private int curSegIdx;
private int numClusters;
private final boolean withPageMeta;
private FileSystem fs =null ;
/**
* Constructor that takes a Path.
*
* @param fs
* @param path
* @param columns
* @throws IOException
*/
public Writer(FileSystem fs, Path tmpPath, Path path, Path tmpoutputPath,
List<List<DataType>> columns)
throws IOException {
this(fs, tmpPath, path, tmpoutputPath, columns, true);
}
/**
* Constructor that takes a Path
*
* @param fs
* @param path
* @param columns
* @param withPageMeta
* @throws IOException
*/
public Writer(FileSystem fs, Path tmpPath, Path path, Path finalOutPath,
List<List<DataType>> columns,
boolean withPageMeta)
throws IOException {
this(fs.create(tmpPath, true, fs.getConf().getInt("io.file.buffer.size", 4096),
fs.getDefaultReplication(), fs.getDefaultBlockSize()), columns, withPageMeta);
fs.setVerifyChecksum(true);
closeOutputStream = true;
name = path.toString();
this.path = path;
this.tmpPath = tmpPath;
this.fs = fs;
this.tmpoutputPath = tmpoutputPath;
}
/**
* Constructor that takes a stream.
*
* @param ostream
* Stream to use.
* @param columns
* @param withPageMeta
* @throws IOException
*/
public Writer(final FSDataOutputStream ostream, List<List<DataType>> columns,
boolean withPageMeta)
throws IOException {
// LOG.debug("create a writer...");
this.outputStream = ostream;
this.closeOutputStream = false;
this.name = this.outputStream.toString();
this.withPageMeta = withPageMeta;
init(columns);
}
private void init(List<List<DataType>> columns) {
if (columns != null) {
numClusters = columns.size();
// LOG.debug("Init " + numClusters + " clusters.");
pms = new PageMetaSection(withPageMeta);
pageMetaLists = new PageMetaList[numClusters];
for (int i = 0; i < numClusters; i++) {
pageMetaLists[i] = new PageMetaList(withPageMeta);
pageMetaLists[i].setMetaList(new ArrayList<PageMeta>(),
new ArrayList<Long>());
}
clusterOffsetInCurSegment = new long[numClusters];
}
curSegIdx = 0;
}
public void append(Writable r) throws IOException {
if (TabConfiInit == false) {
tabConfig = new TabConfig();
tabConfig.configure(SerializeUtil.jc, SerializeUtil.tableProperties);
TabConfiInit = true;
tabConfig.append(r);
} else {
tabConfig.append(r);
}
}
private void resetPageMetaSection() {
for (PageMetaList pageMetaList : pageMetaLists) {
pageMetaList.getMetaList().clear();
pageMetaList.getOffsetList().clear();
}
}
public void addSegmentMeta(int clusterId, final PageMeta pm) {
curSegMetas[clusterId] = pm;
}
public void beginSegment() throws IOException {
LOG.info("Begin a new Segment from position " + outputStream.getPos());
// And the segment start offset;
segOffsets.add(outputStream.getPos());
// reinit the arguments of a segment
curClusterIdx = 0;
if (withPageMeta) {
curSegMetas = new PageMeta[numClusters];
}
}
public void finishSegment() throws IOException {
LOG.info("Finish data section in a Segment at position " + outputStream.getPos());
segPMSOffsets.add(outputStream.getPos());
// write the pagemeta section
LOG.info("And write its page meta section.");
pms.setPageMetaLists(pageMetaLists);
pms.write(outputStream);
// write the cluter index
LOG.info("And write its clusters' offsets.");
for (long l : clusterOffsetInCurSegment) {
outputStream.writeLong(l);
}
// record the length of the segment
long length = outputStream.getPos() - segOffsets.get(curSegIdx);
LOG.info("This segment length is " + length);
segLengths.add(length);
if (withPageMeta) {
segMetasList.add(curSegMetas);
curSegMetas = null;
}
resetPageMetaSection();
pageMetaLists = new PageMetaList[numClusters];
for (int i = 0; i < numClusters; i++) {
pageMetaLists[i] = new PageMetaList(withPageMeta);
pageMetaLists[i].setMetaList(new ArrayList<PageMeta>(),
new ArrayList<Long>());
}
curSegIdx++;
}
/**
* @return Path or null if we were passed a stream rather than a Path.
*/
public Path getPath() {
return this.path;
}
@Override
public String toString() {
return "writer=" + this.name;
}
public void beginCluster() throws IOException {
LOG.info("Begin a new Cluster from position " + outputStream.getPos());
curPageMetaList = this.pageMetaLists[curClusterIdx].getMetaList();
curPMOffsetList = this.pageMetaLists[curClusterIdx].getOffsetList();
this.clusterOffsetInCurSegment[curClusterIdx] = outputStream.getPos();
}
public void finishCluster() {
LOG.info("Finish Cluster " + curClusterIdx + ".");
LOG.info("Finish a Cluster while writing " + curPageMetaList.size() + " pages.");
curClusterIdx++;
}
public void Segappend(final byte[] page, final int offset,
final int length, final PageMeta pm) throws IOException {
curPMOffsetList.add(outputStream.getPos());
outputStream.write(page, offset, length);
curPageMetaList.add(pm);
}
public void fileClose() throws IOException {
// outputStream.flush();
// outputStream.sync();
if (outputStream == null) {
return;
}
LOG.info("Finish the segment file by writing its segments' index at position "
+ outputStream.getPos() + " .");
LOG.info("Total segments are " + curSegIdx);
long segIdxOffset = outputStream.getPos();
// Write out the segment index
for (int i = 0; i < curSegIdx; i++) {
outputStream.writeLong(segOffsets.get(i));
outputStream.writeLong(segPMSOffsets.get(i));
outputStream.writeLong(segLengths.get(i));
if (withPageMeta) {
for (int j = 0; j < numClusters; j++) {
// segMetasList.get(i)[j].write(outputStream);
PageMeta pageMeta = segMetasList.get(i)[j];
pageMeta.write(outputStream);
}
}
}
outputStream.writeInt(numClusters);
outputStream.writeInt(curSegIdx);
outputStream.writeLong(segIdxOffset);
LOG.info("Finished @ position " + outputStream.getPos());
// if (this.closeOutputStream) {
// LOG.info("fs.size"+fs.getLength(path));
// LOG.info("fs.name"+fs.getName());
// LOG.info("fs.backup"+fs.getReplication(path));
// LOG.info("fs.status"+fs.getFileStatus(path));
// // LOG.info(""+fs.setVerifyChecksum(verifyChecksum););
// fs.setVerifyChecksum(true);
// fs.printStatistics();
outputStream.close();
// LOG.info("fs.size"+fs.getLength(path));
// LOG.info("fs.name"+fs.getName());
// LOG.info("fs.backup"+fs.getReplication(path));
// LOG.info("fs.status"+fs.getFileStatus(path));
outputStream = null;
// DistributedFileSystem dfs=(DistributedFileSystem)fs ;
if (fs.exists(path)) {
fs.delete(path, true);
}
fs.rename(tmpPath, path);
// fs.create(path);
// dfs.moveToLocalFile(tmpPath,finalOutPath);
fs.delete(tmpPath, true);
// fs.delete(tmpoutputPath, true);
// fs.MoveTask
}
@Override
public synchronized void close() throws IOException {
tabConfig.close();
fileClose();
}
}
public static class SegmentIndexRef {
public int numSegs;
public int numClusters;
public long[] segOffsets;
public long[] segPMSOffsets;
public long[] segLengths;
public PageMeta[][] segMetas;
}
/**
* Segment Index Reader. (read in the segment index for M/R splitting.)
*/
public static class SegmentIndexReader implements Closeable {
long segIndexOffset;
SegmentIndexRef ref = new SegmentIndexRef();
// Stream to read from
private FSDataInputStream istream = null;
private final long fileSize;
private final boolean withPageMeta;
public SegmentIndexReader(FileSystem fs, Path path) throws IOException {
this(fs, path, true);
}
public SegmentIndexReader(FileSystem fs, Path path, boolean withPM)
throws IOException {
istream = fs.open(path);
fileSize = fs.getFileStatus(path).getLen();
this.withPageMeta = withPM;
LOG.info("Open Segment File " + path + " : file length is " + fileSize
+ " , with page meta : " + withPageMeta);
}
public synchronized void readSegIndex() throws IOException {
istream.seek(fileSize - 2 * Bytes.SIZEOF_INT - Bytes.SIZEOF_LONG);
ref.numClusters = istream.readInt();
ref.numSegs = istream.readInt();
segIndexOffset = istream.readLong();
LOG.info("Trying to read " + ref.numSegs + " segments at position " + segIndexOffset);
ref.segOffsets = new long[ref.numSegs];
ref.segPMSOffsets = new long[ref.numSegs];
ref.segLengths = new long[ref.numSegs];
if (withPageMeta) {
ref.segMetas = new PageMeta[ref.numSegs][];
}
istream.seek(segIndexOffset);
for (int i = 0; i < ref.numSegs; i++) {
ref.segOffsets[i] = istream.readLong();
ref.segPMSOffsets[i] = istream.readLong();
ref.segLengths[i] = istream.readLong();
if (withPageMeta) {
ref.segMetas[i] = new PageMeta[ref.numClusters];
for (int j = 0; j < ref.numClusters; j++) {
ref.segMetas[i][j] = new PageMeta();
ref.segMetas[i][j].readFields(istream);
}
}
}
}
public synchronized long[] getSegOffsets() {
return ref.segOffsets;
}
public synchronized long[] getSegPMSOffsets() {
return ref.segPMSOffsets;
}
public synchronized long[] getSegLengths() {
return ref.segLengths;
}
public synchronized PageMeta[][] getSegMetas() {
return ref.segMetas;
}
public synchronized int getNumSegs() {
return ref.numSegs;
}
public synchronized SegmentIndexRef getRef() {
return ref;
}
@Override
public synchronized void close() throws IOException {
if (istream == null) {
return;
}
istream.close();
istream = null;
}
}
/**
* Counters to calculate read pages
*/
public static enum SegmentReadPageCounter {
CLUSTER1, CLUSTER2, CLUSTER3, CLUSTER4, CLUSTER5, CLUSTER6, CLUSTER7,
CLUSTER8, CLUSTER9, CLUSTER10, CLUSTER11, CLUSTER12, CLUSTER13,
CLUSTER14, CLUSTER15, CLUSTER16, OTHERCLUSTERS
}
/**
* Counters to calculate skipped pages
*/
public static enum SegmentSkippedPageCounter {
CLUSTER1, CLUSTER2, CLUSTER3, CLUSTER4, CLUSTER5, CLUSTER6, CLUSTER7,
CLUSTER8, CLUSTER9, CLUSTER10, CLUSTER11, CLUSTER12, CLUSTER13,
CLUSTER14, CLUSTER15, CLUSTER16, OTHERCLUSTERS
}
/**
* Counters to calculate the cache hits
*/
public static enum SegmentCacheHitCounter {
CLUSTER1, CLUSTER2, CLUSTER3, CLUSTER4, CLUSTER5, CLUSTER6, CLUSTER7,
CLUSTER8, CLUSTER9, CLUSTER10, CLUSTER11, CLUSTER12, CLUSTER13,
CLUSTER14, CLUSTER15, CLUSTER16, OTHERCLUSTERS
}
/** Counters to calculate the position seek times */
public static enum SegmentPosSeekCounter {
MOVEON, MOVEBACK
}
/**
* <p>
* SegmentReader is used to read a segment. It wasn't used to read the actual data. A
* <i>SegmentReader</i> is used to process the segment page meta section and generated the related
* <i> ClusterReader</i>s to read the actual data.
* </p>
*
*/
public static class SegmentReader implements Closeable {
public static enum READMODE {
/**
* During point query, we use a shared global lru page cache
* to reduce the overhead of random access.
*/
POINTQUERY,
/**
* During m/r scan query, we use a simple queue-based page cache
* to reduce the overhead of random access in a segment split.
*/
MR
}
//
Configuration conf;
// stream to read in
private final FSDataInputStream istream;
// read in the page meta section
PageMetaSection pms = null;
// Segment information
long segmentOffset;
long segmentLength;
long segmentPMSOffset;
// Number of Clusters
int numClusters;
long[] clusterOffsets;
// Cache Pool
// pcp just used in m/r mode
PageCache[] pcp;
// pagecache used in POINTQUERY mode
BlockCache pagecache;
// Statistics of Cache Pool
int pageLoads;
int cacheHits;
Map<Integer, ScanMode[]> scanMap = null;
Reporter reporter = null;
boolean withPageMeta;
READMODE mode;
int segId;
/**
* Create the Segment Reader.
*
* @param fs
* which file system that store the segment file
* @param file
* the path of the segment file
* @param buffersize
* the length of the buffer size to read the data
* @param segmentOffset
* the file offset of the segment in the segment file
* @param segmentLength
* the length of the segment
* @param segmentPMSOffset
* the file offset of the page meta section of the segment
* @throws IOException
*/
public SegmentReader(Configuration conf, FileSystem fs, Path file, int segId, int buffersize,
long segmentOffset, long segmentLength, long segmentPMSOffset) throws IOException {
this(conf, fs, file, segId, buffersize, segmentOffset, segmentLength, segmentPMSOffset, true,
READMODE.MR);
}
public SegmentReader(Configuration conf, FileSystem fs, Path file, int segId, int buffersize,
long segmentOffset, long segmentLength, long segmentPMSOffset,
boolean withPageMeta) throws IOException {
this(conf, fs, file, segId, buffersize, segmentOffset, segmentLength, segmentPMSOffset,
withPageMeta,
READMODE.MR);
}
/**
* Create the Segment Reader
*
* @param fs
* which file system that store the segment file
* @param file
* the path of the segment file
* @param buffersize
* the length of the buffer size to read the data
* @param segmentOffset
* the file offset of the segment in the segment file
* @param segmentLength
* the length of the segment
* @param segmentPMSOffset
* the file offset of the page meta section of the segment
* @param withPageMeta
* do we need to read the max/min page meta
* @throws IOException
*/
public SegmentReader(Configuration conf, FileSystem fs, Path file, int segId, int buffersize,
long segmentOffset, long segmentLength, long segmentPMSOffset,
boolean withPageMeta, READMODE readMode) throws IOException {
this.conf = conf;
this.istream = fs.open(file, buffersize);
this.segmentOffset = segmentOffset;
this.segmentLength = segmentLength;
this.segmentPMSOffset = segmentPMSOffset;
this.withPageMeta = withPageMeta;
this.mode = readMode;
this.segId = segId;
}
/**
* Init a M/R repoter, so we can collect the statistics of the activities
* of a segment reader during query processing
*
* @param reporter
* the M/R reporter
*/
public synchronized void initReporter(Reporter reporter) {
this.reporter = reporter;
}
/**
* Load the page meta section.
* Note: this methods need to be called before any other activity
*
* @throws IOException
*/
public synchronized void loadPMS() throws IOException {
// move to the offset of page meta section
istream.seek(segmentPMSOffset);
pms = new PageMetaSection(withPageMeta);
pms.readFields(istream);
// we get the page meta section, we know the number of clusters
numClusters = pms.getPageMetaLists().length;
LOG.info("Load page meta section with " + numClusters + " clusters.");
if (mode == READMODE.MR) {
pcp = new PageCache[numClusters];
for (int i = 0; i < numClusters; i++) {
pcp[i] = new SimplePageCache();
}
} else if (mode == READMODE.POINTQUERY) {
pagecache = getBlockCache(conf);
}
// read in the cluster offsets
clusterOffsets = new long[numClusters];
for (int i = 0; i < numClusters; i++) {
clusterOffsets[i] = istream.readLong();
}
}
public synchronized void buildScanMap(ExprDesc expr, ClusterAccessor[] accessors) {
if (pms == null) {
return;
}
pms.setClusterAccessors(accessors);
this.scanMap = pms.computeScanMap(expr);
}
/**
* Clear the scan map of the last query. So the segment reader can be re-used
* in the following query processing.
*/
public synchronized void clearScanMap() {
scanMap = null;
}
/**
* Create a cluster reader to read the actual data.
*
* @param clusterId
* which cluster to read data
* @param cachePage
* do we need to cache the page in the buffer
* @return cluster reader
*/
public synchronized ClusterReader newClusterReader(int clusterId, boolean cachePage) {
long clusterLength;
if (clusterId == numClusters - 1) {
clusterLength = segmentPMSOffset - clusterOffsets[clusterId];
} else {
clusterLength = clusterOffsets[clusterId + 1] - clusterOffsets[clusterId];
}
ScanMode[] modes = null;
if (scanMap != null) {
modes = scanMap.get(clusterId);
}
return new ClusterReader(this, clusterId, clusterOffsets[clusterId],
clusterLength, pms.getPageMetaLists()[clusterId], modes, cachePage);
}
/**
* Read a page of clusterId at position
*
* @param clusterId
* @param position
* @return
*/
// DynamicByteArray dynamicBuffer = new DynamicByteArray();
// dynamicBuffer.add(bytes, 0, bytes.length);
//
// ByteBuffer inBuf = ByteBuffer.allocate(dynamicBuffer.size());
// // System.out.println("56 "+inBuf.getInt());
// dynamicBuffer.setByteBuffer(inBuf, 0, dynamicBuffer.size());
//
// inBuf.flip();
//
// RunLengthIntegerReader in = new RunLengthIntegerReader(InStream.create
// ("test", inBuf, codec, (int)readfile.length()), true);
// // int count=0 ;
// int[] result=new int[fileLong] ;
// for(int i=0; i < fileLong; ++i) {
//
// result[i]= (int) in.next();
// count ++ ;
// }
//// inBuf.clear();
public synchronized ByteBuffer readPage(int clusterId, int position, boolean cachePage)
throws IOException {
PageMetaList pmList = pms.getPageMetaLists()[clusterId];
int pageId = Utils.findTargetPos(pmList.getMetaList(), 0, pmList.getMetaList().size() - 1,
position);
if (pageId < 0) {
throw new IOException("No page in segment " + this.segId + " contains position " + position);
}
long offset = pmList.getOffsetList().get(pageId);
long length;
if (pageId == pmList.getOffsetList().size() - 1) {
long clusterLength;
if (clusterId == numClusters - 1) {
clusterLength = segmentPMSOffset - clusterOffsets[clusterId];
} else {
clusterLength = clusterOffsets[clusterId + 1] - clusterOffsets[clusterId];
}
length = clusterOffsets[clusterId] + clusterLength - offset;
} else {
length = pmList.getOffsetList().get(pageId + 1) - offset;
}
return readPage(clusterId, pageId, offset, length, cachePage);
}
/**
* Read in a file page.
*
* @param clusterId
* which cluster to read
* @param pageId
* which page to read
* @param cachePage
* need to cache a page?
* @return Block wrapped in a ByteBuffer.
* @throws IOException
*/
synchronized ByteBuffer readPage(int clusterId, int pageId, long offset, long length,
boolean cachePage)
throws IOException {
pageLoads++;
ByteBuffer cachedPage = null;
if (mode == READMODE.MR) {
cachedPage = pcp[clusterId].getPage(pageId);
} else if (mode == READMODE.POINTQUERY) {
cachedPage = pagecache.getBlock(makePageName(segId, clusterId, pageId));
}
if (cachedPage != null) {
cacheHits++;
if (reporter != null) {
if (clusterId < 16) {
reporter.incrCounter(SegmentCacheHitCounter.values()[clusterId], 1);
} else {
reporter.incrCounter(SegmentCacheHitCounter.OTHERCLUSTERS, 1);
}
}
return cachedPage.duplicate();
}
/**
* Report the related activities to M/R frameworks, so we will know
* what happened during query processing
*/
if (reporter != null) {
if (clusterId < 16) {
reporter.incrCounter(SegmentReadPageCounter.values()[clusterId], 1);
} else {
reporter.incrCounter(SegmentReadPageCounter.OTHERCLUSTERS, 1);
}
if (istream.getPos() - offset > 0) {
reporter.incrCounter(SegmentPosSeekCounter.MOVEBACK, 1);
} else {
reporter.incrCounter(SegmentPosSeekCounter.MOVEON, 1);
}
}
// ByteBuffer buf = ByteBuffer.allocate(longToInt(length));
// istream.readFully(offset, buf.array());
// Read the page from filesystem
InputStream is = new BoundedRangeFileInputStream(istream, offset, length);
ByteBuffer buf = ByteBuffer.allocate(longToInt(length));
IOUtils.readFully(is, buf.array(), 0, buf.capacity());
is.close();
if (cachePage) {
if (mode == READMODE.MR) {
pcp[clusterId].cachePage(pageId, buf.duplicate());
} else if (mode == READMODE.POINTQUERY) {
pagecache.cacheBlock(makePageName(segId, clusterId, pageId), buf.duplicate(), true);
}
}
return buf;
}
@Override
public synchronized void close() throws IOException {
if (istream != null) {
istream.close();
}
}
}
/**
* A <i>ClusterReader</i> is used to handle the read action
* of a specified cluster.
*/
public static class ClusterReader {
private final SegmentReader sr;
private final int clusterId;
private final long clusterOffset;
private final long clusterLength;
private final List<PageMeta> pmList;
private final List<Long> poList;
private final ScanMode[] scanmode;
private boolean cachePage;
int curPageId;
int numPages;
PosRLEChunk prb = new PosRLEChunk();
Reporter reporter = null;
/**
* ClusterReader Constructor
*
* @param sr
* @param clusterId
* which cluster to read
* @param clusterOffset
* the file offset of the cluster
* @param clusterLength
* the length of the cluster
* @param pml
* the pagemeta list of the cluster
* @param sm
* the scan mode of the query
* @param cachePage
* do we need to cache a page for random access
*/
public ClusterReader(SegmentReader sr, int clusterId,
long clusterOffset, long clusterLength, PageMetaList pml,
ScanMode[] sm, boolean cachePage) {
// System.out.println("Open reader for cluster " + clusterId + " offset " + clusterOffset +
// " length " + clusterLength);
this.sr = sr;
this.clusterId = clusterId;
this.clusterOffset = clusterOffset;
this.clusterLength = clusterLength;
this.pmList = pml.getMetaList();
this.poList = pml.getOffsetList();
this.cachePage = cachePage;
scanmode = sm;
numPages = poList.size();
curPageId = 0;
}
public synchronized void setCachePage(boolean cachePage_) {
this.cachePage = cachePage_;
}
public synchronized void initReporter(Reporter reporter) {
this.reporter = reporter;
}
public synchronized int getNumPages() {
return numPages;
}
public synchronized int getCurPageId() {
return curPageId;
}
public synchronized boolean isPageCached() {
return cachePage;
}
/**
* Read a next page (sequencely)
*
* @return the next page data
* @throws IOException
*/
public synchronized byte[] nextPage() throws IOException {
if (curPageId < numPages) {
long offset = poList.get(curPageId);
long length = curPageId == numPages - 1 ? clusterOffset + clusterLength - offset :
poList.get(curPageId + 1) - offset;
System.out.println("1567 byte[] nextPage() length "+length+" clusterId "+clusterId+" curPageId "+curPageId+" offset "+offset);
byte[] page = sr.readPage(clusterId, curPageId, offset, length, cachePage).array();
curPageId++;
return page;
} else {
return null;
}
}
/**
* @deprecated
* @param predicate
* @return the page data
* @throws IOException
*/
@Deprecated
public synchronized byte[] nextPage(Predicate predicate) throws IOException {
return null;
}
/**
* Skip to the page thats contain the target position.
* It is useful during position filtering.
*
* @param pos
* position
* @return the page data
* @throws IOException
*/
public synchronized byte[] skipToPosAndGetPage(int pos) throws IOException {
// System.err.println("[SegmentFile]skipToPosAndGetPage : skip to pos " + pos +
// " to read a page.");
if (curPageId >= numPages) {
return null;
}
int skipToIdx = Utils.findTargetPos(pmList, curPageId, numPages - 1, pos);
if (skipToIdx >= 0) {
if (reporter != null && skipToIdx > curPageId) {
if (clusterId < 16) {
reporter.incrCounter(SegmentSkippedPageCounter.values()[clusterId], skipToIdx
- curPageId - 1);
} else {
reporter
.incrCounter(SegmentSkippedPageCounter.OTHERCLUSTERS, skipToIdx - curPageId - 1);
}
}
curPageId = skipToIdx;
}
return nextPage();
}
/**
* Get the last position of the cluster
*
* @return last position
*/
public synchronized int getLastPos() {
PageMeta pm = pmList.get(numPages - 1);
return pm.startPos + pm.numPairs - 1;
}
/**
* <p>
* Read in the next necessary page by predicate. (skip all the negative pages) <br>
* 1) if the page is a rough page, return the rough page and its scan mode. <br>
* 2) if the page is a positive page, return the first positive page and its scan mode, and we
* also collect the continuous position range until we encounters a rough/negative page.
* </p>
*
* <p>
* this method is used in producing position chunks stream</i>
*
* @param modes
* the scan mode of the page(output)
* @param blks
* the position range of the predicate pages(output)
* @return the reference to the page
* @throws IOException
*/
public synchronized byte[] nextPredicatePagePos(ScanMode[] modes, PosChunk[] blks)
throws IOException {
if (scanmode == null) {
modes[0] = ScanMode.Rough;
if (curPageId < numPages) {
PageMeta tmpPm = pmList.get(curPageId);
prb.setTriple(null, tmpPm.startPos, tmpPm.numPairs);
blks[0] = prb;
return nextPage();
} else {
return null;
}
}
byte[] page = null;
// skip all the negative pages
int prevPageId = curPageId;
while (curPageId < numPages && scanmode[curPageId] == ScanMode.Negative) {
curPageId++;
}
if (reporter != null) {
if (clusterId < 16) {
reporter.incrCounter(SegmentSkippedPageCounter.values()[clusterId], curPageId
- prevPageId - 1);
} else {
reporter.incrCounter(SegmentSkippedPageCounter.OTHERCLUSTERS, curPageId - prevPageId - 1);
}
}
if (curPageId < numPages) {
if (scanmode[curPageId] == ScanMode.Rough) {
modes[0] = ScanMode.Rough;
PageMeta pm = pmList.get(curPageId);
// System.err.println("Read page " + curPageId + " : start from " + pm.startPos +
// ", numReps " + pm.numPairs);
prb.setTriple(null, pm.startPos, pm.numPairs);
blks[0] = prb;
page = nextPage();
} else if (scanmode[curPageId] == ScanMode.Positive) {
modes[0] = ScanMode.Positive;
PageMeta pm = pmList.get(curPageId);
int startPos = pm.startPos;
int numPairs = pm.numPairs;
// System.err.println("Read page " + curPageId + " : start from " + startPos + ", numReps"
// + numPairs);
page = nextPage();
while (curPageId < numPages &&
scanmode[curPageId] == ScanMode.Positive) {
pm = pmList.get(curPageId);
numPairs += pm.numPairs;
// System.err.println("Read page " + curPageId + " : start from " + pm.startPos +
// ", numReps " + pm.numPairs);
curPageId++;
}
// System.err.println("Return scan range : start from " + startPos + ", numReps " +
// numPairs + ".");
prb.setTriple(null, startPos, numPairs);
blks[0] = prb;
} else {
assert (false);
}
}
return page;
}
/**
* Read in the next page by predicates
*
* @param modes
* the scan mode of the next page
* @return the next page data
* @throws IOException
*/
public synchronized byte[] nextPredicatePageValue(ScanMode[] modes) throws IOException {
if (scanmode == null) {
modes[0] = ScanMode.Rough;
return nextPage();
}
byte[] page = null;
int prevPageId = curPageId;
// skip all the negative pages
while (curPageId < numPages && scanmode[curPageId] == ScanMode.Negative) {
curPageId++;
}
if (reporter != null) {
if (clusterId < 16) {
reporter.incrCounter(SegmentSkippedPageCounter.values()[clusterId], curPageId
- prevPageId - 1);
} else {
reporter.incrCounter(SegmentSkippedPageCounter.OTHERCLUSTERS, curPageId - prevPageId - 1);
}
}
if (curPageId < numPages) {
modes[0] = scanmode[curPageId];
page = nextPage();
}
return page;
}
}
// Utility methods.
/*
* @param l Long to convert to an int.
*
* @return <code>l</code> cast as an int.
*/
static synchronized int longToInt(final long l) {
// Expecting the size() of a block not exceeding 4GB. Assuming the
// size() will wrap to negative integer if it exceeds 2GB (From tfile).
return (int) (l & 0x00000000ffffffffL);
}
/**
* Return the global block cache
*
* @param conf
* The current configuration
* @return the block cache or null
*/
public static synchronized BlockCache getBlockCache(Configuration conf) {
if (globalPageCache != null) {
return globalPageCache;
}
float cachePercentage = conf.getFloat(SEGFILE_CACHE_SIZE_KEY, 0.0f);
if (cachePercentage == 0L) {
return null;
}
if (cachePercentage > 1.0) {
throw new IllegalArgumentException(SEGFILE_CACHE_SIZE_KEY +
" must be between 0.0 and 1.0, not > 1.0");
}
// Calculate the amount of heap to give the heap
MemoryUsage mu = ManagementFactory.getMemoryMXBean().getHeapMemoryUsage();
long cacheSize = (long) (mu.getMax() * cachePercentage);
LOG.info("Allocating LruPageCache with maximum size " +
StringUtils.humanReadableInt(cacheSize));
globalPageCache = new LruBlockCache(cacheSize,
MastiffMapReduce.getTablePageSize(conf));
return globalPageCache;
}
static synchronized String makePageName(int segId, int clusterId, int pageId) {
StringBuilder sb = new StringBuilder();
sb.append(segId).append('-').append(clusterId).append('-').append(pageId);
return sb.toString();
}
}