package org.apache.hadoop.hive.mastiffFlexibleEncoding.parquet; /* * this is used for test various encoding from Parquet and ORC file . also used to test some new combinations encodings of basic encoding ways from Parquet and ORC ,which is combineed by wangmeng * @author wangmeng */ import java.io.DataInputStream; import java.io.DataOutputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; public class TestOnlyIntDictionaryRLEBitPackingZIgzar { static java.util.Calendar c=java.util.Calendar.getInstance(); static java.text.SimpleDateFormat f=new java.text.SimpleDateFormat("yyyy年MM月dd日hh时mm分ss秒"); public static int fileLong =0 ,count =0 ,runcount=0 ,DictionarySize=0; public static long startTime ,encodingReadTime ,encodingTime,encodingWriteTime, RencodingReadTime ,RencodingTime,RencodingWriteTime, finalTime; public static long DeltaencodingReadTime=0 ,DeltaencodingTime,DeltaencodingWriteTime=0, DeltaRencodingReadTime=0 ,DeltaRencodingTime=0,DeltaRencodingWriteTime=0; static int[] initValues ; public static void testIntDictionary(String[] s) throws IOException { // System.out.println("intOnlyDictionaryRLEBitPackingZIgzar : begin write to DictionaryRLE : " +new java.text.SimpleDateFormat("yyyy年MM月dd日hh时mm分ss秒").format(java.util.Calendar.getInstance().getTime())); encodingReadTime=System.currentTimeMillis() ; File file=new File(s[0]) ; FileInputStream fis =new FileInputStream(file); DataInputStream dis=new DataInputStream(fis); fileLong=(int) file.length()/4; // int[] initValues=new int[fileLong] ; initValues=new int[fileLong] ; for (int i = 0; i < fileLong; i++) { //cw.writeInteger(dis.readInt()); initValues[i]=dis.readInt() ; } fis.close(); dis.close(); encodingTime=System.currentTimeMillis() ; DeltaencodingReadTime=DeltaencodingReadTime+encodingTime-encodingReadTime ; final OnlyDictionaryValuesWriter.PlainIntegerDictionaryValuesWriter cw = new OnlyDictionaryValuesWriter.PlainIntegerDictionaryValuesWriter(Integer.MAX_VALUE, fileLong); for (int i = 0; i < fileLong; i++) { cw.writeInteger(initValues[i]); } BytesInput sbi=cw.getBytes(); encodingWriteTime=System.currentTimeMillis() ; String[] str =new String[s.length-1]; for(int i=0 ;i<s.length-1;i++){ str[i]=s[i+1]; } // long tmp=System.currentTimeMillis(); // cw.getBytes().writeToDisk(str); // System.out.println("tmp "+tmp); long tmp=cw.WriteDictionaryToDisk(s); long tmp1= sbi.writeToDisk(str); DeltaencodingTime=DeltaencodingTime+encodingWriteTime-encodingTime +tmp+tmp1 ; // System.out.println("time "+(System.currentTimeMillis()-tmp)); RencodingReadTime=System.currentTimeMillis() ; DeltaencodingWriteTime=DeltaencodingWriteTime+RencodingReadTime-encodingWriteTime-tmp-tmp1 ; //System.out.println("intOnlyDictionaryRLEBitPackingZIgzar : finish write to DictionaryRLE : " +new java.text.SimpleDateFormat("yyyy年MM月dd日hh时mm分ss秒").format(java.util.Calendar.getInstance().getTime())); // System.out.println("intDictionaryRLEBitPackingZIgzar: begin read from DictionaryRLE : " +new java.text.SimpleDateFormat("yyyy年MM月dd日hh时mm分ss秒").format(java.util.Calendar.getInstance().getTime())); byte[] bytes=getDictionaryIdBytes(s); byte[] dictionaryByte= getDictionaryBytes(s[3]); RencodingTime=System.currentTimeMillis() ; DeltaRencodingReadTime=DeltaRencodingReadTime+RencodingTime-RencodingReadTime; //DictionaryValuesReader cr = initDicReader(s,PrimitiveType.PrimitiveTypeName.INT32); DictionaryValuesReader cr = initDicReader(DictionarySize,dictionaryByte,PrimitiveType.PrimitiveTypeName.INT32); cr.initFromPage(fileLong, bytes, 0); int[] result=new int[fileLong]; for (int i = 0; i < fileLong; i++) { result[i]= cr.readInteger(); // System.out.println(back); //if(result[i]==initValues[i]){ count++ ; //} // else{ // System.out.println("wrong ......................................................."); // } } RencodingWriteTime=System.currentTimeMillis() ; DeltaRencodingTime=DeltaRencodingTime+RencodingWriteTime-RencodingTime; FileOutputStream revrseencodingFis=new FileOutputStream(new File(s[4])) ; DataOutputStream revrseencodingDos=new DataOutputStream(revrseencodingFis) ; for(int i=0; i < fileLong; ++i) { //int x = (int) in.next(); revrseencodingDos.writeInt(result[i]); } revrseencodingFis.close(); revrseencodingDos.close(); finalTime=System.currentTimeMillis() ; DeltaRencodingWriteTime=DeltaRencodingWriteTime+finalTime-RencodingWriteTime; // System.out.println("total value "+ count); // System.out.println("intOnlyDictionaryRLEBitPackingZIgzar : finish read from DictionaryRLE : " +new java.text.SimpleDateFormat("yyyy年MM月dd日hh时mm分ss秒").format(java.util.Calendar.getInstance().getTime())); } private static byte[] getDictionaryIdBytes(String[] s) throws IOException{ // BytesInput bit=cw.getBytes(); // System.out.println("67 "+bit.size()); // byte[] byt= bit.toByteArray();////inner write data to disk wm File file=new File(s[1]); DataInputStream dis =new DataInputStream(new FileInputStream(file)); byte[] byt=new byte[(int) file.length()]; dis.readFully(byt); dis.close(); return byt; } private static byte[] getDictionaryBytes(String s) throws IOException{ File file =new File(s); DataInputStream dis =new DataInputStream(new FileInputStream(file)); byte[] bytes=new byte[(int) (file.length()-4)]; //System.out.println(file.length()-4); DictionarySize = dis.readInt(); dis.readFully(bytes); dis.close(); return bytes; } private static DictionaryValuesReader initDicReader(int DictionarySize ,byte[] bytes, PrimitiveType.PrimitiveTypeName type) throws IOException { //need know dictionary size(50) dictionaryBytesSize(50*4) dictionary BytsInput wm // final DictionaryPage dictionaryPage = cw.createDictionaryPage().copy(); // new DictionaryPage(dictionaryEncoder.getBytes(), lastUsedDictionarySize, Encoding.PLAIN_DICTIONARY); // final DictionaryPage dictionaryPage = cw.createDictionaryPage().copy() ; // File file =new File(s[3]); // DataInputStream dis =new DataInputStream(new FileInputStream(file)); // byte[] bytes=new byte[(int) (file.length()-4)]; // //System.out.println(file.length()-4); // // int DictionarySize = dis.readInt(); // dis.readFully(bytes); // dis.close(); CapacityByteArrayOutputStream cbs=new CapacityByteArrayOutputStream(bytes.length); cbs.write(bytes, 0, bytes.length); BytesInput bytesInput =new BytesInput.CapacityBAOSBytesInput(cbs) ; // DictionaryPage dictionaryPage =new DictionaryPage(200,50,Encoding.PLAIN_DICTIONARY); DictionaryPage dictionaryPage =new DictionaryPage( bytesInput,DictionarySize,Encoding.PLAIN_DICTIONARY); // System.out.println(bytesInput.toByteArray().length); final ColumnDescriptor descriptor = new ColumnDescriptor(new String[] {"foo"}, type, 0, 0); Encoding encoding=Encoding.PLAIN_DICTIONARY ; final Dictionary dictionary = encoding.initDictionary(descriptor, dictionaryPage); final DictionaryValuesReader cr = new DictionaryValuesReader(dictionary); return cr; } private void roundTripLong(DictionaryValuesWriter cw, ValuesReader reader, int maxDictionaryByteSize) throws IOException { int fallBackThreshold = maxDictionaryByteSize / 8; for (long i = 0; i < 100; i++) { cw.writeLong(i); if (i < fallBackThreshold) { // assertEquals(cw.getEncoding(), PLAIN_DICTIONARY); } else { // assertEquals(cw.getEncoding(), PLAIN); } } reader.initFromPage(100, cw.getBytes().toByteArray(), 0); for (long i = 0; i < 100; i++) { /// assertEquals(i, reader.readLong()); } } // @Test public void testLongDictionaryFallBack() throws IOException { int slabSize = 100; int maxDictionaryByteSize = 50; final DictionaryValuesWriter cw = new DictionaryValuesWriter.PlainLongDictionaryValuesWriter(maxDictionaryByteSize, slabSize); // Fallbacked to Plain encoding, therefore use PlainValuesReader to read it back ValuesReader reader = new PlainValuesReader.LongPlainValuesReader(); roundTripLong(cw, reader, maxDictionaryByteSize); //simulate cutting the page cw.reset(); // assertEquals(0,cw.getBufferedSize()); cw.resetDictionary(); roundTripLong(cw, reader, maxDictionaryByteSize); } public static void main(String[] args) throws IOException { // TODO Auto-generated method stub File sourcefileDirectory =new File(args[0]); File[] sourcefiles= sourcefileDirectory.listFiles() ; int length=sourcefiles.length; String[] sourcestr=new String[length]; String[] encodinstr=new String[length]; String[] dictionaryStr=new String[length]; String[] revrseEncodingstr=new String[length]; for (int i=0;i<sourcefiles.length;i++){ sourcestr[i]=sourcefiles[i].getAbsolutePath() ; encodinstr[i]=args[1]+"/"+i ; dictionaryStr[i]=args[3]+"/"+i ; revrseEncodingstr[i]=args[4]+"/"+i ; // System.out.println(finalstr[i]); } String[] runstr=new String[5] ; runstr[2]=args[2]; startTime=System.currentTimeMillis() ; System.out.println("ORCRLE : begin ORCRLE : " +new java.text.SimpleDateFormat("yyyy年MM月dd日hh时mm分ss秒").format(java.util.Calendar.getInstance().getTime())); for (int i=0;i<length;i++){ runstr[0]=sourcestr[i]; runstr[1]=encodinstr[i] ; runstr[3]=dictionaryStr[i]; runstr[4]=revrseEncodingstr[i] ; testIntDictionary(runstr); runcount++ ; } System.out.println("ORCRLE : finish ORCRLE : " +new java.text.SimpleDateFormat("yyyy年MM月dd日hh时mm分ss秒").format(java.util.Calendar.getInstance().getTime())); System.out.println("totoal run :"+runcount+" times"); System.out.println("totoal valuecount :"+count); //DeltaencodingReadTime=0 ,DeltaencodingTime,DeltaencodingWriteTime=0, DeltaRencodingReadTime=0 ,DeltaRencodingTime=0,DeltaRencodingWriteTime=0; System.out.println("DeltaencodingReadTime :"+DeltaencodingReadTime+" mis"); System.out.println("DeltaencodingTime :"+DeltaencodingTime+" mis"); System.out.println("DeltaencodingWriteTime :"+DeltaencodingWriteTime+" mis"); System.out.println("DeltaRencodingReadTime :"+DeltaRencodingReadTime+" mis"); System.out.println("DeltaRencodingTime :"+DeltaRencodingTime+" mis"); System.out.println("DeltaRencodingWriteTime :"+DeltaRencodingWriteTime+" mis"); System.out.println("total time : "+(finalTime-startTime)+" mis"); long encodingtotalLong=0 , revrsetotalLong=0 ; for(int j=0 ;j<length;j++){ revrsetotalLong= revrsetotalLong+new File(revrseEncodingstr[j]).length(); encodingtotalLong=encodingtotalLong+new File(encodinstr[j]).length()+new File(dictionaryStr[j]).length(); } System.out.println("encodingtotalLong : "+encodingtotalLong+" /1024/1024 "+encodingtotalLong/1024/1024); System.out.println("revrsetotalLong : "+revrsetotalLong+" /1024/1024 "+revrsetotalLong/1024/1024); } }