package me.lemire.integercompression.benchmarktools; import me.lemire.integercompression.*; import me.lemire.integercompression.differential.IntegratedBinaryPacking; import me.lemire.integercompression.differential.IntegratedByteIntegerCODEC; import me.lemire.integercompression.differential.IntegratedComposition; import me.lemire.integercompression.differential.IntegratedIntegerCODEC; import me.lemire.integercompression.differential.IntegratedVariableByte; import java.io.*; import java.util.ArrayList; import java.util.Arrays; /** * This will run benchmarks using a set of posting lists stored as CSV files. * * @author lemire * */ public class BenchmarkCSV { static IntegratedIntegerCODEC codecs[] = { new IntegratedComposition(new IntegratedBinaryPacking(), new IntegratedVariableByte()) }; static IntegratedByteIntegerCODEC bcodecs[] = { new IntegratedVariableByte() }; static IntegerCODEC regcodecs[] = { new Composition(new FastPFOR128(), new VariableByte()), new Composition(new FastPFOR(), new VariableByte()), new Composition(new BinaryPacking(), new VariableByte()) }; static ByteIntegerCODEC regbcodecs[] = { new VariableByte() }; private static ArrayList<int[]> loadIntegers(final String filename, final Format f) throws IOException { int misparsed = 0; if (f == Format.ONEARRAYPERLINE) { ArrayList<int[]> answer = new ArrayList<int[]>(); BufferedReader br = new BufferedReader(new FileReader( filename)); String s; while ((s = br.readLine()) != null) { String[] numbers = s.split("[,;;]"); // that's // slow int[] a = new int[numbers.length]; for (int k = 0; k < numbers.length; ++k) { try { a[k] = Integer .parseInt(numbers[k] .trim()); } catch (java.lang.NumberFormatException nfe) { if (misparsed == 0) System.err.println(nfe); ++misparsed; } } answer.add(a); } if (misparsed > 0) System.out.println("Failed to parse " + misparsed + " entries"); br.close(); return answer; } else if (f == Format.ONEARRAYPERFILE) { ArrayList<Integer> answer = new ArrayList<Integer>(); BufferedReader br = new BufferedReader(new FileReader( filename)); String s; while ((s = br.readLine()) != null) { String[] numbers = s.split("[,;;]");// that's // slow for (int k = 0; k < numbers.length; ++k) { try { answer.add(Integer .parseInt(numbers[k] .trim())); } catch (java.lang.NumberFormatException nfe) { if (misparsed == 0) System.err.println(nfe); ++misparsed; } } } int[] actualanswer = new int[answer.size()]; for (int i = 0; i < answer.size(); ++i) actualanswer[i] = answer.get(i); ArrayList<int[]> wrap = new ArrayList<int[]>(); wrap.add(actualanswer); if (misparsed > 0) System.out.println("Failed to parse " + misparsed + " entries"); br.close(); return wrap; } else { ArrayList<Integer> answer = new ArrayList<Integer>(); BufferedReader br = new BufferedReader(new FileReader( filename)); String s; while ((s = br.readLine()) != null) { try { answer.add(Integer.parseInt(s.trim())); } catch (java.lang.NumberFormatException nfe) { if (misparsed == 0) System.err.println(nfe); ++misparsed; } } int[] actualanswer = new int[answer.size()]; for (int i = 0; i < answer.size(); ++i) actualanswer[i] = answer.get(i); ArrayList<int[]> wrap = new ArrayList<int[]>(); wrap.add(actualanswer); if (misparsed > 0) System.out.println("Failed to parse " + misparsed + " entries"); br.close(); return wrap; } } private enum Format { ONEARRAYPERLINE, ONEARRAYPERFILE, ONEINTPERLINE } private enum CompressionMode { AS_IS, DELTA } /** * @param args command-line arguments * @throws IOException when some IO error occurs */ public static void main(final String[] args) throws IOException { Format myformat = Format.ONEARRAYPERLINE; CompressionMode cm = CompressionMode.DELTA; ArrayList<String> files = new ArrayList<String>(); for (String s : args) { if (s.startsWith("-")) {// it is a flag if (s.equals("--onearrayperfile")) myformat = Format.ONEARRAYPERFILE; else if (s.equals("--nodelta")) cm = CompressionMode.AS_IS; else if (s.equals("--oneintperline")) myformat = Format.ONEINTPERLINE; else throw new RuntimeException( "I don't understand: " + s); } else {// it is a filename files.add(s); } } if (myformat == Format.ONEARRAYPERFILE) System.out.println("Treating each file as one array."); else if (myformat == Format.ONEARRAYPERLINE) System.out .println("Each line of each file is an array: use --onearrayperfile or --oneintperline to change."); else if (myformat == Format.ONEINTPERLINE) System.out .println("Treating each file as one array, with one integer per line."); if (cm == CompressionMode.AS_IS) System.out .println("Compressing the integers 'as is' (no differential coding)"); else System.out .println("Using differential coding (arrays will be sorted): use --nodelta to prevent sorting"); ArrayList<int[]> data = new ArrayList<int[]>(); for (String fn : files) for (int[] x : loadIntegers(fn, myformat)) data.add(x); System.out.println("Loaded " + data.size() + " array(s)"); if (cm == CompressionMode.DELTA) { System.out .println("Sorting the arrray(s) because you are using differential coding"); for (int[] x : data) Arrays.sort(x); } bench(data, cm, false); bench(data, cm, false); bench(data, cm, true); bytebench(data, cm, false); bytebench(data, cm, false); bytebench(data, cm, true); } private static void bench(ArrayList<int[]> postings, CompressionMode cm, boolean verbose) { int maxlength = 0; for (int[] x : postings) if (maxlength < x.length) maxlength = x.length; if (verbose) System.out.println("Max array length: " + maxlength); int[] compbuffer = new int[2 * maxlength + 1024]; int[] decompbuffer = new int[maxlength]; if (verbose) System.out.println("Scheme -- bits/int -- speed (mis)"); for (IntegerCODEC c : (cm == CompressionMode.DELTA ? codecs : regcodecs)) { long bef = 0; long aft = 0; long decomptime = 0; long volumein = 0; long volumeout = 0; int[][] compdata = new int[postings.size()][]; for (int k = 0; k < postings.size(); ++k) { int[] in = postings.get(k); IntWrapper inpos = new IntWrapper(0); IntWrapper outpos = new IntWrapper(0); c.compress(in, inpos, in.length, compbuffer, outpos); int clength = outpos.get(); inpos = new IntWrapper(0); outpos = new IntWrapper(0); c.uncompress(compbuffer, inpos, clength, decompbuffer, outpos); volumein += in.length; volumeout += clength; if (outpos.get() != in.length) throw new RuntimeException("bug"); for (int z = 0; z < in.length; ++z) if (in[z] != decompbuffer[z]) throw new RuntimeException( "bug"); compdata[k] = Arrays .copyOf(compbuffer, clength); } bef = System.nanoTime(); for (int[] cin : compdata) { IntWrapper inpos = new IntWrapper(0); IntWrapper outpos = new IntWrapper(0); c.uncompress(cin, inpos, cin.length, decompbuffer, outpos); if (inpos.get() != cin.length) throw new RuntimeException("bug"); } aft = System.nanoTime(); decomptime += (aft - bef); double bitsPerInt = volumeout * 32.0 / volumein; double decompressSpeed = volumein * 1000.0 / (decomptime); if (verbose) System.out.println(c.toString() + "\t" + String.format("\t%1$.2f\t%2$.2f", bitsPerInt, decompressSpeed)); } } private static void bytebench(ArrayList<int[]> postings, CompressionMode cm, boolean verbose) { int maxlength = 0; for (int[] x : postings) if (maxlength < x.length) maxlength = x.length; if (verbose) System.out.println("Max array length: " + maxlength); byte[] compbuffer = new byte[6 * (maxlength + 1024)]; int[] decompbuffer = new int[maxlength]; if (verbose) System.out.println("Scheme -- bits/int -- speed (mis)"); for (ByteIntegerCODEC c : (cm == CompressionMode.DELTA ? bcodecs : regbcodecs)) { long bef = 0; long aft = 0; long decomptime = 0; long volumein = 0; long volumeout = 0; byte[][] compdata = new byte[postings.size()][]; for (int k = 0; k < postings.size(); ++k) { int[] in = postings.get(k); IntWrapper inpos = new IntWrapper(0); IntWrapper outpos = new IntWrapper(0); c.compress(in, inpos, in.length, compbuffer, outpos); int clength = outpos.get(); inpos = new IntWrapper(0); outpos = new IntWrapper(0); c.uncompress(compbuffer, inpos, clength, decompbuffer, outpos); volumein += in.length; volumeout += clength; if (outpos.get() != in.length) throw new RuntimeException("bug"); for (int z = 0; z < in.length; ++z) if (in[z] != decompbuffer[z]) throw new RuntimeException( "bug"); compdata[k] = Arrays .copyOf(compbuffer, clength); } bef = System.nanoTime(); for (byte[] cin : compdata) { IntWrapper inpos = new IntWrapper(0); IntWrapper outpos = new IntWrapper(0); c.uncompress(cin, inpos, cin.length, decompbuffer, outpos); if (inpos.get() != cin.length) throw new RuntimeException("bug"); } aft = System.nanoTime(); decomptime += (aft - bef); double bitsPerInt = volumeout * 8.0 / volumein; double decompressSpeed = volumein * 1000.0 / (decomptime); if (verbose) System.out.println(c.toString() + "\t" + String.format("\t%1$.2f\t%2$.2f", bitsPerInt, decompressSpeed)); } } }