package com.yahoo.glimmer.util; import it.unimi.dsi.fastutil.io.BinIO; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import org.itadaki.bzip2.BZip2BitInputStream; import org.itadaki.bzip2.BZip2BlockDecompressor; import org.itadaki.bzip2.BZip2Constants; import org.itadaki.bzip2.BZip2InputStream; import com.martiansoftware.jsap.FlaggedOption; import com.martiansoftware.jsap.JSAP; import com.martiansoftware.jsap.JSAPException; import com.martiansoftware.jsap.JSAPResult; import com.martiansoftware.jsap.Parameter; import com.martiansoftware.jsap.SimpleJSAP; public class BZip2BlockOffsetTool { private static final String BZ2_FILE_ARG = "bz2-file"; private static final String OFFSETS_FILE_ARG = "offsets-file"; private static final String DOC_ID_ARG = "doc-id"; private static final String BLOCK_INDEX_ARG = "block-index"; private static final int MAX_DOC_ID_DIGITS = 19; // A long. private static long lastBlockStartBitOffset; private static long lastFirstDocId; private static int bytesReadFromBlock; public static void main(String[] args) throws IOException, ClassNotFoundException, JSAPException { SimpleJSAP jsap = new SimpleJSAP(BZip2BlockOffsetTool.class.getName(), "Tool for BZip2 .blockOffsets file creation and querying.", new Parameter[] { new FlaggedOption(BZ2_FILE_ARG, JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, Character.toUpperCase(BZ2_FILE_ARG.charAt(0)), BZ2_FILE_ARG, "The .bz2 file to read. Omition reads from stdin."), new FlaggedOption(OFFSETS_FILE_ARG, JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, Character.toUpperCase(OFFSETS_FILE_ARG.charAt(0)), OFFSETS_FILE_ARG, "The .blockOffsets file to read/write. Omition writes to stdout."), new FlaggedOption(DOC_ID_ARG, JSAP.LONG_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, DOC_ID_ARG.charAt(0), DOC_ID_ARG, "The doc id."), new FlaggedOption(BLOCK_INDEX_ARG, JSAP.LONG_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, BLOCK_INDEX_ARG.charAt(0), BLOCK_INDEX_ARG, "The block index.") }); JSAPResult jsapResult = jsap.parse(args); // check whether the command line was valid, and if it wasn't, // display usage information and exit. if (!jsapResult.success()) { System.err.println(); System.err.println("Usage: java " + BZip2BlockOffsetTool.class.getName()); System.err.println(" " + jsap.getUsage()); System.err.println(); System.out.println("To generate block offsets files:"); System.out.println(" " + BZip2BlockOffsetTool.class.getCanonicalName() + " < bySubject.bz2 > bySubject.blockOffsets"); System.out.println(" or " + BZip2BlockOffsetTool.class.getCanonicalName() + " -B bySubject.bz2 > bySubject.blockOffsets"); System.out.println(" or " + BZip2BlockOffsetTool.class.getCanonicalName() + " -B bySubject.bz2 -O bySubject.blockOffsets"); System.out.println("To dump all block offsets from file:"); System.out.println(" " + BZip2BlockOffsetTool.class.getCanonicalName() + " -O bySubject.blockOffsets"); System.out.println("To get the block index for a doc id:"); System.out.println(" " + BZip2BlockOffsetTool.class.getCanonicalName() + " -O bySubject.blockOffsets -d <doc id>"); System.out.println("To get the block start and end bit offsets by block index:"); System.out.println(" " + BZip2BlockOffsetTool.class.getCanonicalName() + " -O bySubject.blockOffsets -b <block index>"); System.out.println("To get the uncompressed block content by doc id:"); System.out.println(" " + BZip2BlockOffsetTool.class.getCanonicalName() + " -B bySubject.bz2 -O bySubject.blockOffsets -d <block index>"); System.out.println("To get the uncompressed block content by block index:"); System.out.println(" " + BZip2BlockOffsetTool.class.getCanonicalName() + " -B bySubject.bz2 -O bySubject.blockOffsets -b <block index>"); System.exit(1); } String bzip2Filename = null; if (jsapResult.contains(BZ2_FILE_ARG)) { bzip2Filename = jsapResult.getString(BZ2_FILE_ARG); } String blockOffsetsFilename = null; if (jsapResult.contains(OFFSETS_FILE_ARG)) { blockOffsetsFilename = jsapResult.getString(OFFSETS_FILE_ARG); System.err.println("Reading offsets from " + blockOffsetsFilename); } Long docId = null; if (jsapResult.contains(DOC_ID_ARG)) { docId = jsapResult.getLong(DOC_ID_ARG); } Long blockIndex = null; if (jsapResult.contains(BLOCK_INDEX_ARG)) { blockIndex = jsapResult.getLong(BLOCK_INDEX_ARG); } if (blockOffsetsFilename != null && bzip2Filename == null) { BlockOffsets blockOffsets = (BlockOffsets) BinIO.loadObject(new FileInputStream(blockOffsetsFilename)); if (docId != null) { blockIndex = blockOffsets.getBlockIndex(docId); System.out.println("The block index for Doc ID " + docId + " is " + blockIndex); } else if (blockIndex != null) { long startBitOffset = blockOffsets.getBlockStartBitOffset(blockIndex); long endBitOffset = blockOffsets.getBlockStartBitOffset(blockIndex + 1) - 1; System.out.println("The block start and end bit offsets for block index " + blockIndex + " are " + startBitOffset + " and " + endBitOffset); } else { blockOffsets.printTo(System.out); } return; } else if (blockOffsetsFilename != null && bzip2Filename != null && (docId != null || blockIndex != null)) { BlockOffsets blockOffsets = (BlockOffsets) BinIO.loadObject(new FileInputStream(blockOffsetsFilename)); FileInputStream fileInputStream = new FileInputStream(bzip2Filename); if (blockIndex == null) { blockIndex = blockOffsets.getBlockIndex(docId); } long blockStartBitOffset = blockOffsets.getBlockStartBitOffset(blockIndex); if (fileInputStream.read() != 'B' || fileInputStream.read() != 'Z' || fileInputStream.read() != 'h') { System.err.println("The given bzip2 file doesn't have a BZip2 header."); System.exit(1); } int blockSize = fileInputStream.read() - '0'; if (blockSize < 1 || blockSize > 9) { System.err.println("Invalid blocksize in the given bzip2 file. " + blockIndex); System.exit(1); } fileInputStream.skip((blockStartBitOffset / 8) - 4); BZip2BitInputStream bZip2BitInputStream = new BZip2BitInputStream(fileInputStream); bZip2BitInputStream.readBits((int) (blockStartBitOffset % 8)); final int marker1 = bZip2BitInputStream.readBits(24); final int marker2 = bZip2BitInputStream.readBits(24); if (marker1 != BZip2Constants.BLOCK_HEADER_MARKER_1 || marker2 != BZip2Constants.BLOCK_HEADER_MARKER_2) { System.err.println("Invalid Bzip2 block header at bit offset " + blockStartBitOffset); System.exit(1); } BZip2BlockDecompressor bZip2BlockDecompressor = new BZip2BlockDecompressor(bZip2BitInputStream, blockSize * 100000); int b; while ((b = bZip2BlockDecompressor.read()) != -1) { System.out.write(b); } System.out.flush(); return; } InputStream input = System.in; OutputStream output = System.out; if (bzip2Filename != null) { System.err.println("Reading bzip2 stream from " + bzip2Filename); input = new FileInputStream(bzip2Filename); } if (blockOffsetsFilename != null) { System.err.println("Writting block offsets to " + blockOffsetsFilename); output = new FileOutputStream(blockOffsetsFilename); } writeBlockOffsets(input, output); } public static void writeBlockOffsets(InputStream input, OutputStream output) throws IOException { final BlockOffsets.Builder blockOffsetsBuilder = new BlockOffsets.Builder(); BZip2InputStream uncompressedInputStream = new BZip2InputStream(input, false, new BZip2InputStream.Callback() { @Override public void blockStart(long blockStartBitOffset) { if (lastBlockStartBitOffset != 0) { if (lastFirstDocId == 0) { throw new IllegalArgumentException("lastFirstDocId is 0."); } blockOffsetsBuilder.setBlockStart(lastBlockStartBitOffset, lastFirstDocId); } lastBlockStartBitOffset = blockStartBitOffset; bytesReadFromBlock = 0; } @Override public void noMoreBlocks(long totalBitsRead) { blockOffsetsBuilder.close(totalBitsRead); } }); long docCount = 0; long lastDocId = 0; int docIdDigitIndex = 0; long docId = 0l; System.err.println("Processing record 0"); int b; while ((b = uncompressedInputStream.read()) != -1) { bytesReadFromBlock++; if (b == BySubjectRecord.RECORD_DELIMITER) { if (docIdDigitIndex != -1) { uncompressedInputStream.close(); throw new RuntimeException("Got unexpected RECORD_START in record " + docCount); } docIdDigitIndex = 0; docId = 0l; docCount++; if (docCount % 100000 == 0) { System.err.println("Processing doc " + docCount); } } else if (docIdDigitIndex >= MAX_DOC_ID_DIGITS) { uncompressedInputStream.close(); throw new RuntimeException("Doc ID longer than " + MAX_DOC_ID_DIGITS + " at record " + docCount); } else if (docIdDigitIndex >= 0) { if (b == BySubjectRecord.FIELD_DELIMITER) { if (lastDocId > docId) { uncompressedInputStream.close(); throw new IllegalArgumentException("lastDocId(" + lastDocId + ") is greater that the current docId(" + docId + ")"); } if (lastBlockStartBitOffset != 0 && bytesReadFromBlock > docIdDigitIndex) { if (docId == 0) { uncompressedInputStream.close(); throw new IllegalArgumentException("docId is 0."); } blockOffsetsBuilder.setBlockStart(lastBlockStartBitOffset, docId); lastBlockStartBitOffset = 0; lastFirstDocId = docId; } lastDocId = docId; docIdDigitIndex = -1; } else if (b >= '0' && b <= '9') { docId = docId * 10 + (b - '0'); docIdDigitIndex++; } else { uncompressedInputStream.close(); throw new RuntimeException("Non-numeric in Doc Id at record " + docCount); } } } uncompressedInputStream.close(); BlockOffsets blockOffsets = blockOffsetsBuilder.build(docCount, lastDocId); blockOffsets.printTo(System.err); blockOffsets.save(output); System.out.flush(); } }