package org.commoncrawl.util; import java.io.BufferedOutputStream; import java.io.IOException; import java.io.OutputStreamWriter; import java.io.PrintWriter; import java.nio.ByteBuffer; import java.nio.charset.Charset; import java.util.LinkedList; import java.util.concurrent.Semaphore; import java.util.zip.Deflater; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.GnuParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.OptionBuilder; import org.apache.commons.cli.Options; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.commoncrawl.async.EventLoop; import org.commoncrawl.io.NIOBufferList; import org.commoncrawl.io.NIOHttpConnection; import org.commoncrawl.protocol.shared.ArcFileHeaderItem; import org.commoncrawl.protocol.shared.ArcFileItem; /** * Utility class used to retrieve a specific document from an ArcFile stored on S3 * * @author rana * */ @SuppressWarnings("static-access") public class ArcFileItemFetcher { private static final Log LOG = LogFactory.getLog(ArcFileItemFetcher.class); public static ArcFileItem retrieveItem(String awsKey,String awsSecret,String awsBucket,String arcFilePath,long arcFileOffset,int arcItemSize) throws IOException { EventLoop eventLoop = new EventLoop(); eventLoop.start(); ArcFileItem itemOut = null; try { itemOut = retrieveItem(eventLoop, awsKey, awsSecret, awsBucket, arcFilePath, arcFileOffset, arcItemSize); } finally { eventLoop.stop(); } return itemOut; } public static ArcFileItem retrieveItem(EventLoop eventLoop,String awsKey,String awsSecret,String awsBucket,String arcFilePath,long arcFileOffset,int arcItemSize) throws IOException { S3Downloader downloader = new S3Downloader(awsBucket,awsKey,awsSecret,false); // now activate the segment log ... final Semaphore downloadCompleteSemaphore = new Semaphore(0); final StreamingArcFileReader arcFileReader = new StreamingArcFileReader(false); //arcFileReader.setArcFileHasHeaderItemFlag(false); // create a buffer list we will append incoming content into ... final LinkedList<ByteBuffer> bufferList = new LinkedList<ByteBuffer>(); downloader.initialize(new S3Downloader.Callback() { @Override public boolean contentAvailable(NIOHttpConnection connection,int itemId, String itemKey,NIOBufferList contentBuffer) { LOG.info("ContentQuery contentAvailable called for Item:" + itemKey + " totalBytesAvailable:" + contentBuffer.available()); try { while (contentBuffer.available() != 0) { bufferList.add(contentBuffer.read()); } return true; } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); return false; } } @Override public void downloadComplete(NIOHttpConnection connection,int itemId, String itemKey) { LOG.info("S3 Download Complete for item:" + itemKey); downloadCompleteSemaphore.release(); } @Override public void downloadFailed(NIOHttpConnection connection,int itemId, String itemKey, String errorCode) { LOG.info("S3 Download Failed for item:" + itemKey); downloadCompleteSemaphore.release(); } @Override public boolean downloadStarting(NIOHttpConnection connection,int itemId, String itemKey,long contentLength) { LOG.info("ContentQuery DownloadStarting for Item:" + itemKey + " contentLength:" + contentLength); return true; } },eventLoop); LOG.info("Starting request for Item:" + arcFilePath); //TODO: FIX RANGE OFFSET TO BE A LONG IN DOWNLOADER downloader.fetchPartialItem(arcFilePath, (int)arcFileOffset, arcItemSize); downloadCompleteSemaphore.acquireUninterruptibly(); if (bufferList.size() == 0) { return null; } ByteBuffer firstBuffer = bufferList.getFirst(); if (firstBuffer != null) { int offsetToGZIPHeader = scanForGZIPHeader(firstBuffer.duplicate()); if (offsetToGZIPHeader != -1) { firstBuffer.position(offsetToGZIPHeader); LOG.info("*** Offset to GZIP Header:" + offsetToGZIPHeader); } else { LOG.error("*** Failed to find GZIP Header offset"); } } // now try to decode content if possible for (ByteBuffer buffer : bufferList) { LOG.info("Adding Buffer of Size:" + buffer.remaining() + " Position:" + buffer.position() + " Limit:" + buffer.limit()); arcFileReader.available(buffer); } ArcFileItem item = arcFileReader.getNextItem(); if (item != null) { LOG.info("Request Returned item:" + item.getUri()); LOG.info("Uncompressed Size:" + item.getContent().getCount()); } return item; } static int scanForGZIPHeader(ByteBuffer byteBuffer) throws IOException { LOG.info("*** SCANNING FOR GZIP MAGIC Bytes:" + Byte.toString((byte)StreamingArcFileReader.GZIP_MAGIC) + " " + Byte.toString((byte)(StreamingArcFileReader.GZIP_MAGIC >> 8)) + " BufferSize is:" + byteBuffer.limit() + " Remaining:" + byteBuffer.remaining()); int limit = byteBuffer.limit(); while (byteBuffer.position() + 2 < limit) { //LOG.info("Reading Byte At:"+ byteBuffer.position()); int b = byteBuffer.get(); //LOG.info("First Byte is:"+ b); if (b == (byte)(StreamingArcFileReader.GZIP_MAGIC)) { byteBuffer.mark(); byte b2 = byteBuffer.get(); //LOG.info("Second Byte is:"+ b2); if (b2 == (byte)(StreamingArcFileReader.GZIP_MAGIC >> 8)) { byte b3 = byteBuffer.get(); if (b3 == Deflater.DEFLATED) { LOG.info("Found GZip Magic at:" + (byteBuffer.position() - 3)); return byteBuffer.position() - 3; } } byteBuffer.reset(); } } LOG.error("Failed to Find GZIP Magic!!"); //LOG.error(Arrays.toString(byteBuffer.array())); return -1; } static Options options = new Options(); static { options.addOption( OptionBuilder.withArgName("awsKey").hasArg().withDescription("AWS Key").isRequired().create("awsKey")); options.addOption( OptionBuilder.withArgName("awsSecret").hasArg().withDescription("AWS Secret").isRequired().create("awsSecret")); options.addOption( OptionBuilder.withArgName("bucket").hasArg().withDescription("S3 bucket name").isRequired().create("bucket")); options.addOption( OptionBuilder.withArgName("path").hasArg().withDescription("S3 path prefix").isRequired().create("path")); options.addOption( OptionBuilder.withArgName("offset").hasArg().withDescription("Offset of item in file").isRequired().create("offset")); options.addOption( OptionBuilder.withArgName("length").hasArg().withDescription("Compressed length of item in file").isRequired().create("length")); } static void printUsage() { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp( "ArcFileItemFetcher", options ); } public static void main(String[] args) { CommandLineParser parser = new GnuParser(); try { // parse the command line arguments CommandLine cmdLine = parser.parse( options, args ); try { ArcFileItem item = retrieveItem( cmdLine.getOptionValue("awsKey"), cmdLine.getOptionValue("awsSecret"), cmdLine.getOptionValue("bucket"), cmdLine.getOptionValue("path"), Integer.parseInt(cmdLine.getOptionValue("offset")), Integer.parseInt(cmdLine.getOptionValue("length"))); if (item != null) { OutputStreamWriter writer = new OutputStreamWriter(System.out, Charset.forName("UTF-8")); try { writer.write(item.getUri()); writer.write('\n'); for (ArcFileHeaderItem header : item.getHeaderItems()) { writer.write(header.getItemKey()+":"+header.getItemValue()); writer.write("\r\n"); } writer.write("\r\n"); writer.flush(); System.out.write(item.getContent().getReadOnlyBytes(),item.getContent().getOffset(),item.getContent().getCount()); System.out.flush(); System.exit(0); } finally { writer.close(); } } else { System.err.println("Unable to retrieve/decode item!"); System.exit(1); } } catch (IOException e) { LOG.error("Failed to retrieve item with Error:" + CCStringUtils.stringifyException(e)); } } catch (Exception e) { LOG.error(CCStringUtils.stringifyException(e)); printUsage(); } } }