/**
* Copyright 2008 - CommonCrawl Foundation
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
**/
package org.commoncrawl.service.queryserver.master;
import java.io.File;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.nio.ByteBuffer;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.LinkedList;
import java.util.concurrent.Semaphore;
import java.util.zip.Deflater;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.commoncrawl.async.EventLoop;
import org.commoncrawl.io.NIOBufferList;
import org.commoncrawl.io.NIOHttpConnection;
import org.commoncrawl.protocol.ArchiveInfo;
import org.commoncrawl.protocol.shared.ArcFileItem;
import org.commoncrawl.util.CCStringUtils;
import org.commoncrawl.util.S3Downloader;
import org.commoncrawl.util.StreamingArcFileReader;
public class S3Helper {
private static final Log LOG = LogFactory.getLog(S3Helper.class);
private static SimpleDateFormat S3_TIMESTAMP_FORMAT = new SimpleDateFormat("yyyy/MM/dd/");
private static String hdfsNameToS3ArcFileName(long arcFileDate,int arcFilePartNo) {
String arcFileName = Long.toString(arcFileDate) + "_" + arcFilePartNo + ".arc.gz";
synchronized (S3_TIMESTAMP_FORMAT) {
return S3_TIMESTAMP_FORMAT.format(new Date(arcFileDate)) +arcFilePartNo + "/" + arcFileName;
}
}
public static ArcFileItem retrieveArcFileItem(ArchiveInfo archiveInfo,EventLoop eventLoop) throws IOException {
// the default bucket id
String bucketId = "commoncrawl-crawl-002";
//ok, see if we need to switch buckets
if (archiveInfo.getCrawlNumber() == 1) {
bucketId = "commoncrawl";
}
S3Downloader downloader = new S3Downloader(bucketId,"","",false);
// now activate the segment log ...
final Semaphore downloadCompleteSemaphore = new Semaphore(0);
final StreamingArcFileReader arcFileReader = new StreamingArcFileReader(false);
//arcFileReader.setArcFileHasHeaderItemFlag(false);
// create a buffer list we will append incoming content into ...
final LinkedList<ByteBuffer> bufferList = new LinkedList<ByteBuffer>();
downloader.initialize(new S3Downloader.Callback() {
@Override
public boolean contentAvailable(NIOHttpConnection connection,int itemId, String itemKey,NIOBufferList contentBuffer) {
LOG.info("ContentQuery contentAvailable called for Item:" + itemKey + " totalBytesAvailable:" + contentBuffer.available());
try {
while (contentBuffer.available() != 0) {
bufferList.add(contentBuffer.read());
}
return true;
}
catch (IOException e) {
LOG.error(CCStringUtils.stringifyException(e));
return false;
}
}
@Override
public void downloadComplete(NIOHttpConnection connection,int itemId, String itemKey) {
LOG.info("S3 Download Complete for item:" + itemKey);
downloadCompleteSemaphore.release();
}
@Override
public void downloadFailed(NIOHttpConnection connection,int itemId, String itemKey, String errorCode) {
LOG.info("S3 Download Failed for item:" + itemKey);
downloadCompleteSemaphore.release();
}
@Override
public boolean downloadStarting(NIOHttpConnection connection,int itemId, String itemKey,long contentLength) {
LOG.info("ContentQuery DownloadStarting for Item:" + itemKey + " contentLength:" + contentLength);
return true;
}
},eventLoop);
LOG.info("Starting request for Item:" + hdfsNameToS3ArcFileName(archiveInfo.getArcfileDate(),archiveInfo.getArcfileIndex()) + " Offset:" + archiveInfo.getArcfileOffset());
int sizeToRetrieve = (archiveInfo.getCompressedSize() != 0) ? archiveInfo.getCompressedSize() : 30000;
sizeToRetrieve += 10;
downloader.fetchPartialItem(hdfsNameToS3ArcFileName(archiveInfo.getArcfileDate(),archiveInfo.getArcfileIndex()), archiveInfo.getArcfileOffset()-10, sizeToRetrieve);
downloadCompleteSemaphore.acquireUninterruptibly();
if (bufferList.size() == 0) {
return null;
}
ByteBuffer firstBuffer = bufferList.getFirst();
if (firstBuffer != null) {
int offsetToGZIPHeader = scanForGZIPHeader(firstBuffer.duplicate());
if (offsetToGZIPHeader != -1) {
firstBuffer.position(offsetToGZIPHeader);
LOG.info("*** Offset to GZIP Header:" + offsetToGZIPHeader);
}
else {
LOG.error("*** Failed to find GZIP Header offset");
}
}
// now try to decode content if possible
for (ByteBuffer buffer : bufferList) {
LOG.info("Adding Buffer of Size:" + buffer.remaining() + " Position:" + buffer.position() + " Limit:" + buffer.limit());
arcFileReader.available(buffer);
}
ArcFileItem item = arcFileReader.getNextItem();
if (item != null) {
LOG.info("Request Returned item:" + item.getUri());
LOG.info("Uncompressed Size:" + item.getContent().getCount());
}
return item;
}
static int scanForGZIPHeader(ByteBuffer byteBuffer) throws IOException {
LOG.info("*** SCANNING FOR GZIP MAGIC Bytes:" + Byte.toString((byte)StreamingArcFileReader.GZIP_MAGIC) + " " + Byte.toString((byte)(StreamingArcFileReader.GZIP_MAGIC >> 8)) + " BufferSize is:" + byteBuffer.limit() + " Remaining:" + byteBuffer.remaining());
int limit = byteBuffer.limit();
while (byteBuffer.position() + 2 < limit) {
//LOG.info("Reading Byte At:"+ byteBuffer.position());
int b = byteBuffer.get();
//LOG.info("First Byte is:"+ b);
if (b == (byte)(StreamingArcFileReader.GZIP_MAGIC)) {
byteBuffer.mark();
byte b2 = byteBuffer.get();
//LOG.info("Second Byte is:"+ b2);
if (b2 == (byte)(StreamingArcFileReader.GZIP_MAGIC >> 8)) {
byte b3 = byteBuffer.get();
if (b3 == Deflater.DEFLATED) {
LOG.info("Found GZip Magic at:" + (byteBuffer.position() - 3));
return byteBuffer.position() - 3;
}
}
byteBuffer.reset();
}
}
LOG.error("Failed to Find GZIP Magic!!");
//LOG.error(Arrays.toString(byteBuffer.array()));
return -1;
}
public static void main(String[] args) {
File arcFile = new File(args[0]);
long offset = Long.parseLong(args[1]);
long contentSize = Long.parseLong(args[2]);
try {
RandomAccessFile fileHandle = new RandomAccessFile(arcFile, "r");
fileHandle.seek(Math.max(offset - 10,0));
byte data[] = new byte[(int)contentSize + 10];
fileHandle.readFully(data);
ByteBuffer buffer = ByteBuffer.wrap(data);
buffer.position(0);
int position = scanForGZIPHeader(buffer.slice());
buffer.position(position);
StreamingArcFileReader reader = new StreamingArcFileReader(false);
reader.available(buffer);
ArcFileItem nextItem = reader.getNextItem();
System.out.println(nextItem.getUri());
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}