package org.commoncrawl.util;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import com.amazonaws.auth.BasicAWSCredentials;
import com.amazonaws.services.s3.AmazonS3Client;
import com.amazonaws.services.s3.model.ObjectListing;
import com.amazonaws.services.s3.model.S3Object;
import com.amazonaws.services.s3.model.S3ObjectSummary;
import com.google.common.collect.Lists;
/**
* Collect stats about the public dataset bucket
* @author rana
*
*/
public class S3CollectStats {
public static final Log LOG = LogFactory.getLog(S3FixCCACL.class);
static Options options = new Options();
static {
options.addOption(
OptionBuilder.withArgName("awsKey").hasArg().withDescription("AWS Key").isRequired().create("awsKey"));
options.addOption(
OptionBuilder.withArgName("awsSecret").hasArg().withDescription("AWS Secret").isRequired().create("awsSecret"));
}
private static final String DATASET_BUCKET = "aws-publicdatasets";
static void printUsage() {
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp( "S3CollectStats", options );
}
private static List<Long> getValidSegmentIds(AmazonS3Client s3Client)throws IOException {
S3Object validSegmentsObject = s3Client.getObject(DATASET_BUCKET, "/common-crawl/parse-output/valid_segments.txt");
BufferedReader reader = new BufferedReader(new InputStreamReader(validSegmentsObject.getObjectContent()));
ArrayList<Long> segmentIds = Lists.newArrayList();
try {
String line = null;
while ((line = reader.readLine()) != null) {
if (line.length() != 0) {
try {
segmentIds.add(Long.parseLong(line.trim()));
}
catch (Exception e) {
throw new IOException("Invalid Segment Id Encountered:" + line);
}
}
}
}
finally {
reader.close();
}
return segmentIds;
}
private static class SegmentStats {
public SegmentStats(long segmentId) {
this.segmentId = segmentId;
}
long segmentId;
int arcFileCount;
long arcFileSizeTotal;
int metadataCount;
long metadataSizeTotal;
long textSizeTotal;
}
public static void main(String[] args) {
CommandLineParser parser = new GnuParser();
try {
// parse the command line arguments
CommandLine cmdLine = parser.parse( options, args );
BasicAWSCredentials credentials
= new BasicAWSCredentials(
cmdLine.getOptionValue("awsKey"),
cmdLine.getOptionValue("awsSecret"));
AmazonS3Client s3Client = new AmazonS3Client(credentials);
// read valid segments file
List<Long> segmentIds = getValidSegmentIds(s3Client);
for (long segmentId : segmentIds) {
SegmentStats stats = new SegmentStats(segmentId);
String segmentPath = "/common-crawl/parse-output/segment/" + segmentId + "/*";
ObjectListing listing = s3Client.listObjects(DATASET_BUCKET,segmentPath);
boolean done = false;
do {
for (S3ObjectSummary summary : listing.getObjectSummaries()) {
System.out.println("Updating Stats For:" + summary.getKey());
if (summary.getKey().endsWith("arc.gz")) {
stats.arcFileCount++;
stats.arcFileSizeTotal += summary.getSize();
}
else if (summary.getKey().startsWith("metadata-")) {
stats.metadataCount++;
stats.metadataSizeTotal += summary.getSize();
}
else if (summary.getKey().startsWith("textData-")) {
stats.textSizeTotal += summary.getSize();
}
}
if (listing.isTruncated()) {
listing = s3Client.listNextBatchOfObjects(listing);
}
else {
done = true;
}
}
while (!done);
System.out.println(
"@@@Stats\t" +
segmentId + "\t" +
stats.arcFileCount + "\t" +
stats.arcFileSizeTotal + "\t" +
stats.metadataCount + "\t" +
stats.metadataSizeTotal + "\t" +
stats.textSizeTotal);
}
}
catch( Exception exp ) {
// oops, something went wrong
System.err.println( "Parsing failed. Reason: " + exp.getMessage() );
printUsage();
}
}
}