/** * Copyright 2008 - CommonCrawl Foundation * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * **/ package org.commoncrawl.util; import java.io.IOException; import java.io.StringReader; import java.nio.ByteBuffer; import java.util.List; import java.util.concurrent.ConcurrentSkipListMap; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.GnuParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.OptionBuilder; import org.apache.commons.cli.Options; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.commoncrawl.io.NIOBufferList; import org.commoncrawl.io.NIOHttpConnection; import org.commoncrawl.util.Tuples.Pair; import com.amazonaws.auth.BasicAWSCredentials; import com.amazonaws.services.s3.AmazonS3Client; import com.amazonaws.services.s3.model.ListObjectsRequest; import com.amazonaws.services.s3.model.ObjectListing; import com.amazonaws.services.s3.model.S3ObjectSummary; import com.google.common.collect.ImmutableList; import com.google.gson.JsonArray; import com.google.gson.JsonParser; import com.google.gson.JsonPrimitive; import com.google.gson.stream.JsonReader; /** * Utility used to transfer data from S3 down to colo in bulk * * @author rana * */ public class S3BulkTransferUtil implements S3Downloader.Callback { private static final Log LOG = LogFactory.getLog(S3BulkTransferUtil.class); S3Downloader _downloader; Configuration _conf; FileSystem _fs; int _totalQueuedItemsCount; int _totalCompletedItemsCount = 0; ConcurrentSkipListMap<String,Path> _pathMapping = new ConcurrentSkipListMap<String, Path>(); S3BulkTransferUtil(String bucketName, String s3AccessKeyId,String s3SecretKey, JsonArray pathList,final Path outputPath)throws IOException { _conf = new Configuration(); _fs = FileSystem.get(_conf); LOG.info("Initializing Downloader"); _downloader = new S3Downloader(bucketName,s3AccessKeyId,s3SecretKey,false); _downloader.setMaxParallelStreams(150); _downloader.initialize(this); LOG.info("Got JSON Array with:" + pathList.size() + " elements"); for (int i=0;i<pathList.size();++i){ LOG.info("Collecting files from path:" + pathList.get(i).toString()); List<S3ObjectSummary> metadataFiles = getPaths(s3AccessKeyId, s3SecretKey,bucketName,pathList.get(i).getAsString()); LOG.info("Got:" + metadataFiles.size() + " total files"); for (S3ObjectSummary metadataFile : metadataFiles) { Path s3Path = new Path("/" + metadataFile.getKey()); Path finalPath = new Path(outputPath,s3Path.getName()); FileStatus fileStatus = null; try { fileStatus = _fs.getFileStatus(finalPath); } catch (Exception e) { } if (fileStatus != null && fileStatus.getLen() != metadataFile.getSize()) { LOG.error( "SRC-DEST SIZE MISMATCH!! SRC:" + metadataFile + " SRC-SIZE:" + metadataFile.getSize() + " DEST:" + finalPath + " DEST-SIZE:" + fileStatus.getLen()); // ok delete the destination _fs.delete(finalPath,false); // null file status so that the item gets requeued ... fileStatus = null; } if (fileStatus == null) { LOG.info("Queueing Item:" + metadataFile); ++_totalQueuedItemsCount; _pathMapping.put(metadataFile.getKey(),finalPath); _downloader.fetchItem(metadataFile.getKey()); } else { LOG.info("Skipping Already Download Item:" + metadataFile + " Found at:" + finalPath); } } } LOG.info("Waiting for shutdown event"); _downloader.waitForCompletion(); } public static List<S3ObjectSummary> getPaths(String s3AccessKeyId,String s3SecretKey,String bucketName,String segmentPath) throws IOException { AmazonS3Client s3Client = new AmazonS3Client(new BasicAWSCredentials(s3AccessKeyId,s3SecretKey)); ImmutableList.Builder<S3ObjectSummary> listBuilder = new ImmutableList.Builder<S3ObjectSummary>(); ObjectListing response = s3Client.listObjects(new ListObjectsRequest().withBucketName(bucketName).withPrefix(segmentPath)); do { LOG.info("Response Key Count:" + response.getObjectSummaries().size()); for (S3ObjectSummary entry : response.getObjectSummaries()) { listBuilder.add(entry); } if (response.isTruncated()) { response = s3Client.listNextBatchOfObjects(response); } else { break; } } while (true); return listBuilder.build(); } ConcurrentSkipListMap<String, Pair<Path,FSDataOutputStream>> _pathToStreamMap = new ConcurrentSkipListMap<String, Pair<Path,FSDataOutputStream>>(); @Override public boolean contentAvailable(NIOHttpConnection connection,int itemId, String itemKey,NIOBufferList contentBuffer) { Pair<Path,FSDataOutputStream> downloadTuple = _pathToStreamMap.get(itemKey); if (downloadTuple != null) { try { while (contentBuffer.available() != 0) { ByteBuffer bufferForRead = contentBuffer.read(); if (bufferForRead != null) { //LOG.info("Writing: " + bufferForRead.remaining() + " bytes for Key:"+ itemKey); downloadTuple.e1.write(bufferForRead.array(),bufferForRead.position(),bufferForRead.remaining()); } } return true; } catch (Exception e) { LOG.error("Error during contentAvailable for Key:" + itemKey + " Exception:" + CCStringUtils.stringifyException(e)); } } return false; } static Path finalSegmentOutputDir = new Path("crawl/ec2Import/segment"); @Override public void downloadComplete(NIOHttpConnection connection,int itemId, String itemKey) { LOG.info("Received Download Complete Event for Key:" + itemKey); Pair<Path,FSDataOutputStream> downloadTuple = _pathToStreamMap.remove(itemKey); boolean downloadSuccessful = false; if (downloadTuple == null) { LOG.error("Excepected Download Tuple for key:" + itemKey + " GOT NULL!"); } else { try { // ok close the stream first ... LOG.info("Flushing Stream for key:" + itemKey); downloadTuple.e1.flush(); downloadTuple.e1.close(); downloadTuple.e1 = null; downloadSuccessful = true; } catch (Exception e) { LOG.error("Error completing download for item:" + itemKey + " Exception:"+ CCStringUtils.stringifyException(e)); } finally { if (downloadTuple.e1 != null) { try { downloadTuple.e1.close(); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } } } } if (!downloadSuccessful) { LOG.error("Download for Key:" + itemKey + " Unsuccessful. Requeueing"); try { _downloader.fetchItem(itemKey); } catch (IOException e) { LOG.fatal("Failed to Requeue Item:" + itemKey); } } } @Override public void downloadFailed(NIOHttpConnection connection,int itemId, String itemKey, String errorCode) { LOG.info("Received Download Failed Event for Key:" + itemKey); Pair<Path,FSDataOutputStream> downloadTuple = _pathToStreamMap.remove(itemKey); if (downloadTuple == null) { LOG.error("Excepected Download Tuple for Failed Download key:" + itemKey + " GOT NULL!"); } else { try { if (downloadTuple.e1 != null) { downloadTuple.e1.close(); downloadTuple.e1 = null; } LOG.info("Deleting Temp File:" + downloadTuple.e0 + " for Key:" + itemKey); _fs.delete(downloadTuple.e0,false); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } } LOG.error("Download for Key:" + itemKey + " Unsuccessful. Requeueing"); try { _downloader.fetchItem(itemKey); } catch (IOException e) { LOG.fatal("Failed to Requeue Item:" + itemKey); } } @Override public boolean downloadStarting(NIOHttpConnection connection,int itemId, String itemKey, long contentLength) { LOG.info("Received Download Start Event for Key:" + itemKey); boolean continueDownload = false; Path outputFilePath = _pathMapping.get(itemKey); if (outputFilePath != null) { try { _fs.mkdirs(outputFilePath.getParent()); Pair<Path,FSDataOutputStream> tupleOut = new Pair<Path, FSDataOutputStream>(outputFilePath,_fs.create(outputFilePath)); LOG.info("Created Stream for Key:"+ itemKey +" temp Path:" + outputFilePath); _pathToStreamMap.put(itemKey, tupleOut); continueDownload = true; } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } } else { LOG.error("Unable to extract metadata filename parts from name:" + itemKey); } return continueDownload; } static Options options = new Options(); static { options.addOption( OptionBuilder.withArgName("awsKey").hasArg().withDescription("AWS Key").isRequired().create("awsKey")); options.addOption( OptionBuilder.withArgName("awsSecret").hasArg().withDescription("AWS Secret").isRequired().create("awsSecret")); options.addOption( OptionBuilder.withArgName("bucket").hasArg().withDescription("S3 bucket name").isRequired().create("bucket")); options.addOption( OptionBuilder.withArgName("outputPath").hasArg().isRequired().withDescription("HDFS output path").create("outputPath")); options.addOption( OptionBuilder.withArgName("path").hasArg().withDescription("S3 path prefix").create("path")); options.addOption( OptionBuilder.withArgName("paths").hasArg().withDescription("S3 paths as a JSON Array").create("paths")); } static void printUsage() { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp( "S3BulkTransferUtil", options ); } public static void main(String[] args)throws IOException { CommandLineParser parser = new GnuParser(); try { // parse the command line arguments CommandLine cmdLine = parser.parse( options, args ); String s3AccessKey = cmdLine.getOptionValue("awsKey"); String s3Secret = cmdLine.getOptionValue("awsSecret"); String s3Bucket = cmdLine.getOptionValue("bucket"); Path hdfsOutputPath = new Path(cmdLine.getOptionValue("outputPath")); JsonArray paths = new JsonArray(); if (cmdLine.hasOption ("path")) { String values[] = cmdLine.getOptionValues("path"); for (String value : values) { paths.add( new JsonPrimitive(value)); } } if (cmdLine.hasOption("paths")) { JsonParser jsonParser = new JsonParser(); JsonReader reader = new JsonReader(new StringReader(cmdLine.getOptionValue("paths"))); reader.setLenient(true); JsonArray array = jsonParser.parse(reader).getAsJsonArray(); if (array != null) { paths.addAll(array); } } if (paths.size() == 0) { throw new IOException("No Input Paths Specified!"); } LOG.info("Bucket:" + s3Bucket + " Target Paths:" + paths.toString()); S3BulkTransferUtil util = new S3BulkTransferUtil(s3Bucket,s3AccessKey,s3Secret,paths,hdfsOutputPath); } catch (Exception e) { LOG.error(CCStringUtils.stringifyException(e)); printUsage(); } } }