/** * Copyright 2008 - CommonCrawl Foundation * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * **/ package org.commoncrawl.util; import java.io.IOException; import java.net.URL; import java.nio.ByteBuffer; import java.nio.charset.Charset; import java.text.SimpleDateFormat; import java.util.Date; import java.util.List; import java.util.Locale; import java.util.Map; import java.util.TimeZone; import java.util.TreeMap; import java.util.Vector; import java.util.concurrent.atomic.AtomicReference; import java.util.concurrent.locks.Condition; import java.util.concurrent.locks.ReentrantLock; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.commoncrawl.async.EventLoop; import org.commoncrawl.io.NIOBufferList; import org.commoncrawl.io.NIOHttpConnection; import org.commoncrawl.io.NIOHttpHeaders; import org.commoncrawl.util.BandwidthUtils.BandwidthStats; import org.commoncrawl.util.GZIPUtils.UnzipResult; /** * * @author rana * */ public class S3Uploader implements NIOHttpConnection.DataSource { /** logging **/ static final Log LOG = LogFactory.getLog(S3Uploader.class); private static final int MAX_QUEUED_READ_SIZE = 10 * 1024 * 1024; EventLoop _eventLoop; FSDataInputStream _inputStream; FileSystem _fileSystem; Path _uploadTarget; String _s3Bucket; String _s3Key; String _s3ACL; String _contentType; long _contentLength; String _s3AccessId; String _s3SecretKey; IOException _exception = null; AtomicReference<Thread> _loaderThread = new AtomicReference<Thread>(); NIOBufferList _writeBuffer = new NIOBufferList(); ReentrantLock _readLock = new ReentrantLock(); Condition _readEvent = _readLock.newCondition(); boolean _abort = false; boolean _loadComplete = false; NIOHttpConnection _connection = null; S3Utils.CallingFormat _callingFormat = S3Utils.CallingFormat.getSubdomainCallingFormat(); int _slot; int _bandWidthLimit; int _Id; long _bytesUploaded = 0; static IOException failureExceptionFromContent(NIOHttpConnection theConnection) { String errorDescription = null; if (theConnection.getContentBuffer().available() != 0) { NIOBufferList contentBuffer = theConnection.getContentBuffer(); try { // now check headers to see if it is gzip encoded int keyIndex = theConnection.getResponseHeaders().getKey("Content-Encoding"); if (keyIndex != -1) { String encoding = theConnection.getResponseHeaders().getValue(keyIndex); byte data[] = new byte[contentBuffer.available()]; // and read it from the niobuffer contentBuffer.read(data); if (encoding.equalsIgnoreCase("gzip")) { UnzipResult result = GZIPUtils.unzipBestEffort(data,256000); if (result != null) { contentBuffer.reset(); contentBuffer.write(result.data.get(), 0, result.data.getCount()); contentBuffer.flush(); } } } byte data[] = new byte[contentBuffer.available()]; contentBuffer.read(data); ByteBuffer bb = ByteBuffer.wrap(data); StringBuffer buf = new StringBuffer(); buf.append(Charset.forName("ASCII").decode(bb)); errorDescription = buf.toString(); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } } if (errorDescription == null) { errorDescription = "UNKNOWN ERROR"; } return new IOException(errorDescription); } BandwidthUtils.RateLimiter _rateLimiter = null; public S3Uploader(int uploaderId,EventLoop eventLoop,FileSystem fileSystem,Path uploadTarget,int bandWidthLimit,String s3Bucket,String s3Key,String contentMimeType,String s3AccessId,String s3SecretKey,String acl) { _Id = uploaderId; _eventLoop = eventLoop; _fileSystem = fileSystem; _uploadTarget = uploadTarget; _s3Bucket = s3Bucket; _s3Key = s3Key; _s3ACL = acl; _s3AccessId = s3AccessId; _s3SecretKey = s3SecretKey; _contentType = contentMimeType; _bandWidthLimit = bandWidthLimit; _rateLimiter = new BandwidthUtils.RateLimiter(_bandWidthLimit); _writeBuffer.setMinBufferSize(65536 * 2); } public int getSlot() { return _slot; } public void setSlot(int index) { _slot = index; } Object _context; public void setContext(Object o) { _context = o; } public Object getContext() { return _context; } public Path getPath() { return _uploadTarget; } private void startLoader() throws IOException { _contentLength = _fileSystem.getFileStatus(_uploadTarget).getLen(); LOG.info("startLoader - Content Size is:" + _contentLength); _inputStream = _fileSystem.open(_uploadTarget); _abort = false; _loaderThread.set(new Thread(new Runnable() { public void run() { try { while (!_abort && _inputStream.available() != 0) { // first see if have reached a read threshold ... if (_writeBuffer.available() >= MAX_QUEUED_READ_SIZE) { //LOG.info("Connection:[" + _connection.getId() + "] Load Thread for Path:" + _uploadTarget.getName() + " Detected Queue Full. Grabbing Read Lock"); try { // acquire the read lock _readLock.lock(); // set up our read event ... _writeBuffer.setReadEvent(_readLock,_readEvent); // LOG.info("Connection:[" + _connection.getId() + "] Load Thread for Path:" + _uploadTarget.getName() + " Waiting on Read Event"); // and wait on read event ... try { _readEvent.await(); } catch (InterruptedException e) { //LOG.info("Connection:[" + _connection.getId() + "] Load Thread for Target:" + _uploadTarget.toString() + " Interrupted"); _abort = true; } //LOG.info("Connection:[" + _connection.getId() + "] Load Thread for Path:" + _uploadTarget.getName() + " Woke up from Wait on Read Event"); } finally { _readLock.unlock(); } } if (!_abort) { ByteBuffer buffer = _writeBuffer.getWriteBuf(); if (buffer == null) { _exception = new IOException("Out Of Memory Error"); LOG.error(CCStringUtils.stringifyException(_exception)); break; } int bytesRead = _inputStream.read(buffer.array(),buffer.position(),buffer.remaining()); buffer.position(buffer.position() + bytesRead); // LOG.info("Connection:[" + _connection.getId() + "] loadThread for Path:" + _uploadTarget.getName() + " Read:" + bytesRead); } } // LOG.info("loadThread for Path:" + _uploadTarget.getName() +" Done"); if (_abort && _exception == null) { _exception = new IOException("Transfer Explicitly Aborted"); LOG.error(CCStringUtils.stringifyException(_exception)); } } catch (IOException e) { _exception = e; LOG.error(CCStringUtils.stringifyException(e)); } finally { try { if (_inputStream != null) { _inputStream.close(); } _inputStream = null; } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } _loaderThread.set(null); } if (!_abort && _exception == null) { synchronized (_writeBuffer) { _writeBuffer.flush(); // LOG.info("Reader Thread Bytes Available:" + _writeBuffer.available()); _loadComplete = true; } } else if(_exception != null) { if (_connection != null) _connection.close(); } //LOG.info("Connection:[" + _connection.getId() + "] Load Thread for Target:" + _uploadTarget.toString() + " EXITING"); } })); _loaderThread.get().start(); } public void shutdown() { if (_connection != null) { _connection.close(); _connection.setContext(null); _connection.setListener(null); _connection = null; } _abort = true; // signal the read event incase the read thread is locked on it... if (_readLock != null) { _readLock.lock(); if (_readEvent != null) { _readEvent.signal(); } _readLock.unlock(); } while(_loaderThread.get() != null) { try { // LOG.info("Waiting for Loader Thread for Target:" + _uploadTarget + " to Exit"); Thread thread = _loaderThread.get(); if (thread != null) { thread.join(10); } // LOG.info("Returned from Wait on Loader Thread for Target:" + _uploadTarget); } catch (InterruptedException e) { LOG.error(CCStringUtils.stringifyException(e)); } } _writeBuffer.reset(); } /** * Generate an rfc822 date for use in the Date HTTP header. */ private static String httpDate() { final String DateFormat = "EEE, dd MMM yyyy HH:mm:ss "; SimpleDateFormat format = new SimpleDateFormat( DateFormat, Locale.US ); format.setTimeZone( TimeZone.getTimeZone( "GMT" ) ); return format.format( new Date() ) + "GMT"; } private static void addToAmazonHeader(String key,String value,Map amazonHeaders) { List<String> list = (List<String>) amazonHeaders.get(key); if (list == null) { list = new Vector<String>(); amazonHeaders.put(key, list); } list.add(value); } private static String normalizeACLString(String targetString) { StringBuffer buffer =new StringBuffer(); boolean lastCharWasWhitepsace = false; for( char c : targetString.toCharArray()) { if (c == ' ' || c == '\t' || c == '\n') { // if (!lastCharWasWhitepsace) { buffer.append(' '); // } lastCharWasWhitepsace = true; } else { if (c == '<') buffer.append("<"); else if (c == '>') buffer.append(">"); else buffer.append(c); lastCharWasWhitepsace = false; } } return buffer.toString(); } public void startUpload(NIOHttpConnection.Listener listener) throws IOException { // start the file loader ... startLoader(); // construct the s3 url ... URL theURL = _callingFormat.getURL(false, S3Utils.DEFAULT_HOST, S3Utils.INSECURE_PORT, _s3Bucket, _s3Key, null); // allocate an http connection _connection = new NIOHttpConnection(theURL,_eventLoop.getSelector(),_eventLoop.getResolver(),null); _connection.setId(_Id); LOG.info("Connection for Path:" + _uploadTarget.getName() + " is:" + _connection.getId()); // set the back pointer to us ... _connection.setContext(this); // set rate limit policy ... _connection.setUploadRateLimiter(_rateLimiter); // specify that we will populate our own request headers ... _connection.setPopulateDefaultHeaderItems(false); // set up the data source ... _connection.setDataSource(this); // get at headers ... NIOHttpHeaders headers = _connection.getRequestHeaders(); // populate http request string headers.prepend("PUT" + " " + theURL.getFile() +" " + "HTTP/1.1", null); if (theURL.getPort() != -1 && theURL.getPort() != 80) { headers.set("Host",theURL.getHost() +":"+String.valueOf(theURL.getPort())); } else { headers.set("Host",theURL.getHost()); } // create a tree map in parallel (to pass to canonicalization routine for s3 auth) Map amazonHeaders = new TreeMap(); // set mime type header entry ... headers.set("Content-Type", _contentType); // and add content type to amazon headers as well .. addToAmazonHeader("Content-Type", _contentType,amazonHeaders); // and add content length ... headers.set("Content-Length", ((Long)_contentLength).toString()); // add date ... String theDate = httpDate(); headers.set("Date", theDate); addToAmazonHeader("Date", theDate, amazonHeaders); // set reduced redundancy flag headers.set("x-amz-storage-class", "REDUCED_REDUNDANCY"); addToAmazonHeader("x-amz-storage-class", "REDUCED_REDUNDANCY", amazonHeaders); // specify reduced redundancy storage // and if acl is specified... if (_s3ACL != null) { String aclStringNormalized = normalizeACLString(_s3ACL); // add it to the list of headers //headers.set("x-amz-acl", _s3ACL); // and to the list of headers used to canonacalize the url ... //addToAmazonHeader("x-amz-acl", aclStringNormalized, amazonHeaders); } String canonicalString = S3Utils.makeCanonicalString("PUT", _s3Bucket, _s3Key, null,amazonHeaders ); //LOG.info("Headers for Request:" + headers.toString()); //LOG.info("Cannonica for Request:" + canonicalString); String encodedCanonical = S3Utils.encode(_s3SecretKey, canonicalString, false); // add auth string to headers ... headers.set("Authorization","AWS " + _s3AccessId + ":" + encodedCanonical); // add cache control pragmas ... headers.set ("Connection", "close"); headers.set("Cache-Control", "no-cache"); headers.set("Pragma", "no-cache"); // ready to roll ... // set the listener ... _connection.setListener(listener); // and open the connection _connection.open(); } public void abortUpload() { _abort = true; _connection.close(); } public IOException getException() { return _exception; } public boolean read(NIOHttpConnection connection,NIOBufferList dataBuffer)throws IOException { ByteBuffer buffer = null; if ((buffer = _writeBuffer.read()) != null) { _bytesUploaded += buffer.remaining(); BandwidthStats stats = new BandwidthStats(); _rateLimiter.getStats(stats); System.out.println("[" + _slot + "]ID:" + _Id + " read Callback for S3Uploader for Path:" + _uploadTarget.getName() + " returned:" + buffer.remaining() + " Bytes TotalBytesRead:" + _bytesUploaded +" Rate:" + stats.scaledBitsPerSecond + " " + stats.scaledBitsUnits ); buffer.position(buffer.limit()); dataBuffer.write(buffer); dataBuffer.flush(); } boolean eof = false; synchronized(_writeBuffer) { eof = _writeBuffer.available() == 0 && _loadComplete; } return eof; } @Override public void finsihedWriting(NIOHttpConnection connection,ByteBuffer thisBuffer) throws IOException { //NOOP } }