package org.commoncrawl.util; /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * **/ import java.io.FileOutputStream; import java.io.IOException; import java.net.URI; import java.nio.ByteBuffer; import java.sql.Connection; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicReference; import java.util.concurrent.locks.Condition; import java.util.concurrent.locks.ReentrantLock; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.commoncrawl.async.Callback; import org.commoncrawl.async.Timer; import org.commoncrawl.io.NIOBufferList; import org.commoncrawl.io.NIOBufferListInputStream; import org.commoncrawl.io.NIOHttpConnection; /** * * An InputStream that fetches data from S3 by using an * S3Downloader instance to fetch/buffer data in a background thread. * * @author rana * */ public class S3InputStream extends NIOBufferListInputStream implements S3Downloader.Callback, Timer.Callback { /** logging **/ private static final Log LOG = LogFactory.getLog(S3InputStream.class); URI uri; S3Downloader downloader = null; AtomicReference<Exception> _exception = new AtomicReference<Exception>(); ReentrantLock _writeLock = new ReentrantLock(); AtomicReference<Condition> _writeEvent = new AtomicReference<Condition>(_writeLock.newCondition()); long _waitStartTime = -1; boolean _inWaitState = false; AtomicBoolean _eofCondition = new AtomicBoolean(); AtomicReference<NIOHttpConnection> pausedConnection = new AtomicReference<NIOHttpConnection>(); AtomicReference<NIOHttpConnection> activeConnection = new AtomicReference<NIOHttpConnection>(); int activeItemId = -1; String activeItemKey = null; int MAX_BUFFER_SIZE = 1048576 * 5; Timer timeoutTimer; /** * Initiate the stream with specified s3/s3n uri. * @param uri s3/s3n uri that points to an s3 object * @param s3AccessKey * @param s3Secret * @param bufferSize set this to be at least 1MB or higher to ensure decent performance * @throws IOException */ public S3InputStream(URI uri,String s3AccessKey,String s3Secret,int bufferSize,long seekPos) throws IOException { super(new NIOBufferList()); this.uri = uri; downloader = new S3Downloader(uri.getHost(), s3AccessKey,s3Secret, false); // we are download a single stream ... downloader.setMaxParallelStreams(1); // initialize the callback downloader.initialize(this); // initiate the download LOG.info("Fetching:" + uri.getPath() + " seekPos:" + seekPos); if (seekPos == 0) { downloader.fetchItem(uri.getPath().substring(1)); } else { downloader.fetchPartialItem(uri.getPath().substring(1), seekPos,-1L); } timeoutTimer = new Timer(5000,true,this); } @Override protected void ensureBuffer() throws IOException { do { super.ensureBuffer(); if (_activeBuf == null) { // ok, unpause the connection in case it is in a paused state before going into a wait state ... unpauseConnection(); //System.out.println("Read from Main Thread for Path:" + uri + ". Checking for EOF or Error"); _writeLock.lock(); try { if (_eofCondition.get()) { if (_exception.get() != null) { LOG.error("Read from Main Thread for Path:" + uri + " detected Exception"); throw new IOException(_exception.get()); } else { LOG.info("Read from Main Thread for Path:" + uri + " detected EOF"); return; } } else { _writeEvent.set(_writeLock.newCondition()); _inWaitState = true; _waitStartTime = System.currentTimeMillis(); //long nanoTimeStart = System.nanoTime(); //System.out.println("Read from Main Thread for Path:" + uri + " Waiting on Write"); try { _writeEvent.get().await(); _waitStartTime = -1L; //long nanoTimeEnd = System.nanoTime(); //System.out.println("Read from Main Thread for Path:" + uri + " Returned from Wait Took:" + (nanoTimeEnd-nanoTimeStart)); } catch (InterruptedException e) { LOG.error("Read from Main Thread for Path:" + uri + " was Interrupted. Exiting"); throw new IOException(e); } } } finally { _inWaitState = false; _writeLock.unlock(); } } } while(_activeBuf == null); if (_bufferQueue.available() < MAX_BUFFER_SIZE) { unpauseConnection(); } } void unpauseConnection() { if (pausedConnection.get() != null) { downloader.getEventLoop().queueAsyncCallback(new Callback() { @Override public void execute() { final NIOHttpConnection connection = pausedConnection.get(); pausedConnection.set(null); if (connection != null) { LOG.info("*** RESUMING DOWNLOADS FOR:" + connection.getURL() + "***"); try { connection.enableReads(); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } } } }); } } @Override public void close() throws IOException { downloader.shutdown(); } @Override public boolean downloadStarting(NIOHttpConnection connection,int itemId, String itemKey,long contentLength) { activeConnection.set(connection); downloader.getEventLoop().setTimer(timeoutTimer); activeItemId = itemId; activeItemKey = itemKey; return true; } @Override public boolean contentAvailable(NIOHttpConnection theConnection,int itemId, String itemKey,NIOBufferList contentBuffer) { ByteBuffer buffer = null; IOException exception = null; //int receivedBytes = 0; try { while ((buffer = contentBuffer.read()) != null) { if (buffer.position() != 0) { buffer = buffer.slice(); } //receivedBytes += buffer.remaining(); buffer.position(buffer.limit()); _bufferQueue.write(buffer); } _bufferQueue.flush(); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); exception = e; } if (_bufferQueue.available() >= MAX_BUFFER_SIZE) { LOG.info("*** PAUSING DOWNLOADS FOR:" + theConnection.getURL()); theConnection.disableReads(); pausedConnection.set(theConnection); } //long nanoTimeStart = System.nanoTime(); _writeLock.lock(); //long nanoTimeEnd = System.nanoTime(); //System.out.println("Received: " + receivedBytes + "for URI:" + uri + " Lock took:" + (nanoTimeEnd-nanoTimeStart)); try { Condition writeCondition = _writeEvent.getAndSet(null); if (exception != null) { _eofCondition.set(true); _exception.set(exception); } if (writeCondition != null) { writeCondition.signal(); } } finally { _writeLock.unlock(); } return true; } @Override public void downloadFailed(NIOHttpConnection connection,int itemId, String itemKey, String errorCode) { LOG.error("Download Failed for URI:" + S3InputStream.this.uri); _writeLock.lock(); try { _exception.set(new IOException(errorCode)); _eofCondition.set(true); Condition writeCondition = _writeEvent.getAndSet(null); if (writeCondition != null) { writeCondition.signal(); } } finally { _writeLock.unlock(); } downloader.getEventLoop().cancelTimer(timeoutTimer); activeConnection.set(null); } @Override public void downloadComplete(NIOHttpConnection connection,int itemId, String itemKey) { LOG.info("Download Complete for URI:" + S3InputStream.this.uri); _writeLock.lock(); try { _exception.set(null); _eofCondition.set(true); Condition writeCondition = _writeEvent.getAndSet(null); if (writeCondition != null) { writeCondition.signal(); } } finally { _writeLock.unlock(); } downloader.getEventLoop().cancelTimer(timeoutTimer); activeConnection.set(null); } private static final int WAIT_LOCK_TIMEOUT = 5 * 60000; @Override public void timerFired(Timer timer) { LOG.info("timeout timer fired"); boolean timedOut = false; NIOHttpConnection connection = activeConnection.get(); if (connection != null) { if (pausedConnection.get() == null) { if (connection.checkForTimeout()) { LOG.info("*** TIMEOUT detected via HTTPConnection for stream:" + connection.getURL()); timedOut = true; } } } if (!timedOut) { _writeLock.lock(); try { if (_inWaitState) { if (System.currentTimeMillis() - _waitStartTime >= WAIT_LOCK_TIMEOUT) { LOG.info("*** TIMEOUT detected via LOCKWAIT time for stream:" + connection.getURL()); timedOut = true; } } } finally { _writeLock.unlock(); } } if (timedOut) { downloader.shutdown(); downloadFailed(activeConnection.get(), activeItemId, activeItemKey, "TIMEOUT"); } } }