/** * Copyright 2008 - CommonCrawl Foundation * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * **/ package org.commoncrawl.util; import java.io.IOException; import java.nio.ByteBuffer; import java.util.HashMap; import java.util.Map; import java.util.concurrent.LinkedBlockingQueue; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.util.StringUtils; import org.commoncrawl.io.NIOBufferList; import org.commoncrawl.io.NIOHttpConnection; import org.commoncrawl.protocol.shared.ArcFileItem; /** * * @author rana * */ public class S3ArcFileReader implements S3Downloader.Callback { private static final int MAX_BUFFERS_ENQUEUED_PER_THREAD = 10000; private static final int MAX_DECODER_THREADS = 2; private static final Log LOG = LogFactory.getLog(S3ArcFileReader.class); private static class QueuedArcFileItem { String _source = null; ArcFileItem _item = null; public QueuedArcFileItem(String source,ArcFileItem item) { _source = source; _item = item; } } private LinkedBlockingQueue<QueuedArcFileItem> _items = new LinkedBlockingQueue<QueuedArcFileItem>(); private QueuedArcFileItem _currentItem = null; // the single download instance ... private S3Downloader _downloader; private static class QueuedBufferItem { public QueuedBufferItem(String key,ByteBuffer buffer) { _key = key; _buffer = buffer; } public String _key; public ByteBuffer _buffer; } // the blocking input buffer queue private LinkedBlockingQueue<QueuedBufferItem> _bufferQueues[] = new LinkedBlockingQueue[MAX_DECODER_THREADS]; // the map from arcfile key to ArcFileState private Map<String,StreamingArcFileReader> _decodeStateMaps[] = new Map[MAX_DECODER_THREADS]; //decoder thread ... Thread _decoderThreads[] = new Thread[MAX_DECODER_THREADS]; // active stream count int _activeStreamCounts[] = new int[MAX_DECODER_THREADS]; // item list String _keys[] = null; public S3ArcFileReader(String bucketName,String s3AccessId, String s3SecretKey,String[] arcFileNames, int maxParallelStreams)throws IOException { _downloader = new S3Downloader(bucketName,s3AccessId,s3SecretKey,false); _downloader.setMaxParallelStreams(maxParallelStreams); _keys = arcFileNames; for (int i=0;i<MAX_DECODER_THREADS;++i) { _bufferQueues[i] = new LinkedBlockingQueue<QueuedBufferItem>(MAX_BUFFERS_ENQUEUED_PER_THREAD); _decodeStateMaps[i] = new HashMap<String,StreamingArcFileReader>(); _activeStreamCounts[i] = 0; } } public void start() throws IOException { //LOG.info("ArcFileReader->start"); //LOG.info("ArcFileReader-> Download.start"); _downloader.initialize(this); int keyCount =0; for (String key : _keys) { try { //LOG.info("Fetching Key:" + key); // start fetch ... int itemId = _downloader.fetchItem(key); // set key to decoder affinity ... int threadIdx = itemId % MAX_DECODER_THREADS; // increment per thread stream count ... _activeStreamCounts[threadIdx]++; // increment key count keyCount++; } catch (IOException e) { LOG.error("Failed to Queue Item:" + key +" for Fetch"); } } // start decoders ... for (int threadIdx=0;threadIdx<MAX_DECODER_THREADS;++threadIdx) { _decoderThreads[threadIdx] = startDecoderThread(threadIdx,_bufferQueues[threadIdx]); } } private Thread startDecoderThread(final int threadIdx,final LinkedBlockingQueue<QueuedBufferItem> attachedQueue) { Thread decoderThread = new Thread( new Runnable() { public void run() { //LOG.info("Decoder Thread Start"); while(true) { try { QueuedBufferItem bufferItem = attachedQueue.take(); if (bufferItem._key == null) { //this is our signal to shutdown and exit //LOG.info("Decoder Thread received shutdown signal"); _items.add(new QueuedArcFileItem(null,null)); return; } // LOG.info("Decoder Thread Got Buffer Item for Key:" + bufferItem._key); StreamingArcFileReader decoder = null; synchronized (_decodeStateMaps[threadIdx]) { decoder = _decodeStateMaps[threadIdx].get(bufferItem._key); } if (decoder == null) { LOG.error("No Decode State found for Key:" + bufferItem._key); } else { if (bufferItem._buffer != null) { decoder.available(bufferItem._buffer); } else { //LOG.info("Calling Decoder Finished For Key:" + bufferItem._key); decoder.finished(); } try { int itemsProduced = 0; StreamingArcFileReader.TriStateResult result = decoder.hasMoreItems(); while (result == StreamingArcFileReader.TriStateResult.MoreItems) { ArcFileItem item = decoder.getNextItem(); if (item != null) { itemsProduced++; //LOG.info("Decoder Got Item:" + item.getUri() + " For Key:" + bufferItem._key); _items.offer(new QueuedArcFileItem(bufferItem._key,item)); } else { break; } } // LOG.info("Stream:" + bufferItem._key + " Bytes:" + bufferBytes + " Items:" + itemsProduced); if (result == StreamingArcFileReader.TriStateResult.NoMoreItems) { decoder.resetState(); synchronized (_decodeStateMaps[threadIdx]) { _decodeStateMaps[threadIdx].remove(bufferItem._key); } } } catch (IOException e) { decoder.resetState(); synchronized (_decodeStateMaps[threadIdx]) { _decodeStateMaps[threadIdx].remove(bufferItem._key); } LOG.error(StringUtils.stringifyException(e)); } } } catch (InterruptedException e) { } } } }); decoderThread.start(); return decoderThread; } public void stop() throws IOException { _downloader.shutdown(); for (int threadIdx=0;threadIdx < MAX_DECODER_THREADS;++threadIdx) { _bufferQueues[threadIdx].clear(); if (_decoderThreads[threadIdx]!= null) { _bufferQueues[threadIdx].add(new QueuedBufferItem(null,null)); try { _decoderThreads[threadIdx].join(); } catch (InterruptedException e) { } _decoderThreads[threadIdx] = null; } _activeStreamCounts[threadIdx] = 0; } } public boolean hasMoreItems() throws IOException { try { _currentItem = _items.take(); } catch (InterruptedException e) { } return _currentItem._item != null; } public ArcFileItem getNextItem()throws IOException { return _currentItem._item; } @Override public boolean downloadStarting(NIOHttpConnection connection,int itemId,String itemKey,long contentLength) { // set key to decoder affinity ... int threadIdx = itemId % MAX_DECODER_THREADS; synchronized(_decodeStateMaps[threadIdx]) { _decodeStateMaps[threadIdx].put(itemKey, new StreamingArcFileReader(true)); } return true; } @Override public boolean contentAvailable(NIOHttpConnection connection,int itemId,String itemKey,NIOBufferList contentBuffer) { // set key to decoder affinity ... int threadIdx = itemId % MAX_DECODER_THREADS; boolean continueDownload = true; while (continueDownload && contentBuffer.available() != 0) { try { ByteBuffer buffer = contentBuffer.read(); try { _bufferQueues[threadIdx].put(new QueuedBufferItem(itemKey,buffer)); // LOG.info("BQueue[" + threadIdx + "] Count:" +_bufferQueues[threadIdx].size() + " Stream:" + itemKey); } catch (InterruptedException e) { } } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); continueDownload = false; } } return continueDownload; } @Override public void downloadFailed(NIOHttpConnection connection,int itemId,String itemKey,String errorCode) { // set key to decoder affinity ... int threadIdx = itemId % MAX_DECODER_THREADS; _activeStreamCounts[threadIdx]--; LOG.error("Download Failed for Item:" + itemKey + " ReasonCode:" + errorCode); // add item termination packet try { _bufferQueues[threadIdx].put(new QueuedBufferItem(itemKey,null)); } catch (InterruptedException e) { } // add thread termination packet if (_activeStreamCounts[threadIdx] == 0) { try { _bufferQueues[threadIdx].put(new QueuedBufferItem(null,null)); } catch (InterruptedException e) { } } } @Override public void downloadComplete(NIOHttpConnection connection,int itemId,String itemKey) { // set key to decoder affinity ... int threadIdx = itemId % MAX_DECODER_THREADS; _activeStreamCounts[threadIdx]--; // LOG.info("Download Complete for Item:" + itemKey); // add item termination packet try { _bufferQueues[threadIdx].put(new QueuedBufferItem(itemKey,null)); } catch (InterruptedException e) { } // add thread termination packet if (_activeStreamCounts[threadIdx] == 0) { try { _bufferQueues[threadIdx].put(new QueuedBufferItem(null,null)); } catch (InterruptedException e) { } } } }