package org.commoncrawl.service.crawlhistoryV2; import java.io.IOException; import java.text.NumberFormat; import java.util.concurrent.LinkedBlockingDeque; import java.util.concurrent.LinkedBlockingQueue; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.DataInputBuffer; import org.apache.hadoop.io.DataOutputBuffer; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.SequenceFile; import org.commoncrawl.async.CallbackWithResult; import org.commoncrawl.protocol.URLFPV2; import org.commoncrawl.util.CCStringUtils; import org.commoncrawl.util.URLFPBloomFilter; public class ShardThread implements Runnable { private static final Log LOG = LogFactory.getLog(ShardThread.class); private static final int FSYNC_INTERVAL = 100; private static final int ROLL_INTERVAL = 1000000; CrawlHistoryServer _server; int _shardId; FileSystem _fs; Configuration _conf; LinkedBlockingDeque<Request> _requestQueue = new LinkedBlockingDeque<Request>(); SequenceFile.Writer _logWriter; long _logFileId; int _logEntries; Path _tlogBasePath; URLFPBloomFilter _filter; final NumberFormat NUMBER_FORMAT = NumberFormat.getInstance(); public static class Request { enum RequestType { ROLL_LOG, SINGLE_FP_UPDATE, MULTIPLE_FP_UPDATE, SHUTDOWN } RequestType _type; long _requestTime; boolean _complete; IOException _lastError; public URLFPV2 _singleRequestFP; public DataOutputBuffer _multiReqBuffer = null; CallbackWithResult<Request> _completionCallback; public Request(RequestType requestType,long requestTime,CallbackWithResult<Request> completionCallback) { _type = requestType; _requestTime = requestTime; _completionCallback = completionCallback; } } public ShardThread(CrawlHistoryServer server,FileSystem fs,Configuration conf,Path tlogBasePath,int shardId,URLFPBloomFilter filter) throws IOException { _server = server; _fs = fs; _conf = conf; _shardId = shardId; _filter = filter; NUMBER_FORMAT.setMinimumIntegerDigits(5); NUMBER_FORMAT.setGroupingUsed(false); _tlogBasePath = tlogBasePath; } @Override public void run() { LOG.info(getLogPrefix()+"Thread Started"); outer: while (true) { try { Request request = _requestQueue.take(); try { switch (request._type) { case ROLL_LOG: { try { LOG.info(getLogPrefix()+"GOT ROLL_LOG CMD"); rollTransactionLog(); request._complete = true; LOG.info(getLogPrefix()+"FINISHED ROLL_LOG CMD"); } catch (IOException e) { LOG.error(getLogPrefix()+"Failed to RollLog with Exception:" + CCStringUtils.stringifyException(e)); } } break; case SINGLE_FP_UPDATE: { try { _filter.add(request._singleRequestFP); appendLogFileRecord(request._singleRequestFP,request._requestTime); request._complete = true; } catch (IOException e) { LOG.error(getLogPrefix()+ "SINGLE FP REQUEST FAILED with Exception:" + CCStringUtils.stringifyException(e)); request._lastError = e; } } break; case MULTIPLE_FP_UPDATE: { try { if (request._multiReqBuffer != null) { DataInputBuffer inputBuffer = new DataInputBuffer(); inputBuffer.reset( request._multiReqBuffer.getData(), 0, request._multiReqBuffer.getLength()); int items = inputBuffer.readInt(); URLFPV2 fp = new URLFPV2(); for (int i=0;i<items;++i) { } } } catch (IOException e) { } } break; case SHUTDOWN: { try { LOG.info(getLogPrefix()+"GOT SHUTDOWN ROLLING LOG"); rollTransactionLog(); LOG.info(getLogPrefix()+"GOT SHUTDOWN ROLLED LOG"); } catch (IOException e){ LOG.error(getLogPrefix()+"Failed to RollLog with Exception:" + CCStringUtils.stringifyException(e)); } break outer; } } } finally { request._completionCallback.execute(request); } } catch (Exception e) { LOG.error(getLogPrefix() + "UnhandledException: " +CCStringUtils.stringifyException(e)); try { Thread.sleep(5000); } catch (InterruptedException e1) { } } } LOG.info(getLogPrefix()+"THREAD EXITING"); } void appendLogFileRecords(DataInputBuffer stream,long timestamp) throws IOException { URLFPV2 fp = new URLFPV2(); int recordCount = stream.readInt(); { for (int i=0;i<recordCount;++i) { fp.setRootDomainHash(stream.readLong()); fp.setDomainHash(stream.readLong()); fp.setUrlHash(stream.readLong()); try { SequenceFile.Writer writer = ensureWriter(); if (writer != null) { writer.append(fp, timestamp); _logEntries++; if (_logEntries >= ROLL_INTERVAL) { rollTransactionLog(); } else if (_logEntries % FSYNC_INTERVAL == 0) { try { writer.syncFs(); } catch (IOException e) { LOG.error(getLogPrefix() + "Failed to FSYNC File:" + _logFileId + " with Exception:" + CCStringUtils.stringifyException(e)); // force roll the log rollTransactionLog(); } } } else { } } catch (IOException e) { LOG.error(getLogPrefix()+"APPEND FAILURE " + " RH: " + fp.getRootDomainHash() + " DH:" + fp.getDomainHash() + " UH:" + fp.getUrlHash()); LOG.error(getLogPrefix()+"APPEND Exception: " + CCStringUtils.stringifyException(e)); rollTransactionLog(); } } } } void appendLogFileRecord(URLFPV2 fp,long timestamp) throws IOException { SequenceFile.Writer writer = ensureWriter(); if (writer != null) { try { writer.append(fp, timestamp); _logEntries++; if (_logEntries >= ROLL_INTERVAL) { rollTransactionLog(); } else if (_logEntries % FSYNC_INTERVAL == 0) { try { writer.syncFs(); } catch (IOException e) { LOG.error(getLogPrefix() + "Failed to FSYNC File:" + _logFileId + " with Exception:" + CCStringUtils.stringifyException(e)); // force roll the log rollTransactionLog(); throw e; } } } catch (IOException e) { LOG.error(getLogPrefix()+"APPEND FAILURE " + " RH: " + fp.getRootDomainHash() + " DH:" + fp.getDomainHash() + " UH:" + fp.getUrlHash()); LOG.error(getLogPrefix()+"APPEND Exception: " + CCStringUtils.stringifyException(e)); rollTransactionLog(); throw e; } } } SequenceFile.Writer ensureWriter()throws IOException { if (_logWriter == null) { long fileId = System.currentTimeMillis(); _logWriter = new SequenceFile.Writer(_fs,_conf,getTLogFilePathGivenId(fileId),URLFPV2.class,LongWritable.class); _logFileId = fileId; _logEntries = 0; } return _logWriter; } Path getTLogFilePathGivenId(long fileId) { return new Path(_tlogBasePath,NUMBER_FORMAT.format(_shardId)+"-"+fileId); } void rollTransactionLog()throws IOException { if (_logWriter != null) { try { _logWriter.close(); } catch (IOException e) { LOG.error(getLogPrefix() + " Threw Exception during close:" + CCStringUtils.stringifyException(e)); } _logWriter = null; _logFileId = -1; _logEntries = 0; } } String getLogPrefix() { return "SHARD[" + NUMBER_FORMAT.format(_shardId)+"]"; } }