/**
* Copyright 2008 - CommonCrawl Foundation
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
**/
package org.commoncrawl.service.listcrawler;
import java.io.DataInput;
import java.io.DataInputStream;
import java.io.DataOutput;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.net.MalformedURLException;
import java.nio.ByteBuffer;
import java.rmi.server.UID;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.LinkedList;
import java.util.List;
import java.util.TreeSet;
import java.util.Vector;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.Semaphore;
import java.util.concurrent.TimeUnit;
import java.util.zip.CRC32;
import java.util.zip.CheckedInputStream;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.WritableUtils;
import org.apache.hadoop.record.Buffer;
import org.commoncrawl.async.ConcurrentTask;
import org.commoncrawl.async.ConcurrentTask.CompletionCallback;
import org.commoncrawl.async.EventLoop;
import org.commoncrawl.async.Timer;
import org.commoncrawl.crawl.common.internal.CrawlEnvironment;
import org.commoncrawl.io.NIOHttpHeaders;
import org.commoncrawl.protocol.CacheItem;
import org.commoncrawl.protocol.CrawlURLMetadata;
import org.commoncrawl.service.listcrawler.HDFSFlusherThread.IndexDataFileTriple;
import org.commoncrawl.util.HttpHeaderInfoExtractor;
import org.commoncrawl.util.SessionIDURLNormalizer;
import org.commoncrawl.util.URLFingerprint;
import org.commoncrawl.util.URLUtils;
import org.commoncrawl.util.ArcFileItemUtils;
import org.commoncrawl.util.CCStringUtils;
import org.junit.Assert;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
/**
* Class that encapsulates most of the serving side of the crawler
*
* @author rana
*
*/
public class CacheManager {
private HashMultimap<Long,Long> _fingerprintToLocalLogPos = HashMultimap.create();
private LinkedList<CacheItem> _writeQueue = new LinkedList<CacheItem>() ;
private LinkedBlockingQueue<CacheWriteRequest> _writeRequestQueue = new LinkedBlockingQueue<CacheWriteRequest>();
private LinkedBlockingQueue<CacheFlushRequest> _hdfsFlushRequestQueue = new LinkedBlockingQueue<CacheFlushRequest>();
private Vector<HDFSFileIndex> _hdfsIndexList = new Vector<HDFSFileIndex>();
// private LinkedBlockingQueue<CacheLoadRequest> _loadRequestQueue = new LinkedBlockingQueue<CacheLoadRequest>();
private static final int DEFAULT_DISK_READER_THREADS = 8 * 4;
private static final int HDFS_READER_THREADS = 3*8*4;
private static final int LOG_ACCESS_SEMAPHORE_COUNT = 100 + 1;
static final int ITEM_RECORD_TRAILING_BYTES = 4;
// flush the local cache to hdfs once the local cache document counts exceeds this number
public static final int LOCAL_CACHE_FLUSH_THRESHOLD = 10000;
// proxy cache location in hdfs
private static final String PROXY_CACHE_LOCATION = "crawl/proxy/cache";
static final String PROXY_CACHE_FILE_DATA_PREFIX = "cacheData";
static final String PROXY_CACHE_FILE_INDEX_PREFIX = "cacheIndex";
private static final int CACHE_POLL_TIMER_INTERVAL = 10000;
public static final Log LOG = LogFactory.getLog(CacheManager.class);
/** log file header **/
private LocalLogFileHeader _header = new LocalLogFileHeader();
/** local data directory **/
File _localDataDirectory;
/** remote data directory **/
Path _remoteDataDirectory;
/** event loop **/
EventLoop _eventLoop;
/** reader thread pool **/
ExecutorService _cacheLoadThreadPool = null;
/** hdfs loader thread pool **/
ExecutorService _hdfsLoaderPool = null;
/** number of cache writer threads **/
public static final int CACHE_WRITER_THREADS = 8;
/** local cache writer thread(S) **/
Thread _writerThreads[] = null;
/** hdfs flusher thread **/
Thread _hdfsFlusherThread = null;
/** hdfs flusher active indicator **/
boolean _hdfsFlusherActive = false;
/** file system object **/
FileSystem _remoteFileSystem = null;
/** local log virtual offset **/
long _localLogStartOffset = 0;
/** local log access mutex **/
Semaphore _localLogAccessSempahore = new Semaphore(LOG_ACCESS_SEMAPHORE_COUNT);
/** local log write mutex **/
Semaphore _localLogWriteAccessSemaphore = new Semaphore(1);
/** session id normalizer **/
SessionIDURLNormalizer _sessionIdNormalizer = new SessionIDURLNormalizer();
// cache flush timer
Timer _cacheFlushTimer;
/** cache flush threshold **/
int _cacheFlushThreshold = LOCAL_CACHE_FLUSH_THRESHOLD;
/** internal constructor for test purposes
*
*/
private CacheManager(EventLoop eventLoop) {
_localDataDirectory = new File("/tmp/proxy/localData");
_localDataDirectory.mkdirs();
_remoteDataDirectory = new Path("/tmp/proxy/remoteData");
if (eventLoop == null) {
_eventLoop = new EventLoop();
_eventLoop.start();
}
else {
_eventLoop = eventLoop;
}
Configuration conf = new Configuration();
conf.addResource("nutch-default.xml");
conf.addResource("nutch-site.xml");
conf.addResource("hadoop-default.xml");
conf.addResource("hadoop-site.xml");
conf.addResource("commoncrawl-default.xml");
conf.addResource("commoncrawl-site.xml");
CrawlEnvironment.setHadoopConfig(conf);
try {
_remoteFileSystem = FileSystem.getLocal(conf);
_remoteFileSystem.mkdirs(_remoteDataDirectory);
}
catch (IOException e) {
LOG.error(CCStringUtils.stringifyException(e));
throw new RuntimeException("Could not initialize hdfs connection");
}
}
/** DocumentCache - proper constructor
*
*/
public CacheManager(FileSystem remoteFileSystem,File dataDirectory,EventLoop eventLoop) {
_localDataDirectory = dataDirectory;
_eventLoop = eventLoop;
_remoteFileSystem = remoteFileSystem;
_remoteDataDirectory = new Path(PROXY_CACHE_LOCATION);
}
public static final int INIT_FLAG_SKIP_CACHE_WRITER_INIT = 1;
public static final int INIT_FLAG_SKIP_HDFS_WRITER_INIT = 2;
public static final int INIT_FLAG_SKIP_INDEX_LOAD = 4;
/**
* set cache flush threshold
* best to call this before initialize
*/
public void setCacheFlushThreshold(int cacheFlushThreshold) {
_cacheFlushThreshold = cacheFlushThreshold;
}
public int getCacheFlushThreshold() {
return _cacheFlushThreshold;
}
/**
* initialize the DocumentCache
*
* @throws IOException
*/
public void initialize(int initFlags)throws IOException {
initializeActiveLog();
if ((initFlags & INIT_FLAG_SKIP_INDEX_LOAD) == 0) {
loadHDFSIndexFiles();
}
if ((initFlags & INIT_FLAG_SKIP_CACHE_WRITER_INIT) == 0) {
startCacheWriterThread();
}
if ((initFlags & INIT_FLAG_SKIP_HDFS_WRITER_INIT) == 0) {
startHDFSFlusherThread();
}
_cacheLoadThreadPool = Executors.newFixedThreadPool(DEFAULT_DISK_READER_THREADS);
_hdfsLoaderPool = Executors.newFixedThreadPool(HDFS_READER_THREADS);
_cacheFlushTimer = new Timer(CACHE_POLL_TIMER_INTERVAL,true,new Timer.Callback() {
@Override
public void timerFired(Timer timer) {
if (!_hdfsFlusherActive) {
synchronized(_header) {
// check status ...
if (_header._itemCount >= _cacheFlushThreshold) {
LOG.info("Local Cache Item Count:" + _header._itemCount + " Exceeded Cache Flush Threshold. Scheduling HDFS Flush");
CacheFlushRequest flushRequest = new CacheFlushRequest(_header._fileSize - LocalLogFileHeader.SIZE,(int)_header._itemCount);
_hdfsFlushRequestQueue.add(flushRequest);
_hdfsFlusherActive = true;
}
}
}
}
});
_eventLoop.setTimer(_cacheFlushTimer);
LOG.info("Initialization Complete");
}
/**
* shutdown the DocuemntCache manager
*
*/
public void shutdown() {
if (_cacheFlushTimer != null) {
_eventLoop.cancelTimer(_cacheFlushTimer);
_cacheFlushTimer = null;
}
if (_writerThreads != null) {
LOG.info("Shuting down write threads");
for (int i=0;i<CACHE_WRITER_THREADS;++i)
_writeRequestQueue.add(new CacheWriteRequest());
for (int i=0;i<CACHE_WRITER_THREADS;++i) {
try {
_writerThreads[i].join();
} catch (InterruptedException e1) {
}
}
_writerThreads = null;
}
if (_cacheLoadThreadPool != null) {
LOG.info("write thread terminated. shuting down reader threads");
_cacheLoadThreadPool.shutdown();
try {
while (!_cacheLoadThreadPool.awaitTermination(5000, TimeUnit.MILLISECONDS)) {
LOG.info("waiting ... ");
}
} catch (InterruptedException e) {
}
LOG.info("reader thread terminated");
_cacheLoadThreadPool = null;
}
if (_hdfsLoaderPool != null) {
LOG.info("shuting down hdfs loader threads");
_hdfsLoaderPool.shutdown();
try {
while (!_hdfsLoaderPool.awaitTermination(5000, TimeUnit.MILLISECONDS)) {
LOG.info("waiting ... ");
}
} catch (InterruptedException e) {
}
LOG.info("hdfs loads threads terminated");
_hdfsLoaderPool = null;
}
}
/********************************************************************************************************/
// CacheItemCheckCallback
/********************************************************************************************************/
public static interface CacheItemCheckCallback {
public void cacheItemAvailable(String url,CacheItem item);
public void cacheItemNotFound(String url);
}
/**
* check if this specified url is present in the cache
*
* @param url
* @param callback
* @throws MalformedURLException
*/
public void checkCacheForItem(final String url,final CacheItemCheckCallback callback) {
try {
// get the normalized url
String normalizedURL = normalizeURL(url);
// get the fingerprint
long urlfp = URLFingerprint.generate64BitURLFPrint(normalizedURL);
// delegate to properly qualified method
checkCacheForItem(normalizedURL,urlfp,callback);
}
catch (MalformedURLException e) {
LOG.error(CCStringUtils.stringifyException(e));
queueCacheItemNotFoundCallback(callback, url);
}
}
/**
* check if this specified url is present in the cache
*
* @param url
* @param callback
* @throws MalformedURLException
*/
public CacheItem checkCacheForItemInWorkerThread(final String url) {
try {
// get the normalized url
String normalizedURL = normalizeURL(url);
// get the fingerprint
long urlfp = URLFingerprint.generate64BitURLFPrint(normalizedURL);
// delegate to properly qualified method
return checkCacheForItemInWorkerThread(normalizedURL,urlfp);
}
catch (MalformedURLException e) {
LOG.error(CCStringUtils.stringifyException(e));
return null;
}
}
private static long dateFromCacheItem(CacheItem cacheItem) {
NIOHttpHeaders headers = ArcFileItemUtils.buildHeaderFromArcFileItemHeaders(cacheItem.getHeaderItems());
CrawlURLMetadata metadata = new CrawlURLMetadata();
try {
HttpHeaderInfoExtractor.parseHeaders(headers,metadata);
} catch (IOException e) {
LOG.error(CCStringUtils.stringifyException(e));
return 0;
}
if (metadata.isFieldDirty(CrawlURLMetadata.Field_HTTPDATE)) {
return metadata.getHttpDate();
}
else if (metadata.isFieldDirty(CrawlURLMetadata.Field_LASTMODIFIEDTIME)) {
return metadata.getLastModifiedTime();
}
else if (metadata.isFieldDirty(CrawlURLMetadata.Field_EXPIRES)) {
return metadata.getExpires();
}
return 0;
}
/** check cache via fingerprint - this call blocks and should not be used in an async context
*
* @param urlFingerprint
* @return true if a document with matching fingerprint exists in the cache ...
*/
public long checkCacheForFingerprint(long urlFingerprint,boolean returnDate) {
synchronized(this) {
for (CacheItem item : _writeQueue) {
if (item.getUrlFingerprint() == urlFingerprint) {
if (returnDate) {
long dateOut = dateFromCacheItem(item);
// if no date found, use current date as an approximate...
return (dateOut != 0) ? dateOut : System.currentTimeMillis();
}
else return 1;
}
}
}
synchronized (this) {
if (_fingerprintToLocalLogPos.get(urlFingerprint).size() != 0) {
// assume recent date as an approximate
return System.currentTimeMillis();
}
}
// now check hdfs indexes
ImmutableList<HDFSFileIndex> indexList = null;
synchronized(CacheManager.this) {
indexList = ImmutableList.copyOf(_hdfsIndexList);
}
long timeStart = System.currentTimeMillis();
// first check local item cache ...
TreeSet<Long> cachedItems = new TreeSet<Long>();
for (HDFSFileIndex index : Lists.reverse(indexList)) {
try {
CacheItem itemFound = index.findItem(urlFingerprint,!returnDate);
if (itemFound != null) {
if (returnDate) {
// get item date from headers .
long itemDate = dateFromCacheItem(itemFound);
if (itemDate == 0) {
itemDate = index.getIndexTimestamp();
// if item date still 0, this is BAD !!!
if (itemDate == 0) {
LOG.error("!!!!!!UNABLE TO PARSE INDEX TIMESTAMP:" + index.getIndexDataPath());
itemDate = 1L;
}
}
// ok add it to the map ...
cachedItems.add(itemDate);
}
else {
return 1;
}
}
} catch (IOException e) {
LOG.error(CCStringUtils.stringifyException(e));
}
}
if (returnDate && cachedItems.size() != 0) {
return cachedItems.last();
}
return 0;
}
/**
* check cache for url by fingerprint
*
* @param urlfingerprint
* @param callback
* @throws MalformedURLException
*/
public void checkCacheForItem(final String normalizedURL,final long urlFingerprint,final CacheItemCheckCallback callback) {
// first check local item cache ...
CacheItem cachedItemOut = null;
synchronized(this) {
for (CacheItem item : _writeQueue) {
if (item.getUrlFingerprint() == urlFingerprint) {
cachedItemOut = item;
break;
}
}
}
// if found initiate immediate callback
if (cachedItemOut != null) {
queueCacheItemFoundCallback(callback,cachedItemOut);
}
else {
Long[] fpToItemCache = new Long[0];
synchronized(this) {
// now check local cache first ...
fpToItemCache = _fingerprintToLocalLogPos.get(urlFingerprint).toArray(fpToItemCache);
}
if (fpToItemCache.length != 0) {
queueLocalCacheLoadRequest(new CacheLoadRequest(normalizedURL,fpToItemCache,callback));
}
else {
// if not found ... check hdfs cache ...
queueHDFSCacheLoadRequest(new CacheLoadRequest(normalizedURL,urlFingerprint,callback));
}
}
}
/**
* check cache for url by fingerprint
*
* @param urlfingerprint
* @param callback
* @throws MalformedURLException
*/
public CacheItem checkCacheForItemInWorkerThread(final String normalizedURL,final long urlFingerprint) {
// first check local item cache ...
CacheItem cachedItemOut = null;
synchronized(this) {
for (CacheItem item : _writeQueue) {
if (item.getUrlFingerprint() == urlFingerprint) {
cachedItemOut = item;
break;
}
}
}
// if found initiate immediate callback
if (cachedItemOut != null) {
// callback.cacheItemAvailable(cachedItemOut.getUrl(), cachedItemOut);
return cachedItemOut;
}
else {
Long[] fpToItemCache = new Long[0];
synchronized(this) {
// now check local cache first ...
fpToItemCache = _fingerprintToLocalLogPos.get(urlFingerprint).toArray(fpToItemCache);
}
if (fpToItemCache.length != 0) {
return queueLocalCacheLoadRequestInWorkerThread(new CacheLoadRequest(normalizedURL,fpToItemCache,null));
}
else {
// if not found ... check hdfs cache ...
return queueHDFSCacheLoadRequestInWorkerThread(new CacheLoadRequest(normalizedURL,urlFingerprint,null));
}
}
}
/**
* cache this item
*
* @param item - the item to cache
* @param optionalSemaphore an optional semaphore that will be passed back in completion callback when io operations complete
*/
public void cacheItem(CacheItem item,Semaphore optionalSemaphore) {
synchronized (this) {
_writeQueue.add(item);
_writeRequestQueue.add(new CacheWriteRequest(item.getUrlFingerprint(),item,optionalSemaphore));
}
}
EventLoop getEventLoop() {
return _eventLoop;
}
long getLocalLogFilePos() {
long filePosOut = 0;
synchronized (_header) {
filePosOut = _header._fileSize;
}
return filePosOut;
}
byte[] getLocalLogSyncBytes() {
return _header._sync;
}
LinkedBlockingQueue<CacheWriteRequest> getWriteRequestQueue() {
return _writeRequestQueue;
}
File getLocalDataDirectory() {
return _localDataDirectory;
}
Path getRemoteDataDirectory() {
return _remoteDataDirectory;
}
/** startCacheWriterThread
*
*/
private void startCacheWriterThread() {
_writerThreads = new Thread[CACHE_WRITER_THREADS];
for (int i=0;i<CACHE_WRITER_THREADS;++i) {
_writerThreads[i] = new Thread(new CacheWriterThread(this));
_writerThreads[i].start();
}
}
/** startHDFSFlusherThread
*
*/
private void startHDFSFlusherThread() {
_hdfsFlusherThread = new Thread(new HDFSFlusherThread(this));
_hdfsFlusherThread.start();
}
/** load hdfs indexes
*
*/
private synchronized void loadHDFSIndexFiles() throws IOException {
//scan remote file system for index files ...
FileStatus indexFiles[] = getRemoteFileSystem().globStatus(new Path(getRemoteDataDirectory(),PROXY_CACHE_FILE_INDEX_PREFIX+"*"));
// iterate files
for (FileStatus indexFile : indexFiles) {
LOG.info("Found Remote Index File:" + indexFile.getPath() + " Scanning for valid local copy");
File localPath = new File(getLocalDataDirectory(),indexFile.getPath().getName());
if (!localPath.exists() || localPath.length() != indexFile.getLen()) {
LOG.info("Local Index File:" + localPath.getAbsolutePath() + " Not Found. Copying...");
getRemoteFileSystem().copyToLocalFile(indexFile.getPath(), new Path(localPath.getAbsolutePath()));
LOG.info("Remote Index File:" + indexFile.getPath() + " copied to:" + localPath.getAbsolutePath());
}
// extract timestamp from index name
long indexTimestamp = Long.parseLong(indexFile.getPath().getName().substring(PROXY_CACHE_FILE_INDEX_PREFIX.length() + 1));
// construct data file path
Path remoteDataPath = new Path(getRemoteDataDirectory(),PROXY_CACHE_FILE_DATA_PREFIX +"-"+indexTimestamp);
// now load the index ...
LOG.info("Loading Index from:" + localPath.getAbsolutePath() + " Data Path:" + remoteDataPath);
HDFSFileIndex indexObject = new HDFSFileIndex(getRemoteFileSystem(),localPath,remoteDataPath);
LOG.info("Loaded Index from:" + localPath.getAbsolutePath());
_hdfsIndexList.add(indexObject);
}
}
// shrink the log file by the desired amount and update the header
private final void flushLocalLog(final long bytesToRemove,final int itemsToRemove,final List<FingerprintAndOffsetTuple> flushedTupleList,final ArrayList<IndexDataFileTriple> tempFileTriples) {
LOG.info("Acquiring Log Access Semaphores");
// first boost this thread's priority ...
int originalThreadPriority = Thread.currentThread().getPriority();
Thread.currentThread().setPriority(Thread.MAX_PRIORITY);
// next acquire all permits to the local access log ... block until we get there ...
getLocalLogAccessSemaphore().acquireUninterruptibly(LOG_ACCESS_SEMAPHORE_COUNT);
// now that we have all the semaphores we need, reduce the thread's priority to normal
Thread.currentThread().setPriority(originalThreadPriority);
LOG.info("Acquired ALL Log Access Semaphores");
long timeStart = System.currentTimeMillis();
// now we have exclusive access to the local transaction log ...
File activeLogFilePath = getActiveLogFilePath();
File checkpointLogFilePath = getCheckpointLogFilePath();
try {
// delete checkpoint file if it existed ...
checkpointLogFilePath.delete();
// now rename activelog to checkpoint path
activeLogFilePath.renameTo(checkpointLogFilePath);
long logFileConsolidationStartTime = System.currentTimeMillis();
// now trap for exceptions in case something fails
try {
// fix up the header ...
_header._fileSize -= bytesToRemove;
_header._itemCount -= itemsToRemove;
// open a old file and new file
RandomAccessFile newFile = new RandomAccessFile(activeLogFilePath,"rw");
RandomAccessFile oldFile = new RandomAccessFile(checkpointLogFilePath,"r");
LOG.info("Opened new and old files. New Header FileSize is:" + _header._fileSize + " ItemCount:" + _header._itemCount);
try {
// write out header ...
long bytesRemainingInLogFile = _header._fileSize;
LOG.info("Writing Header to New File. Bytes Remaining for Data are:" + bytesRemainingInLogFile);
// write header to new file ...
_header.writeHeader(newFile);
// decrement bytes available ...
bytesRemainingInLogFile -= LocalLogFileHeader.SIZE;
if (bytesRemainingInLogFile != 0) {
byte transferBuffer[] = new byte[(1 << 20) * 16];
LOG.info("Seeking old file past flushed data (pos:" + LocalLogFileHeader.SIZE + bytesToRemove + ")");
// seek past old data ...
oldFile.seek(LocalLogFileHeader.SIZE + bytesToRemove);
// and copy across remaining data
while (bytesRemainingInLogFile != 0) {
int bytesToReadWriteThisIteration = Math.min((int)bytesRemainingInLogFile,transferBuffer.length);
oldFile.read(transferBuffer, 0, bytesToReadWriteThisIteration);
newFile.write(transferBuffer,0,bytesToReadWriteThisIteration);
LOG.info("Copied " + bytesToReadWriteThisIteration + " from Old to New");
bytesRemainingInLogFile -= bytesToReadWriteThisIteration;
}
}
}
finally {
if (newFile != null) {
newFile.close();
}
if (oldFile != null) {
oldFile.close();
}
}
// if we reached here then checkpoint was successfull ...
LOG.info("Checkpoint - Log Consolidation Successfull! TOOK:" + (System.currentTimeMillis() - logFileConsolidationStartTime));
LOG.info("Loading Index Files");
for (IndexDataFileTriple triple : tempFileTriples) {
LOG.info("Loading Index File:" + triple._localIndexFilePath);
final HDFSFileIndex fileIndex = new HDFSFileIndex(_remoteFileSystem,triple._localIndexFilePath,triple._dataFilePath);
LOG.info("Loaded Index File");
// update hdfs index list ...
synchronized (CacheManager.this) {
LOG.info("Adding HDFS Index to list");
_hdfsIndexList.addElement(fileIndex);
}
}
// create a semaphore to wait on
final Semaphore semaphore = new Semaphore(0);
LOG.info("Scheduling Async Event");
// now we need to schedule an async call to main thread to update data structures safely ...
_eventLoop.setTimer(new Timer(0,false,new Timer.Callback() {
@Override
public void timerFired(Timer timer) {
LOG.info("Cleaning Map");
synchronized (CacheManager.this) {
// walk tuples
for (FingerprintAndOffsetTuple tuple : flushedTupleList) {
//TODO: HACK!
// remove from collection ...
_fingerprintToLocalLogPos.removeAll(tuple._fingerprint);
}
}
LOG.info("Increment Offset Info");
// finally increment locallog offset by bytes removed ...
_localLogStartOffset += bytesToRemove;
LOG.info("Releasing Wait Semaphore");
//release wait sempahore
semaphore.release();
}
}));
LOG.info("Waiting for Async Event to Complete");
//wait for async operation to complete ...
semaphore.acquireUninterruptibly();
LOG.info("Async Event to Completed");
}
catch (IOException e){
LOG.error("Checkpoint Failed with Exception:" + CCStringUtils.stringifyException(e));
// delete new file ...
activeLogFilePath.delete();
// and rename checkpoint file to active file ...
checkpointLogFilePath.renameTo(activeLogFilePath);
}
}
finally {
LOG.info("Releasing ALL Log Access Semaphores. HELD FOR:" + (System.currentTimeMillis() - timeStart));
getLocalLogAccessSemaphore().release(LOG_ACCESS_SEMAPHORE_COUNT);
}
}
/** called by hdfs log flusher thread when a cache flush is complete
*
* @param request
* @param tupleListOut
* @param localIndexFileName
* @param remoteDataFileName
* @throws IOException
*/
void hdfsCacheFlushRequestComplete(CacheFlushRequest request,List<FingerprintAndOffsetTuple> tupleListOut,ArrayList<IndexDataFileTriple> tempFileList)throws IOException {
// ok we have been called from the hdfs worker thread, and it is in the middle of a flush transaction
// and it wants us to atomically update our cache based on its recent successful flush operation
flushLocalLog(request._bytesToFlush,request._itemsToFlush,tupleListOut,tempFileList);
_eventLoop.setTimer(new Timer(0,false,new Timer.Callback() {
@Override
public void timerFired(Timer timer) {
// reset hdfs flusher thread variable
_hdfsFlusherActive = false;
}
}));
}
/** called by hdfs log flusher thread if a cache flush request fails
*
* @param request
*/
public void hdfsCacheFlushRequestFailed(CacheFlushRequest request) {
LOG.error("HDFS Cache Flush Failed");
_eventLoop.setTimer(new Timer(0,false,new Timer.Callback() {
@Override
public void timerFired(Timer timer) {
// reset hdfs flusher thread variable
_hdfsFlusherActive = false;
}
}));
}
Semaphore getLocalLogAccessSemaphore() {
return _localLogAccessSempahore;
}
Semaphore getLocalLogWriteAccessSemaphore() {
return _localLogWriteAccessSemaphore;
}
FileSystem getRemoteFileSystem() {
return _remoteFileSystem;
}
LinkedBlockingQueue<CacheFlushRequest> getHDFSFlushRequestQueue() {
return _hdfsFlushRequestQueue;
}
void writeRequestFailed(final CacheWriteRequest request,final IOException e) {
LOG.error("Failed to complete write request for Item:+ "+ request._item.getUrl() + " with Exception:" + CCStringUtils.stringifyException(e));
synchronized (CacheManager.this) {
// ok time to find this item in the write queue and move it to the long term position queue ...
_writeQueue.remove(request._item);
}
// ok finally, if completion semaphore is set... release it
if (request._optionalSemaphore != null) {
request._optionalSemaphore.release();
}
}
void writeRequestComplete(final CacheWriteRequest request,final long absoluteFilePosition) {
synchronized (CacheManager.this) {
// ok time to find this item in the write queue and move it to the long term position queue ...
_writeQueue.remove(request._item);
// now ... push it into long term lookup map ...
_fingerprintToLocalLogPos.put(request._itemFingerprint,_localLogStartOffset + absoluteFilePosition);
}
// ok finally, if completion semaphore is set... release it
if (request._optionalSemaphore != null) {
request._optionalSemaphore.release();
}
}
/**
* queueCacheItemFoundCallback - helper method used to dispatch async callback
*
* @param callback
* @param itemFound
*/
private void queueCacheItemFoundCallback(final CacheItemCheckCallback callback,final CacheItem itemFound) {
_eventLoop.setTimer(new Timer(0,false,new Timer.Callback() {
@Override
public void timerFired(Timer timer) {
callback.cacheItemAvailable(itemFound.getUrl(), itemFound);
}
}));
}
/**
* queueCacheItemNotFoundCallback - helper method used to dispatch async callback
*
* @param callback
* @param itemFound
*/
private void queueCacheItemNotFoundCallback(final CacheItemCheckCallback callback,final String url) {
_eventLoop.setTimer(new Timer(0,false,new Timer.Callback() {
@Override
public void timerFired(Timer timer) {
callback.cacheItemNotFound(url);
}
}));
}
/**
* loadCacheItemFromDisk - load a single cache item from disk
*
* @param file
* @param optTargetURL
* @param location
* @return
* @throws IOException
*/
private CacheItem loadCacheItemFromDisk(FileInputStream file,String optTargetURL,long location) throws IOException {
long timeStart = System.currentTimeMillis();
// and read out the Item Header ...
CacheItemHeader itemHeader= new CacheItemHeader();
itemHeader.readHeader(new DataInputStream(file));
// see if it is valid ...
if (!Arrays.equals(itemHeader._sync, _header._sync)) {
LOG.error("### Item Lookup for URL:" + optTargetURL + " Record at:" + location + " failed - corrupt sync bytes detected!!!");
}
else {
CRC32 crc32 = new CRC32();
// ok deserialize the bytes ...
CacheItem item = new CacheItem();
CheckedInputStream checkedStream = new CheckedInputStream(file,crc32);
DataInputStream itemStream = new DataInputStream(checkedStream);
item.readFields(itemStream);
// read the content buffer length
int contentBufferLen = itemStream.readInt();
if (contentBufferLen != 0) {
byte data[] = new byte[contentBufferLen];
itemStream.read(data);
item.setContent(new Buffer(data));
}
// cache crc
long crcValueComputed = crc32.getValue();
// read disk crc
long crcValueOnDisk = itemStream.readLong();
// validate
if (crcValueComputed == crcValueOnDisk) {
String canonicalURL = URLUtils.canonicalizeURL(item.getUrl(),true);
if (optTargetURL.length() == 0 || optTargetURL.equals(canonicalURL)) {
if (isValidCacheItem(item)) {
LOG.info("### Item Lookup for URL:" + optTargetURL + " Record at:" + location + " completed in:" + (System.currentTimeMillis()-timeStart));
return item;
}
else {
LOG.info("### Item Lookup for URL:" + optTargetURL + " Record at:" + location + " failed with invalid result code");
}
}
else {
LOG.info("### Item Lookup for URL:" + optTargetURL + " Record at:" + location+ " failed with url mismatch. record url:" + item.getUrl());
}
}
else {
LOG.error("### Item Lookup for URL:" + optTargetURL + " Record at:" + location + " failed - crc mismatch!!!");
}
}
return null;
}
/** is this a valid cache item for servicing a query request
*
*/
private static boolean isValidCacheItem(CacheItem item) {
if ((item.getFlags() & (CacheItem.Flags.Flag_IsPermanentRedirect|CacheItem.Flags.Flag_IsTemporaryRedirect)) != 0) {
return true;
}
// parse response code in headers ...
CrawlURLMetadata metadata = new CrawlURLMetadata();
HttpHeaderInfoExtractor.parseStatusLine(item.getHeaderItems().get(0).getItemValue(), metadata);
if (metadata.isFieldDirty(CrawlURLMetadata.Field_HTTPRESULTCODE)) {
if (metadata.getHttpResultCode() == 200 /*|| (metadata.getHttpResultCode()>=400 && metadata.getHttpResultCode() <500)*/) {
return true;
}
else {
LOG.info("Rejecting Cache Item:" + item.getUrl() + " With Invalid Result Code:" + metadata.getHttpResultCode()
+ " ActualValue:" + item.getHeaderItems().get(0).getItemValue());
}
}
return false;
}
/** queue up an hdfs cache load request
*
*/
private void queueHDFSCacheLoadRequest(final CacheLoadRequest loadRequest) {
_hdfsLoaderPool.submit(new ConcurrentTask<CacheItem>(_eventLoop,new Callable<CacheItem>() {
@Override
public CacheItem call() throws Exception {
LOG.info("Executing HDFS Index Search Thread for URL:" + loadRequest._targetURL);
ImmutableList<HDFSFileIndex> indexList = null;
synchronized(CacheManager.this) {
long timeStart = System.currentTimeMillis();
indexList = ImmutableList.copyOf(_hdfsIndexList);
long timeEnd = System.currentTimeMillis();
// LOG.info("#### TIMER - indexList Copy Took:" + (timeEnd-timeStart));
}
long timeStart = System.currentTimeMillis();
LOG.info("Starting Search of:" + indexList.size() + " hdfs indexes for fp:" + loadRequest._fingerprint);
for (HDFSFileIndex index : Lists.reverse(indexList)) {
CacheItem item = index.findItem(loadRequest._fingerprint,false);
if (item != null) {
LOG.info("Found Hit for fingerprint:" + loadRequest._fingerprint + " URL:" + item.getUrl() + " IN:" + (System.currentTimeMillis() - timeStart));
return item;
}
}
LOG.info("FAILED TO FIND Hit during for fingerprint:" + loadRequest._fingerprint + " IN:" + (System.currentTimeMillis() - timeStart));
return null;
}
},
new CompletionCallback<CacheItem>() {
@Override
public void taskComplete(CacheItem loadResult) {
if (loadResult != null && isValidCacheItem(loadResult)) {
LOG.info("### Item Load Request for URL:" + loadRequest._targetURL + " Succeeded. Initiating Callback");
// reset pending to zero so no other load requests satisfy callback
loadRequest._callback.cacheItemAvailable(loadRequest._targetURL,loadResult);
}
else{
LOG.info("### Item Load Request for URL:" + loadRequest._targetURL + " Failed with No-Item-Found");
// if pending zero ... initiate failure callback
loadRequest._callback.cacheItemNotFound(loadRequest._targetURL);
}
}
@Override
public void taskFailed(Exception e) {
LOG.info("### Item Load Request for URL:" + loadRequest._targetURL + " Failed with FAILURE Reason:" + CCStringUtils.stringifyException(e));
// if pending zero ... initiate failure callback
loadRequest._callback.cacheItemNotFound(loadRequest._targetURL);
}
}));
}
/** queue up an hdfs cache load request
*
*/
private CacheItem queueHDFSCacheLoadRequestInWorkerThread(final CacheLoadRequest loadRequest) {
// LOG.info("#### ENTERING queueHDFSCacheLoadRequestInWorkerThread");
try {
CacheItem loadResult = null;
// LOG.info("Executing HDFS Index Search Thread for URL:" + loadRequest._targetURL);
ImmutableList<HDFSFileIndex> indexList = null;
synchronized(CacheManager.this) {
indexList = ImmutableList.copyOf(_hdfsIndexList);
}
long timeStart = System.currentTimeMillis();
// LOG.info("Starting Search of:" + indexList.size() + " hdfs indexes for fp:" + loadRequest._fingerprint);
for (HDFSFileIndex index : Lists.reverse(indexList)) {
CacheItem item = index.findItem(loadRequest._fingerprint,false);
if (item != null) {
LOG.info("Found Hit for fingerprint:" + loadRequest._fingerprint + " URL:" + item.getUrl() + " IN:" + (System.currentTimeMillis() - timeStart));
loadResult = item;
break;
}
}
LOG.info("FAILED TO FIND Hit during for fingerprint:" + loadRequest._fingerprint + " IN:" + (System.currentTimeMillis() - timeStart));
if (loadResult != null && isValidCacheItem(loadResult)) {
// LOG.info("### Item Load Request for URL:" + loadRequest._targetURL + " Succeeded. Initiating Callback");
// reset pending to zero so no other load requests satisfy callback
// loadRequest._callback.cacheItemAvailable(loadRequest._targetURL,loadResult);
return loadResult;
}
else{
// LOG.info("### Item Load Request for URL:" + loadRequest._targetURL + " Failed with No-Item-Found");
// if pending zero ... initiate failure callback
//loadRequest._callback.cacheItemNotFound(loadRequest._targetURL);
}
}
catch (Exception e) {
LOG.info("### Item Load Request for URL:" + loadRequest._targetURL + " Failed with FAILURE Reason:" + CCStringUtils.stringifyException(e));
// if pending zero ... initiate failure callback
// loadRequest._callback.cacheItemNotFound(loadRequest._targetURL);
}
finally {
//LOG.info("#### EXITING queueHDFSCacheLoadRequestInWorkerThread");
}
return null;
}
/**
* queue a cache load request via a background thread
*
* @param loadRequest
*/
private void queueLocalCacheLoadRequest(final CacheLoadRequest loadRequest) {
// queue up requests into the thread pool executor (for now)
for (final Long location : loadRequest._loacations) {
_cacheLoadThreadPool.submit(new ConcurrentTask<CacheItem>(_eventLoop, new Callable<CacheItem>() {
@Override
public CacheItem call() throws Exception {
//LOG.info("### Local Cache Loader Called. Acquiring Semaphore");
getLocalLogAccessSemaphore().acquireUninterruptibly();
//LOG.info("### Local Cache Loader Called. Acquired Semaphore");
// now set up and exception handler block to ensure that we release semaphore
try {
//LOG.info("### Item Loading Item for URL:" + loadRequest._targetURL + " at Pos:" + location.longValue());
// now that we have acquire the semaphore ... validate position against current log file offset ...
if (location < _localLogStartOffset) {
LOG.error("### Load Request for Potentially Flushed Item. Location Request:" + location + " Current LogStartOffset:" + _localLogStartOffset);
}
else {
long timeStart = System.currentTimeMillis();
// we got a location ... initiate a disk read to fetch the serialized CacheItem
FileInputStream file = new FileInputStream(getActiveLogFilePath());
try {
// seek to item location ...
file.skip(location.longValue() - _localLogStartOffset);
return loadCacheItemFromDisk(file,loadRequest._targetURL,location.longValue());
}
catch (IOException e) {
LOG.error(CCStringUtils.stringifyException(e));
}
finally {
// file.getFD().sync();
file.close();
}
}
return null;
}
finally {
//LOG.info("### Local Cache Loader Releasing Semaphore");
getLocalLogAccessSemaphore().release();
}
}
},new CompletionCallback<CacheItem>() {
@Override
public void taskComplete(CacheItem loadResult) {
if (loadResult != null) {
// LOG.info("### Item Load Request for URL:" + loadRequest._targetURL + " Succeeded. Initiating Callback");
// reset pending to zero so no other load requests satisfy callback
loadRequest._pendingItemCount = 0;
loadRequest._callback.cacheItemAvailable(loadRequest._targetURL,loadResult);
}
else{
// on failure reduce pending count ...
loadRequest._pendingItemCount--;
if (loadRequest._pendingItemCount == 0) {
// LOG.info("### Item Load Request for URL:" + loadRequest._targetURL + " Failed with No-Item-Found");
// if pending zero ... initiate failure callback
loadRequest._callback.cacheItemNotFound(loadRequest._targetURL);
}
}
}
@Override
public void taskFailed(Exception e) {
// on failure reduce pending count ...
loadRequest._pendingItemCount--;
if (loadRequest._pendingItemCount == 0) {
// LOG.info("### Item Load Request for URL:" + loadRequest._targetURL + " Failed with FAILURE Reason:" + CCStringUtils.stringifyException(e));
// if pending zero ... initiate failure callback
loadRequest._callback.cacheItemNotFound(loadRequest._targetURL);
}
}
}));
}
}
/**
* queue a cache load request via a background thread
*
* @param loadRequest
*/
private CacheItem queueLocalCacheLoadRequestInWorkerThread(final CacheLoadRequest loadRequest) {
// LOG.info("#### ENTERING queueLocalCacheLoadRequestInWorkerThread");
try {
CacheItem loadResult = null;
// queue up requests into the thread pool executor (for now)
for (final Long location : loadRequest._loacations) {
LOG.info("### Local Cache Loader Called. Acquiring Semaphore");
getLocalLogAccessSemaphore().acquireUninterruptibly();
LOG.info("### Local Cache Loader Called. Acquired Semaphore");
// now set up and exception handler block to ensure that we release semaphore
try {
// LOG.info("### Item Loading Item for URL:" + loadRequest._targetURL + " at Pos:" + location.longValue());
// now that we have acquire the semaphore ... validate position against current log file offset ...
if (location < _localLogStartOffset) {
LOG.error("### Load Request for Potentially Flushed Item. Location Request:" + location + " Current LogStartOffset:" + _localLogStartOffset);
}
else {
long timeStart = System.currentTimeMillis();
// we got a location ... initiate a disk read to fetch the serialized CacheItem
FileInputStream file = new FileInputStream(getActiveLogFilePath());
try {
// seek to item location ...
file.skip(location.longValue() - _localLogStartOffset);
loadResult = loadCacheItemFromDisk(file,loadRequest._targetURL,location.longValue());
if (loadResult != null) {
break;
}
}
catch (IOException e) {
LOG.error(CCStringUtils.stringifyException(e));
}
finally {
// file.getFD().sync();
file.close();
}
}
}
finally {
LOG.info("### Local Cache Loader Releasing Semaphore");
getLocalLogAccessSemaphore().release();
}
}
if (loadResult != null) {
LOG.info("### Item Load Request for URL:" + loadRequest._targetURL + " Succeeded. Initiating Callback");
// reset pending to zero so no other load requests satisfy callback
loadRequest._pendingItemCount = 0;
// loadRequest._callback.cacheItemAvailable(loadRequest._targetURL,loadResult);
return loadResult;
}
else{
// on failure reduce pending count ...
loadRequest._pendingItemCount--;
if (loadRequest._pendingItemCount == 0) {
LOG.info("### Item Load Request for URL:" + loadRequest._targetURL + " Failed with No-Item-Found");
// if pending zero ... initiate failure callback
// loadRequest._callback.cacheItemNotFound(loadRequest._targetURL);
return null;
}
}
}
catch (Exception e) {
LOG.info("### Item Load Request for URL:" + loadRequest._targetURL + " Failed with FAILURE Reason:" + CCStringUtils.stringifyException(e));
// if pending zero ... initiate failure callback
//loadRequest._callback.cacheItemNotFound(loadRequest._targetURL);
}
finally {
//LOG.info("#### EXITING queueLocalCacheLoadRequestInWorkerThread");
}
return null;
}
/**
* normalize a url to be canonical
*
* @param url
* @return
* @throws MalformedURLException
*/
public static String normalizeURL(String url) throws MalformedURLException {
return URLUtils.canonicalizeURL(url,true);
}
/**
* get fingerprint given url
*/
public static long getFingerprintGivenURL(String url) throws MalformedURLException {
// get the normalized url
String normalizedURL = normalizeURL(url);
// get the fingerprint
return URLFingerprint.generate64BitURLFPrint(normalizedURL);
}
/**
* getLogFilePath - get active log file path
*
* @param directoryRoot
* @return
*/
File getActiveLogFilePath() {
return new File(_localDataDirectory,"ActiveLog");
}
File getCheckpointLogFilePath() {
return new File(_localDataDirectory,"Checkpoint");
}
/**
* updateLogFileHeader - update the log file header
* called via the log file writer thread ...
* @throws IOException
*/
void updateLogFileHeader(File logFileName, long newlyAddedItemsCount, long newItemsFileSize)throws IOException {
RandomAccessFile file = new RandomAccessFile(logFileName,"rw");
try {
synchronized(_header) {
// update cached header ...
_header._fileSize += newItemsFileSize;
_header._itemCount += newlyAddedItemsCount;
// set the position at zero ..
file.seek(0);
// and write header to disk ...
_header.writeHeader(file);
}
}
finally {
// major bottle neck..
// file.getFD().sync();
file.close();
}
}
/**
* initializeEmptyLogFile - init an empty log file header
*
* @param stream
* @return
* @throws IOException
*/
private static LocalLogFileHeader initializeEmptyLogFile(DataOutput stream) throws IOException {
LocalLogFileHeader header = new LocalLogFileHeader();
header.writeHeader(stream);
return header;
}
/**
* readLogFileHeader - from File
*
* @param logFileName
* @return
* @throws IOException
*/
private static LocalLogFileHeader readLogFileHeader(File logFileName) throws IOException {
LocalLogFileHeader headerOut = new LocalLogFileHeader();
RandomAccessFile file = new RandomAccessFile(logFileName,"r");
try {
headerOut = readLogFileHeader(file);
}
finally {
file.close();
}
return headerOut;
}
/**
* readLogFileHeader - from Stream
*
* @param reader
* @return
* @throws IOException
*/
private static LocalLogFileHeader readLogFileHeader(DataInput reader)throws IOException {
LocalLogFileHeader headerOut = new LocalLogFileHeader();
headerOut.readHeader(reader);
return headerOut;
}
/** initiailizeActiveLog - init local cache log
*
*
* **/
private void initializeActiveLog()throws IOException {
File activeLogPath = getActiveLogFilePath();
if (!activeLogPath.exists()) {
DataOutputStream outputStream = new DataOutputStream(new FileOutputStream(activeLogPath));
try {
_header = initializeEmptyLogFile(outputStream);
}
finally {
outputStream.close();
}
}
else {
_header = new LocalLogFileHeader();
DataInputStream inputStream = new DataInputStream(new FileInputStream(activeLogPath));
try {
_header.readHeader(inputStream);
}
finally {
inputStream.close();
}
if (_header._itemCount != 0) {
loadCache(activeLogPath,_header);
}
}
}
/**
* loadCache - load local cache from disk
* @param activeLogPath
* @param logFileHeader
* @throws IOException
*/
private synchronized void loadCache(File activeLogPath,LocalLogFileHeader logFileHeader) throws IOException {
RandomAccessFile file = new RandomAccessFile(getActiveLogFilePath(),"rw");
byte [] syncCheck = new byte[_header._sync.length];
try {
long lastValidPos = LocalLogFileHeader.SIZE;
long currentPos = lastValidPos;
long endPos = file.length();
CacheItemHeader itemHeader = new CacheItemHeader();
// start read
while (currentPos < endPos) {
if ((endPos - currentPos) < LocalLogFileHeader.SYNC_BYTES_SIZE)
break;
// seek to current position ...
file.seek(currentPos);
boolean headerLoadFailed = false;
try {
// read the item header ... assuming things are good so far ...
itemHeader.readHeader(file);
}
catch (IOException e) {
LOG.error("### Item Header Load Failed With Exception:" + CCStringUtils.stringifyException(e));
headerLoadFailed = true;
}
if (headerLoadFailed) {
LOG.error("### Item File Corrupt at position:" + currentPos +" Seeking Next Sync Point");
currentPos += LocalLogFileHeader.SYNC_BYTES_SIZE;
}
// if header sync bytes don't match .. then seek to next sync position ...
if (headerLoadFailed || !Arrays.equals(itemHeader._sync, _header._sync)) {
LOG.error("### Item File Corrupt at position:" + currentPos +" Seeking Next Sync Point");
// reseek to current pos
file.seek(currentPos);
// read in a sync.length buffer amount
file.readFully(syncCheck);
int syncLen = _header._sync.length;
// start scan for next sync position ...
for (int i = 0; file.getFilePointer() < endPos; i++) {
int j = 0;
for (; j < syncLen; j++) {
if (_header._sync[j] != syncCheck[(i+j)%syncLen])
break;
}
if (j == syncLen) {
file.seek(file.getFilePointer() - LocalLogFileHeader.SYNC_BYTES_SIZE); // position before sync
break;
}
syncCheck[i%syncLen] = file.readByte();
}
// whatever, happened file pointer is at current pos
currentPos = file.getFilePointer();
if (currentPos < endPos) {
LOG.info("### Item Loader Found another sync point at:" + currentPos);
}
else {
LOG.error("### No more sync points found!");
}
}
else {
// ok figure out next steps based on header ...
// for now, just add item to our list ...
_fingerprintToLocalLogPos.put(itemHeader._fingerprint,_localLogStartOffset + currentPos);
// now seek past data
currentPos += CacheItemHeader.SIZE + itemHeader._dataLength + ITEM_RECORD_TRAILING_BYTES;
}
}
}
finally {
if (file != null) {
file.close();
}
}
}
public static long readVLongFromByteBuffer(ByteBuffer source) {
byte firstByte = source.get();
int len = WritableUtils.decodeVIntSize(firstByte);
if (len == 1) {
return firstByte;
}
long i = 0;
for (int idx = 0; idx < len-1; idx++) {
byte b = source.get();
i = i << 8;
i = i | (b & 0xFF);
}
return (WritableUtils.isNegativeVInt(firstByte) ? (i ^ -1L) : i);
}
public static void writeVLongToByteBuffer(ByteBuffer stream, long i) throws IOException {
if (i >= -112 && i <= 127) {
stream.put((byte)i);
return;
}
int len = -112;
if (i < 0) {
i ^= -1L; // take one's complement'
len = -120;
}
long tmp = i;
while (tmp != 0) {
tmp = tmp >> 8;
len--;
}
stream.put((byte)len);
len = (len < -120) ? -(len + 120) : -(len + 112);
for (int idx = len; idx != 0; idx--) {
int shiftbits = (idx - 1) * 8;
long mask = 0xFFL << shiftbits;
stream.put((byte)((i & mask) >> shiftbits));
}
}
/********************************************************************************************************/
// TEST CODE
/********************************************************************************************************/
public static void main(String[] args) {
final EventLoop eventLoop = new EventLoop();
eventLoop.start();
final CacheManager manager = new CacheManager(eventLoop);
// delete active log if it exists ...
manager.getActiveLogFilePath().delete();
try {
manager.initialize(INIT_FLAG_SKIP_CACHE_WRITER_INIT | INIT_FLAG_SKIP_HDFS_WRITER_INIT);
} catch (IOException e1) {
LOG.error(CCStringUtils.stringifyException(e1));
return;
}
MessageDigest digester;
try {
digester = MessageDigest.getInstance("MD5");
} catch (NoSuchAlgorithmException e1) {
LOG.error(CCStringUtils.stringifyException(e1));
return;
}
final byte [] randomBytes = new byte[1 << 15];
LOG.info("Building Random Digest");
for (int i=0;i<randomBytes.length;i+=16) {
long time = System.nanoTime();
digester.update((new UID()+"@"+time).getBytes());
System.arraycopy(digester.digest(),0,randomBytes,i,16);
}
final Semaphore semaphore = new Semaphore(0);
if (args[0].equals("populate")) {
manager.startCacheWriterThread();
manager.startHDFSFlusherThread();
try {
LOG.info("Done Building Random Digest");
LOG.info("Writing Items To Disk");
for (int i=0;i<1000000;++i) {
if (i % 1000 == 0) {
LOG.info("Wrote:" + i + " entries");
}
final CacheItem item1 = new CacheItem();
item1.setUrl(manager.normalizeURL("http://www.domain.com/foobar/" + i));
item1.setContent(new Buffer(randomBytes));
item1.setUrlFingerprint(URLFingerprint.generate64BitURLFPrint(item1.getUrl()));
manager.cacheItem(item1,null);
Thread.sleep(1);
if (i != 0 && i % 10000 == 0) {
LOG.info("Hit 10000 items.. sleeping for 20 seconds");
Thread.sleep(20 * 1000);
}
}
Thread.sleep(30000);
for (int i=0;i<1000000;++i) {
final String url = new String("http://www.domain.com/foobar/"+i);
manager.checkCacheForItem(url, new CacheItemCheckCallback() {
@Override
public void cacheItemAvailable(String url, CacheItem item) {
Assert.assertTrue(item.getUrl().equals(url));
String itemIndex = url.substring("http://www.domain.com/foobar/".length());
int itemNumber = Integer.parseInt(itemIndex);
if (itemNumber == 999999) {
semaphore.release();
}
}
@Override
public void cacheItemNotFound(String url) {
Assert.assertTrue(false);
}
});
}
}
catch (IOException e) {
LOG.error(CCStringUtils.stringifyException(e));
}
catch (InterruptedException e2) {
}
}
else if (args[0].equals("read")) {
try {
final CacheItem item1 = new CacheItem();
item1.setUrl(manager.normalizeURL("http://www.domain.com/barz/"));
item1.setUrlFingerprint(URLFingerprint.generate64BitURLFPrint(item1.getUrl()));
item1.setContent(new Buffer(randomBytes));
manager.cacheItem(item1,null);
// queue up cache load requests ....
for (int i=0;i<10000;++i) {
final String url = new String("http://www.domain.com/foobar/"+i);
eventLoop.setTimer(new Timer(1,false,new Timer.Callback() {
@Override
public void timerFired(Timer timer) {
manager.checkCacheForItem(url, new CacheItemCheckCallback() {
@Override
public void cacheItemAvailable(String url, CacheItem item) {
LOG.info("FOUND Item for URL:" + url + " ContentSize:" + item.getContent().getCount());
}
@Override
public void cacheItemNotFound(String url) {
LOG.info("DIDNOT Find Item for URL:" + url);
}
});
}
}));
}
eventLoop.setTimer(new Timer(1,false,new Timer.Callback() {
@Override
public void timerFired(Timer timer) {
manager.checkCacheForItem(item1.getUrl(), new CacheItemCheckCallback() {
@Override
public void cacheItemAvailable(String url, CacheItem item) {
LOG.info("FOUND Item for URL:" + url + " ContentSize:" + item.getContent().getCount());
}
@Override
public void cacheItemNotFound(String url) {
LOG.info("DIDNOT Find Item for URL:" + url);
}
});
}
}));
}
catch (IOException e) {
LOG.error(CCStringUtils.stringifyException(e));
}
}
semaphore.acquireUninterruptibly();
}
}