/**
* Copyright 2008 - CommonCrawl Foundation
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
**/
package org.commoncrawl.service.listcrawler;
import java.io.BufferedOutputStream;
import java.io.DataInputStream;
import java.io.DataOutput;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.RandomAccessFile;
import java.util.Arrays;
import java.util.Comparator;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.Semaphore;
import java.util.concurrent.TimeUnit;
import java.util.zip.CRC32;
import java.util.zip.CheckedOutputStream;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DataInputBuffer;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.MapFile;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.WritableUtils;
import org.apache.hadoop.io.SequenceFile.ValueBytes;
import org.apache.hadoop.record.Buffer;
import org.apache.log4j.BasicConfigurator;
import org.commoncrawl.async.EventLoop;
import org.commoncrawl.crawl.common.internal.CrawlEnvironment;
import org.commoncrawl.mapred.ProxyCrawlHistoryItem;
import org.commoncrawl.protocol.CrawlURL;
import org.commoncrawl.protocol.URLFP;
import org.commoncrawl.rpc.base.shared.BinaryProtocol;
import org.commoncrawl.service.crawler.util.URLFPBloomFilter;
import org.commoncrawl.service.listcrawler.CrawlListMetadata;
import org.commoncrawl.util.URLUtils;
import org.commoncrawl.util.CCStringUtils;
import org.commoncrawl.util.CRC16;
import org.commoncrawl.util.FileUtils;
import org.junit.Assert;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Sets;
/**
* Class that encapsulates the state necessary to manage long term crawl history
*
* @author rana
*
*/
public class CrawlHistoryManager implements CrawlHistoryStorage {
public static final Log LOG = LogFactory.getLog(CrawlHistoryManager.class);
private static class QueueItem<Type> {
public QueueItem(Type item) {
_item = item;
}
public Type _item;
}
public static interface ItemUpdater {
public void updateItemState(URLFP fingerprint, ProxyCrawlHistoryItem item)
throws IOException;
}
/** path to log file directory **/
File _localLogFileDir;
/** remote data directory **/
Path _remoteDataDirectory;
/** event loop **/
EventLoop _eventLoop;
/** file system object **/
FileSystem _remoteFileSystem = null;
/** queue of pending crawl history updates **/
LinkedBlockingQueue<HistoryUpdateRequest> _historyUpdateQueue = new LinkedBlockingQueue<HistoryUpdateRequest>();
/** list loader queue **/
LinkedBlockingQueue<QueueItem<CrawlList>> _listLoaderQueue = new LinkedBlockingQueue<QueueItem<CrawlList>>();
/** crawl queue **/
LinkedBlockingQueue<QueueItem<CrawlList>> _queueLoaderQueue = new LinkedBlockingQueue<QueueItem<CrawlList>>();
/** shutdown flag **/
boolean _shutdown = false;
/** queue loader shutdown flag **/
boolean _queueLoaderShutdown = false;
/** cache writer thread **/
Thread _writerThread;
/** list loader thread **/
Thread _listLoaderThread;
/** queue loader thread **/
Thread _queueLoaderThread;
/** checkpoint access semaphore **/
Semaphore _checkpointSemaphore = new Semaphore(
1);
/** last checkpoint time **/
long _lastCheckpointTime = -1;
/** crc16 - used to calculate individual payload crcs **/
private CRC16 _crc16in = new CRC16();
/** buffer used to store sync byte data during payload scan **/
private byte _syncByteBuffer[] = new byte[LocalLogFileHeader.SYNC_BYTES_SIZE];
/** payload Buffer object used to accumulate payload data for writes **/
Buffer _payloadBuffer = new Buffer();
/** data input buffer reused to read payload data **/
DataInputBuffer _payloadInputStream = new DataInputBuffer();
/** lists **/
TreeMap<Long, CrawlList> _crawlLists = new TreeMap<Long, CrawlList>();
public static final String CRAWL_HISTORY_HDFS_LOGFILE_PREFIX = "historyData-";
public static final String CRAWL_HISTORY_HDFS_BLOOMFILTER_PREFIX = "historyBloomFilter-";
/** log file header **/
private LocalLogFileHeader _header = new LocalLogFileHeader();
/** local log item map **/
TreeMap<URLFP, ProxyCrawlHistoryItem> _localLogItems = new TreeMap<URLFP, ProxyCrawlHistoryItem>();
public static final int INIT_FLAG_SKIP_ACTIVE_LOG_FILE_INIT = 1;
public static final int INIT_FLAG_SKIP_LOAD_EXISTING_LISTS = 2;
public static final int INIT_FLAG_SKIP_LOG_WRITER_THREAD_INIT = 4;
public static final int INIT_FLAG_SKIP_LIST_LOADER_THREAD_INIT = 8;
public static final int INIT_FLAG_DISABLE_CHECKPOINTS = 16;
private static final int LOG_ITEM_HEADER_SIZE = LocalLogFileHeader.SYNC_BYTES_SIZE + 4 + 2;
private static final int POLL_WAIT_TIME = 5000;
public static final int DEFAULT_LOCAL_ITEM_CHECKPOINT_THRESHOLD = 100000;
private static int _checkpointThreshold = DEFAULT_LOCAL_ITEM_CHECKPOINT_THRESHOLD;
/**
* constuctor
*
* @param logFileDir
* the path to the local log file directory
* @throws IOException
*/
public CrawlHistoryManager(FileSystem remoteFileSystem,
Path remoteLogFileDir, File localLogFileDir, EventLoop eventLoop,
int initFlags) throws IOException {
this._eventLoop = eventLoop;
this._remoteFileSystem = remoteFileSystem;
this._remoteDataDirectory = remoteLogFileDir;
this._localLogFileDir = localLogFileDir;
LOG.info("*** LOCAL DATA DIR:" + _localLogFileDir);
initialize(initFlags);
}
private void initialize(int initFlags) throws IOException {
// initialize the local log file ...
if ((initFlags & INIT_FLAG_SKIP_ACTIVE_LOG_FILE_INIT) == 0) {
initializeActiveLog();
}
// load pre-existing lists
if ((initFlags & INIT_FLAG_SKIP_LOAD_EXISTING_LISTS) == 0) {
loadExistingLists();
}
// start log writer thread ...
if ((initFlags & INIT_FLAG_SKIP_LOG_WRITER_THREAD_INIT) == 0) {
startLogWriterThread(initFlags);
}
// start list loader thread ...
if ((initFlags & INIT_FLAG_SKIP_LIST_LOADER_THREAD_INIT) == 0) {
startListLoaderThread();
}
}
/**
* shutdown the log properly
*/
public void shutdown() {
_shutdown = true;
stopQueueLoaderThread();
_historyUpdateQueue.add(new HistoryUpdateRequest());
_listLoaderQueue.add(new QueueItem(null));
try {
if (_writerThread != null) {
_writerThread.join();
}
if (_listLoaderThread != null) {
_listLoaderThread.join();
}
} catch (InterruptedException e1) {
}
_writerThread = null;
_listLoaderThread = null;
_historyUpdateQueue.clear();
_listLoaderQueue.clear();
_shutdown = false;
}
/**
*
* @return the local data directory
*/
public File getLocalDataDir() {
return _localLogFileDir;
}
/**
* add a new url list to the queue
*
* @param dataFilePath
* - the path to the file containing a list of urls
* @return a unique list id that can be used to identify the list
* @throws IOException
*/
public long loadList(File dataFilePath,int refreshInterval) throws IOException {
long listId = System.currentTimeMillis();
// create a placeholder list
CrawlList list = CrawlList.createListLoadingInLoadingState(this, listId,
dataFilePath,refreshInterval);
// add it to the map
synchronized (_crawlLists) {
_crawlLists.put(listId, list);
}
// add to to the loader queue .
_listLoaderQueue.add(new QueueItem<CrawlList>(list));
return listId;
}
/**
* retrieve the list object associated with the given id
*
* @param listId
* @return
*/
public CrawlList getList(long listId) {
synchronized (_crawlLists) {
return _crawlLists.get(listId);
}
}
/**
*
* @param matchCriteria
* - set of url fingerprints to match against
* @param updater
* - the action to perform with each match
*/
@Override
public void syncList(final long listId, TreeSet<URLFP> matchCriteria,
ItemUpdater targetList) throws IOException {
// first grab last update time ...
long lastUpdateTimePreScan = -1;
Set<Long> processedItems = new HashSet<Long>();
LOG.info("LIST:" + listId
+ " iterateCrawlHistoryLog - iterating hdfs log files");
boolean exitLoop = false;
do {
synchronized (this) {
lastUpdateTimePreScan = _lastCheckpointTime;
}
// ok now start to iterate item in checkpoint directory
Path wildcardPattern = new Path(_remoteDataDirectory,
CRAWL_HISTORY_HDFS_LOGFILE_PREFIX + "*");
FileStatus candidates[] = _remoteFileSystem.globStatus(wildcardPattern);
for (FileStatus candidate : candidates) {
Path candidatePath = candidate.getPath();
String candidateName = candidatePath.getName();
long candidateTimestamp = Long.parseLong(candidateName
.substring(CRAWL_HISTORY_HDFS_LOGFILE_PREFIX.length()));
if (candidateTimestamp <= lastUpdateTimePreScan
|| lastUpdateTimePreScan == -1) {
if (!processedItems.contains(candidateTimestamp)) {
LOG.info("LIST:" + listId
+ " iterateCrawlHistoryLog - iterating hdfs file:"
+ candidateName);
// go ahead and process this candidate ...
iterateHDFSCrawlHistoryLog(listId, candidateTimestamp,
matchCriteria, targetList);
// add to set
processedItems.add(candidateTimestamp);
}
}
}
// now acquire checkpoint semaphore
LOG.info("LIST:" + listId
+ " iterateCrawlHistoryLog - acquiring semaphore");
_checkpointSemaphore.acquireUninterruptibly();
try {
// check to see if checkpoint time has not changed
if (_lastCheckpointTime == lastUpdateTimePreScan) {
// ok checkpoint time has not changed since our previous attempt to
// check it
exitLoop = true;
if (_localLogItems.size() != 0) {
// go ahead and process any in memory items against the criteria ...
for (URLFP candidate : matchCriteria) {
ProxyCrawlHistoryItem item = _localLogItems.get(candidate);
// if found call match action
if (item != null) {
targetList.updateItemState(candidate, item);
}
}
}
}
} finally {
_checkpointSemaphore.release();
}
} while (!exitLoop);
}
// take a remote crawl history log file and cache it locally
private void cacheCrawlHistoryLog(File localCacheDir, long timestamp)
throws IOException {
SequenceFile.Reader reader = null;
Path mapFilePath = new Path(_remoteDataDirectory,
CRAWL_HISTORY_HDFS_LOGFILE_PREFIX + timestamp);
Path indexFilePath = new Path(mapFilePath, "index");
Path dataFilePath = new Path(mapFilePath, "data");
File cacheFilePath = new File(localCacheDir,
CRAWL_HISTORY_HDFS_LOGFILE_PREFIX + timestamp);
SequenceFile.Reader indexReader = new SequenceFile.Reader(
_remoteFileSystem, dataFilePath, CrawlEnvironment.getHadoopConfig());
ValueBytes valueBytes = indexReader.createValueBytes();
DataOutputBuffer keyBytes = new DataOutputBuffer();
DataInputBuffer keyBuffer = new DataInputBuffer();
DataOutputBuffer finalOutputStream = new DataOutputBuffer();
DataOutputBuffer uncompressedValueBytes = new DataOutputBuffer();
URLFP fp = new URLFP();
try {
while (indexReader.nextRaw(keyBytes, valueBytes) != -1) {
keyBuffer.reset(keyBytes.getData(), 0, keyBytes.getLength());
// read fingerprint ...
fp.readFields(keyBuffer);
// write hash only
finalOutputStream.writeLong(fp.getUrlHash());
uncompressedValueBytes.reset();
// write value bytes to intermediate buffer ...
valueBytes.writeUncompressedBytes(uncompressedValueBytes);
// write out uncompressed length
WritableUtils.writeVInt(finalOutputStream, uncompressedValueBytes
.getLength());
// write out bytes
finalOutputStream.write(uncompressedValueBytes.getData(), 0,
uncompressedValueBytes.getLength());
}
// delete existing ...
cacheFilePath.delete();
// compute crc ...
CRC32 crc = new CRC32();
crc.update(finalOutputStream.getData(), 0, finalOutputStream.getLength());
// open final output stream
DataOutputStream fileOutputStream = new DataOutputStream(
new BufferedOutputStream(new FileOutputStream(cacheFilePath)));
try {
fileOutputStream.writeLong(crc.getValue());
fileOutputStream.write(finalOutputStream.getData(), 0,
finalOutputStream.getLength());
fileOutputStream.flush();
} catch (IOException e) {
LOG.error(CCStringUtils.stringifyException(e));
fileOutputStream.close();
fileOutputStream = null;
cacheFilePath.delete();
throw e;
} finally {
if (fileOutputStream != null) {
fileOutputStream.close();
}
}
} finally {
if (indexReader != null) {
indexReader.close();
}
}
}
private void iterateHDFSCrawlHistoryLog(long listId, long timestamp,
TreeSet<URLFP> criteria, ItemUpdater targetList) throws IOException {
// ok copy stuff locally if possible ...
File localIndexPath = new File(getLocalDataDir(),
CRAWL_HISTORY_HDFS_LOGFILE_PREFIX + timestamp + ".index");
File localDataPath = new File(getLocalDataDir(),
CRAWL_HISTORY_HDFS_LOGFILE_PREFIX + timestamp + ".data");
File localBloomFilterPath = new File(getLocalDataDir(),
CRAWL_HISTORY_HDFS_LOGFILE_PREFIX + timestamp + ".bloom");
SequenceFile.Reader reader = null;
Path mapFilePath = new Path(_remoteDataDirectory,
CRAWL_HISTORY_HDFS_LOGFILE_PREFIX + timestamp);
Path indexFilePath = new Path(mapFilePath, "index");
Path dataFilePath = new Path(mapFilePath, "data");
Path bloomFilePath = new Path(_remoteDataDirectory,
CRAWL_HISTORY_HDFS_BLOOMFILTER_PREFIX + timestamp);
// ok copy local first
if (!localIndexPath.exists()) {
LOG.info("LIST:" + listId + " Copying Index File:" + indexFilePath
+ " to Local:" + localIndexPath.getAbsolutePath());
try {
_remoteFileSystem.copyToLocalFile(indexFilePath, new Path(
localIndexPath.getAbsolutePath()));
} catch (IOException e) {
LOG.error(CCStringUtils.stringifyException(e));
localIndexPath.delete();
throw e;
}
}
if (!localDataPath.exists()) {
LOG.info("LIST:" + listId + " Copying Data File:" + dataFilePath
+ " to Local:" + localDataPath.getAbsolutePath());
try {
_remoteFileSystem.copyToLocalFile(dataFilePath, new Path(localDataPath
.getAbsolutePath()));
} catch (IOException e) {
LOG.error(CCStringUtils.stringifyException(e));
localDataPath.delete();
throw e;
}
}
if (!localBloomFilterPath.exists()) {
LOG.info("LIST:" + listId + " Copying Bloom File:" + bloomFilePath
+ " to Local:" + localBloomFilterPath.getAbsolutePath());
try {
_remoteFileSystem.copyToLocalFile(bloomFilePath, new Path(
localBloomFilterPath.getAbsolutePath()));
} catch (IOException e) {
LOG.error(CCStringUtils.stringifyException(e));
localBloomFilterPath.delete();
throw e;
}
}
// ok open local
FileSystem localFileSystem = FileSystem.getLocal(CrawlEnvironment
.getHadoopConfig());
SequenceFile.Reader indexReader = new SequenceFile.Reader(localFileSystem,
new Path(localIndexPath.getAbsolutePath()), CrawlEnvironment
.getHadoopConfig());
try {
URLFP firstIndexKey = null;
URLFP lastIndexKey = new URLFP();
LongWritable position = new LongWritable();
while (indexReader.next(lastIndexKey, position)) {
if (firstIndexKey == null) {
try {
firstIndexKey = (URLFP) lastIndexKey.clone();
} catch (CloneNotSupportedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
LOG.info("LIST:" + listId + " ### Index First Domain:"
+ firstIndexKey.getDomainHash() + " URLHash:"
+ firstIndexKey.getUrlHash() + " Last Domain:"
+ lastIndexKey.getDomainHash() + " URLHash:"
+ lastIndexKey.getUrlHash());
URLFP criteriaFirstKey = criteria.first();
URLFP criteriaLastKey = criteria.last();
if (firstIndexKey.compareTo(criteriaLastKey) > 0
|| lastIndexKey.compareTo(criteriaFirstKey) < 0) {
LOG.info("LIST:" + listId + " Entire Index is Out of Range. Skipping!");
LOG.info("LIST:" + listId + " ### Criteria First Domain:"
+ criteriaFirstKey.getDomainHash() + " URLHash:"
+ criteriaFirstKey.getUrlHash() + " Last Domain:"
+ criteriaLastKey.getDomainHash() + " URLHash:"
+ criteriaLastKey.getUrlHash());
return;
}
} finally {
indexReader.close();
}
LOG.info("LIST:" + listId + " ### Index:" + timestamp
+ " Passed Test. Doing Full Scan");
// load bloom filter
FSDataInputStream bloomFilterStream = localFileSystem.open(new Path(
localBloomFilterPath.getAbsolutePath()));
int hitCount = 0;
try {
URLFPBloomFilter filter = URLFPBloomFilter.load(bloomFilterStream);
URLFP fpOut = new URLFP();
ProxyCrawlHistoryItem itemOut = new ProxyCrawlHistoryItem();
DataOutputBuffer valueBytesUncompressed = new DataOutputBuffer();
ValueBytes valueBytes = null;
DataInputBuffer valueReader = new DataInputBuffer();
DataOutputBuffer keyBytes = new DataOutputBuffer();
DataInputBuffer keyReader = new DataInputBuffer();
URLFP lastFP = null;
outerLoop:
// now iterate each item in the criteria
for (URLFP targetFP : criteria) {
// if fingerprint is present in filter ...
if (filter.isPresent(targetFP)) {
// check to see if reader is initialzied ...
if (reader == null) {
LOG.info("LIST:" + listId
+ " BloomFilter First Hit. Initializing Reader for file at:"
+ localDataPath.getAbsolutePath());
reader = new SequenceFile.Reader(localFileSystem, new Path(
localDataPath.getAbsolutePath()), CrawlEnvironment
.getHadoopConfig());
LOG.info("LIST:" + listId
+ " BloomFilter First Hit. Initialized Reader for file at:"
+ localDataPath.getAbsolutePath());
valueBytes = reader.createValueBytes();
}
// if last read fingerprint was not null ...
if (lastFP != null) {
// does it match the current item
if (lastFP.compareTo(targetFP) == 0) {
// decompress value bytes ...
valueBytesUncompressed.reset();
valueBytes.writeUncompressedBytes(valueBytesUncompressed);
// init valueReader
valueReader.reset(valueBytesUncompressed.getData(),
valueBytesUncompressed.getLength());
itemOut.readFields(valueReader);
LOG.info("LIST:" + listId + " GOT HISTORY ITEM HIT. URL:"
+ +lastFP.getUrlHash() + " File:" + dataFilePath);
// if so, null out last fp
lastFP = null;
// and update item state ...
targetList.updateItemState(targetFP, itemOut);
hitCount++;
continue;
}
}
// ok at this point .. read the next item in the list ...
lastFP = null;
while (reader.nextRaw(keyBytes, valueBytes) != -1) {
// init reader ...
keyReader.reset(keyBytes.getData(), keyBytes.getLength());
// read key
fpOut.readFields(keyReader);
// reset output buffer
keyBytes.reset();
// LOG.info("LIST:" + listId +" nextRaw Returned DH:" +
// fpOut.getDomainHash() + " UH:" + fpOut.getUrlHash() + " TDH:" +
// targetFP.getDomainHash() + " TUH:" + targetFP.getUrlHash());
// compare it to target ...
int result = fpOut.compareTo(targetFP);
// ok does it match .. ?
if (result == 0) {
// decompress value bytes ...
valueBytesUncompressed.reset();
valueBytes.writeUncompressedBytes(valueBytesUncompressed);
// init valueReader
valueReader.reset(valueBytesUncompressed.getData(),
valueBytesUncompressed.getLength());
itemOut.readFields(valueReader);
LOG.info("LIST:" + listId + " GOT HISTORY ITEM HIT. URL:"
+ fpOut.getUrlHash() + " File:" + dataFilePath);
// update item state ...
targetList.updateItemState(targetFP, itemOut);
hitCount++;
// and break to outer loop
continue outerLoop;
} else if (result == 1) {
// LOG.info("LIST:" + listId +
// " FP Comparison Returned 1. Going to OuterLoop");
// update last FP
lastFP = fpOut;
// continue outer loop
continue outerLoop;
} else {
// otherwise skip
}
}
// ok if we got here .. we are done reading the sequence file and did
// not find a trailing match
LOG
.warn("LIST:"
+ listId
+ " ### Reached End Of File Searching for item in MapFile while BloomFilter returned positivie result (DomainHash:"
+ targetFP.getDomainHash() + "FP:" + targetFP.getUrlHash()
+ ")");
// break out of outer loop
break;
}
}
} finally {
bloomFilterStream.close();
if (reader != null) {
reader.close();
}
LOG.info("LIST:" + listId + " File:" + dataFilePath + " DONE. HitCount:"
+ hitCount);
}
}
// Path mapOutputPath = new
// Path(_remoteDataDirectory,CRAWL_HISTORY_HDFS_LOGFILE_PREFIX +
// checkpointTimestamp);
// Path filterOutputPath = new
// Path(_remoteDataDirectory,CRAWL_HISTORY_HDFS_BLOOMFILTER_PREFIX +
// checkpointTimestamp);
/**
* callback initiated by proxy server on a per url basis
*
* @param connection
* @param url
* @param optTargetObj
* @param successOrFailure
*/
public void crawlComplete(CrawlURL url) {
ProxyCrawlHistoryItem historyItem;
if ((url.getFlags() & CrawlURL.Flags.IsRedirected) != 0) {
historyItem = new ProxyCrawlHistoryItem();
historyItem.setCrawlStatus(0);
historyItem.setLastModifiedTime(System.currentTimeMillis());
historyItem.setOriginalURL(url.getUrl());
historyItem.setHttpResultCode(url.getOriginalResultCode());
historyItem.setRedirectURL(url.getRedirectURL());
if (url.isFieldDirty(CrawlURL.Field_RESULTCODE))
historyItem.setRedirectHttpResult(url.getResultCode());
historyItem
.setRedirectStatus((url.getLastAttemptResult() == CrawlURL.CrawlResult.SUCCESS) ? 0
: url.getLastAttemptFailureReason());
// add the original url pointing to the final url to the queue
_historyUpdateQueue.add(new HistoryUpdateRequest(historyItem));
// ok, now create an entry for the redirected url
historyItem = new ProxyCrawlHistoryItem();
historyItem
.setCrawlStatus((url.getLastAttemptResult() == CrawlURL.CrawlResult.SUCCESS) ? 0
: url.getLastAttemptFailureReason());
historyItem.setLastModifiedTime(System.currentTimeMillis());
historyItem.setOriginalURL(url.getRedirectURL());
if (url.isFieldDirty(CrawlURL.Field_RESULTCODE))
historyItem.setHttpResultCode(url.getResultCode());
// and add it to the queue
_historyUpdateQueue.add(new HistoryUpdateRequest(historyItem));
} else {
// if not redirected ... create an entry for the redirected url
historyItem = new ProxyCrawlHistoryItem();
historyItem
.setCrawlStatus((url.getLastAttemptResult() == CrawlURL.CrawlResult.SUCCESS) ? 0
: url.getLastAttemptFailureReason());
historyItem.setLastModifiedTime(System.currentTimeMillis());
historyItem.setOriginalURL(url.getUrl());
if (url.isFieldDirty(CrawlURL.Field_RESULTCODE))
historyItem.setHttpResultCode(url.getResultCode());
// and add it to the queue
_historyUpdateQueue.add(new HistoryUpdateRequest(historyItem));
}
}
/**
* startCacheWriterThread
*
*/
private void startLogWriterThread(int initFlags) {
_writerThread = new Thread(new LogWriterThread(initFlags));
_writerThread.start();
}
/**
* getLogFilePath - get active log file path
*
* @param directoryRoot
* @return
*/
private File getActiveLogFilePath() {
return new File(_localLogFileDir, "ActiveLog");
}
/**
* initializeEmptyLogFile - init an empty log file header
*
* @param stream
* @return
* @throws IOException
*/
private static LocalLogFileHeader initializeEmptyLogFile(DataOutput stream)
throws IOException {
LocalLogFileHeader header = new LocalLogFileHeader();
header.writeHeader(stream);
return header;
}
/**
* initiailizeActiveLog - init local cache log
*
*
* **/
private void initializeActiveLog() throws IOException {
File activeLogPath = getActiveLogFilePath();
if (!activeLogPath.exists()) {
DataOutputStream outputStream = new DataOutputStream(
new FileOutputStream(activeLogPath));
try {
_header = initializeEmptyLogFile(outputStream);
} finally {
outputStream.close();
}
} else {
_header = new LocalLogFileHeader();
DataInputStream inputStream = new DataInputStream(new FileInputStream(
activeLogPath));
try {
_header.readHeader(inputStream);
} finally {
inputStream.close();
}
if (_header._itemCount != 0) {
_localLogItems = loadLocalLogItemMap();
}
}
}
/**
* updateLogFileHeader - update the log file header called via the log file
* writer thread ...
*
* @throws IOException
*/
void updateLogFileHeader(File logFileName, long newlyAddedItemsCount,
long newItemsFileSize) throws IOException {
RandomAccessFile file = new RandomAccessFile(logFileName, "rw");
try {
synchronized (_header) {
// update cached header ...
_header._fileSize += newItemsFileSize;
_header._itemCount += newlyAddedItemsCount;
// set the position at zero ..
file.seek(0);
// and write header to disk ...
_header.writeHeader(file);
}
} finally {
// major bottle neck..
// file.getFD().sync();
file.close();
}
}
/**
* get local log position according to cached header
*
* @return
*/
long getLocalLogFilePos() {
long filePosOut = 0;
synchronized (_header) {
filePosOut = _header._fileSize;
}
return filePosOut;
}
/**
* return the header sync bytes
*
* @return header sync bytes
*/
byte[] getLocalLogSyncBytes() {
return _header._sync;
}
/**
*
* @return a sorted map of urlfp to item
* @throws IOException
*/
TreeMap<URLFP, ProxyCrawlHistoryItem> loadLocalLogItemMap()
throws IOException {
TreeMap<URLFP, ProxyCrawlHistoryItem> itemMap = new TreeMap<URLFP, ProxyCrawlHistoryItem>();
LOG.info("Reading Local Log File");
RandomAccessFile file = new RandomAccessFile(getActiveLogFilePath(), "rw");
// valid length indicator ...
long validLength = 0;
try {
// skip header ...
file.seek(LocalLogFileHeader.SIZE);
validLength = file.getFilePointer();
// ok walk n items ...
for (int itemIdx = 0; itemIdx < _header._itemCount
&& file.getChannel().position() <= _header._fileSize; ++itemIdx) {
try {
ProxyCrawlHistoryItem item = readItem(file);
// update valid length ...
validLength = file.getFilePointer();
// ok compute fingerprint for item ...
URLFP fingerprintObject = URLUtils.getURLFPFromURL(item
.getOriginalURL(), true);
if (fingerprintObject == null) {
LOG.error("Could not compute fingerprint for URL:"
+ item.getOriginalURL());
} else {
itemMap.put(fingerprintObject, item);
}
} catch (IOException e) {
LOG.error(CCStringUtils.stringifyException(e));
try {
if (!seekToNextSyncBytesPos(file)) {
LOG.error("Hit EOF While Seeking for next SyncByte Sequence!");
break;
} else {
LOG.info("Seek to Next SyncByte Succeeded! Continuing Load");
}
} catch (IOException e2) {
LOG.error(CCStringUtils.stringifyException(e2));
LOG.error("Got IO Exception Reading SyncBytes - Bailing!");
break;
}
}
}
} finally {
if (file.length() > validLength) {
LOG.warn("File Length is:" + file.length() + " Truncating Length to:"
+ validLength);
file.setLength(validLength);
}
file.close();
}
LOG.info("Done Reading Local Log File");
return itemMap;
}
private ProxyCrawlHistoryItem readItem(RandomAccessFile fileStream)
throws IOException {
try {
// read sync bytes ...
fileStream.read(_syncByteBuffer);
// validate ...
if (!Arrays.equals(_header._sync, _syncByteBuffer)) {
throw new IOException("Error Reading Sync Bytes for Item In Checkpoint");
}
int checksum = fileStream.readInt();
int payloadSize = fileStream.readShort();
if (payloadSize == 0) {
throw new IOException("Invalid Payload Size Reading Item In Checkpoint");
}
// read the payload
_payloadBuffer.setCapacity(payloadSize);
fileStream.read(_payloadBuffer.get(), 0, payloadSize);
_crc16in.reset();
_crc16in.update(_payloadBuffer.get(), 0, payloadSize);
// if computed checksum does not match file checksum !!!
if (_crc16in.getValue() != (long) checksum) {
throw new IOException("Checksum Mismatch Expected:" + checksum
+ " got:" + _crc16in.getValue() + " while Reading Item");
}
_payloadInputStream.reset(_payloadBuffer.get(), 0, payloadSize);
ProxyCrawlHistoryItem itemOut = new ProxyCrawlHistoryItem();
itemOut.deserialize(_payloadInputStream, new BinaryProtocol());
return itemOut;
} catch (Exception e) {
LOG.error(CCStringUtils.stringifyException(e));
throw new IOException(e);
}
}
/**
* seek out next instance of sync bytes in the file input stream
*
* @param file
* @throws IOException
*/
private boolean seekToNextSyncBytesPos(RandomAccessFile file)
throws IOException {
// read in a sync.length buffer amount
file.read(_syncByteBuffer);
int syncLen = _header._sync.length;
// start scan for next sync position ...
for (int i = 0; file.getFilePointer() < _header._fileSize; i++) {
int j = 0;
for (; j < syncLen; j++) {
if (_header._sync[j] != _syncByteBuffer[(i + j) % syncLen])
break;
}
if (j == syncLen) {
// found matching sync bytes - reset file pos to before sync bytes
file.seek(file.getFilePointer() - LocalLogFileHeader.SYNC_BYTES_SIZE); // position
// before
// sync
return true;
}
_syncByteBuffer[i % syncLen] = file.readByte();
}
return false;
}
void doCheckpoint() {
LOG.info("Starting Checkpoint Process");
try {
LOG.info("Writing HDFS Log File");
// write local file contents to hdfs
// we don't need to lock the map here becuase only the local log thread
// (current thread) modified the map ...
long checkpointTimestamp = writeMapFileToHDFS(_localLogItems);
// ok that worked .. delete local log
getActiveLogFilePath().delete();
// ok now we DO NEED TO lock the map
synchronized (_localLogItems) {
_localLogItems.clear();
}
LOG.info("Regenerating Local Log");
// and regenerate the file
initializeActiveLog();
synchronized (CrawlHistoryManager.this) {
// ok now update checkpoint timestamp variable
_lastCheckpointTime = checkpointTimestamp;
}
LOG.info("Checkpoint Done");
} catch (IOException e) {
LOG.error("CrawlHistoryLog Checkpoint Failed with Exception:"
+ CCStringUtils.stringifyException(e));
}
}
long writeMapFileToHDFS(TreeMap<URLFP, ProxyCrawlHistoryItem> itemMap)
throws IOException {
long checkpointTimestamp = System.currentTimeMillis();
Path mapOutputPath = new Path(_remoteDataDirectory,
CRAWL_HISTORY_HDFS_LOGFILE_PREFIX + checkpointTimestamp);
Path filterOutputPath = new Path(_remoteDataDirectory,
CRAWL_HISTORY_HDFS_BLOOMFILTER_PREFIX + checkpointTimestamp);
writeCheckpoint(itemMap, CrawlEnvironment.getHadoopConfig(),
_remoteFileSystem, mapOutputPath, filterOutputPath);
return checkpointTimestamp;
}
public static void writeCheckpoint(
TreeMap<URLFP, ProxyCrawlHistoryItem> itemMap, Configuration conf,
FileSystem remoteFileSystem, Path mapOutputPath, Path filterOutputPath)
throws IOException {
try {
LOG.info("Generating Map File at Location:" + mapOutputPath
+ " Filter At:" + filterOutputPath);
// open a temporary hdfs streams ...
MapFile.Writer writer = new MapFile.Writer(conf, remoteFileSystem,
mapOutputPath.toString(), URLFP.class, ProxyCrawlHistoryItem.class);
// create a bloom filter
URLFPBloomFilter filter = new URLFPBloomFilter(_checkpointThreshold * 2,
10, 11);
try {
for (Map.Entry<URLFP, ProxyCrawlHistoryItem> entry : itemMap.entrySet()) {
LOG.info("Writing Key to Map. DomainHash:"
+ entry.getKey().getDomainHash() + " URLHash:"
+ entry.getKey().getUrlHash());
filter.add(entry.getKey());
writer.append(entry.getKey(), entry.getValue());
}
} finally {
writer.close();
}
LOG.info("Done generating Map File");
LOG.info("Writing Bloom Filter Data");
// ok now also flush the bloom filter
FSDataOutputStream bloomFilterOutputStream = remoteFileSystem
.create(filterOutputPath);
try {
filter.serialize(bloomFilterOutputStream);
} finally {
bloomFilterOutputStream.flush();
bloomFilterOutputStream.close();
}
} catch (IOException e) {
// delete all relevant files ...
remoteFileSystem.delete(mapOutputPath, true);
remoteFileSystem.delete(filterOutputPath, false);
// throw exception back out ...
throw e;
}
}
DataOutputBuffer _outputBuffer = new DataOutputBuffer();
private CRC16 _crc16Out = new CRC16();
/**
* append a ProxyCrawlHistoryItem to the active log
*
* @param item
* @throws IOException
*/
void appendItemToLog(ProxyCrawlHistoryItem item) throws IOException {
try {
// open the log file ...
DataOutputStream logStream = new DataOutputStream(new FileOutputStream(
getActiveLogFilePath(), true));
try {
// reset crc calculator (single thread so no worries on synchronization)
_crc16Out.reset();
// reset output stream
_outputBuffer.reset();
// create checked stream
CheckedOutputStream checkedStream = new CheckedOutputStream(
_outputBuffer, _crc16Out);
DataOutputStream dataOutputStream = new DataOutputStream(checkedStream);
// write out item
item.serialize(dataOutputStream, new BinaryProtocol());
dataOutputStream.flush();
// ok now write out sync,crc,length then data
logStream.write(getLocalLogSyncBytes());
logStream.writeInt((int) checkedStream.getChecksum().getValue());
logStream.writeShort((short) _outputBuffer.getLength());
logStream.write(_outputBuffer.getData(), 0, _outputBuffer.getLength());
logStream.flush();
logStream.close();
logStream = null;
// now we need to update the file header
updateLogFileHeader(getActiveLogFilePath(), 1, LOG_ITEM_HEADER_SIZE
+ _outputBuffer.getLength());
URLFP fingerprint = URLUtils.getURLFPFromURL(item.getOriginalURL(),
true);
// update local log
synchronized (_localLogItems) {
if (fingerprint != null) {
_localLogItems.put(fingerprint, item);
}
}
ImmutableSet<CrawlList> lists = null;
// and now walk lists updating them as necessary
synchronized (_crawlLists) {
lists = new ImmutableSet.Builder<CrawlList>().addAll(
_crawlLists.values()).build();
}
for (CrawlList list : lists) {
try {
list.updateItemState(fingerprint, item);
} catch (Exception e) {
// ok, IF an error occurs updating the list metadata.. we need to
// coninue along.
// it is critical for this thread to not die in such a circumstane
LOG.fatal("Error Updating List(" + list.getListId() + "):"
+ CCStringUtils.stringifyException(e));
System.out.println("Exception in List Update(" + list.getListId()
+ "):" + CCStringUtils.stringifyException(e));
}
}
} finally {
if (logStream != null) {
logStream.close();
}
}
} finally {
}
}
class LogWriterThread implements Runnable {
int _initFlags;
public LogWriterThread(int initFlags) {
_initFlags = initFlags;
}
@Override
public void run() {
boolean shutdown = false;
while (!shutdown) {
try {
final HistoryUpdateRequest request = _historyUpdateQueue.poll(
POLL_WAIT_TIME, TimeUnit.MILLISECONDS);
if (request != null) {
switch (request._requestType) {
case ExitThreadRequest: {
// shutdown condition ...
LOG.info("Log Writer Thread Received Shutdown. Exiting!");
shutdown = true;
}
break;
case UpdateRequest: {
try {
appendItemToLog(request._item);
} catch (IOException e) {
LOG.error(CCStringUtils.stringifyException(e));
}
}
break;
}
}
// now check if we can perform a checkpoint
long localItemCount = 0;
synchronized (_header) {
localItemCount = _header._itemCount;
}
// LOG.info("$$$$ LOCAL ITEM COUNT IS:" + localItemCount);
if (localItemCount >= _checkpointThreshold) {
// if checkpoints were not disabled during initialization ...
if ((_initFlags & INIT_FLAG_DISABLE_CHECKPOINTS) == 0) {
LOG.info("$$$$ LOCAL ITEM COUNT EXCEEDS THRESHOLD:"
+ _checkpointThreshold);
// see if can start a checkpoint ..
if (_checkpointSemaphore.tryAcquire(100, TimeUnit.MILLISECONDS)) {
try {
// ok we can exclusivey touch the local log file
doCheckpoint();
} finally {
_checkpointSemaphore.release();
}
} else {
LOG.warn("$$$$ FAILED TO ACQUIRE SEMAPHORE FOR CHECKPOINT!");
}
} else {
LOG
.warn("$$$$ CHECKPOINTS DISABLED. SKIPPING POTENTIAL CHECKPOINT");
}
}
} catch (InterruptedException e) {
}
}
}
}
static class HistoryUpdateRequest {
public enum RequestType {
UpdateRequest, ExitThreadRequest
}
public HistoryUpdateRequest(ProxyCrawlHistoryItem item) {
_item = item;
_requestType = RequestType.UpdateRequest;
}
public HistoryUpdateRequest() {
_requestType = RequestType.ExitThreadRequest;
}
public ProxyCrawlHistoryItem _item = null;
public RequestType _requestType;
}
private void loadExistingLists() throws IOException {
// scan data directory for list id pattern
FileSystem localFileSystem = FileSystem.getLocal(CrawlEnvironment
.getHadoopConfig());
FileStatus loadTargets[] = localFileSystem.globStatus(new Path(
_localLogFileDir.getAbsolutePath(), CrawlList.LIST_URL_DATA_PREFIX
+ "*"));
// sort list so that we load newer lists first ...
Arrays.sort(loadTargets, new Comparator<FileStatus>() {
@Override
public int compare(FileStatus o1, FileStatus o2) {
return ((Long) o2.getModificationTime()).compareTo(o1
.getModificationTime());
}
});
for (FileStatus loadTarget : loadTargets) {
// extract timestamp ...
long listId = Long.parseLong(loadTarget.getPath().getName().substring(
CrawlList.LIST_URL_DATA_PREFIX.length()));
LOG.info("Found List Data for List:" + listId);
// validate
if (CrawlList.allFilesPresent(_localLogFileDir, listId)) {
LOG.info("List looks valid. Loading");
try {
CrawlList list = new CrawlList(this, listId);
synchronized (_crawlLists) {
CrawlList oldList = _crawlLists.get(listId);
if (oldList != null) {
list.setEventListener(oldList.getEventListener());
}
_crawlLists.put(listId, list);
}
LOG.info("Loaded List:" + listId + " Scheduling for Queueing");
_queueLoaderQueue.add(new QueueItem<CrawlList>(list));
} catch (IOException e) {
LOG.error("Failed to load list:" + listId + " Exception:"
+ CCStringUtils.stringifyException(e));
synchronized (_crawlLists) {
_crawlLists.put(listId, CrawlList.createListWithLoadErrorState(
this, listId, e));
}
}
}
}
}
private void startListLoaderThread() {
_listLoaderThread = new Thread(new Runnable() {
@Override
public void run() {
LOG.info("Starting List Loader Thread");
while (true) {
try {
QueueItem<CrawlList> listItem = _listLoaderQueue.take();
if (listItem._item == null || _shutdown) {
break;
} else {
try {
// mark the ui list as really loading ...
listItem._item.markListAsReallyLoading();
LOG.info("Attempting to load List:"
+ listItem._item.getListId());
CrawlList listToLoad = new CrawlList(CrawlHistoryManager.this,
listItem._item.getListId(), listItem._item
.getListURLDataFile(),listItem._item.getMetadata().getRefreshInterval());
LOG.info("Successfully loaded List:"
+ listItem._item.getListId() + " Sending to QueueLoader");
synchronized (_crawlLists) {
CrawlList oldList = _crawlLists.get(listToLoad.getListId());
if (oldList != null) {
listToLoad.setEventListener(oldList.getEventListener());
}
_crawlLists.put(listToLoad.getListId(), listToLoad);
}
// add to queue loader ...
if (!_shutdown) {
_queueLoaderQueue.add(new QueueItem<CrawlList>(listToLoad));
}
} catch (Exception e) {
LOG.error("Failed to load List:" + listItem._item.getListId()
+ " with Exception:" + CCStringUtils.stringifyException(e));
synchronized (_crawlLists) {
_crawlLists.put(listItem._item.getListId(), CrawlList
.createListWithLoadErrorState(CrawlHistoryManager.this,
listItem._item.getListId(), e));
}
}
}
} catch (InterruptedException e) {
}
}
LOG.info("Exiting List Loader Thread");
}
});
_listLoaderThread.start();
}
/**
* start the queue loader thread
*
* @param loader
* - the passed in queue loader callback
*/
public void startQueueLoaderThread(final CrawlQueueLoader loader) {
_queueLoaderThread = new Thread(new Runnable() {
@Override
public void run() {
LOG.info("Starting Qeueue Loader Thread");
try {
while (true) {
try {
QueueItem<CrawlList> listItem = _queueLoaderQueue.take();
if (listItem._item == null || _queueLoaderShutdown) {
break;
} else {
try {
LOG.info("Attempting to queue List:"
+ listItem._item.getListId());
listItem._item.queueUnCrawledItems(loader);
LOG
.info("Finished queueing List:"
+ listItem._item.getListId());
} catch (Exception e) {
LOG.error("Failed to queue List:" + listItem._item.getListId()
+ " with Exception:" + CCStringUtils.stringifyException(e));
}
}
} catch (InterruptedException e) {
}
}
}
finally {
LOG.info("Exiting Queue Loader Thread");
}
}
});
_queueLoaderThread.start();
}
public void stopQueueLoaderThread() {
if (_queueLoaderThread != null) {
_queueLoaderShutdown = true;
_queueLoaderQueue.add(new QueueItem(null));
try {
_queueLoaderThread.join();
} catch (InterruptedException e) {
}
_queueLoaderThread = null;
_queueLoaderShutdown = false;
}
}
private static CrawlURL proxyCrawlHitoryItemToCrawlURL(
ProxyCrawlHistoryItem item) {
CrawlURL url = new CrawlURL();
url.setUrl(item.getOriginalURL());
if (item.isFieldDirty(ProxyCrawlHistoryItem.Field_CRAWLSTATUS)) {
if (item.getCrawlStatus() == 0) {
url.setLastAttemptResult((byte) CrawlURL.CrawlResult.SUCCESS);
} else {
url.setLastAttemptResult((byte) CrawlURL.CrawlResult.FAILURE);
url.setLastAttemptFailureReason((byte) item.getCrawlStatus());
}
}
if (item.isFieldDirty(ProxyCrawlHistoryItem.Field_HTTPRESULTCODE)) {
url.setResultCode(item.getHttpResultCode());
}
if (item.isFieldDirty(ProxyCrawlHistoryItem.Field_REDIRECTURL)) {
// move original result code over to appropiate location in CrawlURL
url.setOriginalResultCode(url.getResultCode());
url.setFieldClean(CrawlURL.Field_RESULTCODE);
url.setFlags(CrawlURL.Flags.IsRedirected);
url.setRedirectURL(item.getRedirectURL());
if (item.getRedirectStatus() == 0) {
url.setLastAttemptResult((byte) CrawlURL.CrawlResult.SUCCESS);
} else {
url.setLastAttemptResult((byte) CrawlURL.CrawlResult.FAILURE);
url.setLastAttemptFailureReason((byte) item.getRedirectStatus());
}
if (item.isFieldDirty(ProxyCrawlHistoryItem.Field_REDIRECTHTTPRESULT)) {
url.setResultCode(item.getRedirectHttpResult());
}
}
return url;
}
public Map<Long, CrawlListMetadata> collectListMetadata(Set<Long> predicate) {
TreeMap<Long, CrawlListMetadata> metadataOut = new TreeMap<Long, CrawlListMetadata>();
int normalPriority = Thread.currentThread().getPriority();
try {
LOG.info("### BOOSTING THREAD PRIORITY");
Thread.currentThread().setPriority(Thread.MAX_PRIORITY);
LOG.info("### ATTEMPTING LOCK");
synchronized (_crawlLists) {
LOG.info("### GOT LOCK");
for (CrawlList list : _crawlLists.values()) {
if (predicate.contains(list.getListId())) {
metadataOut.put(list.getListId(), list.getMetadata());
}
}
}
LOG.info("### RELEASING LOCK");
} finally {
Thread.currentThread().setPriority(normalPriority);
}
return metadataOut;
}
/**********************************************************************************************/
// TEST CODE
/**********************************************************************************************/
private static void generateTestURLFile(File outputFile, String[] urlList)
throws IOException {
PrintWriter writer = new PrintWriter(outputFile, "UTF-8");
for (String url : urlList) {
writer.println(url);
}
writer.flush();
writer.close();
}
static final String[] urlList1 = { "http://www.google.com/1",
"http://www.someotherdomain.com/1", "http://www.google.com/2",
"http://www.google.com/3",
"http://www.someotherdomain.com/triggerSemaphore" };
static final String[] urlList2 = { "http://www.google.com/1",
"http://www.google.com/2", "http://www.someotherdomain.com/2",
"http://www.google.com/4", "http://www.google.com/5", };
public static void main(String[] args) {
// initialize ...
Configuration conf = new Configuration();
conf.addResource("nutch-default.xml");
conf.addResource("nutch-site.xml");
conf.addResource("core-site.xml");
conf.addResource("hdfs-site.xml");
conf.addResource("mapred-site.xml");
BasicConfigurator.configure();
conf.set("mapred.output.compression.codec",
"org.apache.hadoop.io.compress.GzipCodec");
conf.set("mapred.map.output.compression.codec",
"org.apache.hadoop.io.compress.GzipCodec");
conf.set("io.seqfile.compression.type", "NONE");
CrawlEnvironment.setHadoopConfig(conf);
CrawlEnvironment.setDefaultHadoopFSURI("file:///");
EventLoop eventLoop = new EventLoop();
eventLoop.start();
testWriteMapFileToHDFS(eventLoop);
// launchInTestMode();
}
private static void testWriteMapFileToHDFS(EventLoop eventLoop) {
try {
// initialize log manager
CrawlHistoryManager logManager = initializeTestLogManager(eventLoop, true);
// initialize item list
TreeMap<URLFP, ProxyCrawlHistoryItem> items = buildTestList(urlList1);
final TreeMap<String, URLFP> urlToURLFPMap = new TreeMap<String, URLFP>();
for (Map.Entry<URLFP, ProxyCrawlHistoryItem> item : items.entrySet()) {
urlToURLFPMap.put(item.getValue().getOriginalURL(), item.getKey());
}
// add to local item map in log manager
for (ProxyCrawlHistoryItem item : items.values()) {
logManager.appendItemToLog(item);
}
// ok shutdown log manager ...
logManager.shutdown();
// restart - reload log file ...
logManager = initializeTestLogManager(eventLoop, false);
// write to 'hdfs'
logManager.doCheckpoint();
syncAndValidateItems(items, logManager);
logManager.shutdown();
// restart
logManager = initializeTestLogManager(eventLoop, false);
// tweak original items
updateTestItemStates(items);
// ok append items
for (ProxyCrawlHistoryItem item : items.values()) {
logManager.appendItemToLog(item);
}
syncAndValidateItems(items, logManager);
// ok now checkpoint the items
logManager.doCheckpoint();
// ok now validate one last time
syncAndValidateItems(items, logManager);
// shutown
logManager.shutdown();
logManager = null;
{
// start from scratch ...
final CrawlHistoryManager logManagerTest = initializeTestLogManager(
eventLoop, true);
// create a final version of the tree map reference
final TreeMap<URLFP, ProxyCrawlHistoryItem> itemList = items;
// create filename
File urlInputFile = new File(logManagerTest.getLocalDataDir(),
"testURLS-" + System.currentTimeMillis());
// ok create a crawl list from urls
CrawlList.generateTestURLFile(urlInputFile, urlList1);
long listId = logManagerTest.loadList(urlInputFile,0);
CrawlList listObject = logManagerTest.getList(listId);
final Semaphore listCompletionSemaphore = new Semaphore(-(itemList
.size() - 1));
listObject.setEventListener(new CrawlList.CrawlListEvents() {
@Override
public void itemUpdated(URLFP itemFingerprint) {
// TODO Auto-generated method stub
listCompletionSemaphore.release();
}
});
// ok start the appropriate threads
logManagerTest.startLogWriterThread(0);
logManagerTest.startListLoaderThread();
logManagerTest.startQueueLoaderThread(new CrawlQueueLoader() {
@Override
public void queueURL(URLFP urlfp, String url) {
logManagerTest
.crawlComplete(proxyCrawlHitoryItemToCrawlURL(itemList
.get(urlToURLFPMap.get(url))));
}
@Override
public void flush() {
// TODO Auto-generated method stub
}
});
LOG.info("Waiting for Release");
// and wait for the finish
listCompletionSemaphore.acquireUninterruptibly();
LOG.info("Got Here");
}
} catch (IOException e) {
LOG.error(CCStringUtils.stringifyException(e));
}
}
private static void syncAndValidateItems(
TreeMap<URLFP, ProxyCrawlHistoryItem> items,
CrawlHistoryManager logManager) throws IOException {
// ok now sync the list
final TreeMap<URLFP, ProxyCrawlHistoryItem> syncedItemList = new TreeMap<URLFP, ProxyCrawlHistoryItem>();
try {
logManager.syncList(0L, Sets.newTreeSet(items.keySet()),
new ItemUpdater() {
@Override
public void updateItemState(URLFP fingerprint,
ProxyCrawlHistoryItem item) throws IOException {
try {
syncedItemList.put((URLFP) fingerprint.clone(),
(ProxyCrawlHistoryItem) item.clone());
} catch (CloneNotSupportedException e) {
e.printStackTrace();
}
}
});
} catch (IOException e) {
LOG.error(CCStringUtils.stringifyException(e));
Assert.assertTrue(false);
}
// assert that the key set is equal
Assert.assertEquals(items.keySet(), syncedItemList.keySet());
// ok now validate that the values are equal
for (Map.Entry<URLFP, ProxyCrawlHistoryItem> item : items.entrySet()) {
ProxyCrawlHistoryItem other = syncedItemList.get(item.getKey());
Assert.assertEquals(item.getValue(), other);
}
}
private static CrawlHistoryManager initializeTestLogManager(
EventLoop eventLoop, boolean fromScratch) throws IOException {
File baseTestDir = new File("/tmp/logManagerTest");
File remoteDir = new File(baseTestDir, "remote");
File localDir = new File(baseTestDir, "local");
if (fromScratch) {
FileUtils.recursivelyDeleteFile(baseTestDir);
baseTestDir.mkdir();
remoteDir.mkdir();
localDir.mkdir();
}
FileSystem localFileSystem = FileSystem.getLocal(CrawlEnvironment
.getHadoopConfig());
int initFlags = INIT_FLAG_SKIP_LOAD_EXISTING_LISTS
| INIT_FLAG_SKIP_LOG_WRITER_THREAD_INIT
| INIT_FLAG_SKIP_LIST_LOADER_THREAD_INIT;
return new CrawlHistoryManager(localFileSystem, new Path(remoteDir
.getAbsolutePath()), localDir, eventLoop, initFlags);
}
private static TreeMap<URLFP, ProxyCrawlHistoryItem> buildTestList(
String... urls) {
TreeMap<URLFP, ProxyCrawlHistoryItem> mapOut = new TreeMap<URLFP, ProxyCrawlHistoryItem>();
for (String url : urls) {
URLFP fp = URLUtils.getURLFPFromURL(url, true);
if (fp != null) {
ProxyCrawlHistoryItem item = new ProxyCrawlHistoryItem();
item.setCrawlStatus(0);
item.setOriginalURL(url);
item.setHttpResultCode(200);
mapOut.put(fp, item);
}
}
return mapOut;
}
private static void updateTestItemStates(
TreeMap<URLFP, ProxyCrawlHistoryItem> items) {
for (ProxyCrawlHistoryItem item : items.values()) {
item.setHttpResultCode(301);
item.setRedirectURL(item.getOriginalURL() + "/redirect");
item.setRedirectStatus(0);
item.setRedirectHttpResult(200);
}
}
private static void launchInTestMode() {
File baseTestDir = new File("/tmp/logManagerTest");
FileUtils.recursivelyDeleteFile(baseTestDir);
baseTestDir.mkdir();
File remoteDir = new File(baseTestDir, "remote");
File localDir = new File(baseTestDir, "local");
remoteDir.mkdir();
localDir.mkdir();
final TreeMap<String, URLFP> urlToFPMap = new TreeMap<String, URLFP>();
final TreeMap<URLFP, String> urlFPToString = new TreeMap<URLFP, String>();
Set<String> list1 = Sets.newHashSet(urlList1);
Set<String> list2 = Sets.newHashSet(urlList2);
final Set<String> combined = Sets.union(list1, list2);
Set<String> difference = Sets.difference(list1, list2);
final Set<String> completedURLS = new HashSet<String>();
for (String url : combined) {
URLFP fingerprint = URLUtils.getURLFPFromURL(url, true);
urlToFPMap.put(url, fingerprint);
urlFPToString.put(fingerprint, url);
}
File testInputFile1 = new File(localDir, "INPUT_LIST-"
+ System.currentTimeMillis());
File testInputFile2 = new File(localDir, "INPUT_LIST-"
+ (System.currentTimeMillis() + 1));
try {
generateTestURLFile(testInputFile1, urlList1);
generateTestURLFile(testInputFile2, urlList2);
FileSystem localFileSystem = FileSystem.getLocal(CrawlEnvironment
.getHadoopConfig());
EventLoop eventLoop = new EventLoop();
eventLoop.start();
final CrawlHistoryManager logManager = new CrawlHistoryManager(
localFileSystem, new Path(remoteDir.getAbsolutePath()), localDir,
eventLoop, 0);
final LinkedBlockingQueue<ProxyCrawlHistoryItem> queue = new LinkedBlockingQueue<ProxyCrawlHistoryItem>();
final Semaphore initialListComplete = new Semaphore(0);
logManager.startQueueLoaderThread(new CrawlQueueLoader() {
@Override
public void queueURL(URLFP urlfp, String url) {
ProxyCrawlHistoryItem item = new ProxyCrawlHistoryItem();
item.setOriginalURL(url);
queue.add(item);
}
@Override
public void flush() {
// TODO Auto-generated method stub
}
});
Thread queueTestThread = new Thread(new Runnable() {
@Override
public void run() {
while (true) {
try {
ProxyCrawlHistoryItem item = queue.take();
if (item.getOriginalURL().length() == 0) {
break;
} else {
System.out.println("Got:" + item.getOriginalURL());
CrawlURL urlObject = new CrawlURL();
Assert.assertTrue(!completedURLS
.contains(item.getOriginalURL()));
completedURLS.add(item.getOriginalURL());
urlObject
.setLastAttemptResult((byte) CrawlURL.CrawlResult.SUCCESS);
urlObject.setUrl(item.getOriginalURL());
urlObject.setResultCode(200);
logManager.crawlComplete(urlObject);
if (completedURLS.equals(combined)) {
System.out
.println("Hit Trigger URL. Releasing InitialListComplete Sempahore");
initialListComplete.release(1);
}
}
} catch (InterruptedException e) {
}
}
}
});
queueTestThread.start();
logManager.loadList(testInputFile1,0);
logManager.loadList(testInputFile2,0);
System.out.println("Waiting for Initial List to Complete");
initialListComplete.acquireUninterruptibly();
System.out.println("Woke Up");
try {
eventLoop.getEventThread().join();
} catch (InterruptedException e) {
e.printStackTrace();
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}