/** * Copyright 2008 - CommonCrawl Foundation * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * **/ package org.commoncrawl.service.listcrawler; import java.io.BufferedOutputStream; import java.io.DataInputStream; import java.io.DataOutput; import java.io.DataOutputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.PrintWriter; import java.io.RandomAccessFile; import java.util.Arrays; import java.util.Comparator; import java.util.HashSet; import java.util.Map; import java.util.Set; import java.util.TreeMap; import java.util.TreeSet; import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.Semaphore; import java.util.concurrent.TimeUnit; import java.util.zip.CRC32; import java.util.zip.CheckedOutputStream; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.DataInputBuffer; import org.apache.hadoop.io.DataOutputBuffer; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.MapFile; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.WritableUtils; import org.apache.hadoop.io.SequenceFile.ValueBytes; import org.apache.hadoop.record.Buffer; import org.apache.log4j.BasicConfigurator; import org.commoncrawl.async.EventLoop; import org.commoncrawl.crawl.common.internal.CrawlEnvironment; import org.commoncrawl.mapred.ProxyCrawlHistoryItem; import org.commoncrawl.protocol.CrawlURL; import org.commoncrawl.protocol.URLFP; import org.commoncrawl.rpc.base.shared.BinaryProtocol; import org.commoncrawl.service.crawler.util.URLFPBloomFilter; import org.commoncrawl.service.listcrawler.CrawlListMetadata; import org.commoncrawl.util.URLUtils; import org.commoncrawl.util.CCStringUtils; import org.commoncrawl.util.CRC16; import org.commoncrawl.util.FileUtils; import org.junit.Assert; import com.google.common.collect.ImmutableSet; import com.google.common.collect.Sets; /** * Class that encapsulates the state necessary to manage long term crawl history * * @author rana * */ public class CrawlHistoryManager implements CrawlHistoryStorage { public static final Log LOG = LogFactory.getLog(CrawlHistoryManager.class); private static class QueueItem<Type> { public QueueItem(Type item) { _item = item; } public Type _item; } public static interface ItemUpdater { public void updateItemState(URLFP fingerprint, ProxyCrawlHistoryItem item) throws IOException; } /** path to log file directory **/ File _localLogFileDir; /** remote data directory **/ Path _remoteDataDirectory; /** event loop **/ EventLoop _eventLoop; /** file system object **/ FileSystem _remoteFileSystem = null; /** queue of pending crawl history updates **/ LinkedBlockingQueue<HistoryUpdateRequest> _historyUpdateQueue = new LinkedBlockingQueue<HistoryUpdateRequest>(); /** list loader queue **/ LinkedBlockingQueue<QueueItem<CrawlList>> _listLoaderQueue = new LinkedBlockingQueue<QueueItem<CrawlList>>(); /** crawl queue **/ LinkedBlockingQueue<QueueItem<CrawlList>> _queueLoaderQueue = new LinkedBlockingQueue<QueueItem<CrawlList>>(); /** shutdown flag **/ boolean _shutdown = false; /** queue loader shutdown flag **/ boolean _queueLoaderShutdown = false; /** cache writer thread **/ Thread _writerThread; /** list loader thread **/ Thread _listLoaderThread; /** queue loader thread **/ Thread _queueLoaderThread; /** checkpoint access semaphore **/ Semaphore _checkpointSemaphore = new Semaphore( 1); /** last checkpoint time **/ long _lastCheckpointTime = -1; /** crc16 - used to calculate individual payload crcs **/ private CRC16 _crc16in = new CRC16(); /** buffer used to store sync byte data during payload scan **/ private byte _syncByteBuffer[] = new byte[LocalLogFileHeader.SYNC_BYTES_SIZE]; /** payload Buffer object used to accumulate payload data for writes **/ Buffer _payloadBuffer = new Buffer(); /** data input buffer reused to read payload data **/ DataInputBuffer _payloadInputStream = new DataInputBuffer(); /** lists **/ TreeMap<Long, CrawlList> _crawlLists = new TreeMap<Long, CrawlList>(); public static final String CRAWL_HISTORY_HDFS_LOGFILE_PREFIX = "historyData-"; public static final String CRAWL_HISTORY_HDFS_BLOOMFILTER_PREFIX = "historyBloomFilter-"; /** log file header **/ private LocalLogFileHeader _header = new LocalLogFileHeader(); /** local log item map **/ TreeMap<URLFP, ProxyCrawlHistoryItem> _localLogItems = new TreeMap<URLFP, ProxyCrawlHistoryItem>(); public static final int INIT_FLAG_SKIP_ACTIVE_LOG_FILE_INIT = 1; public static final int INIT_FLAG_SKIP_LOAD_EXISTING_LISTS = 2; public static final int INIT_FLAG_SKIP_LOG_WRITER_THREAD_INIT = 4; public static final int INIT_FLAG_SKIP_LIST_LOADER_THREAD_INIT = 8; public static final int INIT_FLAG_DISABLE_CHECKPOINTS = 16; private static final int LOG_ITEM_HEADER_SIZE = LocalLogFileHeader.SYNC_BYTES_SIZE + 4 + 2; private static final int POLL_WAIT_TIME = 5000; public static final int DEFAULT_LOCAL_ITEM_CHECKPOINT_THRESHOLD = 100000; private static int _checkpointThreshold = DEFAULT_LOCAL_ITEM_CHECKPOINT_THRESHOLD; /** * constuctor * * @param logFileDir * the path to the local log file directory * @throws IOException */ public CrawlHistoryManager(FileSystem remoteFileSystem, Path remoteLogFileDir, File localLogFileDir, EventLoop eventLoop, int initFlags) throws IOException { this._eventLoop = eventLoop; this._remoteFileSystem = remoteFileSystem; this._remoteDataDirectory = remoteLogFileDir; this._localLogFileDir = localLogFileDir; LOG.info("*** LOCAL DATA DIR:" + _localLogFileDir); initialize(initFlags); } private void initialize(int initFlags) throws IOException { // initialize the local log file ... if ((initFlags & INIT_FLAG_SKIP_ACTIVE_LOG_FILE_INIT) == 0) { initializeActiveLog(); } // load pre-existing lists if ((initFlags & INIT_FLAG_SKIP_LOAD_EXISTING_LISTS) == 0) { loadExistingLists(); } // start log writer thread ... if ((initFlags & INIT_FLAG_SKIP_LOG_WRITER_THREAD_INIT) == 0) { startLogWriterThread(initFlags); } // start list loader thread ... if ((initFlags & INIT_FLAG_SKIP_LIST_LOADER_THREAD_INIT) == 0) { startListLoaderThread(); } } /** * shutdown the log properly */ public void shutdown() { _shutdown = true; stopQueueLoaderThread(); _historyUpdateQueue.add(new HistoryUpdateRequest()); _listLoaderQueue.add(new QueueItem(null)); try { if (_writerThread != null) { _writerThread.join(); } if (_listLoaderThread != null) { _listLoaderThread.join(); } } catch (InterruptedException e1) { } _writerThread = null; _listLoaderThread = null; _historyUpdateQueue.clear(); _listLoaderQueue.clear(); _shutdown = false; } /** * * @return the local data directory */ public File getLocalDataDir() { return _localLogFileDir; } /** * add a new url list to the queue * * @param dataFilePath * - the path to the file containing a list of urls * @return a unique list id that can be used to identify the list * @throws IOException */ public long loadList(File dataFilePath,int refreshInterval) throws IOException { long listId = System.currentTimeMillis(); // create a placeholder list CrawlList list = CrawlList.createListLoadingInLoadingState(this, listId, dataFilePath,refreshInterval); // add it to the map synchronized (_crawlLists) { _crawlLists.put(listId, list); } // add to to the loader queue . _listLoaderQueue.add(new QueueItem<CrawlList>(list)); return listId; } /** * retrieve the list object associated with the given id * * @param listId * @return */ public CrawlList getList(long listId) { synchronized (_crawlLists) { return _crawlLists.get(listId); } } /** * * @param matchCriteria * - set of url fingerprints to match against * @param updater * - the action to perform with each match */ @Override public void syncList(final long listId, TreeSet<URLFP> matchCriteria, ItemUpdater targetList) throws IOException { // first grab last update time ... long lastUpdateTimePreScan = -1; Set<Long> processedItems = new HashSet<Long>(); LOG.info("LIST:" + listId + " iterateCrawlHistoryLog - iterating hdfs log files"); boolean exitLoop = false; do { synchronized (this) { lastUpdateTimePreScan = _lastCheckpointTime; } // ok now start to iterate item in checkpoint directory Path wildcardPattern = new Path(_remoteDataDirectory, CRAWL_HISTORY_HDFS_LOGFILE_PREFIX + "*"); FileStatus candidates[] = _remoteFileSystem.globStatus(wildcardPattern); for (FileStatus candidate : candidates) { Path candidatePath = candidate.getPath(); String candidateName = candidatePath.getName(); long candidateTimestamp = Long.parseLong(candidateName .substring(CRAWL_HISTORY_HDFS_LOGFILE_PREFIX.length())); if (candidateTimestamp <= lastUpdateTimePreScan || lastUpdateTimePreScan == -1) { if (!processedItems.contains(candidateTimestamp)) { LOG.info("LIST:" + listId + " iterateCrawlHistoryLog - iterating hdfs file:" + candidateName); // go ahead and process this candidate ... iterateHDFSCrawlHistoryLog(listId, candidateTimestamp, matchCriteria, targetList); // add to set processedItems.add(candidateTimestamp); } } } // now acquire checkpoint semaphore LOG.info("LIST:" + listId + " iterateCrawlHistoryLog - acquiring semaphore"); _checkpointSemaphore.acquireUninterruptibly(); try { // check to see if checkpoint time has not changed if (_lastCheckpointTime == lastUpdateTimePreScan) { // ok checkpoint time has not changed since our previous attempt to // check it exitLoop = true; if (_localLogItems.size() != 0) { // go ahead and process any in memory items against the criteria ... for (URLFP candidate : matchCriteria) { ProxyCrawlHistoryItem item = _localLogItems.get(candidate); // if found call match action if (item != null) { targetList.updateItemState(candidate, item); } } } } } finally { _checkpointSemaphore.release(); } } while (!exitLoop); } // take a remote crawl history log file and cache it locally private void cacheCrawlHistoryLog(File localCacheDir, long timestamp) throws IOException { SequenceFile.Reader reader = null; Path mapFilePath = new Path(_remoteDataDirectory, CRAWL_HISTORY_HDFS_LOGFILE_PREFIX + timestamp); Path indexFilePath = new Path(mapFilePath, "index"); Path dataFilePath = new Path(mapFilePath, "data"); File cacheFilePath = new File(localCacheDir, CRAWL_HISTORY_HDFS_LOGFILE_PREFIX + timestamp); SequenceFile.Reader indexReader = new SequenceFile.Reader( _remoteFileSystem, dataFilePath, CrawlEnvironment.getHadoopConfig()); ValueBytes valueBytes = indexReader.createValueBytes(); DataOutputBuffer keyBytes = new DataOutputBuffer(); DataInputBuffer keyBuffer = new DataInputBuffer(); DataOutputBuffer finalOutputStream = new DataOutputBuffer(); DataOutputBuffer uncompressedValueBytes = new DataOutputBuffer(); URLFP fp = new URLFP(); try { while (indexReader.nextRaw(keyBytes, valueBytes) != -1) { keyBuffer.reset(keyBytes.getData(), 0, keyBytes.getLength()); // read fingerprint ... fp.readFields(keyBuffer); // write hash only finalOutputStream.writeLong(fp.getUrlHash()); uncompressedValueBytes.reset(); // write value bytes to intermediate buffer ... valueBytes.writeUncompressedBytes(uncompressedValueBytes); // write out uncompressed length WritableUtils.writeVInt(finalOutputStream, uncompressedValueBytes .getLength()); // write out bytes finalOutputStream.write(uncompressedValueBytes.getData(), 0, uncompressedValueBytes.getLength()); } // delete existing ... cacheFilePath.delete(); // compute crc ... CRC32 crc = new CRC32(); crc.update(finalOutputStream.getData(), 0, finalOutputStream.getLength()); // open final output stream DataOutputStream fileOutputStream = new DataOutputStream( new BufferedOutputStream(new FileOutputStream(cacheFilePath))); try { fileOutputStream.writeLong(crc.getValue()); fileOutputStream.write(finalOutputStream.getData(), 0, finalOutputStream.getLength()); fileOutputStream.flush(); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); fileOutputStream.close(); fileOutputStream = null; cacheFilePath.delete(); throw e; } finally { if (fileOutputStream != null) { fileOutputStream.close(); } } } finally { if (indexReader != null) { indexReader.close(); } } } private void iterateHDFSCrawlHistoryLog(long listId, long timestamp, TreeSet<URLFP> criteria, ItemUpdater targetList) throws IOException { // ok copy stuff locally if possible ... File localIndexPath = new File(getLocalDataDir(), CRAWL_HISTORY_HDFS_LOGFILE_PREFIX + timestamp + ".index"); File localDataPath = new File(getLocalDataDir(), CRAWL_HISTORY_HDFS_LOGFILE_PREFIX + timestamp + ".data"); File localBloomFilterPath = new File(getLocalDataDir(), CRAWL_HISTORY_HDFS_LOGFILE_PREFIX + timestamp + ".bloom"); SequenceFile.Reader reader = null; Path mapFilePath = new Path(_remoteDataDirectory, CRAWL_HISTORY_HDFS_LOGFILE_PREFIX + timestamp); Path indexFilePath = new Path(mapFilePath, "index"); Path dataFilePath = new Path(mapFilePath, "data"); Path bloomFilePath = new Path(_remoteDataDirectory, CRAWL_HISTORY_HDFS_BLOOMFILTER_PREFIX + timestamp); // ok copy local first if (!localIndexPath.exists()) { LOG.info("LIST:" + listId + " Copying Index File:" + indexFilePath + " to Local:" + localIndexPath.getAbsolutePath()); try { _remoteFileSystem.copyToLocalFile(indexFilePath, new Path( localIndexPath.getAbsolutePath())); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); localIndexPath.delete(); throw e; } } if (!localDataPath.exists()) { LOG.info("LIST:" + listId + " Copying Data File:" + dataFilePath + " to Local:" + localDataPath.getAbsolutePath()); try { _remoteFileSystem.copyToLocalFile(dataFilePath, new Path(localDataPath .getAbsolutePath())); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); localDataPath.delete(); throw e; } } if (!localBloomFilterPath.exists()) { LOG.info("LIST:" + listId + " Copying Bloom File:" + bloomFilePath + " to Local:" + localBloomFilterPath.getAbsolutePath()); try { _remoteFileSystem.copyToLocalFile(bloomFilePath, new Path( localBloomFilterPath.getAbsolutePath())); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); localBloomFilterPath.delete(); throw e; } } // ok open local FileSystem localFileSystem = FileSystem.getLocal(CrawlEnvironment .getHadoopConfig()); SequenceFile.Reader indexReader = new SequenceFile.Reader(localFileSystem, new Path(localIndexPath.getAbsolutePath()), CrawlEnvironment .getHadoopConfig()); try { URLFP firstIndexKey = null; URLFP lastIndexKey = new URLFP(); LongWritable position = new LongWritable(); while (indexReader.next(lastIndexKey, position)) { if (firstIndexKey == null) { try { firstIndexKey = (URLFP) lastIndexKey.clone(); } catch (CloneNotSupportedException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } LOG.info("LIST:" + listId + " ### Index First Domain:" + firstIndexKey.getDomainHash() + " URLHash:" + firstIndexKey.getUrlHash() + " Last Domain:" + lastIndexKey.getDomainHash() + " URLHash:" + lastIndexKey.getUrlHash()); URLFP criteriaFirstKey = criteria.first(); URLFP criteriaLastKey = criteria.last(); if (firstIndexKey.compareTo(criteriaLastKey) > 0 || lastIndexKey.compareTo(criteriaFirstKey) < 0) { LOG.info("LIST:" + listId + " Entire Index is Out of Range. Skipping!"); LOG.info("LIST:" + listId + " ### Criteria First Domain:" + criteriaFirstKey.getDomainHash() + " URLHash:" + criteriaFirstKey.getUrlHash() + " Last Domain:" + criteriaLastKey.getDomainHash() + " URLHash:" + criteriaLastKey.getUrlHash()); return; } } finally { indexReader.close(); } LOG.info("LIST:" + listId + " ### Index:" + timestamp + " Passed Test. Doing Full Scan"); // load bloom filter FSDataInputStream bloomFilterStream = localFileSystem.open(new Path( localBloomFilterPath.getAbsolutePath())); int hitCount = 0; try { URLFPBloomFilter filter = URLFPBloomFilter.load(bloomFilterStream); URLFP fpOut = new URLFP(); ProxyCrawlHistoryItem itemOut = new ProxyCrawlHistoryItem(); DataOutputBuffer valueBytesUncompressed = new DataOutputBuffer(); ValueBytes valueBytes = null; DataInputBuffer valueReader = new DataInputBuffer(); DataOutputBuffer keyBytes = new DataOutputBuffer(); DataInputBuffer keyReader = new DataInputBuffer(); URLFP lastFP = null; outerLoop: // now iterate each item in the criteria for (URLFP targetFP : criteria) { // if fingerprint is present in filter ... if (filter.isPresent(targetFP)) { // check to see if reader is initialzied ... if (reader == null) { LOG.info("LIST:" + listId + " BloomFilter First Hit. Initializing Reader for file at:" + localDataPath.getAbsolutePath()); reader = new SequenceFile.Reader(localFileSystem, new Path( localDataPath.getAbsolutePath()), CrawlEnvironment .getHadoopConfig()); LOG.info("LIST:" + listId + " BloomFilter First Hit. Initialized Reader for file at:" + localDataPath.getAbsolutePath()); valueBytes = reader.createValueBytes(); } // if last read fingerprint was not null ... if (lastFP != null) { // does it match the current item if (lastFP.compareTo(targetFP) == 0) { // decompress value bytes ... valueBytesUncompressed.reset(); valueBytes.writeUncompressedBytes(valueBytesUncompressed); // init valueReader valueReader.reset(valueBytesUncompressed.getData(), valueBytesUncompressed.getLength()); itemOut.readFields(valueReader); LOG.info("LIST:" + listId + " GOT HISTORY ITEM HIT. URL:" + +lastFP.getUrlHash() + " File:" + dataFilePath); // if so, null out last fp lastFP = null; // and update item state ... targetList.updateItemState(targetFP, itemOut); hitCount++; continue; } } // ok at this point .. read the next item in the list ... lastFP = null; while (reader.nextRaw(keyBytes, valueBytes) != -1) { // init reader ... keyReader.reset(keyBytes.getData(), keyBytes.getLength()); // read key fpOut.readFields(keyReader); // reset output buffer keyBytes.reset(); // LOG.info("LIST:" + listId +" nextRaw Returned DH:" + // fpOut.getDomainHash() + " UH:" + fpOut.getUrlHash() + " TDH:" + // targetFP.getDomainHash() + " TUH:" + targetFP.getUrlHash()); // compare it to target ... int result = fpOut.compareTo(targetFP); // ok does it match .. ? if (result == 0) { // decompress value bytes ... valueBytesUncompressed.reset(); valueBytes.writeUncompressedBytes(valueBytesUncompressed); // init valueReader valueReader.reset(valueBytesUncompressed.getData(), valueBytesUncompressed.getLength()); itemOut.readFields(valueReader); LOG.info("LIST:" + listId + " GOT HISTORY ITEM HIT. URL:" + fpOut.getUrlHash() + " File:" + dataFilePath); // update item state ... targetList.updateItemState(targetFP, itemOut); hitCount++; // and break to outer loop continue outerLoop; } else if (result == 1) { // LOG.info("LIST:" + listId + // " FP Comparison Returned 1. Going to OuterLoop"); // update last FP lastFP = fpOut; // continue outer loop continue outerLoop; } else { // otherwise skip } } // ok if we got here .. we are done reading the sequence file and did // not find a trailing match LOG .warn("LIST:" + listId + " ### Reached End Of File Searching for item in MapFile while BloomFilter returned positivie result (DomainHash:" + targetFP.getDomainHash() + "FP:" + targetFP.getUrlHash() + ")"); // break out of outer loop break; } } } finally { bloomFilterStream.close(); if (reader != null) { reader.close(); } LOG.info("LIST:" + listId + " File:" + dataFilePath + " DONE. HitCount:" + hitCount); } } // Path mapOutputPath = new // Path(_remoteDataDirectory,CRAWL_HISTORY_HDFS_LOGFILE_PREFIX + // checkpointTimestamp); // Path filterOutputPath = new // Path(_remoteDataDirectory,CRAWL_HISTORY_HDFS_BLOOMFILTER_PREFIX + // checkpointTimestamp); /** * callback initiated by proxy server on a per url basis * * @param connection * @param url * @param optTargetObj * @param successOrFailure */ public void crawlComplete(CrawlURL url) { ProxyCrawlHistoryItem historyItem; if ((url.getFlags() & CrawlURL.Flags.IsRedirected) != 0) { historyItem = new ProxyCrawlHistoryItem(); historyItem.setCrawlStatus(0); historyItem.setLastModifiedTime(System.currentTimeMillis()); historyItem.setOriginalURL(url.getUrl()); historyItem.setHttpResultCode(url.getOriginalResultCode()); historyItem.setRedirectURL(url.getRedirectURL()); if (url.isFieldDirty(CrawlURL.Field_RESULTCODE)) historyItem.setRedirectHttpResult(url.getResultCode()); historyItem .setRedirectStatus((url.getLastAttemptResult() == CrawlURL.CrawlResult.SUCCESS) ? 0 : url.getLastAttemptFailureReason()); // add the original url pointing to the final url to the queue _historyUpdateQueue.add(new HistoryUpdateRequest(historyItem)); // ok, now create an entry for the redirected url historyItem = new ProxyCrawlHistoryItem(); historyItem .setCrawlStatus((url.getLastAttemptResult() == CrawlURL.CrawlResult.SUCCESS) ? 0 : url.getLastAttemptFailureReason()); historyItem.setLastModifiedTime(System.currentTimeMillis()); historyItem.setOriginalURL(url.getRedirectURL()); if (url.isFieldDirty(CrawlURL.Field_RESULTCODE)) historyItem.setHttpResultCode(url.getResultCode()); // and add it to the queue _historyUpdateQueue.add(new HistoryUpdateRequest(historyItem)); } else { // if not redirected ... create an entry for the redirected url historyItem = new ProxyCrawlHistoryItem(); historyItem .setCrawlStatus((url.getLastAttemptResult() == CrawlURL.CrawlResult.SUCCESS) ? 0 : url.getLastAttemptFailureReason()); historyItem.setLastModifiedTime(System.currentTimeMillis()); historyItem.setOriginalURL(url.getUrl()); if (url.isFieldDirty(CrawlURL.Field_RESULTCODE)) historyItem.setHttpResultCode(url.getResultCode()); // and add it to the queue _historyUpdateQueue.add(new HistoryUpdateRequest(historyItem)); } } /** * startCacheWriterThread * */ private void startLogWriterThread(int initFlags) { _writerThread = new Thread(new LogWriterThread(initFlags)); _writerThread.start(); } /** * getLogFilePath - get active log file path * * @param directoryRoot * @return */ private File getActiveLogFilePath() { return new File(_localLogFileDir, "ActiveLog"); } /** * initializeEmptyLogFile - init an empty log file header * * @param stream * @return * @throws IOException */ private static LocalLogFileHeader initializeEmptyLogFile(DataOutput stream) throws IOException { LocalLogFileHeader header = new LocalLogFileHeader(); header.writeHeader(stream); return header; } /** * initiailizeActiveLog - init local cache log * * * **/ private void initializeActiveLog() throws IOException { File activeLogPath = getActiveLogFilePath(); if (!activeLogPath.exists()) { DataOutputStream outputStream = new DataOutputStream( new FileOutputStream(activeLogPath)); try { _header = initializeEmptyLogFile(outputStream); } finally { outputStream.close(); } } else { _header = new LocalLogFileHeader(); DataInputStream inputStream = new DataInputStream(new FileInputStream( activeLogPath)); try { _header.readHeader(inputStream); } finally { inputStream.close(); } if (_header._itemCount != 0) { _localLogItems = loadLocalLogItemMap(); } } } /** * updateLogFileHeader - update the log file header called via the log file * writer thread ... * * @throws IOException */ void updateLogFileHeader(File logFileName, long newlyAddedItemsCount, long newItemsFileSize) throws IOException { RandomAccessFile file = new RandomAccessFile(logFileName, "rw"); try { synchronized (_header) { // update cached header ... _header._fileSize += newItemsFileSize; _header._itemCount += newlyAddedItemsCount; // set the position at zero .. file.seek(0); // and write header to disk ... _header.writeHeader(file); } } finally { // major bottle neck.. // file.getFD().sync(); file.close(); } } /** * get local log position according to cached header * * @return */ long getLocalLogFilePos() { long filePosOut = 0; synchronized (_header) { filePosOut = _header._fileSize; } return filePosOut; } /** * return the header sync bytes * * @return header sync bytes */ byte[] getLocalLogSyncBytes() { return _header._sync; } /** * * @return a sorted map of urlfp to item * @throws IOException */ TreeMap<URLFP, ProxyCrawlHistoryItem> loadLocalLogItemMap() throws IOException { TreeMap<URLFP, ProxyCrawlHistoryItem> itemMap = new TreeMap<URLFP, ProxyCrawlHistoryItem>(); LOG.info("Reading Local Log File"); RandomAccessFile file = new RandomAccessFile(getActiveLogFilePath(), "rw"); // valid length indicator ... long validLength = 0; try { // skip header ... file.seek(LocalLogFileHeader.SIZE); validLength = file.getFilePointer(); // ok walk n items ... for (int itemIdx = 0; itemIdx < _header._itemCount && file.getChannel().position() <= _header._fileSize; ++itemIdx) { try { ProxyCrawlHistoryItem item = readItem(file); // update valid length ... validLength = file.getFilePointer(); // ok compute fingerprint for item ... URLFP fingerprintObject = URLUtils.getURLFPFromURL(item .getOriginalURL(), true); if (fingerprintObject == null) { LOG.error("Could not compute fingerprint for URL:" + item.getOriginalURL()); } else { itemMap.put(fingerprintObject, item); } } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); try { if (!seekToNextSyncBytesPos(file)) { LOG.error("Hit EOF While Seeking for next SyncByte Sequence!"); break; } else { LOG.info("Seek to Next SyncByte Succeeded! Continuing Load"); } } catch (IOException e2) { LOG.error(CCStringUtils.stringifyException(e2)); LOG.error("Got IO Exception Reading SyncBytes - Bailing!"); break; } } } } finally { if (file.length() > validLength) { LOG.warn("File Length is:" + file.length() + " Truncating Length to:" + validLength); file.setLength(validLength); } file.close(); } LOG.info("Done Reading Local Log File"); return itemMap; } private ProxyCrawlHistoryItem readItem(RandomAccessFile fileStream) throws IOException { try { // read sync bytes ... fileStream.read(_syncByteBuffer); // validate ... if (!Arrays.equals(_header._sync, _syncByteBuffer)) { throw new IOException("Error Reading Sync Bytes for Item In Checkpoint"); } int checksum = fileStream.readInt(); int payloadSize = fileStream.readShort(); if (payloadSize == 0) { throw new IOException("Invalid Payload Size Reading Item In Checkpoint"); } // read the payload _payloadBuffer.setCapacity(payloadSize); fileStream.read(_payloadBuffer.get(), 0, payloadSize); _crc16in.reset(); _crc16in.update(_payloadBuffer.get(), 0, payloadSize); // if computed checksum does not match file checksum !!! if (_crc16in.getValue() != (long) checksum) { throw new IOException("Checksum Mismatch Expected:" + checksum + " got:" + _crc16in.getValue() + " while Reading Item"); } _payloadInputStream.reset(_payloadBuffer.get(), 0, payloadSize); ProxyCrawlHistoryItem itemOut = new ProxyCrawlHistoryItem(); itemOut.deserialize(_payloadInputStream, new BinaryProtocol()); return itemOut; } catch (Exception e) { LOG.error(CCStringUtils.stringifyException(e)); throw new IOException(e); } } /** * seek out next instance of sync bytes in the file input stream * * @param file * @throws IOException */ private boolean seekToNextSyncBytesPos(RandomAccessFile file) throws IOException { // read in a sync.length buffer amount file.read(_syncByteBuffer); int syncLen = _header._sync.length; // start scan for next sync position ... for (int i = 0; file.getFilePointer() < _header._fileSize; i++) { int j = 0; for (; j < syncLen; j++) { if (_header._sync[j] != _syncByteBuffer[(i + j) % syncLen]) break; } if (j == syncLen) { // found matching sync bytes - reset file pos to before sync bytes file.seek(file.getFilePointer() - LocalLogFileHeader.SYNC_BYTES_SIZE); // position // before // sync return true; } _syncByteBuffer[i % syncLen] = file.readByte(); } return false; } void doCheckpoint() { LOG.info("Starting Checkpoint Process"); try { LOG.info("Writing HDFS Log File"); // write local file contents to hdfs // we don't need to lock the map here becuase only the local log thread // (current thread) modified the map ... long checkpointTimestamp = writeMapFileToHDFS(_localLogItems); // ok that worked .. delete local log getActiveLogFilePath().delete(); // ok now we DO NEED TO lock the map synchronized (_localLogItems) { _localLogItems.clear(); } LOG.info("Regenerating Local Log"); // and regenerate the file initializeActiveLog(); synchronized (CrawlHistoryManager.this) { // ok now update checkpoint timestamp variable _lastCheckpointTime = checkpointTimestamp; } LOG.info("Checkpoint Done"); } catch (IOException e) { LOG.error("CrawlHistoryLog Checkpoint Failed with Exception:" + CCStringUtils.stringifyException(e)); } } long writeMapFileToHDFS(TreeMap<URLFP, ProxyCrawlHistoryItem> itemMap) throws IOException { long checkpointTimestamp = System.currentTimeMillis(); Path mapOutputPath = new Path(_remoteDataDirectory, CRAWL_HISTORY_HDFS_LOGFILE_PREFIX + checkpointTimestamp); Path filterOutputPath = new Path(_remoteDataDirectory, CRAWL_HISTORY_HDFS_BLOOMFILTER_PREFIX + checkpointTimestamp); writeCheckpoint(itemMap, CrawlEnvironment.getHadoopConfig(), _remoteFileSystem, mapOutputPath, filterOutputPath); return checkpointTimestamp; } public static void writeCheckpoint( TreeMap<URLFP, ProxyCrawlHistoryItem> itemMap, Configuration conf, FileSystem remoteFileSystem, Path mapOutputPath, Path filterOutputPath) throws IOException { try { LOG.info("Generating Map File at Location:" + mapOutputPath + " Filter At:" + filterOutputPath); // open a temporary hdfs streams ... MapFile.Writer writer = new MapFile.Writer(conf, remoteFileSystem, mapOutputPath.toString(), URLFP.class, ProxyCrawlHistoryItem.class); // create a bloom filter URLFPBloomFilter filter = new URLFPBloomFilter(_checkpointThreshold * 2, 10, 11); try { for (Map.Entry<URLFP, ProxyCrawlHistoryItem> entry : itemMap.entrySet()) { LOG.info("Writing Key to Map. DomainHash:" + entry.getKey().getDomainHash() + " URLHash:" + entry.getKey().getUrlHash()); filter.add(entry.getKey()); writer.append(entry.getKey(), entry.getValue()); } } finally { writer.close(); } LOG.info("Done generating Map File"); LOG.info("Writing Bloom Filter Data"); // ok now also flush the bloom filter FSDataOutputStream bloomFilterOutputStream = remoteFileSystem .create(filterOutputPath); try { filter.serialize(bloomFilterOutputStream); } finally { bloomFilterOutputStream.flush(); bloomFilterOutputStream.close(); } } catch (IOException e) { // delete all relevant files ... remoteFileSystem.delete(mapOutputPath, true); remoteFileSystem.delete(filterOutputPath, false); // throw exception back out ... throw e; } } DataOutputBuffer _outputBuffer = new DataOutputBuffer(); private CRC16 _crc16Out = new CRC16(); /** * append a ProxyCrawlHistoryItem to the active log * * @param item * @throws IOException */ void appendItemToLog(ProxyCrawlHistoryItem item) throws IOException { try { // open the log file ... DataOutputStream logStream = new DataOutputStream(new FileOutputStream( getActiveLogFilePath(), true)); try { // reset crc calculator (single thread so no worries on synchronization) _crc16Out.reset(); // reset output stream _outputBuffer.reset(); // create checked stream CheckedOutputStream checkedStream = new CheckedOutputStream( _outputBuffer, _crc16Out); DataOutputStream dataOutputStream = new DataOutputStream(checkedStream); // write out item item.serialize(dataOutputStream, new BinaryProtocol()); dataOutputStream.flush(); // ok now write out sync,crc,length then data logStream.write(getLocalLogSyncBytes()); logStream.writeInt((int) checkedStream.getChecksum().getValue()); logStream.writeShort((short) _outputBuffer.getLength()); logStream.write(_outputBuffer.getData(), 0, _outputBuffer.getLength()); logStream.flush(); logStream.close(); logStream = null; // now we need to update the file header updateLogFileHeader(getActiveLogFilePath(), 1, LOG_ITEM_HEADER_SIZE + _outputBuffer.getLength()); URLFP fingerprint = URLUtils.getURLFPFromURL(item.getOriginalURL(), true); // update local log synchronized (_localLogItems) { if (fingerprint != null) { _localLogItems.put(fingerprint, item); } } ImmutableSet<CrawlList> lists = null; // and now walk lists updating them as necessary synchronized (_crawlLists) { lists = new ImmutableSet.Builder<CrawlList>().addAll( _crawlLists.values()).build(); } for (CrawlList list : lists) { try { list.updateItemState(fingerprint, item); } catch (Exception e) { // ok, IF an error occurs updating the list metadata.. we need to // coninue along. // it is critical for this thread to not die in such a circumstane LOG.fatal("Error Updating List(" + list.getListId() + "):" + CCStringUtils.stringifyException(e)); System.out.println("Exception in List Update(" + list.getListId() + "):" + CCStringUtils.stringifyException(e)); } } } finally { if (logStream != null) { logStream.close(); } } } finally { } } class LogWriterThread implements Runnable { int _initFlags; public LogWriterThread(int initFlags) { _initFlags = initFlags; } @Override public void run() { boolean shutdown = false; while (!shutdown) { try { final HistoryUpdateRequest request = _historyUpdateQueue.poll( POLL_WAIT_TIME, TimeUnit.MILLISECONDS); if (request != null) { switch (request._requestType) { case ExitThreadRequest: { // shutdown condition ... LOG.info("Log Writer Thread Received Shutdown. Exiting!"); shutdown = true; } break; case UpdateRequest: { try { appendItemToLog(request._item); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } } break; } } // now check if we can perform a checkpoint long localItemCount = 0; synchronized (_header) { localItemCount = _header._itemCount; } // LOG.info("$$$$ LOCAL ITEM COUNT IS:" + localItemCount); if (localItemCount >= _checkpointThreshold) { // if checkpoints were not disabled during initialization ... if ((_initFlags & INIT_FLAG_DISABLE_CHECKPOINTS) == 0) { LOG.info("$$$$ LOCAL ITEM COUNT EXCEEDS THRESHOLD:" + _checkpointThreshold); // see if can start a checkpoint .. if (_checkpointSemaphore.tryAcquire(100, TimeUnit.MILLISECONDS)) { try { // ok we can exclusivey touch the local log file doCheckpoint(); } finally { _checkpointSemaphore.release(); } } else { LOG.warn("$$$$ FAILED TO ACQUIRE SEMAPHORE FOR CHECKPOINT!"); } } else { LOG .warn("$$$$ CHECKPOINTS DISABLED. SKIPPING POTENTIAL CHECKPOINT"); } } } catch (InterruptedException e) { } } } } static class HistoryUpdateRequest { public enum RequestType { UpdateRequest, ExitThreadRequest } public HistoryUpdateRequest(ProxyCrawlHistoryItem item) { _item = item; _requestType = RequestType.UpdateRequest; } public HistoryUpdateRequest() { _requestType = RequestType.ExitThreadRequest; } public ProxyCrawlHistoryItem _item = null; public RequestType _requestType; } private void loadExistingLists() throws IOException { // scan data directory for list id pattern FileSystem localFileSystem = FileSystem.getLocal(CrawlEnvironment .getHadoopConfig()); FileStatus loadTargets[] = localFileSystem.globStatus(new Path( _localLogFileDir.getAbsolutePath(), CrawlList.LIST_URL_DATA_PREFIX + "*")); // sort list so that we load newer lists first ... Arrays.sort(loadTargets, new Comparator<FileStatus>() { @Override public int compare(FileStatus o1, FileStatus o2) { return ((Long) o2.getModificationTime()).compareTo(o1 .getModificationTime()); } }); for (FileStatus loadTarget : loadTargets) { // extract timestamp ... long listId = Long.parseLong(loadTarget.getPath().getName().substring( CrawlList.LIST_URL_DATA_PREFIX.length())); LOG.info("Found List Data for List:" + listId); // validate if (CrawlList.allFilesPresent(_localLogFileDir, listId)) { LOG.info("List looks valid. Loading"); try { CrawlList list = new CrawlList(this, listId); synchronized (_crawlLists) { CrawlList oldList = _crawlLists.get(listId); if (oldList != null) { list.setEventListener(oldList.getEventListener()); } _crawlLists.put(listId, list); } LOG.info("Loaded List:" + listId + " Scheduling for Queueing"); _queueLoaderQueue.add(new QueueItem<CrawlList>(list)); } catch (IOException e) { LOG.error("Failed to load list:" + listId + " Exception:" + CCStringUtils.stringifyException(e)); synchronized (_crawlLists) { _crawlLists.put(listId, CrawlList.createListWithLoadErrorState( this, listId, e)); } } } } } private void startListLoaderThread() { _listLoaderThread = new Thread(new Runnable() { @Override public void run() { LOG.info("Starting List Loader Thread"); while (true) { try { QueueItem<CrawlList> listItem = _listLoaderQueue.take(); if (listItem._item == null || _shutdown) { break; } else { try { // mark the ui list as really loading ... listItem._item.markListAsReallyLoading(); LOG.info("Attempting to load List:" + listItem._item.getListId()); CrawlList listToLoad = new CrawlList(CrawlHistoryManager.this, listItem._item.getListId(), listItem._item .getListURLDataFile(),listItem._item.getMetadata().getRefreshInterval()); LOG.info("Successfully loaded List:" + listItem._item.getListId() + " Sending to QueueLoader"); synchronized (_crawlLists) { CrawlList oldList = _crawlLists.get(listToLoad.getListId()); if (oldList != null) { listToLoad.setEventListener(oldList.getEventListener()); } _crawlLists.put(listToLoad.getListId(), listToLoad); } // add to queue loader ... if (!_shutdown) { _queueLoaderQueue.add(new QueueItem<CrawlList>(listToLoad)); } } catch (Exception e) { LOG.error("Failed to load List:" + listItem._item.getListId() + " with Exception:" + CCStringUtils.stringifyException(e)); synchronized (_crawlLists) { _crawlLists.put(listItem._item.getListId(), CrawlList .createListWithLoadErrorState(CrawlHistoryManager.this, listItem._item.getListId(), e)); } } } } catch (InterruptedException e) { } } LOG.info("Exiting List Loader Thread"); } }); _listLoaderThread.start(); } /** * start the queue loader thread * * @param loader * - the passed in queue loader callback */ public void startQueueLoaderThread(final CrawlQueueLoader loader) { _queueLoaderThread = new Thread(new Runnable() { @Override public void run() { LOG.info("Starting Qeueue Loader Thread"); try { while (true) { try { QueueItem<CrawlList> listItem = _queueLoaderQueue.take(); if (listItem._item == null || _queueLoaderShutdown) { break; } else { try { LOG.info("Attempting to queue List:" + listItem._item.getListId()); listItem._item.queueUnCrawledItems(loader); LOG .info("Finished queueing List:" + listItem._item.getListId()); } catch (Exception e) { LOG.error("Failed to queue List:" + listItem._item.getListId() + " with Exception:" + CCStringUtils.stringifyException(e)); } } } catch (InterruptedException e) { } } } finally { LOG.info("Exiting Queue Loader Thread"); } } }); _queueLoaderThread.start(); } public void stopQueueLoaderThread() { if (_queueLoaderThread != null) { _queueLoaderShutdown = true; _queueLoaderQueue.add(new QueueItem(null)); try { _queueLoaderThread.join(); } catch (InterruptedException e) { } _queueLoaderThread = null; _queueLoaderShutdown = false; } } private static CrawlURL proxyCrawlHitoryItemToCrawlURL( ProxyCrawlHistoryItem item) { CrawlURL url = new CrawlURL(); url.setUrl(item.getOriginalURL()); if (item.isFieldDirty(ProxyCrawlHistoryItem.Field_CRAWLSTATUS)) { if (item.getCrawlStatus() == 0) { url.setLastAttemptResult((byte) CrawlURL.CrawlResult.SUCCESS); } else { url.setLastAttemptResult((byte) CrawlURL.CrawlResult.FAILURE); url.setLastAttemptFailureReason((byte) item.getCrawlStatus()); } } if (item.isFieldDirty(ProxyCrawlHistoryItem.Field_HTTPRESULTCODE)) { url.setResultCode(item.getHttpResultCode()); } if (item.isFieldDirty(ProxyCrawlHistoryItem.Field_REDIRECTURL)) { // move original result code over to appropiate location in CrawlURL url.setOriginalResultCode(url.getResultCode()); url.setFieldClean(CrawlURL.Field_RESULTCODE); url.setFlags(CrawlURL.Flags.IsRedirected); url.setRedirectURL(item.getRedirectURL()); if (item.getRedirectStatus() == 0) { url.setLastAttemptResult((byte) CrawlURL.CrawlResult.SUCCESS); } else { url.setLastAttemptResult((byte) CrawlURL.CrawlResult.FAILURE); url.setLastAttemptFailureReason((byte) item.getRedirectStatus()); } if (item.isFieldDirty(ProxyCrawlHistoryItem.Field_REDIRECTHTTPRESULT)) { url.setResultCode(item.getRedirectHttpResult()); } } return url; } public Map<Long, CrawlListMetadata> collectListMetadata(Set<Long> predicate) { TreeMap<Long, CrawlListMetadata> metadataOut = new TreeMap<Long, CrawlListMetadata>(); int normalPriority = Thread.currentThread().getPriority(); try { LOG.info("### BOOSTING THREAD PRIORITY"); Thread.currentThread().setPriority(Thread.MAX_PRIORITY); LOG.info("### ATTEMPTING LOCK"); synchronized (_crawlLists) { LOG.info("### GOT LOCK"); for (CrawlList list : _crawlLists.values()) { if (predicate.contains(list.getListId())) { metadataOut.put(list.getListId(), list.getMetadata()); } } } LOG.info("### RELEASING LOCK"); } finally { Thread.currentThread().setPriority(normalPriority); } return metadataOut; } /**********************************************************************************************/ // TEST CODE /**********************************************************************************************/ private static void generateTestURLFile(File outputFile, String[] urlList) throws IOException { PrintWriter writer = new PrintWriter(outputFile, "UTF-8"); for (String url : urlList) { writer.println(url); } writer.flush(); writer.close(); } static final String[] urlList1 = { "http://www.google.com/1", "http://www.someotherdomain.com/1", "http://www.google.com/2", "http://www.google.com/3", "http://www.someotherdomain.com/triggerSemaphore" }; static final String[] urlList2 = { "http://www.google.com/1", "http://www.google.com/2", "http://www.someotherdomain.com/2", "http://www.google.com/4", "http://www.google.com/5", }; public static void main(String[] args) { // initialize ... Configuration conf = new Configuration(); conf.addResource("nutch-default.xml"); conf.addResource("nutch-site.xml"); conf.addResource("core-site.xml"); conf.addResource("hdfs-site.xml"); conf.addResource("mapred-site.xml"); BasicConfigurator.configure(); conf.set("mapred.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec"); conf.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec"); conf.set("io.seqfile.compression.type", "NONE"); CrawlEnvironment.setHadoopConfig(conf); CrawlEnvironment.setDefaultHadoopFSURI("file:///"); EventLoop eventLoop = new EventLoop(); eventLoop.start(); testWriteMapFileToHDFS(eventLoop); // launchInTestMode(); } private static void testWriteMapFileToHDFS(EventLoop eventLoop) { try { // initialize log manager CrawlHistoryManager logManager = initializeTestLogManager(eventLoop, true); // initialize item list TreeMap<URLFP, ProxyCrawlHistoryItem> items = buildTestList(urlList1); final TreeMap<String, URLFP> urlToURLFPMap = new TreeMap<String, URLFP>(); for (Map.Entry<URLFP, ProxyCrawlHistoryItem> item : items.entrySet()) { urlToURLFPMap.put(item.getValue().getOriginalURL(), item.getKey()); } // add to local item map in log manager for (ProxyCrawlHistoryItem item : items.values()) { logManager.appendItemToLog(item); } // ok shutdown log manager ... logManager.shutdown(); // restart - reload log file ... logManager = initializeTestLogManager(eventLoop, false); // write to 'hdfs' logManager.doCheckpoint(); syncAndValidateItems(items, logManager); logManager.shutdown(); // restart logManager = initializeTestLogManager(eventLoop, false); // tweak original items updateTestItemStates(items); // ok append items for (ProxyCrawlHistoryItem item : items.values()) { logManager.appendItemToLog(item); } syncAndValidateItems(items, logManager); // ok now checkpoint the items logManager.doCheckpoint(); // ok now validate one last time syncAndValidateItems(items, logManager); // shutown logManager.shutdown(); logManager = null; { // start from scratch ... final CrawlHistoryManager logManagerTest = initializeTestLogManager( eventLoop, true); // create a final version of the tree map reference final TreeMap<URLFP, ProxyCrawlHistoryItem> itemList = items; // create filename File urlInputFile = new File(logManagerTest.getLocalDataDir(), "testURLS-" + System.currentTimeMillis()); // ok create a crawl list from urls CrawlList.generateTestURLFile(urlInputFile, urlList1); long listId = logManagerTest.loadList(urlInputFile,0); CrawlList listObject = logManagerTest.getList(listId); final Semaphore listCompletionSemaphore = new Semaphore(-(itemList .size() - 1)); listObject.setEventListener(new CrawlList.CrawlListEvents() { @Override public void itemUpdated(URLFP itemFingerprint) { // TODO Auto-generated method stub listCompletionSemaphore.release(); } }); // ok start the appropriate threads logManagerTest.startLogWriterThread(0); logManagerTest.startListLoaderThread(); logManagerTest.startQueueLoaderThread(new CrawlQueueLoader() { @Override public void queueURL(URLFP urlfp, String url) { logManagerTest .crawlComplete(proxyCrawlHitoryItemToCrawlURL(itemList .get(urlToURLFPMap.get(url)))); } @Override public void flush() { // TODO Auto-generated method stub } }); LOG.info("Waiting for Release"); // and wait for the finish listCompletionSemaphore.acquireUninterruptibly(); LOG.info("Got Here"); } } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } } private static void syncAndValidateItems( TreeMap<URLFP, ProxyCrawlHistoryItem> items, CrawlHistoryManager logManager) throws IOException { // ok now sync the list final TreeMap<URLFP, ProxyCrawlHistoryItem> syncedItemList = new TreeMap<URLFP, ProxyCrawlHistoryItem>(); try { logManager.syncList(0L, Sets.newTreeSet(items.keySet()), new ItemUpdater() { @Override public void updateItemState(URLFP fingerprint, ProxyCrawlHistoryItem item) throws IOException { try { syncedItemList.put((URLFP) fingerprint.clone(), (ProxyCrawlHistoryItem) item.clone()); } catch (CloneNotSupportedException e) { e.printStackTrace(); } } }); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); Assert.assertTrue(false); } // assert that the key set is equal Assert.assertEquals(items.keySet(), syncedItemList.keySet()); // ok now validate that the values are equal for (Map.Entry<URLFP, ProxyCrawlHistoryItem> item : items.entrySet()) { ProxyCrawlHistoryItem other = syncedItemList.get(item.getKey()); Assert.assertEquals(item.getValue(), other); } } private static CrawlHistoryManager initializeTestLogManager( EventLoop eventLoop, boolean fromScratch) throws IOException { File baseTestDir = new File("/tmp/logManagerTest"); File remoteDir = new File(baseTestDir, "remote"); File localDir = new File(baseTestDir, "local"); if (fromScratch) { FileUtils.recursivelyDeleteFile(baseTestDir); baseTestDir.mkdir(); remoteDir.mkdir(); localDir.mkdir(); } FileSystem localFileSystem = FileSystem.getLocal(CrawlEnvironment .getHadoopConfig()); int initFlags = INIT_FLAG_SKIP_LOAD_EXISTING_LISTS | INIT_FLAG_SKIP_LOG_WRITER_THREAD_INIT | INIT_FLAG_SKIP_LIST_LOADER_THREAD_INIT; return new CrawlHistoryManager(localFileSystem, new Path(remoteDir .getAbsolutePath()), localDir, eventLoop, initFlags); } private static TreeMap<URLFP, ProxyCrawlHistoryItem> buildTestList( String... urls) { TreeMap<URLFP, ProxyCrawlHistoryItem> mapOut = new TreeMap<URLFP, ProxyCrawlHistoryItem>(); for (String url : urls) { URLFP fp = URLUtils.getURLFPFromURL(url, true); if (fp != null) { ProxyCrawlHistoryItem item = new ProxyCrawlHistoryItem(); item.setCrawlStatus(0); item.setOriginalURL(url); item.setHttpResultCode(200); mapOut.put(fp, item); } } return mapOut; } private static void updateTestItemStates( TreeMap<URLFP, ProxyCrawlHistoryItem> items) { for (ProxyCrawlHistoryItem item : items.values()) { item.setHttpResultCode(301); item.setRedirectURL(item.getOriginalURL() + "/redirect"); item.setRedirectStatus(0); item.setRedirectHttpResult(200); } } private static void launchInTestMode() { File baseTestDir = new File("/tmp/logManagerTest"); FileUtils.recursivelyDeleteFile(baseTestDir); baseTestDir.mkdir(); File remoteDir = new File(baseTestDir, "remote"); File localDir = new File(baseTestDir, "local"); remoteDir.mkdir(); localDir.mkdir(); final TreeMap<String, URLFP> urlToFPMap = new TreeMap<String, URLFP>(); final TreeMap<URLFP, String> urlFPToString = new TreeMap<URLFP, String>(); Set<String> list1 = Sets.newHashSet(urlList1); Set<String> list2 = Sets.newHashSet(urlList2); final Set<String> combined = Sets.union(list1, list2); Set<String> difference = Sets.difference(list1, list2); final Set<String> completedURLS = new HashSet<String>(); for (String url : combined) { URLFP fingerprint = URLUtils.getURLFPFromURL(url, true); urlToFPMap.put(url, fingerprint); urlFPToString.put(fingerprint, url); } File testInputFile1 = new File(localDir, "INPUT_LIST-" + System.currentTimeMillis()); File testInputFile2 = new File(localDir, "INPUT_LIST-" + (System.currentTimeMillis() + 1)); try { generateTestURLFile(testInputFile1, urlList1); generateTestURLFile(testInputFile2, urlList2); FileSystem localFileSystem = FileSystem.getLocal(CrawlEnvironment .getHadoopConfig()); EventLoop eventLoop = new EventLoop(); eventLoop.start(); final CrawlHistoryManager logManager = new CrawlHistoryManager( localFileSystem, new Path(remoteDir.getAbsolutePath()), localDir, eventLoop, 0); final LinkedBlockingQueue<ProxyCrawlHistoryItem> queue = new LinkedBlockingQueue<ProxyCrawlHistoryItem>(); final Semaphore initialListComplete = new Semaphore(0); logManager.startQueueLoaderThread(new CrawlQueueLoader() { @Override public void queueURL(URLFP urlfp, String url) { ProxyCrawlHistoryItem item = new ProxyCrawlHistoryItem(); item.setOriginalURL(url); queue.add(item); } @Override public void flush() { // TODO Auto-generated method stub } }); Thread queueTestThread = new Thread(new Runnable() { @Override public void run() { while (true) { try { ProxyCrawlHistoryItem item = queue.take(); if (item.getOriginalURL().length() == 0) { break; } else { System.out.println("Got:" + item.getOriginalURL()); CrawlURL urlObject = new CrawlURL(); Assert.assertTrue(!completedURLS .contains(item.getOriginalURL())); completedURLS.add(item.getOriginalURL()); urlObject .setLastAttemptResult((byte) CrawlURL.CrawlResult.SUCCESS); urlObject.setUrl(item.getOriginalURL()); urlObject.setResultCode(200); logManager.crawlComplete(urlObject); if (completedURLS.equals(combined)) { System.out .println("Hit Trigger URL. Releasing InitialListComplete Sempahore"); initialListComplete.release(1); } } } catch (InterruptedException e) { } } } }); queueTestThread.start(); logManager.loadList(testInputFile1,0); logManager.loadList(testInputFile2,0); System.out.println("Waiting for Initial List to Complete"); initialListComplete.acquireUninterruptibly(); System.out.println("Woke Up"); try { eventLoop.getEventThread().join(); } catch (InterruptedException e) { e.printStackTrace(); } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } }