/**
* Copyright 2008 - CommonCrawl Foundation
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
**/
package org.commoncrawl.service.listcrawler;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.io.RandomAccessFile;
import java.nio.channels.FileLock;
import java.nio.channels.OverlappingFileLockException;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DataInputBuffer;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.WritableUtils;
import org.apache.log4j.BasicConfigurator;
import org.commoncrawl.crawl.common.internal.CrawlEnvironment;
import org.commoncrawl.hadoop.mergeutils.MergeSortSpillWriter;
import org.commoncrawl.hadoop.mergeutils.RawKeyValueComparator;
import org.commoncrawl.hadoop.mergeutils.SequenceFileSpillWriter;
import org.commoncrawl.mapred.ProxyCrawlHistoryItem;
import org.commoncrawl.protocol.CrawlURL;
import org.commoncrawl.protocol.URLFP;
import org.commoncrawl.rpc.base.shared.BinaryProtocol;
import org.commoncrawl.service.crawler.util.URLFPBloomFilter;
import org.commoncrawl.service.listcrawler.CrawlListDomainItem;
import org.commoncrawl.service.listcrawler.CrawlListMetadata;
import org.commoncrawl.service.listcrawler.CrawlHistoryManager.ItemUpdater;
import org.commoncrawl.util.GoogleURL;
import org.commoncrawl.util.URLFingerprint;
import org.commoncrawl.util.URLUtils;
import org.commoncrawl.util.CCStringUtils;
import org.commoncrawl.util.CRC16;
import org.commoncrawl.util.FileUtils;
import org.junit.Assert;
import com.google.gson.stream.JsonWriter;
/**
* A list of urls that need to be crawled
* @author rana
*
*/
public final class CrawlList implements ItemUpdater {
// default refresh interval is 60 days ...
public static final int DEFAULT_REFRESH_INTERVAL_IN_SECS = 86400 * 60;
/**
* events generated by the CrawlList
*
* @author rana
*
*/
public static interface CrawlListEvents {
public void itemUpdated(URLFP itemFingerprint);
}
public static final Log LOG = LogFactory.getLog(CrawlList.class);
public static final int ValueFlag_HasRedirect = 1 << 0;
File _listURLDataFile = null;
File _fixedDataFile = null;
File _variableDataFile = null;
File _bloomFilterData = null;
File _listMetadataFile = null;
File _subDomainMetadataFile = null;
URLFPBloomFilter _bloomFilter = null;
long _listId;
CrawlHistoryStorage _manager;
CrawlListMetadata _metadata = new CrawlListMetadata();
CrawlListEvents _eventListener;
byte[] _tempFixedDataBuffer = null;
int _tempFixedDataBufferSize = 0;
DataOutputBuffer _tempOutputBuffer = new DataOutputBuffer(OnDiskCrawlHistoryItem.ON_DISK_SIZE);
TreeMap<Long,CrawlListMetadata> _transientSubDomainStats = new TreeMap<Long,CrawlListMetadata>();
DataOutputBuffer _offsetLookupTable = null;
Exception _exception;
public enum LoadState {
UNINITIALIZED,
QUEUED_FOR_LOADING,
REALLY_LOADING,
LOADED,
ERROR
}
LoadState _listState = LoadState.UNINITIALIZED;
public enum QueueState {
WAITING,
QUEUEING,
QUEUED,
ERROR
}
QueueState _queueState = QueueState.WAITING;
/**
* internal factory constructor
*/
private CrawlList(CrawlHistoryStorage manager,long listId, LoadState state ) {
_manager = manager;
//establish file names
initializeListFileNames();
_listId = listId;
_listState = state;
}
/**
* internal factory constructor
*/
private CrawlList(CrawlHistoryStorage manager,long listId, Exception e) {
_manager = manager;
//establish file names
initializeListFileNames();
_listId = listId;
_listState = LoadState.ERROR;
_exception = e;
}
/** is list loaded
*
*/
public boolean isListLoaded() {
return _listState == LoadState.LOADED;
}
/** mark list as loading
*
*
*/
public void markListAsReallyLoading() {
_listState = LoadState.REALLY_LOADING;
}
// get the list's load state
public LoadState getLoadState() {
return _listState;
}
// get the last caught exception (if list is in error state)
public Exception getLastException() {
return _exception;
}
/** get the list id
*
*/
public long getListId() {
return _listId;
}
/** set the event listener hook **
*
* @param eventListener
*/
public synchronized void setEventListener(CrawlListEvents eventListener) {
_eventListener = eventListener;
}
public synchronized CrawlListEvents getEventListener() {
return _eventListener;
}
/** get metadata
*
*/
public CrawlListMetadata getMetadata() {
CrawlListMetadata metadataOut = null;
synchronized (_metadata) {
try {
metadataOut = (CrawlListMetadata) _metadata.clone();
} catch (CloneNotSupportedException e) {
}
}
return metadataOut;
}
/**
*
* @return the path to the url data file (source for the urls in this list)
*/
public File getListURLDataFile() {
return _listURLDataFile;
}
/**
* Initialize a CrawlList in an error state ..
*/
public static CrawlList createListWithLoadErrorState(CrawlHistoryStorage manager,long listId,Exception e) {
return new CrawlList(manager,listId,e);
}
/**
* Initialize a CrawlList in an laoding state ..
*/
public static CrawlList createListLoadingInLoadingState(CrawlHistoryStorage manager,long listId,File dataFile,int refreshInterval) {
CrawlList listOut = new CrawlList(manager,listId,LoadState.QUEUED_FOR_LOADING);
listOut.getMetadata().setRefreshInterval(refreshInterval);
listOut._listURLDataFile = dataFile;
return listOut;
}
/**
* Load a CrawlList from previously stored disk state
*
* @param manager - reference to the crawl list history manager
* @param listId - the list id (the timestamp) for the given list to load from disk state
*/
public CrawlList(CrawlHistoryStorage storage, long listId) throws IOException {
_listId = listId;
_manager = storage;
//establish file names
initializeListFileNames();
LOG.info("Initilaizing pre-existing List with Id:" + listId);
LOG.info("Loading BloomFilterData for List:" + listId);
FileInputStream bloomFilterData = new FileInputStream(_bloomFilterData);
try {
// load bloom filter
_bloomFilter = URLFPBloomFilter.load(bloomFilterData);
}
finally {
bloomFilterData.close();
}
// load list metadata from disk
loadMetadataFromDisk();
// reset queued counts ...
_metadata.setQueuedItemCount(0);
// write it back
writeMetadataToDisk();
// load sub domain metadata from disk ...
loadSubDomainMetadataFromDisk();
// reset queued count ...
resetSubDomainCounts();
_listState = LoadState.LOADED;
}
/**
* Initialize a new CrawlList object from a given input stream of urls
*
* @param manager - reference to the crawl history log manager
* @param urlInputStream - the input stream containing the list of urls that we should add to this list ...
* @throws IOException
*/
public CrawlList(CrawlHistoryStorage manager,long listId,File sourceURLFile,int refreshInterval) throws IOException {
_manager = manager;
_listState = LoadState.REALLY_LOADING;
// initialize a new list id
_listId = listId;
LOG.info("*** LIST:" + getListId() + " LOADING FROM SOURCE FILE:" + sourceURLFile.getAbsolutePath());
//establish file names
initializeListFileNames();
sourceURLFile.renameTo(_listURLDataFile);
FileInputStream urlInputStream = new FileInputStream(_listURLDataFile);
try {
// set we will use to hold all fingerprints generated
TreeSet<URLFP> urlSet = new TreeSet<URLFP>();
// create temp files ...
File spillOutputFile = File.createTempFile("spillOut", Long.toString(_listId));
// create mergesortspillwriter
SequenceFileSpillWriter<URLFP,ProxyCrawlHistoryItem> spillwriter
= new SequenceFileSpillWriter<URLFP,ProxyCrawlHistoryItem>(
FileSystem.getLocal(
CrawlEnvironment.getHadoopConfig()),
CrawlEnvironment.getHadoopConfig(),
new Path(spillOutputFile.getAbsolutePath()),
URLFP.class,
ProxyCrawlHistoryItem.class,
null,false);
try {
MergeSortSpillWriter<URLFP,ProxyCrawlHistoryItem> merger
= new MergeSortSpillWriter<URLFP, ProxyCrawlHistoryItem>(
CrawlEnvironment.getHadoopConfig(),
spillwriter,
FileSystem.getLocal(
CrawlEnvironment.getHadoopConfig()),
new Path(manager.getLocalDataDir().getAbsolutePath()),
null,
new RawKeyValueComparator<URLFP, ProxyCrawlHistoryItem>() {
DataInputBuffer _key1Buffer = new DataInputBuffer();
DataInputBuffer _key2Buffer = new DataInputBuffer();
@Override
public int compareRaw(byte[] key1Data, int key1Offset,
int key1Length, byte[] key2Data, int key2Offset,
int key2Length, byte[] value1Data, int value1Offset,
int value1Length, byte[] value2Data, int value2Offset,
int value2Length) throws IOException {
_key1Buffer.reset(key1Data,key1Offset,key1Length);
_key2Buffer.reset(key2Data,key2Offset,key2Length);
_key1Buffer.skip(2); // skip verison, and 1 byte id
_key2Buffer.skip(2); // skip verison, and 1 byte id
int domainHash1 = WritableUtils.readVInt(_key1Buffer);
int domainHash2 = WritableUtils.readVInt(_key2Buffer);
_key1Buffer.skip(1); // skip 1 byte id
_key2Buffer.skip(1); // skip 1 byte id
long fingerprint1= WritableUtils.readVLong(_key1Buffer);
long fingerprint2= WritableUtils.readVLong(_key2Buffer);
int result = ((Integer)domainHash1).compareTo(domainHash2);
if (result == 0) {
result = ((Long)fingerprint1).compareTo(fingerprint2);
}
return result;
}
@Override
public int compare(URLFP key1, ProxyCrawlHistoryItem value1,URLFP key2, ProxyCrawlHistoryItem value2) {
return key1.compareTo(key2);
}
},
URLFP.class,
ProxyCrawlHistoryItem.class,false,null);
try {
LOG.info("*** LIST:" + getListId() + " Starting Scan of URLS In List");
BufferedReader reader= new BufferedReader(new InputStreamReader(urlInputStream,Charset.forName("UTF-8")));
String line = null;
int lineNumber = 0;
ProxyCrawlHistoryItem item = new ProxyCrawlHistoryItem();
while ((line = reader.readLine()) != null) {
++lineNumber;
if (line.length() != 0 && !line.startsWith("#")) {
URLFP fingerprint = URLUtils.getURLFPFromURL(line, true);
if (fingerprint != null) {
if (!urlSet.contains(fingerprint)) {
// and add fingerprint to set
urlSet.add(fingerprint);
// initialize item
item.clear();
item.setOriginalURL(line);
// and spill to merger / sorter ..
merger.spillRecord(fingerprint, item);
}
}
else {
LOG.error("*** LIST:" + getListId() + " Invalid URL Encounered at Line:" + lineNumber + " URL" + line);
}
}
}
LOG.info("*** LIST:" + getListId() + " Completed Scan of:" + urlSet.size() + " URLS");
}
finally {
merger.close();
}
}
finally {
if (spillwriter != null)
spillwriter.close();
}
LOG.info("*** LIST:" + getListId() + " Generating BloomFilter for:" + urlSet.size() + " keys");
// generate bloom filter ...
_bloomFilter = new URLFPBloomFilter(urlSet.size(),7,10);
for (URLFP fingerprint : urlSet) {
_bloomFilter.add(fingerprint);
}
LOG.info("*** LIST:" + getListId() + " Serializing BloomFilter");
// serialize it
FileOutputStream bloomFilterStream = new FileOutputStream(_bloomFilterData);
try {
_bloomFilter.serialize(bloomFilterStream);
}
finally {
bloomFilterStream.flush();
bloomFilterStream.close();
}
LOG.info("*** LIST:" + getListId() + " Starting Read of Merged Sequence File:" + spillOutputFile);
// now initialize value map and string maps based on output sequence file ...
SequenceFile.Reader reader = new SequenceFile.Reader(
FileSystem.getLocal(CrawlEnvironment.getHadoopConfig()), new Path(spillOutputFile.getAbsolutePath()), CrawlEnvironment.getHadoopConfig());
LOG.info("*** LIST:" + getListId() + " PRE-ALLOCATING FIXED DATA BUFFER OF SIZE:" + (urlSet.size() * OnDiskCrawlHistoryItem.ON_DISK_SIZE));
// OK, Allocate room for fixed data file upfront
DataOutputBuffer valueStream = new DataOutputBuffer(urlSet.size() * OnDiskCrawlHistoryItem.ON_DISK_SIZE);
LOG.info("*** LIST:" + getListId() + " ALLOCATION SUCCEEDED");
try {
//DataOutputStream valueStream = new DataOutputStream(new FileOutputStream(_fixedDataFile));
RandomAccessFile stringsStream = new RandomAccessFile(_variableDataFile, "rw");
try {
URLFP urlFP = new URLFP();
ProxyCrawlHistoryItem item = new ProxyCrawlHistoryItem();
// read fingerprints ...
while (reader.next(urlFP, item)) {
// write out fixed data structure and strings
writeInitialOnDiskItem(urlFP,item,valueStream,stringsStream);
}
}
finally {
//valueStream.flush();
//valueStream.close();
stringsStream.close();
}
}
finally {
reader.close();
}
LOG.info("*** LIST:" + getListId() + " Finished Writing Initial Values to Disk");
LOG.info("*** LIST:" + getListId() + " FIXED DATA BUFFER OF SIZE:" + valueStream.getLength() + " EXCEPECTED SIZE:" + (urlSet.size() * OnDiskCrawlHistoryItem.ON_DISK_SIZE));
if (valueStream.getLength() != (urlSet.size() * OnDiskCrawlHistoryItem.ON_DISK_SIZE)) {
throw new IOException("Final FixedItemData Buffer Size:" + valueStream.getLength() + " != URLSetSize:" + (urlSet.size() * OnDiskCrawlHistoryItem.ON_DISK_SIZE));
}
// initialize temp data buffer variables
_tempFixedDataBuffer = valueStream.getData();
_tempFixedDataBufferSize = valueStream.getLength();
// update metadata
_metadata.setRefreshInterval(refreshInterval);
_metadata.setUrlCount(urlSet.size());
// setup version
_metadata.setVersion(1);
// and write to disk
writeMetadataToDisk();
// mark state as loaded ...
_listState = LoadState.LOADED;
LOG.info("*** LIST:" + getListId() + " SYNCING");
// reconcile with history log
_manager.syncList(this.getListId(),urlSet,this);
LOG.info("*** LIST:" + getListId() + " SYNC COMPLETE");
// write metdata to disk again
writeMetadataToDisk();
LOG.info("*** LIST:" + getListId() + " FLUSHING FIXED DATA");
// and finally flush fixed data to disk
FileOutputStream finalDataStream = new FileOutputStream(_fixedDataFile);
try {
synchronized (this) {
int blockSize = 1 << 20;
long bytesCopied = 0;
for (int offset=0;offset<_tempFixedDataBufferSize;offset += blockSize) {
int bytesToCopy = Math.min(blockSize,_tempFixedDataBufferSize - offset);
finalDataStream.write(_tempFixedDataBuffer, offset, bytesToCopy);
bytesCopied += bytesToCopy;
}
// validate bytes copied
if (bytesCopied != _tempFixedDataBufferSize) {
throw new IOException("Buffer Size:" + _tempFixedDataBufferSize + " Does not Match BytesCopied:" + bytesCopied);
}
// ok release the buffer
_tempFixedDataBuffer = null;
_tempFixedDataBufferSize = 0;
LOG.info("*** LIST:" + getListId() + " FIXED DATA FLUSH COMPLETE");
}
}
finally {
finalDataStream.flush();
finalDataStream.close();
}
// load sub domain metadata from disk ...
loadSubDomainMetadataFromDisk();
}
catch (IOException e) {
LOG.error("*** LIST:" + getListId() + " Crawl List Initialization Failed With Exception:" + CCStringUtils.stringifyException(e));
_fixedDataFile.delete();
_variableDataFile.delete();
_bloomFilterData.delete();
_listState = LoadState.ERROR;
throw e;
}
finally {
urlInputStream.close();
}
}
/**
* update list state of a recently crawled item
*
* @param fingerprint - the fingerprint of the updated item
* @param newData - the updated crawl history data for the given item
* @throws IOException
*/
@Override
public void updateItemState(URLFP fingerprint,ProxyCrawlHistoryItem newData)throws IOException {
if (_listState == LoadState.LOADED) {
// check for membership ...
if (_bloomFilter.isPresent(fingerprint)) {
//LOG.info("UpdateItemState Called for URL:" + newData.getOriginalURL() + " List:" + getListId());
//LOG.info("UpdateItemState Loading OnDisk Item for URL:" + newData.getOriginalURL() + " List:" + getListId());
// extract existing item from disk
OnDiskCrawlHistoryItem originalItem = loadOnDiskItemForURLFP(fingerprint);
//if present (null if false cache hit)
if (originalItem != null) {
// build an on disk item data structure for any potential changes ...
OnDiskCrawlHistoryItem newItem = onDiskItemFromHistoryItem(fingerprint,newData);
// set inital offset information
newItem._fileOffset = originalItem._fileOffset;
newItem._stringsOffset = originalItem._stringsOffset;
// LOG.info("UpdateItemState Comparing OnDisk Item to New Item for URL:" + newData.getOriginalURL() + " List:" + getListId());
// compare the two items ...
if (!newItem.equals(originalItem)) {
//LOG.info("UpdateItemState Items Don't Match for URL:" + newData.getOriginalURL() + " List:" + getListId());
// ok items do not match ... figure out if strings are different ...
if (newItem._stringsCRC != originalItem._stringsCRC) {
RandomAccessFile stringsFile = new RandomAccessFile(_variableDataFile, "rw");
try {
// seek to end
stringsFile.seek(stringsFile.length());
// update offset info
newItem._stringsOffset = stringsFile.length();
// write out string data length
WritableUtils.writeVInt(stringsFile,_stringBuffer1.getLength());
// write strings to log file
stringsFile.write(_stringBuffer1.getData(),0,_stringBuffer1.getLength());
}
finally {
stringsFile.close();
}
}
// otherwise take the offset from old item
else {
newItem._stringsOffset = originalItem._stringsOffset;
}
//LOG.info("Opening Data File for OnDiskItem load for Fingerprint:" + newItem._urlFingerprint);
// ok, different paths depending on wether this is an in memory update or not ...
boolean wroteToMemory = false;
synchronized (this) {
if (_tempFixedDataBuffer != null) {
wroteToMemory = true;
// reset output buffer
_tempOutputBuffer.reset();
// serizlie to output buffer
newItem.serialize(_tempOutputBuffer);
// copy to appropriate location
System.arraycopy(_tempOutputBuffer.getData(), 0, _tempFixedDataBuffer,(int) originalItem._fileOffset, OnDiskCrawlHistoryItem.ON_DISK_SIZE);
}
}
if (!wroteToMemory){
// write to disk
RandomAccessFile file = new RandomAccessFile(_fixedDataFile,"rw");
try {
while (true) {
try {
//LOG.info("*** TRYING UPDATE LOCK FOR OFFSET:" + originalItem._fileOffset);
FileLock lock = file.getChannel().tryLock(originalItem._fileOffset, OnDiskCrawlHistoryItem.ON_DISK_SIZE, false);
try {
//LOG.info("*** GOT UPDATE LOCK FOR OFFSET:" + originalItem._fileOffset);
file.seek(originalItem._fileOffset);
newItem.serialize(file);
//LOG.info("Updated Data File for OnDiskItem for Fingerprint:" + originalItem._urlFingerprint);
break;
}
finally {
//LOG.info("*** RELEASED UPDATE LOCK FOR OFFSET:" + originalItem._fileOffset);
lock.release();
}
}
catch (OverlappingFileLockException e) {
LOG.error("###LockConflict(RETRY):" + CCStringUtils.stringifyException(e));
}
}
}
finally {
file.close();
}
}
// ok now update metadata ...
synchronized (_metadata) {
int updateFlags = calculateUpdateFlags(originalItem, newItem);
if (updateFlags != 0) {
int metadataDirtyFlags = updateMetadata(newItem, _metadata, 0);
// only write metadata to disk if temp data buffer is null
if (metadataDirtyFlags != 0 && !wroteToMemory) {
if ((metadataDirtyFlags & MetadataUpdateFlag_ModifiedCrawlStatus) != 0) {
_metadata.setQueuedItemCount(_metadata.getQueuedItemCount() - 1);
}
writeMetadataToDisk();
}
// if not writing to memory then update subdomain metadata
if (!wroteToMemory) {
synchronized (_subDomainMetadataFile) {
CrawlListMetadata subDomainMetadata = getSubDomainMetadataByURL(newData.getOriginalURL());
int subDomainMetadataDirtyFlags = updateMetadata(newItem, subDomainMetadata, processFileOffsets);
if (subDomainMetadataDirtyFlags != 0 && !wroteToMemory) {
if ((subDomainMetadataDirtyFlags & MetadataUpdateFlag_ModifiedCrawlStatus) != 0) {
subDomainMetadata.setQueuedItemCount(subDomainMetadata.getQueuedItemCount() - 1);
}
writeSubDomainMetadataToDisk(subDomainMetadata);
}
}
}
}
}
synchronized (this) {
if (_eventListener != null) {
_eventListener.itemUpdated(fingerprint);
}
}
}
}
}
}
}
private static final int processOrignalStatus = 1 << 0;
private static final int processOriginalResult = 1 << 1;
private static final int processRedirectStatus = 1 << 2;
private static final int processRedirectResult = 1 << 3;
private static final int processFileOffsets = 1 << 4;
private static final int processAllItems = Integer.MAX_VALUE;
private static int calculateUpdateFlags(OnDiskCrawlHistoryItem originalItem,OnDiskCrawlHistoryItem newItem) {
int updateFlags = 0;
if (newItem.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_CRAWL_STATUS) && !originalItem.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_CRAWL_STATUS)) {
updateFlags |= processOrignalStatus;
}
if (newItem.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_ORIGINAL_RESULT_CODE) && !originalItem.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_ORIGINAL_RESULT_CODE)) {
updateFlags |= processOriginalResult;
}
if (newItem.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_REDIRECT_STATUS) && !originalItem.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_REDIRECT_STATUS)) {
updateFlags |= processRedirectStatus;
}
if (newItem.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_REDIRECT_RESULT_CODE) && !originalItem.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_REDIRECT_RESULT_CODE)) {
updateFlags |= processRedirectResult;
}
return updateFlags;
}
private static final int MetadataUpdateFlag_ModifiedCrawlStatus = 1 << 0;
private static final int MetadataUpdateFlag_ModifiedRedirectStatus = 1 << 1;
private static final int MetadataUpdateFlag_ModifiedOffsets = 1 << 1;
private static int updateMetadata(OnDiskCrawlHistoryItem newItem,CrawlListMetadata metadata, int updateFlags) {
int metadataDirtyFlags = 0;
if (!newItem.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_REDIRECT_STATUS)) {
//if ((updateFlags & processOrignalStatus) != 0) {
// LOG.info("### Updating OriginalCrawlStatus for Item:" + newData.getOriginalURL());
// status changed ...
if (newItem._crawlStatus != 0) {
switch (newItem._crawlStatus) {
case CrawlURL.FailureReason.RobotsExcluded: metadata.setRobotsExcludedCount(metadata.getRobotsExcludedCount() + 1);break;
case CrawlURL.FailureReason.Timeout: metadata.setTimeoutErrorCount(metadata.getTimeoutErrorCount() + 1);break;
case CrawlURL.FailureReason.IOException: metadata.setIOExceptionCount(metadata.getIOExceptionCount() + 1);break;
case CrawlURL.FailureReason.DNSFailure: metadata.setDNSErrorCount(metadata.getDNSErrorCount() + 1);break;
default: metadata.setOtherErrorCount(metadata.getOtherErrorCount() + 1);
}
metadataDirtyFlags |= MetadataUpdateFlag_ModifiedCrawlStatus;
}
//}
//if ((updateFlags & processOriginalResult) != 0) {
// LOG.info("### Updating OriginalResultCode for Item:" + newData.getOriginalURL());
if (newItem._crawlStatus == 0) {
if (newItem._httpResultCode == 200) metadata.setHttp200Count( metadata.getHttp200Count() + 1);
else if (newItem._httpResultCode == 301) metadata.setHttp301Count( metadata.getHttp301Count() + 1);
else if (newItem._httpResultCode == 403) metadata.setHttp403Count( metadata.getHttp403Count() + 1);
else if (newItem._httpResultCode == 404) metadata.setHttp404Count( metadata.getHttp404Count() + 1);
else if (newItem._httpResultCode >= 500 && newItem._httpResultCode < 600 ) metadata.setHttp500Count( metadata.getHttp500Count() + 1);
else if (newItem._httpResultCode >= 600 ) metadata.setHttpOtherCount( metadata.getHttpOtherCount() + 1);
metadataDirtyFlags |= MetadataUpdateFlag_ModifiedCrawlStatus;
}
//}
}
else {
//if ((updateFlags & processRedirectStatus) != 0) {
// status changed ...
if (newItem._redirectStatus != 0) {
switch (newItem._redirectStatus) {
case CrawlURL.FailureReason.RobotsExcluded: metadata.setRobotsExcludedCount(metadata.getRobotsExcludedCount() + 1);break;
case CrawlURL.FailureReason.Timeout: metadata.setTimeoutErrorCount(metadata.getTimeoutErrorCount() + 1);break;
case CrawlURL.FailureReason.IOException: metadata.setIOExceptionCount(metadata.getIOExceptionCount() + 1);break;
case CrawlURL.FailureReason.DNSFailure: metadata.setDNSErrorCount(metadata.getDNSErrorCount() + 1);break;
default: metadata.setOtherErrorCount(metadata.getOtherErrorCount() + 1);
}
metadataDirtyFlags |= MetadataUpdateFlag_ModifiedCrawlStatus;
}
//}
//if ((updateFlags & processRedirectResult) != 0) {
if (newItem._redirectStatus == 0) {
if (newItem._redirectHttpResult == 200) metadata.setHttp200Count( metadata.getHttp200Count() + 1);
else if (newItem._redirectHttpResult == 301) metadata.setHttp301Count( metadata.getHttp301Count() + 1);
else if (newItem._redirectHttpResult == 403) metadata.setHttp403Count( metadata.getHttp403Count() + 1);
else if (newItem._redirectHttpResult == 404) metadata.setHttp404Count( metadata.getHttp404Count() + 1);
else if (newItem._redirectHttpResult >= 500 && newItem._redirectHttpResult < 600 ) metadata.setHttp500Count( metadata.getRedirectHttp500Count() + 1);
else if (newItem._redirectHttpResult >= 600 ) metadata.setRedirectHttpOtherCount( metadata.getHttpOtherCount() + 1);
metadataDirtyFlags |= MetadataUpdateFlag_ModifiedCrawlStatus;
}
//}
}
if ((updateFlags & processFileOffsets) != 0) {
if (!metadata.isFieldDirty(CrawlListMetadata.Field_FIRSTRECORDOFFSET) || metadata.getFirstRecordOffset() > newItem._fileOffset) {
metadata.setFirstRecordOffset(newItem._fileOffset);
metadataDirtyFlags = MetadataUpdateFlag_ModifiedOffsets;
}
if (!metadata.isFieldDirty(CrawlListMetadata.Field_LASTRECORDOFFSET) || metadata.getLastRecordOffset() < newItem._fileOffset) {
metadata.setLastRecordOffset(newItem._fileOffset);
metadataDirtyFlags = MetadataUpdateFlag_ModifiedOffsets;
}
}
return metadataDirtyFlags;
}
/**
*
* @return the queued (all urls queued for crawling or not) state of this list
*/
public QueueState getQueuedState() {
return _queueState;
}
private int lastDomainHash = -1;
private String lastRootDomainName = null;
private CrawlListMetadata lastRootDomainMetadata = null;
private int domainQueuedCount = 0;
private void updateSubDomainMetadataForItemDuringLoad(OnDiskCrawlHistoryItem item,String itemURL,URLFP itemFP,boolean isQueued) throws IOException {
// ok unfortunately, we need to update stats for the subdomain here
if (item._domainHash != lastDomainHash) {
// update last domain hash ...
lastDomainHash = item._domainHash;
// extract root domain name
GoogleURL urlObject = new GoogleURL(itemURL);
String rootDomainName = URLUtils.extractRootDomainName(urlObject.getHost());
// if root domain name different than last root domain name ...
if (rootDomainName != lastRootDomainName) {
// flush last entry
flushCachedSubDomainMetadata();
// load new entry
if (rootDomainName != null) {
lastRootDomainName = rootDomainName;
lastRootDomainMetadata = new CrawlListMetadata();
}
}
if (lastRootDomainMetadata != null) {
if (isQueued){
lastRootDomainMetadata.setQueuedItemCount(lastRootDomainMetadata.getQueuedItemCount() + 1);
}
else {
updateMetadata(item, lastRootDomainMetadata, 0);
}
}
if (lastRootDomainName != null) {
updateSubDomainQueueStatus(lastRootDomainName,domainQueuedCount);
}
}
}
private void flushCachedSubDomainMetadata() throws IOException {
if (lastRootDomainMetadata != null) {
// ok get the latest version of the metadata from disk
synchronized (_subDomainMetadataFile) {
// get from disk
CrawlListMetadata metadataOnDisk = getSubDomainMetadataByRootDomain(lastRootDomainName);
// update on disk version ...
metadataOnDisk.setHttp200Count(metadataOnDisk.getHttp200Count() + lastRootDomainMetadata.getHttp200Count());
metadataOnDisk.setHttp301Count(metadataOnDisk.getHttp301Count() + lastRootDomainMetadata.getHttp301Count());
metadataOnDisk.setHttp403Count(metadataOnDisk.getHttp403Count() + lastRootDomainMetadata.getHttp403Count());
metadataOnDisk.setHttp404Count(metadataOnDisk.getHttp404Count() + lastRootDomainMetadata.getHttp404Count());
metadataOnDisk.setHttp500Count(metadataOnDisk.getHttp500Count() + lastRootDomainMetadata.getHttp500Count());
metadataOnDisk.setHttpOtherCount(metadataOnDisk.getHttpOtherCount() + lastRootDomainMetadata.getHttpOtherCount());
metadataOnDisk.setRobotsExcludedCount(metadataOnDisk.getRobotsExcludedCount() + lastRootDomainMetadata.getRobotsExcludedCount());
metadataOnDisk.setTimeoutErrorCount(metadataOnDisk.getTimeoutErrorCount() + lastRootDomainMetadata.getTimeoutErrorCount());
metadataOnDisk.setIOExceptionCount(metadataOnDisk.getIOExceptionCount() + lastRootDomainMetadata.getIOExceptionCount());
metadataOnDisk.setDNSErrorCount(metadataOnDisk.getDNSErrorCount() + lastRootDomainMetadata.getDNSErrorCount());
metadataOnDisk.setOtherErrorCount(metadataOnDisk.getOtherErrorCount() + lastRootDomainMetadata.getOtherErrorCount());
metadataOnDisk.setQueuedItemCount(metadataOnDisk.getQueuedItemCount() + lastRootDomainMetadata.getQueuedItemCount());
// ok write it back to disk
writeSubDomainMetadataToDisk(metadataOnDisk);
}
lastRootDomainMetadata = null;
lastRootDomainName = null;
lastDomainHash = -1;
}
}
/** queue uncrawled urls via the CrawlQueueLoader
*
* @param loader
*/
public void queueUnCrawledItems(CrawlQueueLoader loader) throws IOException {
_queueState = QueueState.QUEUEING;
int metadataVersion = getMetadata().getVersion();
synchronized (_metadata) {
// reset metadata PERIOD
int urlCount = _metadata.getUrlCount();
_metadata.clear();
_metadata.setUrlCount(urlCount);
}
RandomAccessFile fixedDataReader = new RandomAccessFile(_fixedDataFile, "rw");
RandomAccessFile stringDataReader = new RandomAccessFile(_variableDataFile, "rw");
try {
OnDiskCrawlHistoryItem item = new OnDiskCrawlHistoryItem();
URLFP fingerprint = new URLFP();
while (fixedDataReader.getFilePointer() != fixedDataReader.length()) {
long position = fixedDataReader.getFilePointer();
//LOG.info("*** TRYING READ LOCK FOR OFFSET:" + position);
while (true) {
// get read lock on position ...
try {
FileLock lock = fixedDataReader.getChannel().tryLock(position, OnDiskCrawlHistoryItem.ON_DISK_SIZE, false);
try {
//LOG.info("*** GOT READ LOCK FOR OFFSET:" + position);
item.deserialize(fixedDataReader);
break;
}
finally {
lock.release();
//LOG.info("*** RELEASED READ LOCK FOR OFFSET:" + position);
}
}
catch (OverlappingFileLockException e) {
LOG.error("*** LOCK CONTENTION AT:" + position + " Exception:" + CCStringUtils.stringifyException(e));
}
}
// seek to string data
stringDataReader.seek(item._stringsOffset);
// and skip buffer length
WritableUtils.readVInt(stringDataReader);
// and read primary string
String url = stringDataReader.readUTF();
// setup fingerprint
fingerprint.setDomainHash(item._domainHash);
fingerprint.setUrlHash(item._urlFingerprint);
// first, if it has not been crawled ever, crawl it not matter what ...
boolean crawlItem = !item.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_CRAWL_STATUS);
// if it has been crawled ... check list metadata version ...
if (!crawlItem && metadataVersion >= 1) {
// ok this is newer version of the list ...
// check refresh time if specified ...
int refreshIntervalInSeconds = DEFAULT_REFRESH_INTERVAL_IN_SECS;
if (getMetadata().getRefreshInterval() != 0) {
refreshIntervalInSeconds = getMetadata().getRefreshInterval();
}
if (item._updateTimestamp > 0) {
long timeSinceLastCrawl = item._updateTimestamp;
if (System.currentTimeMillis() - timeSinceLastCrawl >= (refreshIntervalInSeconds * 1000)) {
crawlItem = true;
}
}
}
if (crawlItem) {
loader.queueURL(fingerprint, url);
synchronized (_metadata) {
// update queued item count
_metadata.setQueuedItemCount(_metadata.getQueuedItemCount() + 1);
}
}
else {
updateMetadata(item, _metadata,0);
}
// ok update subdomain stats
updateSubDomainMetadataForItemDuringLoad(item,url,fingerprint,crawlItem);
}
flushCachedSubDomainMetadata();
loader.flush();
_queueState = QueueState.QUEUED;
}
catch (IOException e) {
LOG.error("Encountered Exception Queueing Items for List:" + _listId + " Exception:" + CCStringUtils.stringifyException(e));
_queueState = QueueState.ERROR;
}
finally {
fixedDataReader.close();
stringDataReader.close();
}
}
/** resubmit failed items
*
* @param loader
*/
public void requeueFailedItems(CrawlQueueLoader loader) throws IOException {
synchronized (this) {
_queueState = QueueState.QUEUEING;
}
RandomAccessFile fixedDataReader = new RandomAccessFile(_fixedDataFile, "rw");
RandomAccessFile stringDataReader = new RandomAccessFile(_variableDataFile, "rw");
try {
OnDiskCrawlHistoryItem item = new OnDiskCrawlHistoryItem();
URLFP fingerprint = new URLFP();
while (fixedDataReader.getFilePointer() != fixedDataReader.length()) {
item.deserialize(fixedDataReader);
boolean queueItem = false;
if (item.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_CRAWL_STATUS)) {
if (item.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_REDIRECT_STATUS)) {
queueItem = (item._redirectStatus != 0);
if (!queueItem) {
if (item._redirectHttpResult != 200 && item._redirectHttpResult != 404) {
queueItem = true;
}
}
}
else {
queueItem = (item._crawlStatus != 0);
if (!queueItem) {
if (item._httpResultCode != 200 && item._httpResultCode != 404) {
queueItem = true;
}
}
}
if (queueItem) {
// seek to string data
stringDataReader.seek(item._stringsOffset);
// and skip buffer length
WritableUtils.readVInt(stringDataReader);
// and read primary string
String url = stringDataReader.readUTF();
// and spill
fingerprint.setDomainHash(item._domainHash);
fingerprint.setUrlHash(item._urlFingerprint);
loader.queueURL(fingerprint, url);
}
}
}
}
catch (IOException e) {
LOG.error("Encountered Exception Queueing Items for List:" + _listId + " Exception:" + CCStringUtils.stringifyException(e));
_queueState = QueueState.QUEUED;
}
finally {
fixedDataReader.close();
stringDataReader.close();
}
}
/**
*
* @param localLogFileDir
* @param listId
* @return
*/
public static boolean allFilesPresent(File localLogFileDir,long listId) {
//establish file names
File urlDataFile = new File(localLogFileDir,LIST_URL_DATA_PREFIX + Long.toString(listId));
File fixedDataFile = new File(localLogFileDir,LIST_VALUE_MAP_PREFIX + Long.toString(listId));
File variableDataFile = new File(localLogFileDir,LIST_STRING_MAP_PREFIX + Long.toString(listId));
File bloomFilterFile = new File(localLogFileDir,LIST_BLOOM_DATA_PREFIX + Long.toString(listId));
if (urlDataFile.exists() &&
fixedDataFile.exists() &&
variableDataFile.exists() &&
bloomFilterFile.exists()) {
return true;
}
return false;
}
public static final String LIST_URL_DATA_PREFIX = "listURLS-";
public static final String LIST_VALUE_MAP_PREFIX = "listValueMap-";
public static final String LIST_STRING_MAP_PREFIX = "listStringMap-";
public static final String LIST_BLOOM_DATA_PREFIX = "listBloomFilter-";
public static final String LIST_METADATA_PREFIX = "listMetadata-";
public static final String LIST_SUBDOMAIN_METADATA_PREFIX = "listSubDomainMetadata-";
private void initializeListFileNames() {
//establish file names
_listURLDataFile = new File(_manager.getLocalDataDir(),LIST_URL_DATA_PREFIX + Long.toString(_listId));
_fixedDataFile = new File(_manager.getLocalDataDir(),LIST_VALUE_MAP_PREFIX + Long.toString(_listId));
_variableDataFile = new File(_manager.getLocalDataDir(),LIST_STRING_MAP_PREFIX + Long.toString(_listId));
_bloomFilterData = new File(_manager.getLocalDataDir(),LIST_BLOOM_DATA_PREFIX + Long.toString(_listId));
_listMetadataFile = new File(_manager.getLocalDataDir(),LIST_METADATA_PREFIX + Long.toString(_listId));
_subDomainMetadataFile = new File(_manager.getLocalDataDir(),LIST_SUBDOMAIN_METADATA_PREFIX + Long.toString(_listId));
}
private static class OnDiskCrawlHistoryItem {
public long _fileOffset = -1;
int _domainHash = -1; // 4
long _urlFingerprint = -1; // 8
int _stringsCRC = -1; // 4
long _stringsOffset = -1; // 8
byte _flags = 0; // 1
byte _crawlStatus = -1; // 1
short _httpResultCode = -1; // 2
byte _redirectStatus = -1; // 1
short _redirectHttpResult = -1; // 2
long _updateTimestamp = -1; // 8
//__
// 39 bytes
public static final int ON_DISK_SIZE = 39;
public static final int FLAG_HAS_CRAWL_STATUS = 1;
public static final int FLAG_HAS_ORIGINAL_RESULT_CODE = 2;
public static final int FLAG_HAS_REDIRECT_URL = 4;
public static final int FLAG_HAS_REDIRECT_STATUS = 8;
public static final int FLAG_HAS_REDIRECT_RESULT_CODE = 16;
public static final int FLAG_HAS_LASTMODIFIED_TIME = 32;
public int compareFingerprints(URLFP fp) {
int result = ((Integer)_domainHash).compareTo(fp.getDomainHash());
if (result == 0) {
result = ((Long)_urlFingerprint).compareTo(fp.getUrlHash());
}
return result;
}
@Override
public boolean equals(Object obj) {
if (obj instanceof OnDiskCrawlHistoryItem) {
OnDiskCrawlHistoryItem other = (OnDiskCrawlHistoryItem)obj;
if (_domainHash == other._domainHash &&
_urlFingerprint == other._urlFingerprint &&
_stringsCRC == other._stringsCRC &&
_flags == other._flags &&
_crawlStatus == other._crawlStatus &&
_httpResultCode == other._httpResultCode &&
_redirectStatus == other._redirectStatus &&
_redirectHttpResult == other._redirectHttpResult) {
return true;
}
}
return false;
}
public void setFlag(int flag) {
_flags |= flag;
}
public boolean isFlagSet(int flag) {
return ((_flags & flag) != 0);
}
public void serialize(DataOutput out) throws IOException {
out.writeInt(_domainHash);
out.writeLong(_urlFingerprint);
out.writeInt(_stringsCRC);
out.writeLong(_stringsOffset);
out.write(_flags);
out.writeByte(_crawlStatus);
out.writeShort(_httpResultCode);
out.writeByte(_redirectStatus);
out.writeShort(_redirectHttpResult);
out.writeLong(_updateTimestamp);
}
public void deserialize(DataInput in) throws IOException {
_domainHash = in.readInt();
_urlFingerprint = in.readLong();
_stringsCRC = in.readInt();
_stringsOffset = in.readLong();
_flags = in.readByte();
_crawlStatus = in.readByte();
_httpResultCode = in.readShort();
_redirectStatus = in.readByte();
_redirectHttpResult = in.readShort();
_updateTimestamp = in.readLong();
}
}
DataOutputBuffer _stringBuffer1 = new DataOutputBuffer();
DataOutputBuffer _stringBuffer2 = new DataOutputBuffer();
CRC16 _stringCRC = new CRC16();
private OnDiskCrawlHistoryItem onDiskItemFromHistoryItem(URLFP fingerprint, ProxyCrawlHistoryItem item) throws IOException {
OnDiskCrawlHistoryItem itemOut = new OnDiskCrawlHistoryItem();
itemOut._domainHash = fingerprint.getDomainHash();
itemOut._urlFingerprint = fingerprint.getUrlHash();
itemOut._stringsCRC = calculateStringCRC(item,_stringBuffer1);
if (item.isFieldDirty(ProxyCrawlHistoryItem.Field_CRAWLSTATUS)) {
itemOut.setFlag(OnDiskCrawlHistoryItem.FLAG_HAS_CRAWL_STATUS);
itemOut._crawlStatus = (byte) item.getCrawlStatus();
}
if (item.isFieldDirty(ProxyCrawlHistoryItem.Field_HTTPRESULTCODE)) {
itemOut.setFlag(OnDiskCrawlHistoryItem.FLAG_HAS_ORIGINAL_RESULT_CODE);
itemOut._httpResultCode = (short) item.getHttpResultCode();
}
if (item.isFieldDirty(ProxyCrawlHistoryItem.Field_REDIRECTURL)) {
itemOut.setFlag(OnDiskCrawlHistoryItem.FLAG_HAS_REDIRECT_URL);
}
if (item.isFieldDirty(ProxyCrawlHistoryItem.Field_REDIRECTSTATUS)) {
itemOut.setFlag(OnDiskCrawlHistoryItem.FLAG_HAS_REDIRECT_STATUS);
itemOut._redirectStatus = (byte)item.getRedirectStatus();
}
if (item.isFieldDirty(ProxyCrawlHistoryItem.Field_REDIRECTHTTPRESULT)) {
itemOut.setFlag(OnDiskCrawlHistoryItem.FLAG_HAS_REDIRECT_RESULT_CODE);
itemOut._redirectHttpResult = (short)item.getRedirectHttpResult();
}
// update last modified time if present ....
if (item.isFieldDirty(ProxyCrawlHistoryItem.Field_LASTMODIFIEDTIME) &&
item.getLastModifiedTime() > 0) {
itemOut._updateTimestamp = Math.max(itemOut._updateTimestamp, item.getLastModifiedTime());
itemOut.setFlag(OnDiskCrawlHistoryItem.FLAG_HAS_LASTMODIFIED_TIME);
}
return itemOut;
}
private int calculateStringCRC(ProxyCrawlHistoryItem item,DataOutputBuffer stringBuffer)throws IOException {
stringBuffer.reset();
stringBuffer.writeUTF(item.getOriginalURL());
if (item.isFieldDirty(ProxyCrawlHistoryItem.Field_REDIRECTURL)) {
stringBuffer.writeUTF(item.getRedirectURL());
}
_stringCRC.reset();
_stringCRC.update(stringBuffer.getData(), 0, stringBuffer.getLength());
return (int)_stringCRC.getValue();
}
private void writeInitialOnDiskItem(URLFP fp,ProxyCrawlHistoryItem historyItem,DataOutputStream valueStreamOut,RandomAccessFile stringStream) throws IOException {
OnDiskCrawlHistoryItem itemOut = onDiskItemFromHistoryItem(fp, historyItem);
// update string offset ...
itemOut._stringsOffset = stringStream.length();
// write out string data length
WritableUtils.writeVInt(stringStream,_stringBuffer1.getLength());
// write strings to log file
stringStream.write(_stringBuffer1.getData(),0,_stringBuffer1.getLength());
// update timestamp ...
itemOut._updateTimestamp = -1;
// and write to disk
itemOut.serialize(valueStreamOut);
}
private void dumpFixedDataFile() {
try {
RandomAccessFile fixedDataReader = new RandomAccessFile(_fixedDataFile,"rw");
try {
OnDiskCrawlHistoryItem item = new OnDiskCrawlHistoryItem();
int index =0;
while (fixedDataReader.getFilePointer() != fixedDataReader.length()) {
item.deserialize(fixedDataReader);
LOG.info("Item at Index:" + index++ + " Domain:" + item._domainHash + " URLFP:" + item._urlFingerprint);
}
}
finally {
fixedDataReader.close();
}
}
catch (IOException e) {
LOG.error(CCStringUtils.stringifyException(e));
}
}
private OnDiskCrawlHistoryItem loadOnDiskItemForURLFP(URLFP fingerprint) throws IOException {
// see if state is cached in memory ...
boolean loadedFromMemory = false;
synchronized (this) {
if (_tempFixedDataBuffer != null) {
loadedFromMemory = true;
int low = 0;
int high = (int)(_tempFixedDataBufferSize / OnDiskCrawlHistoryItem.ON_DISK_SIZE) -1;
OnDiskCrawlHistoryItem itemOut = new OnDiskCrawlHistoryItem();
DataInputBuffer inputBuffer = new DataInputBuffer();
int iterationNumber = 0;
while (low <= high) {
++iterationNumber;
int mid = low + ((high - low) / 2);
inputBuffer.reset(_tempFixedDataBuffer,0,_tempFixedDataBufferSize);
inputBuffer.skip(mid * OnDiskCrawlHistoryItem.ON_DISK_SIZE);
// deserialize
itemOut.deserialize(inputBuffer);
// now compare it against desired hash value ...
int comparisonResult = itemOut.compareFingerprints(fingerprint);
if (comparisonResult > 0)
high = mid - 1;
else if (comparisonResult < 0)
low = mid + 1;
else {
// cache offset
itemOut._fileOffset = mid * OnDiskCrawlHistoryItem.ON_DISK_SIZE;
// LOG.info("Found Match. Took:"+ iterationNumber + " iterations");
// and return item
return itemOut;
}
}
//LOG.error("Did Not Find Match For Domain:" + fingerprint.getDomainHash() + " URLFP:" + fingerprint.getUrlHash() + " Took:" + iterationNumber + " iterations");
}
}
if (!loadedFromMemory) {
//load from disk
//LOG.info("Opening Data File for OnDiskItem load for Fingerprint:" + fingerprint.getUrlHash());
RandomAccessFile file = new RandomAccessFile(_fixedDataFile,"rw");
// allocate buffer upfront
byte[] onDiskItemBuffer = new byte[OnDiskCrawlHistoryItem.ON_DISK_SIZE];
DataInputBuffer inputStream = new DataInputBuffer();
//LOG.info("Opened Data File. Searching for match");
try {
int low = 0;
int high = (int)(file.length() / OnDiskCrawlHistoryItem.ON_DISK_SIZE) -1;
OnDiskCrawlHistoryItem itemOut = new OnDiskCrawlHistoryItem();
int iterationNumber = 0;
while (low <= high) {
++iterationNumber;
int mid = low + ((high - low) / 2);
// seek to proper location
file.seek(mid * OnDiskCrawlHistoryItem.ON_DISK_SIZE);
// read the data structure
file.readFully(onDiskItemBuffer, 0, onDiskItemBuffer.length);
// map location in file
//MappedByteBuffer memoryBuffer = file.getChannel().map(MapMode.READ_ONLY,mid * OnDiskCrawlHistoryItem.ON_DISK_SIZE,OnDiskCrawlHistoryItem.ON_DISK_SIZE);
//DataInputStream inputStream = new DataInputStream(new ByteBufferInputStream(memoryBuffer));
inputStream.reset(onDiskItemBuffer,0,OnDiskCrawlHistoryItem.ON_DISK_SIZE);
// deserialize
itemOut.deserialize(inputStream);
// memoryBuffer = null;
//inputStream = null;
// now compare it against desired hash value ...
int comparisonResult = itemOut.compareFingerprints(fingerprint);
if (comparisonResult > 0)
high = mid - 1;
else if (comparisonResult < 0)
low = mid + 1;
else {
// cache offset
itemOut._fileOffset = mid * OnDiskCrawlHistoryItem.ON_DISK_SIZE;
// LOG.info("Found Match. Took:"+ iterationNumber + " iterations");
// and return item
return itemOut;
}
}
//LOG.error("******Did Not Find Match For Domain:" + fingerprint.getDomainHash() + " URLFP:" + fingerprint.getUrlHash() + " Took:" + iterationNumber + " iterations");
//DEBUG ONLY !
// dumpFixedDataFile();
}
finally {
file.close();
}
}
return null;
}
private ProxyCrawlHistoryItem getHistoryItemFromURLFP(URLFP fingerprint) throws IOException {
OnDiskCrawlHistoryItem item = loadOnDiskItemForURLFP(fingerprint);
if (item != null) {
return getHistoryItemFromOnDiskItem(item);
}
return null;
}
private ProxyCrawlHistoryItem getHistoryItemFromOnDiskItem(OnDiskCrawlHistoryItem item) throws IOException {
ProxyCrawlHistoryItem itemOut = new ProxyCrawlHistoryItem();
if ((item._flags & OnDiskCrawlHistoryItem.FLAG_HAS_CRAWL_STATUS) != 0)
itemOut.setCrawlStatus(item._crawlStatus);
if ((item._flags & OnDiskCrawlHistoryItem.FLAG_HAS_ORIGINAL_RESULT_CODE) != 0)
itemOut.setHttpResultCode(item._httpResultCode);
if ((item._flags & OnDiskCrawlHistoryItem.FLAG_HAS_REDIRECT_STATUS) != 0)
itemOut.setRedirectStatus(item._redirectStatus);
if ((item._flags & OnDiskCrawlHistoryItem.FLAG_HAS_REDIRECT_RESULT_CODE) != 0)
itemOut.setRedirectHttpResult(item._redirectHttpResult);
if ((item._flags & OnDiskCrawlHistoryItem.FLAG_HAS_LASTMODIFIED_TIME) != 0)
itemOut.setLastModifiedTime(item._updateTimestamp);
// now attept to get the string offset
RandomAccessFile stringDataReader = new RandomAccessFile(_variableDataFile, "rw");
try {
// seek to string data
stringDataReader.seek(item._stringsOffset);
// and skip buffer length
WritableUtils.readVInt(stringDataReader);
// now populate original url ...
itemOut.setOriginalURL(stringDataReader.readUTF());
// now if redirect url is present
if ((item._flags & OnDiskCrawlHistoryItem.FLAG_HAS_REDIRECT_URL) != 0) {
itemOut.setRedirectURL(stringDataReader.readUTF());
}
}
finally {
stringDataReader.close();
}
return itemOut;
}
/**
* deserialize metadata from disk
*
* @throws IOException
*/
void loadMetadataFromDisk()throws IOException {
// skip metadata load if sub-domain metadata file is missing...
// in this case, metadata will be rebuilt during subdomain metadata rescan ...
if (_subDomainMetadataFile.exists()) {
RandomAccessFile file = new RandomAccessFile(_listMetadataFile,"rw");
try {
_metadata.deserialize(file, new BinaryProtocol());
int urlCount = _metadata.getUrlCount();
_metadata.clear();
_metadata.setUrlCount(urlCount);
}
finally {
file.close();
}
}
}
/**
* serialize metadata to disk
* @throws IOException
*/
void writeMetadataToDisk()throws IOException {
synchronized(_metadata) {
RandomAccessFile file = new RandomAccessFile(_listMetadataFile,"rw");
try {
file.seek(0);
_metadata.serialize(file, new BinaryProtocol());
}
finally {
file.close();
}
}
}
public static void generateTestURLFile(File outputFile,String... urlList)throws IOException {
PrintWriter writer = new PrintWriter(outputFile,"UTF-8");
for (String url : urlList) {
writer.println(url);
}
writer.flush();
writer.close();
}
private static void validateListCode(final File dataDirectory,long listId) throws IOException {
final String urlList[] = new String[] {
"http://www.yahoo.com/1",
"http://www.google.com/1",
"http://www.cnn.com/1",
"http://www.yahoo.com/2",
"http://www.google.com/2",
"http://www.cnn.com/2"
};
File tempFile = File.createTempFile("CrawlList", "validateListInit");
File localTempFile = new File(dataDirectory,tempFile.getName());
generateTestURLFile(localTempFile,urlList);
final TreeMap<String,URLFP> urlToFPMap = new TreeMap<String,URLFP>();
final TreeMap<URLFP,String> urlFPToString = new TreeMap<URLFP,String>();
for (String url : urlList) {
URLFP fp = URLUtils.getURLFPFromURL(url, true);
urlToFPMap.put(url, fp);
urlFPToString.put(fp, url);
}
final TreeMap<URLFP,ProxyCrawlHistoryItem> itemsToMarkComplete = new TreeMap<URLFP,ProxyCrawlHistoryItem>();
ProxyCrawlHistoryItem item1 = new ProxyCrawlHistoryItem();
item1.setCrawlStatus(CrawlURL.FailureReason.RobotsExcluded);
item1.setOriginalURL(urlList[1]);
ProxyCrawlHistoryItem item2 = new ProxyCrawlHistoryItem();
item2.setCrawlStatus(0);
item2.setOriginalURL(urlList[3]);
item2.setHttpResultCode(301);
item2.setRedirectURL("http://www.yahoo.com/3");
item2.setRedirectStatus(0);
item2.setRedirectHttpResult(200);
ProxyCrawlHistoryItem item3 = new ProxyCrawlHistoryItem();
item3.setCrawlStatus(0);
item3.setOriginalURL(urlList[4]);
item3.setHttpResultCode(301);
item3.setRedirectURL("http://www.google.com/3");
item3.setRedirectStatus(CrawlURL.FailureReason.IOException);
itemsToMarkComplete.put(urlToFPMap.get(item1.getOriginalURL()), item1);
itemsToMarkComplete.put(urlToFPMap.get(item2.getOriginalURL()), item2);
itemsToMarkComplete.put(urlToFPMap.get(item3.getOriginalURL()), item3);
final Set<URLFP> itemsToMarkCompleteFPSet= itemsToMarkComplete.keySet();
final Set<URLFP> itemsNotMarked = new TreeSet<URLFP>(urlToFPMap.values());
itemsNotMarked.removeAll(itemsToMarkCompleteFPSet);
CrawlHistoryStorage storage = new CrawlHistoryStorage() {
@Override
public void syncList(long listId,TreeSet<URLFP> matchCriteria, ItemUpdater targetList) throws IOException {
for (URLFP matchItem : matchCriteria) {
if (itemsToMarkCompleteFPSet.contains(matchItem)) {
targetList.updateItemState(matchItem, itemsToMarkComplete.get(matchItem));
}
}
}
@Override
public File getLocalDataDir() {
return dataDirectory;
}
};
CrawlList list1 = new CrawlList(storage,listId,localTempFile,0);
for (int pass=0;pass<2;++pass) {
CrawlList list = null;
if (pass == 0) {
System.out.println("Pass 0 - Initialize from URLList");
list = list1;
}
else {
System.out.println("Pass 1 - Initialize from OnDisk Data");
list = new CrawlList(storage, listId);
}
// iterate fingerprints
for (URLFP fingerprint : urlToFPMap.values()) {
ProxyCrawlHistoryItem itemRetrieved = list.getHistoryItemFromURLFP(fingerprint);
if (itemsToMarkCompleteFPSet.contains(fingerprint)) {
ProxyCrawlHistoryItem itemExpected = itemsToMarkComplete.get(fingerprint);
Assert.assertTrue(itemExpected.equals(itemRetrieved));
}
else {
Assert.assertTrue(itemRetrieved.getOriginalURL().equals(urlFPToString.get(fingerprint)) &&
!itemRetrieved.isFieldDirty(ProxyCrawlHistoryItem.Field_CRAWLSTATUS) &&
!itemRetrieved.isFieldDirty(ProxyCrawlHistoryItem.Field_HTTPRESULTCODE) &&
!itemRetrieved.isFieldDirty(ProxyCrawlHistoryItem.Field_REDIRECTHTTPRESULT) &&
!itemRetrieved.isFieldDirty(ProxyCrawlHistoryItem.Field_REDIRECTSTATUS) &&
!itemRetrieved.isFieldDirty(ProxyCrawlHistoryItem.Field_REDIRECTURL));
}
}
}
// validate string code does not update when strings have not changed
item3.setRedirectStatus(0);
item3.setRedirectHttpResult(200);
long variableDataLength = list1._variableDataFile.length();
long fixedDataLength = list1._fixedDataFile.length();
list1.updateItemState(urlToFPMap.get(item3.getOriginalURL()), item3);
Assert.assertTrue(fixedDataLength == list1._fixedDataFile.length());
Assert.assertTrue(variableDataLength == list1._variableDataFile.length());
list1.queueUnCrawledItems(new CrawlQueueLoader() {
@Override
public void queueURL(URLFP urlfp, String url) {
Assert.assertTrue(itemsNotMarked.contains(urlfp));
Assert.assertTrue(urlFPToString.get(urlfp).equals(url));
}
@Override
public void flush() {
// TODO Auto-generated method stub
}
});
}
public static void testmain(String[] args) {
// initialize ...
Configuration conf = new Configuration();
conf.addResource("nutch-default.xml");
conf.addResource("nutch-site.xml");
conf.addResource("core-site.xml");
conf.addResource("hdfs-site.xml");
conf.addResource("mapred-site.xml");
BasicConfigurator.configure();
conf.set("mapred.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec");
conf.set("mapred.map.output.compression.codec","org.apache.hadoop.io.compress.GzipCodec");
CrawlEnvironment.setHadoopConfig(conf);
CrawlEnvironment.setDefaultHadoopFSURI("file:///");
File testDirectory = new File("/tmp/CrawlListTests");
FileUtils.recursivelyDeleteFile(testDirectory);
testDirectory.mkdir();
try {
validateListCode(testDirectory,System.currentTimeMillis());
} catch (IOException e) {
e.printStackTrace();
}
}
private static final int OFFSET_TABLE_ENTRY_SIZE = 12;
private final int getOffsetForSubDomainData(long domainHash) throws IOException {
DataInputBuffer inputBuffer = new DataInputBuffer();
int low = 0;
int high = (int)(_offsetLookupTable.getLength() / OFFSET_TABLE_ENTRY_SIZE) -1;
while (low <= high) {
int mid = low + ((high - low) / 2);
inputBuffer.reset(_offsetLookupTable.getData(),_offsetLookupTable.getLength());
inputBuffer.skip(mid * OFFSET_TABLE_ENTRY_SIZE);
// deserialize
long hash = inputBuffer.readLong();
// now compare it against desired hash value ...
int comparisonResult = ((Long)hash).compareTo(domainHash);
if (comparisonResult > 0)
high = mid - 1;
else if (comparisonResult < 0)
low = mid + 1;
else {
return inputBuffer.readInt();
}
}
throw new IOException("NOT-FOUND!");
}
void updateSubDomainQueueStatus(String rootDomainName,int deltaQueuedCount)throws IOException {
long domainHash = URLFingerprint.generate64BitURLFPrint(rootDomainName);
synchronized (_subDomainMetadataFile) {
CrawlListMetadata metadata = new CrawlListMetadata();
RandomAccessFile file = new RandomAccessFile(_subDomainMetadataFile,"rw");
try {
int dataOffset = getOffsetForSubDomainData(domainHash);
if (dataOffset == 0) {
throw new IOException("Data Offset Zero for host:" + rootDomainName);
}
file.seek(dataOffset);
metadata.readFields(file);
// set the data offset on the way out so that updates write to the proper location
metadata.setQueuedItemCount(metadata.getQueuedItemCount() + deltaQueuedCount);
// ok reseek to data offset
file.seek(dataOffset);
// rewrite the data structure
metadata.write(file);
}
finally {
file.close();
}
}
}
public CrawlListMetadata getSubDomainMetadataByURL(String originalURL) throws IOException {
GoogleURL urlObject = new GoogleURL(originalURL);
return getSubDomainMetadataByDomain(urlObject.getHost());
}
public CrawlListMetadata getSubDomainMetadataByDomain(String hostName) throws IOException {
String rootDomainName = URLUtils.extractRootDomainName(hostName);
if (rootDomainName != null) {
return getSubDomainMetadataByRootDomain(rootDomainName);
}
throw new IOException("Unable to Extract RootDomainName for host:" + hostName);
}
public CrawlListMetadata getSubDomainMetadataByRootDomain(String rootDomainName) throws IOException {
long domainHash = URLFingerprint.generate64BitURLFPrint(rootDomainName);
CrawlListMetadata metadata = new CrawlListMetadata();
synchronized (_subDomainMetadataFile) {
RandomAccessFile file = new RandomAccessFile(_subDomainMetadataFile,"rw");
try {
int dataOffset = getOffsetForSubDomainData(domainHash);
if (dataOffset == 0) {
throw new IOException("Data Offset Zero for host:" + rootDomainName);
}
file.seek(dataOffset);
metadata.readFields(file);
// set the data offset on the way out so that updates write to the proper location
metadata.setSubDomainDataOffset(dataOffset);
}
finally {
file.close();
}
}
return metadata;
}
// get subdomain metadata
CrawlListMetadata getTransientSubDomainMetadata(String originalURL)throws IOException {
GoogleURL urlObject = new GoogleURL(originalURL);
String rootDomainName = URLUtils.extractRootDomainName(urlObject.getHost());
if (rootDomainName != null) {
long domainHash = URLFingerprint.generate64BitURLFPrint(rootDomainName);
CrawlListMetadata metadata = _transientSubDomainStats.get(domainHash);
if (metadata == null) {
metadata = new CrawlListMetadata();
_transientSubDomainStats.put(domainHash, metadata);
metadata.setDomainName(rootDomainName);
metadata.setDomainHash(domainHash);
}
return metadata;
}
throw new IOException("Unable to Extract RootDomainName for url:" + originalURL);
}
/**
* serialize metadata to disk
* @throws IOException
*/
void writeSubDomainMetadataToDisk(CrawlListMetadata subDomainData)throws IOException {
DataOutputBuffer outputBuffer = new DataOutputBuffer(CrawlListMetadata.Constants.FixedDataSize);
subDomainData.serialize(outputBuffer, new BinaryProtocol());
if (outputBuffer.getLength() > CrawlListMetadata.Constants.FixedDataSize) {
LOG.error("ListMetadata Serialize for List:" + subDomainData.getDomainName() + " > FixedDataSize!!!");
outputBuffer.reset();
subDomainData.setDomainName("<<CORRUPT>>");
subDomainData.serialize(outputBuffer, new BinaryProtocol());
}
synchronized (_subDomainMetadataFile) {
RandomAccessFile file = new RandomAccessFile(_subDomainMetadataFile,"rw");
try {
if (subDomainData.getSubDomainDataOffset() == 0) {
throw new IOException("Data Offset Zero during write!");
}
file.seek(subDomainData.getSubDomainDataOffset());
file.write(outputBuffer.getData(),0,outputBuffer.getLength());
}
finally {
file.close();
}
}
}
void writeInitialSubDomainMetadataToDisk() throws IOException {
RandomAccessFile file = new RandomAccessFile(_subDomainMetadataFile,"rw");
try {
file.writeByte(0); // version
file.writeInt(_transientSubDomainStats.size());
ArrayList<CrawlListMetadata> sortedMetadata = new ArrayList<CrawlListMetadata>();
sortedMetadata.addAll(_transientSubDomainStats.values());
_transientSubDomainStats = null;
CrawlListMetadata metadataArray[] = sortedMetadata.toArray(new CrawlListMetadata[0]);
Arrays.sort(metadataArray,new Comparator<CrawlListMetadata>() {
@Override
public int compare(CrawlListMetadata o1, CrawlListMetadata o2) {
int result = ((Integer)o2.getUrlCount()).compareTo(o1.getUrlCount());
if (result == 0) {
result = o1.getDomainName().compareTo(o2.getDomainName());
}
return result;
}
});
DataOutputBuffer outputBuffer = new DataOutputBuffer(CrawlListMetadata.Constants.FixedDataSize);
TreeMap<Long,Integer> idToOffsetMap = new TreeMap<Long,Integer>();
for (CrawlListMetadata entry : metadataArray) {
// reset output buffer
outputBuffer.reset();
// write item to disk
entry.serialize(outputBuffer, new BinaryProtocol() );
if (outputBuffer.getLength() > CrawlListMetadata.Constants.FixedDataSize) {
LOG.fatal("Metadata Serialization for List:" + getListId() + " SubDomain:" + entry.getDomainName());
System.out.println("Metadata Serialization for List:" + getListId() + " SubDomain:" + entry.getDomainName());
}
// save offset
idToOffsetMap.put(entry.getDomainHash(), (int)file.getFilePointer());
// write out fixed data size
file.write(outputBuffer.getData(),0,CrawlListMetadata.Constants.FixedDataSize);
}
// write lookup table
_offsetLookupTable = new DataOutputBuffer(idToOffsetMap.size() * OFFSET_TABLE_ENTRY_SIZE);
for (Map.Entry<Long,Integer> entry : idToOffsetMap.entrySet()) {
_offsetLookupTable.writeLong(entry.getKey());
_offsetLookupTable.writeInt(entry.getValue());
}
}
finally {
file.close();
}
_transientSubDomainStats = null;
}
void resetSubDomainCounts() throws IOException {
LOG.info("*** LIST:" + getListId() + " Reset SubDomain Queued Counts.");
if (_subDomainMetadataFile.exists()) {
LOG.info("*** LIST:" + getListId() + " FILE EXISTS .");
RandomAccessFile file = new RandomAccessFile(_subDomainMetadataFile,"rw");
DataInputBuffer inputBuffer = new DataInputBuffer();
DataOutputBuffer outputBuffer = new DataOutputBuffer(CrawlListMetadata.Constants.FixedDataSize);
try {
// skip version
file.read();
// read item count
int itemCount = file.readInt();
LOG.info("*** LIST:" + getListId() + " SUBDOMAIN ITEM COUNT:"+ itemCount);
CrawlListMetadata newMetadata = new CrawlListMetadata();
for (int i=0;i<itemCount;++i) {
long orignalPos = file.getFilePointer();
file.readFully(outputBuffer.getData(), 0, CrawlListMetadata.Constants.FixedDataSize);
inputBuffer.reset(outputBuffer.getData(),CrawlListMetadata.Constants.FixedDataSize);
try {
newMetadata.deserialize(inputBuffer, new BinaryProtocol());
}
catch (Exception e) {
LOG.error("-----Failed to Deserialize Metadata at Index:" + i + " Exception:" + CCStringUtils.stringifyException(e));
}
// ok reset everything except hashes and first/last url pointers
int urlCount = newMetadata.getUrlCount();
long firstRecordOffset = newMetadata.getFirstRecordOffset();
long lastRecordOffset = newMetadata.getLastRecordOffset();
String domainName = newMetadata.getDomainName();
long domainHash = newMetadata.getDomainHash();
// reset
newMetadata.clear();
// restore
newMetadata.setUrlCount(urlCount);
newMetadata.setFirstRecordOffset(firstRecordOffset);
newMetadata.setLastRecordOffset(lastRecordOffset);
newMetadata.setDomainName(domainName);
newMetadata.setDomainHash(domainHash);
// serialize it ...
outputBuffer.reset();
newMetadata.serialize(outputBuffer, new BinaryProtocol());
// write it back to disk
file.seek(orignalPos);
// and rewrite it ...
file.write(outputBuffer.getData(),0,CrawlListMetadata.Constants.FixedDataSize);
}
}
finally {
file.close();
}
LOG.info("*** LIST:" + getListId() + " DONE RESETTIGN SUBDOMAIN METADATA QUEUE COUNTS");
}
}
void loadSubDomainMetadataFromDisk()throws IOException {
LOG.info("*** LIST:" + getListId() + " LOAD SUBDOMAIN METADATA FROM DISK ... ");
if (_subDomainMetadataFile.exists()) {
LOG.info("*** LIST:" + getListId() + " FILE EXISTS LOADING SUBDOMAIN DATA FROM DISK.");
RandomAccessFile file = new RandomAccessFile(_subDomainMetadataFile,"rw");
DataInputBuffer inputBuffer = new DataInputBuffer();
byte fixedDataBlock[] = new byte[CrawlListMetadata.Constants.FixedDataSize];
try {
// skip version
file.read();
// read item count
int itemCount = file.readInt();
LOG.info("*** LIST:" + getListId() + " SUBDOMAIN ITEM COUNT:"+ itemCount);
CrawlListMetadata newMetadata = new CrawlListMetadata();
TreeMap<Long,Integer> idToOffsetMap = new TreeMap<Long,Integer>();
for (int i=0;i<itemCount;++i) {
long orignalPos = file.getFilePointer();
file.readFully(fixedDataBlock, 0, fixedDataBlock.length);
inputBuffer.reset(fixedDataBlock,fixedDataBlock.length);
try {
newMetadata.deserialize(inputBuffer, new BinaryProtocol());
}
catch (Exception e) {
LOG.error("-----Failed to Deserialize Metadata at Index:" + i + " Exception:" + CCStringUtils.stringifyException(e));
}
idToOffsetMap.put(newMetadata.getDomainHash(), (int)orignalPos);
}
// write lookup table
_offsetLookupTable = new DataOutputBuffer(idToOffsetMap.size() * OFFSET_TABLE_ENTRY_SIZE);
for (Map.Entry<Long,Integer> entry : idToOffsetMap.entrySet()) {
_offsetLookupTable.writeLong(entry.getKey());
_offsetLookupTable.writeInt(entry.getValue());
}
}
finally {
file.close();
}
LOG.info("*** LIST:" + getListId() + " DONE LOADING SUBDOMAIN DATA FROM DISK");
}
else {
LOG.info("*** LIST:" + getListId() + " SUBDOMAIN METADATA DOES NOT EXIST! LOADING FROM SCRATCH");
RandomAccessFile fixedDataReader = new RandomAccessFile(_fixedDataFile, "rw");
RandomAccessFile stringDataReader = new RandomAccessFile(_variableDataFile, "rw");
try {
//ok rebuild top level metadata as well
_metadata.clear();
OnDiskCrawlHistoryItem item = new OnDiskCrawlHistoryItem();
int processedCount = 0;
while (fixedDataReader.getFilePointer() != fixedDataReader.length()) {
long position = fixedDataReader.getFilePointer();
// store offset in item
item._fileOffset = position;
// load from disk
item.deserialize(fixedDataReader);
try {
// seek to string data
stringDataReader.seek(item._stringsOffset);
// and skip buffer length
WritableUtils.readVInt(stringDataReader);
// and read primary string
String url = stringDataReader.readUTF();
// get metadata object for subdomain
CrawlListMetadata subDomainMetadata = getTransientSubDomainMetadata(url);
// increment url count
subDomainMetadata.setUrlCount(subDomainMetadata.getUrlCount() + 1);
// increment top level metadata count
_metadata.setUrlCount(_metadata.getUrlCount() + 1);
// update top level metadata ..
updateMetadata(item, _metadata, 0);
// update sub-domain metadata object from item data
updateMetadata(item, subDomainMetadata, 0);
++processedCount;
}
catch (IOException e) {
LOG.error("Exception Reading String Data For Item:" + (processedCount + 1));
LOG.error("Exception:" + CCStringUtils.stringifyException(e));
LOG.error("File Position:"+ fixedDataReader.getFilePointer() + " StringsPointer:" + stringDataReader.getFilePointer());
}
if (processedCount % 10000 == 0) {
LOG.info("*** LIST:" + getListId() + " Processed:" + processedCount + " Items");
}
}
// ok commit top level metadata to disk as well
writeMetadataToDisk();
}
catch (IOException e) {
LOG.error("Encountered Exception Queueing Items for List:" + _listId + " Exception:" + CCStringUtils.stringifyException(e));
LOG.error("File Position:"+ fixedDataReader.getFilePointer() + " StringsPointer:" + stringDataReader.getFilePointer());
_queueState = QueueState.QUEUED;
}
finally {
fixedDataReader.close();
stringDataReader.close();
}
LOG.info("*** LIST:" + getListId() + " SUBDOMAIN METADATA REBUILT FROM LIST DATA . WRITING TO DISK");
// write metadat to disk
writeInitialSubDomainMetadataToDisk();
LOG.info("*** LIST:" + getListId() + " SUBDOMAIN METADATA REBUILT FROM LIST DATA . WRITE COMPLETE");
}
}
public int getSubDomainItemCount() {
synchronized (_metadata) {
return _offsetLookupTable.getLength() / OFFSET_TABLE_ENTRY_SIZE;
}
}
public ArrayList<CrawlListDomainItem> getSubDomainList(int offset,int count) {
synchronized (_metadata) {
ArrayList<CrawlListDomainItem> itemsOut = new ArrayList<CrawlListDomainItem>();
try {
synchronized (_subDomainMetadataFile) {
RandomAccessFile file = new RandomAccessFile(_subDomainMetadataFile,"rw");
DataInputBuffer inputBuffer = new DataInputBuffer();
byte fixedDataBlock[] = new byte[CrawlListMetadata.Constants.FixedDataSize];
try {
// skip version
file.read();
// read item count
int itemCount = file.readInt();
int i = offset;
int end = Math.min(i+count,itemCount);
LOG.info("*** LIST:" + getListId() + " SUBDOMAIN ITEM COUNT:"+ itemCount);
if (i<itemCount) {
file.seek(5 + (CrawlListMetadata.Constants.FixedDataSize * offset));
CrawlListMetadata newMetadata = new CrawlListMetadata();
for (;i<end;++i) {
long orignalPos = file.getFilePointer();
file.readFully(fixedDataBlock, 0, fixedDataBlock.length);
inputBuffer.reset(fixedDataBlock,fixedDataBlock.length);
newMetadata.deserialize(inputBuffer, new BinaryProtocol());
itemsOut.add(buildSubDomainSummary(newMetadata.getDomainName(),newMetadata));
}
}
}
finally {
file.close();
}
}
}
catch (IOException e) {
LOG.error(CCStringUtils.stringifyException(e));
}
LOG.info("*** LIST:" + getListId() + " DONE LOADING SUBDOMAIN DATA FROM DISK");
return itemsOut;
}
}
private static CrawlListDomainItem buildSubDomainSummary(String domainName,CrawlListMetadata metadata) {
CrawlListDomainItem domainItem = new CrawlListDomainItem();
domainItem.setDomainName(domainName);
int robotsExcludedItemsCount =0;
int errorItemsCount =0;
int otherHTTPResultsCount = 0;
metadata.getHttp200Count();
metadata.getRobotsExcludedCount();
errorItemsCount += metadata.getTimeoutErrorCount();
errorItemsCount += metadata.getIOExceptionCount();
errorItemsCount += metadata.getDNSErrorCount();
errorItemsCount += metadata.getOtherErrorCount();
otherHTTPResultsCount += metadata.getHttp403Count();
otherHTTPResultsCount += metadata.getHttp404Count();
otherHTTPResultsCount += metadata.getHttp500Count();
otherHTTPResultsCount += metadata.getHttpOtherCount();
domainItem.setUrlCount(metadata.getUrlCount());
domainItem.setUrlsCrawled(metadata.getHttp200Count()+otherHTTPResultsCount);
domainItem.setHttp200Count(metadata.getHttp200Count());
domainItem.setInCacheItemsCount(0);
domainItem.setRobotsExcludedCount(robotsExcludedItemsCount);
domainItem.setErrorCount(errorItemsCount);
domainItem.setFirstItemOffset(metadata.getFirstRecordOffset());
domainItem.setLastItemOffset(metadata.getLastRecordOffset());
domainItem.setHashCode((int)metadata.getDomainHash());
domainItem.setQueuedCount(metadata.getQueuedItemCount());
return domainItem;
}
/*
public CrawlListMetadata getSubDomainMetadata() {
synchronized (_metadata) {
ImmutableSortedSet.Builder<String> builder = ImmutableSortedSet.naturalOrder();
builder.addAll(_subDomainNameToStatsMap.keySet());
return builder.build();
}
}
*/
/**********************************************************************/
public static void main(String[] args) throws IOException {
if (args[0].equalsIgnoreCase("dump")) {
File dataDir = new File(args[1]);
long listId = Long.parseLong(args[2]);
File outputPath = new File(args[3]);
dumpUnCrawledItems(dataDir,listId,outputPath,true);
}
}
public static void dumpUnCrawledItems(File dataDir,long listId,File outputFilePath, boolean includeRobotsExcludedItems) throws IOException {
File fixedDataFile = new File(dataDir,LIST_VALUE_MAP_PREFIX + Long.toString(listId));
File variableDataFile = new File(dataDir,LIST_STRING_MAP_PREFIX + Long.toString(listId));
LOG.info("FixedDataFile is:" + fixedDataFile);
LOG.info("VariableDataFile is:" + variableDataFile);
RandomAccessFile fixedDataReader = new RandomAccessFile(fixedDataFile, "r");
RandomAccessFile stringDataReader = new RandomAccessFile(variableDataFile, "r");
JsonWriter writer = new JsonWriter(new BufferedWriter(new FileWriter(outputFilePath),1024*1024*10));
writer.setIndent(" ");
try {
writer.beginObject();
writer.name("urls");
writer.beginArray();
try {
OnDiskCrawlHistoryItem item = new OnDiskCrawlHistoryItem();
URLFP fingerprint = new URLFP();
while (fixedDataReader.getFilePointer() != fixedDataReader.length()) {
long position = fixedDataReader.getFilePointer();
item.deserialize(fixedDataReader);
// seek to string data
stringDataReader.seek(item._stringsOffset);
// and skip buffer length
WritableUtils.readVInt(stringDataReader);
// and read primary string
String url = stringDataReader.readUTF();
// setup fingerprint
fingerprint.setDomainHash(item._domainHash);
fingerprint.setUrlHash(item._urlFingerprint);
// any item that has not been crawled needs to be queued
boolean queueItem = !item.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_CRAWL_STATUS);
// if item is not queued, check to see if we need to retry the item
if (!queueItem && item.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_CRAWL_STATUS)) {
if (item.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_REDIRECT_STATUS)) {
queueItem = (item._redirectStatus != 0);
if (!queueItem) {
if (item._redirectHttpResult != 200 && item._redirectHttpResult != 404) {
queueItem = true;
}
}
}
else {
queueItem = (item._crawlStatus != 0);
if (!queueItem) {
if (item._httpResultCode != 200 && item._httpResultCode != 404) {
queueItem = true;
}
}
}
}
if (queueItem) {
// ok if queue item is set ...
writer.beginObject();
writer.name("url");
writer.value(url);
writer.name("redirected");
writer.value((boolean)item.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_REDIRECT_STATUS));
writer.name("lastStatus");
if (item.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_REDIRECT_STATUS)) {
if (item._redirectStatus == 0) {
writer.value("HTTP-" + item._redirectHttpResult);
}
else {
writer.value(CrawlURL.FailureReason.toString(item._redirectHttpResult));
}
}
else {
if (item.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_CRAWL_STATUS)) {
if (item._crawlStatus == 0) {
writer.value("HTTP-" + item._httpResultCode);
}
else {
writer.value(CrawlURL.FailureReason.toString(item._crawlStatus));
}
}
else {
writer.value("UNCRAWLED");
}
}
writer.name("updateTime");
writer.value(item._updateTimestamp);
writer.endObject();
}
}
}
catch (IOException e) {
LOG.error("Encountered Exception Queueing Items for List:" + listId + " Exception:" + CCStringUtils.stringifyException(e));
}
finally {
fixedDataReader.close();
stringDataReader.close();
}
writer.endArray();
writer.endObject();
}
catch (Exception e) {
LOG.error(CCStringUtils.stringifyException(e));
throw new IOException(e);
}
finally {
writer.flush();
writer.close();
}
}
}