/**
* Copyright 2008 - CommonCrawl Foundation
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
*/
package org.commoncrawl.service.crawler;
import java.io.ByteArrayInputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.concurrent.Semaphore;
import java.util.concurrent.atomic.AtomicBoolean;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DataInputBuffer;
import org.apache.hadoop.io.WritableUtils;
import org.apache.hadoop.record.Buffer;
import org.commoncrawl.common.Environment;
import org.commoncrawl.crawl.common.internal.CrawlEnvironment;
import org.commoncrawl.protocol.BulkItemHistoryQuery;
import org.commoncrawl.protocol.BulkItemHistoryQueryResponse;
import org.commoncrawl.protocol.CrawlSegmentURLFP;
import org.commoncrawl.protocol.CrawlURL;
import org.commoncrawl.protocol.URLFPV2;
import org.commoncrawl.rpc.base.internal.AsyncRequest;
import org.commoncrawl.rpc.base.internal.AsyncRequest.Callback;
import org.commoncrawl.rpc.base.internal.AsyncRequest.Status;
import org.commoncrawl.util.BloomCalculations;
import org.commoncrawl.util.ImmutableBuffer;
import org.commoncrawl.util.URLFPBloomFilter;
import org.commoncrawl.util.URLUtils;
import org.commoncrawl.util.BitUtils.BitStream;
import org.commoncrawl.util.BitUtils.BitStreamReader;
/**
* A transaction log that tracks crawl progress within a single crawl segemnt
*
* @author rana
*
*/
public final class CrawlSegmentLog {
private static final int DEFAULT_LOGITEM_LIST_SIZE = 100;
public static final Log LOG = LogFactory.getLog(CrawlSegmentLog.class);
public static class CrawlSegmentFPMap {
public int _urlCount =0;
public int _urlsComplete = 0;
private byte[] _urlfpBuffer = null;
private int _urlfpBufferSize = 0;
private URLFPBloomFilter _validFingerprintsBloomFilter = null;
private URLFPBloomFilter _crawledItemsBloomFilter = null;
public void setURLFPBuffer(int segmentURLCount,byte[] data,int length)throws IOException {
_urlCount = segmentURLCount;
_urlfpBuffer = data;
_urlfpBufferSize = length;
// initialize the bloom filters
_validFingerprintsBloomFilter = new URLFPBloomFilter(segmentURLCount*2, BloomCalculations.computeBestK(11), 11);
_crawledItemsBloomFilter = new URLFPBloomFilter(segmentURLCount*2, BloomCalculations.computeBestK(11), 11);
// populate valid items filter ...
DataInputBuffer inputBuffer = getURLFPAsStream();
URLFPV2 urlfp = new URLFPV2();
while (inputBuffer.available() != 0) {
urlfp.setDomainHash(WritableUtils.readVLong(inputBuffer));
urlfp.setUrlHash(WritableUtils.readVLong(inputBuffer));
_validFingerprintsBloomFilter.add(urlfp);
}
}
public DataInputBuffer getURLFPAsStream()throws IOException {
if (_urlfpBuffer != null && _urlfpBufferSize != 0) {
DataInputBuffer dataInputBuffer = new DataInputBuffer();
dataInputBuffer.reset(_urlfpBuffer, _urlfpBufferSize);
return dataInputBuffer;
}
else {
throw new IOException("URLFPBuffer Not Initialized!");
}
}
public Buffer getURLFPAsBuffer()throws IOException {
if (_urlfpBuffer != null && _urlfpBufferSize != 0) {
return new Buffer(_urlfpBuffer,0, _urlfpBufferSize);
}
else {
throw new IOException("URLFPBuffer Not Initialized!");
}
}
public boolean wasCrawled(URLFPV2 urlfp) {
return _crawledItemsBloomFilter.isPresent(urlfp);
}
public void setCrawled(URLFPV2 urlfp) {
_crawledItemsBloomFilter.add(urlfp);
}
public boolean isValidSegmentURL(URLFPV2 urlfp) {
return _validFingerprintsBloomFilter.isPresent(urlfp);
}
}
File _rootDataDir;
int _listId;
int _segmentId;
int _localLogItemCount;
int _checkpointItemCount;
int _remainingURLS;
String _nodeName;
boolean _segmentComplete;
boolean _urlCountValid;
LinkedList<LogItemBuffer> _buffers = new LinkedList<LogItemBuffer>();
public CrawlSegmentLog(File rootDataDirectory,int listId,int segmentId,String nodeName) {
_rootDataDir = rootDataDirectory;
_listId = listId;
_segmentId = segmentId;
_remainingURLS = 0;
_localLogItemCount = 0;
_checkpointItemCount = 0;
_nodeName = nodeName;
_segmentComplete = false;
_urlCountValid = false;
}
/** get the host name **/
public String getNodeName() {
return _nodeName;
}
/** get the list this segment log is associated with **/
public int getListId() {
return _listId;
}
/** check and see if this segment is complete **/
public synchronized boolean isSegmentComplete() {
return _segmentComplete;
}
public synchronized boolean isURLCountValid() {
return _urlCountValid;
}
public static void insetFPIntoArray(ArrayList<CrawlSegmentURLFP> vector,CrawlSegmentURLFP targetfp) {
int insertionPos = findInsertionPosForFP(vector,targetfp.getUrlFP());
if (insertionPos == -1){
vector.add(0, targetfp);
}
else {
if (vector.get(insertionPos).getUrlFP() != targetfp.getUrlFP()) {
vector.add(insertionPos+1,targetfp);
}
}
}
public static int findInsertionPosForFP(ArrayList<CrawlSegmentURLFP> vector,long targetfp) {
int low = 0;
int high = vector.size() - 1;
while (low <= high) {
int mid = low + ((high - low) / 2);
CrawlSegmentURLFP urlfp = vector.get(mid);
int compareResult = (urlfp.getUrlFP() < targetfp) ? -1 : (urlfp.getUrlFP() > targetfp) ? 1 : 0;
if (compareResult > 0) {
high = mid - 1;
}
else if (compareResult < 0) {
low = mid + 1;
}
else {
return mid;
}
}
return high;
}
private static void updateFPMapFromBulkQueryResponse(CrawlSegmentFPMap segmentDetail,BulkItemHistoryQueryResponse queryResponse) throws IOException {
BitStream bitStream = new BitStream(queryResponse.getResponseList().getReadOnlyBytes(),queryResponse.getResponseList().getCount()*8);
BitStreamReader reader = new BitStreamReader(bitStream);
int updatedItemCount = 0;
int processedItemCount = 0;
// ok walk entire urlfp stream (prepopulated from crawl segment)
DataInputBuffer inputBuffer = segmentDetail.getURLFPAsStream();
URLFPV2 urlfp = new URLFPV2();
while (inputBuffer.available() != 0) {
urlfp.setDomainHash(WritableUtils.readVLong(inputBuffer));
urlfp.setUrlHash(WritableUtils.readVLong(inputBuffer));
processedItemCount++;
// check to see what history server says about the item ...
if (reader.getbit() == 1) {
// if it indicates this item was crawled, update the bloom filter ...
segmentDetail.setCrawled(urlfp);
updatedItemCount++;
// and update urls complete ...
segmentDetail._urlsComplete++;
}
else {
// otherwise, tricky, but check local bloom filter to see if it was crawled prior to checkpoint with history server
if (segmentDetail.wasCrawled(urlfp)) {
// if so, update urls complete
segmentDetail._urlsComplete++;
}
}
}
// if (Environment.detailLogEnabled())
LOG.info("###SYNC: Reconciled FPMap with Query Response. "
+ " URLCount:" + segmentDetail._urlCount
+ " Complete:" + segmentDetail._urlsComplete
+ " Items Changed:" + updatedItemCount);
}
private static BulkItemHistoryQuery buildHistoryQueryBufferFromMap(CrawlSegmentFPMap segmentDetail) throws IOException {
// create a bulk item query message ...
BulkItemHistoryQuery query = new BulkItemHistoryQuery();
// get the entire urlfp stream from segmentFPMap and set it in the message
query.setFingerprintList(segmentDetail.getURLFPAsBuffer());
return query;
}
/** sync the incoming segment against the local crawl log and then send it up to the history server **/
public int syncToLog(CrawlSegmentFPMap segmentDetail,SegmentLoader.CancelOperationCallback cancelCheck) throws IOException {
if (Environment.detailLogEnabled())
LOG.info("### SYNC: List:"+ _listId + " Segment:" + _segmentId +" Syncing Progress Log");
int itemsProcessed = 0;
// and construct a path to the local crawl segment directory ...
File activeLogPath = buildActivePath(_rootDataDir,_listId,_segmentId);
File checkpointLogPath = buildCheckpointPath(_rootDataDir,_listId, _segmentId);
// check if it exists ...
if (checkpointLogPath.exists()){
// log it ...
if (Environment.detailLogEnabled())
LOG.info("### SYNC: List:"+ _listId + " Segment:" + _segmentId +" Checkpoint Log Found");
// rename it as the active log ...
checkpointLogPath.renameTo(activeLogPath);
}
if (activeLogPath.exists()) {
// reconcile against active log (if it exists) ...
_localLogItemCount = reconcileLogFile(FileSystem.getLocal(CrawlEnvironment.getHadoopConfig()),new Path(activeLogPath.getAbsolutePath()),_listId,_segmentId,segmentDetail,null);
if (Environment.detailLogEnabled())
LOG.info("### SYNC: List:"+ _listId + " Segment:" + _segmentId +" Reconciled Local Log File with ProcessedItemCount:" + _localLogItemCount);
itemsProcessed += _localLogItemCount;
}
FileSystem hdfs = CrawlEnvironment.getDefaultFileSystem();
// first things first ... check to see if special completion log file exists in hdfs
Path hdfsSegmentCompletionLogPath =
new Path(CrawlEnvironment.getCrawlSegmentLogsDirectory() + "/" + getListId() + "/"
+ getSegmentId() + "/"
+ CrawlEnvironment.buildCrawlSegmentCompletionLogFileName(getNodeName()));
if (hdfs.exists(hdfsSegmentCompletionLogPath)) {
if (Environment.detailLogEnabled())
LOG.info("### SYNC: List:"+ _listId + " Segment:" + _segmentId +" Completion File Found. Marking Segment Complete");
// if the file exists then this segment has been crawled and uploaded already ...
// if active log file exists ... delete it ...
if (activeLogPath.exists())
activeLogPath.delete();
//reset local log item count ...
_localLogItemCount = 0;
itemsProcessed = -1;
// remove all hosts from segment
segmentDetail._urlsComplete = segmentDetail._urlCount;
}
else {
if (segmentDetail != null) {
int retryCount = 0;
final AtomicBoolean done = new AtomicBoolean();
while (!done.get() && !cancelCheck.cancelOperation()) {
retryCount++;
if (Environment.detailLogEnabled())
LOG.info("### SYNC: Building BulkItem History Query for List:"+ _listId + " Segment:" + _segmentId + " Attempt#:" + retryCount);
BulkItemHistoryQuery query = buildHistoryQueryBufferFromMap(segmentDetail);
if (query != null) {
// create blocking semaphore ...
final Semaphore semaphore = new Semaphore(0);
if (Environment.detailLogEnabled())
LOG.info("### SYNC: Dispatching query to history server");
//create an outer response object we can pass aysnc response to ...
final BulkItemHistoryQueryResponse outerResponse = new BulkItemHistoryQueryResponse();
CrawlerServer.getServer().getHistoryServiceStub().bulkItemQuery(query, new Callback<BulkItemHistoryQuery, BulkItemHistoryQueryResponse>() {
@Override
public void requestComplete(final AsyncRequest<BulkItemHistoryQuery, BulkItemHistoryQueryResponse> request) {
try {
// response returns in async thread context ...
if (request.getStatus() == Status.Success) {
if (Environment.detailLogEnabled())
LOG.info("###SYNC: bulk Query to history server succeeded. setting out resposne");
ImmutableBuffer buffer = request.getOutput().getResponseList();
outerResponse.setResponseList(new Buffer(buffer.getReadOnlyBytes(),0,buffer.getCount()));
done.set(true);
}
else {
LOG.error("###SYNC: bulk Query to history server failed. Sleeping for 10 seconds and then will retry");
try {
Thread.sleep(10000);
} catch (InterruptedException e) {
}
}
}
finally {
// release semaphore
semaphore.release();
}
}
});
LOG.info("###SYNC: Loader thread blocked waiting for bulk query response");
semaphore.acquireUninterruptibly();
LOG.info("###SYNC: Loader thread received response from history server");
if (outerResponse.getResponseList().getCount() == 0) {
LOG.error("###SYNC: History Server Bulk Query Returned NULL!!! for List:" + _listId + " Segment:" + _segmentId);
}
else {
// ok time to process the response and integrate the results into the fp list
updateFPMapFromBulkQueryResponse(segmentDetail,outerResponse);
}
done.set(true);
}
else {
if (Environment.detailLogEnabled())
LOG.warn("### SYNC: No fingerprints found when processing segment detail for List:"+ _listId + " Segment:" + _segmentId);
segmentDetail._urlsComplete = segmentDetail._urlCount;
done.set(true);
}
}
}
}
if (segmentDetail != null) {
_remainingURLS += (segmentDetail._urlCount - segmentDetail._urlsComplete);
// mark url count as valid now ...
_urlCountValid = true;
// now if remaining url count is zero ... then mark the segment as complete ...
if(_remainingURLS == 0 && _localLogItemCount == 0) {
_segmentComplete = true;
}
}
if (Environment.detailLogEnabled())
LOG.info("### SYNC: List:"+ _listId + " Segment:" + _segmentId +" Done Syncing Progress Log TotalURLS:" + segmentDetail._urlCount +" RemainingURLS:" + _remainingURLS + " LocalLogItemCount:" + _localLogItemCount);
return itemsProcessed;
}
/** append a CrawlURL item to the log **/
public void completeItem(CrawlURL urlItem) {
LogItem item = new LogItem();
item._hostFP = urlItem.getHostFP();
item._itemFP = urlItem.getFingerprint();
item._urlData = urlItem;
getAvailableBuffer().appendItem(item);
if ((item._urlData.getFlags() & CrawlURL.Flags.IsRobotsURL) == 0) {
// now check to see if item was redirected ...
if ((item._urlData.getFlags() & CrawlURL.Flags.IsRedirected) != 0) {
// if so, check last attempt reason
if (item._urlData.getLastAttemptResult() == CrawlURL.CrawlResult.SUCCESS && item._urlData.isFieldDirty(CrawlURL.Field_REDIRECTURL)) {
String redirectURL = item._urlData.getRedirectURL();
// attempt to generate a fingerprint for the the redirected url ...
URLFPV2 fingerprint = URLUtils.getURLFPV2FromURL(redirectURL);
if (fingerprint != null) {
// append a redirect item
item = new LogItem();
item._hostFP = fingerprint.getDomainHash();
item._itemFP = fingerprint.getUrlHash();
item._urlData = urlItem;
item._writeToCrawLog = false;
getAvailableBuffer().appendItem(item);
}
}
}
}
// reduce remaining url count
--_remainingURLS;
// and increment local log item count ...
++_localLogItemCount;
}
public void purgeLocalFiles() throws IOException {
File activePath = buildActivePath(_rootDataDir,_listId,getSegmentId());
File checkpointPath = buildCheckpointPath(_rootDataDir,_listId,getSegmentId());
if (activePath.exists())
activePath.delete();
if (checkpointPath.exists())
checkpointPath.delete();
}
/** checkpoint log file **/
public void checkpointLocalLog() throws IOException {
File activePath = buildActivePath(_rootDataDir,_listId,getSegmentId());
File checkpointPath = buildCheckpointPath(_rootDataDir,_listId,getSegmentId());
// capture local log item count ...
_checkpointItemCount = _localLogItemCount;
checkpointPath.delete();
// rename active path to check point path ...
activePath.renameTo(checkpointPath);
// and recreate log ..
initializeLogFile(activePath);
}
void finalizeCheckpoint() {
File checkpointLogFile= buildCheckpointPath(_rootDataDir,_listId, _segmentId);
// delete local checkpoint log file ...
checkpointLogFile.delete();
// and reduce local log item count by checkpoint amount ...
_localLogItemCount -= _checkpointItemCount;
//reset checkpoint item count ...
_checkpointItemCount = 0;
if (isURLCountValid()) {
LOG.info("finalizeCheckpoint for Segment:" + _segmentId + " List: " + _listId + " Remaining:" + _remainingURLS + " LocalLogItemCount:" + _localLogItemCount);
// now finally ... if remaining url count is zero and local log item count is zero as well...
if (_remainingURLS == 0 && _localLogItemCount == 0) {
LOG.info("CrawlSegment ListId:" + _listId + " Segment:" + _segmentId + " Marked as Complete During CrawlSegmentLog Checkpoint");
// then mark the segment as complete ...
_segmentComplete = true;
}
}
}
void abortCheckpoint() {
File activeLogFile = buildActivePath(_rootDataDir,_listId, _segmentId);
File checkpointLogFile = buildCheckpointPath(_rootDataDir,_listId, _segmentId);
checkpointLogFile.renameTo(activeLogFile);
//reset checkpoint item count ...
_checkpointItemCount = 0;
}
/** ensure paths **/
private static void ensurePaths(File rootDirectory) {
File crawlDataDir = new File(rootDirectory,CrawlEnvironment.getCrawlerLocalOutputPath());
if (!crawlDataDir.exists()) {
crawlDataDir.mkdir();
}
}
public static void initializeLogFile(File activeLogFilePath) throws IOException {
if (!activeLogFilePath.exists()) {
writeHeader(activeLogFilePath,0);
}
}
public void purgeActiveLog()throws IOException {
File activeLogFilePath = buildActivePath(_rootDataDir,_listId, _segmentId);
if (activeLogFilePath.exists())
activeLogFilePath.delete();
initializeLogFile(activeLogFilePath);
}
/** get active log file path given segment id **/
public static File buildActivePath(File rootDirectory,int listId,int segmentId) {
// and construct a path to the local crawl segment directory ...
File crawlDataDir = new File(rootDirectory,CrawlEnvironment.getCrawlerLocalOutputPath());
// list directory ...
File listDir = new File(crawlDataDir,Integer.toString(listId));
if (!listDir.exists()) {
listDir.mkdirs();
}
// append the segment id to the path ...
return new File(listDir,((Integer)segmentId).toString() + "_" + CrawlEnvironment.ActiveSegmentLog);
}
/** get active log file path given segment id **/
public static File buildCheckpointPath(File rootDirectory,int listId,int segmentId) {
// and construct a path to the local crawl segment directory ...
File crawlDataDir = new File(rootDirectory,CrawlEnvironment.getCrawlerLocalOutputPath());
// list directory ...
File listDir = new File(crawlDataDir,Integer.toString(listId));
if (!listDir.exists()) {
listDir.mkdirs();
}
// append the segment id to the path ...
return new File(listDir,((Integer)segmentId).toString() + "_" + CrawlEnvironment.CheckpointSegmentLog);
}
/** get segment id of associated segment **/
public int getSegmentId() {
return _segmentId;
}
/** flush and add all pending buffers into the passed in list **/
public void flushLog(LinkedList<LogItemBuffer> collector) {
for (LogItemBuffer buffer : _buffers) {
if (buffer.getItemCount() != 0 ) {
collector.addLast(buffer);
}
}
_buffers.clear();
_buffers.addFirst(new LogItemBuffer(getListId(),getSegmentId()));
}
private LogItemBuffer getAvailableBuffer() {
if (_buffers.isEmpty() || !_buffers.getFirst().spaceAvailable()) {
_buffers.addFirst(new LogItemBuffer(getListId(),getSegmentId()));
}
return _buffers.getFirst();
}
static class LogItem implements Comparable<LogItem> {
public static final int ItemSize_Bytes = 20; // hostFP(long) + itemFP(long) + position(int)
// Comparable Implementation
public int compareTo(LogItem otherItem) {
if (_hostFP < otherItem._hostFP )
return -1;
else if (_hostFP > otherItem._hostFP)
return 1;
else {
if (_itemFP < otherItem._itemFP)
return -1;
else if (_itemFP > otherItem._itemFP)
return 1;
else
return 0;
}
}
public boolean _writeToCrawLog = true;
public long _hostFP;
public long _itemFP;
public CrawlURL _urlData;
}
static class LogItemBuffer {
private int _listId;
private int _segmentId;
private LogItem[] _itemsArray = null;
private int _itemCount;
public LogItemBuffer(int listId,int segmentId) {
_listId = listId;
_segmentId = segmentId;
_itemCount = 0;
_itemsArray = new LogItem[DEFAULT_LOGITEM_LIST_SIZE];
}
public int getListId() { return _listId; }
public int getSegmentId() { return _segmentId; }
public LogItem[] getItems() { return _itemsArray; }
public int getItemCount() { return _itemCount; }
public void appendItem(LogItem item) {
if (_itemsArray ==null || _itemCount == _itemsArray.length) {
throw new RuntimeException("Invalid call to append item");
}
_itemsArray[_itemCount++] = item;
}
public boolean spaceAvailable() {
return (_itemsArray != null && _itemCount < _itemsArray.length);
}
public static interface CrawlURLWriter {
void writeItemCount(int entryCount) throws IOException ;
void writeItem(CrawlURL url) throws IOException ;
}
public int flushToDisk(int startingItemPosition,CrawlURLWriter urlWriter,DataOutputStream segmentLogStream,DataOutputStream historyLog) throws IOException {
// write out entry count first ...
urlWriter.writeItemCount(_itemCount);
for (int i=0;i<_itemCount;++i) {
if (_itemsArray[i]._writeToCrawLog) {
// write url data ...
urlWriter.writeItem(_itemsArray[i]._urlData);
}
CrawlURL urlObject = _itemsArray[i]._urlData;
// if now crawl directives ...
if ((urlObject.getFlags() & CrawlURL.Flags.InParseQueue) == 0) {
if (segmentLogStream != null) {
// and write out segment log info ...
segmentLogStream.writeLong(_itemsArray[i]._hostFP);
segmentLogStream.writeLong(_itemsArray[i]._itemFP);
segmentLogStream.writeInt(startingItemPosition + i);
}
if (historyLog != null) {
URLFPV2 fp = URLUtils.getURLFPV2FromURL(urlObject.getUrl());
if (fp != null) {
// write original url to history log ...
fp.write(historyLog);
}
// if redirected ...
if ((_itemsArray[i]._urlData.getFlags() & CrawlURL.Flags.IsRedirected) != 0) {
// calc fingerprint for url ...
fp = URLUtils.getURLFPV2FromURL(urlObject.getRedirectURL());
if (fp != null) {
// write redirect fingerprint to history log ...
fp.write(historyLog);
}
}
}
}
_itemsArray[i]._urlData.clear();
_itemsArray[i]._urlData = null;
_itemsArray[i] = null;
}
return _itemCount;
}
public void loadFromStream(byte[] readBuffer, int itemCount) {
_itemCount = itemCount;
if (_itemsArray == null || _itemsArray.length < itemCount) {
// reallocate array ...
_itemsArray = new LogItem[_itemCount];
}
int bytePosition = 0;
ByteArrayInputStream inputStream = new ByteArrayInputStream(readBuffer);
DataInputStream dataInputStream = new DataInputStream(inputStream);
for (int i=0;i<_itemCount;++i) {
LogItem item = new LogItem();
item._hostFP = (((long)readBuffer[bytePosition++] << 56) +
((long)(readBuffer[bytePosition++] & 255) << 48) +
((long)(readBuffer[bytePosition++] & 255) << 40) +
((long)(readBuffer[bytePosition++] & 255) << 32) +
((long)(readBuffer[bytePosition++] & 255) << 24) +
((readBuffer[bytePosition++] & 255) << 16) +
((readBuffer[bytePosition++] & 255) << 8) +
((readBuffer[bytePosition++] & 255) << 0));
item._itemFP = (((long)readBuffer[bytePosition++] << 56) +
((long)(readBuffer[bytePosition++] & 255) << 48) +
((long)(readBuffer[bytePosition++] & 255) << 40) +
((long)(readBuffer[bytePosition++] & 255) << 32) +
((long)(readBuffer[bytePosition++] & 255) << 24) +
((readBuffer[bytePosition++] & 255) << 16) +
((readBuffer[bytePosition++] & 255) << 8) +
((readBuffer[bytePosition++] & 255) << 0));
// skip position hint...
bytePosition += 4;
_itemsArray[i] = item;
}
}
}
public static int getHeaderSize() {
return 8;
}
public static int readerHeader(File logFilePath) throws IOException {
int recordCount = 0;
FileInputStream stream = new FileInputStream(logFilePath);
try {
DataInputStream reader = new DataInputStream(stream);
recordCount = readHeader(reader);
}
finally {
stream.close();
}
return recordCount;
}
public static int readHeader(DataInputStream reader) throws IOException {
reader.skipBytes(4);
return reader.readInt();
}
public static final int LogFileHeaderBytes = 0xCC00CC00;
public static void writeHeader(File logFilePath,int recordCount) throws IOException {
RandomAccessFile stream = new RandomAccessFile(logFilePath,"rw");
try {
stream.seek(0);
stream.writeInt(LogFileHeaderBytes);
stream.writeInt(recordCount);
}
finally {
// stream.getFD().sync();
stream.close();
}
}
public static int reconcileLogFile(FileSystem fs,Path logFilePath,int listId,int segmentId,CrawlSegmentFPMap segment,File consolidationFile)throws IOException {
RandomAccessFile consolidationStream = null;
int consolidationFileItemCount = 0;
if (consolidationFile != null) {
consolidationStream = new RandomAccessFile(consolidationFile,"rw");
consolidationFileItemCount = readerHeader(consolidationFile);
consolidationStream.seek(consolidationStream.length());
}
int processedItemCount = 0;
FSDataInputStream hdfsInputStream = null;
try {
// get the file size on disk
long fileSize = fs.getFileStatus(logFilePath).getLen();
// allocate an array that can hold up to the list size of items ...
byte[] buffer = new byte[DEFAULT_LOGITEM_LIST_SIZE * LogItem.ItemSize_Bytes];
// calcuate item count
int totalItemCount = (int)( (fileSize - getHeaderSize()) / LogItem.ItemSize_Bytes);
// get a reader ...
hdfsInputStream = fs.open(logFilePath);
int headerItemCount = readHeader(hdfsInputStream);
if (headerItemCount != totalItemCount) {
LOG.warn("CrawlSegmentLog - header item count for log file:" + logFilePath.toString() + " is:"+ headerItemCount + " file size indicates:" + totalItemCount);
totalItemCount = headerItemCount;
}
int remainingItemCount = totalItemCount;
LogItemBuffer itemList = new LogItemBuffer(listId,segmentId);
while (remainingItemCount != 0) {
int blockItemCount = Math.min(remainingItemCount,DEFAULT_LOGITEM_LIST_SIZE);
// and read the data
hdfsInputStream.read(buffer,0,(int)blockItemCount * LogItem.ItemSize_Bytes);
// and if consolidation stream is valid ...
if (consolidationStream != null) {
// add entries to that stream ...
consolidationStream.write(buffer,0,(int)blockItemCount * LogItem.ItemSize_Bytes);
}
// if not a dry run...
if (segment != null) {
// populate the item list
itemList.loadFromStream(buffer,blockItemCount);
// reconcile the list against the segment
processedItemCount += reconcileItemList(itemList,segment);
}
// reduce item count
remainingItemCount -= blockItemCount;
}
// finally if consolidation stream is valid ...
if (consolidationStream != null) {
// update the file's header ..
writeHeader(consolidationFile, consolidationFileItemCount +totalItemCount );
}
}
finally {
if (consolidationStream != null) {
consolidationStream.close();
}
if (hdfsInputStream != null) {
hdfsInputStream.close();
}
}
return processedItemCount;
}
public static interface LogFileItemCallback {
public void processItem(long domainHash,long urlFingerprint);
}
public static void walkFingerprintsInLogFile(FileSystem fs,Path logFilePath,LogFileItemCallback callback)throws IOException {
FSDataInputStream hdfsInputStream = null;
try {
// get the file size on disk
long fileSize = fs.getFileStatus(logFilePath).getLen();
// allocate an array that can hold up to the list size of items ...
byte[] buffer = new byte[DEFAULT_LOGITEM_LIST_SIZE * LogItem.ItemSize_Bytes];
// calcuate item count
int totalItemCount = (int)( (fileSize - getHeaderSize()) / LogItem.ItemSize_Bytes);
// get a reader ...
hdfsInputStream = fs.open(logFilePath);
int headerItemCount = readHeader(hdfsInputStream);
if (headerItemCount != totalItemCount) {
LOG.warn("CrawlSegmentLog - header item count for log file:" + logFilePath.toString() + " is:"+ headerItemCount + " file size indicates:" + totalItemCount);
totalItemCount = headerItemCount;
}
int remainingItemCount = totalItemCount;
LogItemBuffer itemList = new LogItemBuffer(0,0);
while (remainingItemCount != 0) {
int blockItemCount = Math.min(remainingItemCount,DEFAULT_LOGITEM_LIST_SIZE);
// and read the data
hdfsInputStream.read(buffer,0,(int)blockItemCount * LogItem.ItemSize_Bytes);
// populate the item list
itemList.loadFromStream(buffer,blockItemCount);
// for walk items in list
for (int i=0;i<itemList.getItemCount();++i) {
LogItem item = itemList.getItems()[i];
callback.processItem(item._hostFP,item._itemFP);
}
// reduce item count
remainingItemCount -= blockItemCount;
}
}
finally {
if (hdfsInputStream != null) {
hdfsInputStream.close();
}
}
}
public static int reconcileItemList(LogItemBuffer itemList,CrawlSegmentFPMap segment) {
int processedItemCount = 0;
URLFPV2 urlfp = new URLFPV2();
// and now walk segment and list consolidating segment as we go along ...
for (int i=0;i<itemList.getItemCount();++i) {
LogItem item = itemList.getItems()[i];
urlfp.setDomainHash(item._hostFP);
urlfp.setUrlHash(item._itemFP);
if (segment.isValidSegmentURL(urlfp)) {
//update local bloom filter ...
segment.setCrawled(urlfp);
// increment processed item count
processedItemCount++;
}
}
return processedItemCount;
}
}