CrawlSegmentLog.java example

Explorer
commoncrawl-crawler-master
- src
  - com
    - dappit
      - Dapper
        parser
        CompressedDomBuilder.java
        DebugDocumentBuilder.java
        DocumentBuilder.java
        DomDocumentBuilder.java
        EnviromentController.java
        HTMLParser.java
        InstructionsPool.java
        LinkExtractionDocumentBuilder.java
        MozillaParser.java
        ParserException.java
        ParserInitializationException.java
        ParserInstruction.java
  - org
    - commoncrawl
/**
 * Copyright 2008 - CommonCrawl Foundation
 * 
 *    This program is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation, either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 */


package org.commoncrawl.service.crawler;

import java.io.ByteArrayInputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.concurrent.Semaphore;
import java.util.concurrent.atomic.AtomicBoolean;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DataInputBuffer;
import org.apache.hadoop.io.WritableUtils;
import org.apache.hadoop.record.Buffer;
import org.commoncrawl.common.Environment;
import org.commoncrawl.crawl.common.internal.CrawlEnvironment;
import org.commoncrawl.protocol.BulkItemHistoryQuery;
import org.commoncrawl.protocol.BulkItemHistoryQueryResponse;
import org.commoncrawl.protocol.CrawlSegmentURLFP;
import org.commoncrawl.protocol.CrawlURL;
import org.commoncrawl.protocol.URLFPV2;
import org.commoncrawl.rpc.base.internal.AsyncRequest;
import org.commoncrawl.rpc.base.internal.AsyncRequest.Callback;
import org.commoncrawl.rpc.base.internal.AsyncRequest.Status;
import org.commoncrawl.util.BloomCalculations;
import org.commoncrawl.util.ImmutableBuffer;
import org.commoncrawl.util.URLFPBloomFilter;
import org.commoncrawl.util.URLUtils;
import org.commoncrawl.util.BitUtils.BitStream;
import org.commoncrawl.util.BitUtils.BitStreamReader;

/**
 * A transaction log that tracks crawl progress within a single crawl segemnt
 * 
 * @author rana
 *
 */
public final class CrawlSegmentLog {

  private static final int DEFAULT_LOGITEM_LIST_SIZE = 100;
  
  public static final Log LOG = LogFactory.getLog(CrawlSegmentLog.class);
  
  
  public static class CrawlSegmentFPMap { 
    public int _urlCount =0;
    public int _urlsComplete = 0;
    private byte[] _urlfpBuffer = null;
    private int    _urlfpBufferSize = 0;
    private URLFPBloomFilter _validFingerprintsBloomFilter = null;
    private URLFPBloomFilter _crawledItemsBloomFilter = null;
    
    
    public void setURLFPBuffer(int segmentURLCount,byte[] data,int length)throws IOException {
      _urlCount = segmentURLCount;
      _urlfpBuffer = data;
      _urlfpBufferSize = length;
      // initialize the bloom filters 
      _validFingerprintsBloomFilter = new URLFPBloomFilter(segmentURLCount*2, BloomCalculations.computeBestK(11), 11);
      _crawledItemsBloomFilter      = new URLFPBloomFilter(segmentURLCount*2, BloomCalculations.computeBestK(11), 11);
      // populate valid items filter ... 
      DataInputBuffer inputBuffer = getURLFPAsStream();
      URLFPV2 urlfp = new URLFPV2();
      while (inputBuffer.available() != 0) { 
        urlfp.setDomainHash(WritableUtils.readVLong(inputBuffer));
        urlfp.setUrlHash(WritableUtils.readVLong(inputBuffer));
        _validFingerprintsBloomFilter.add(urlfp);
      }
    }
    
    public DataInputBuffer getURLFPAsStream()throws IOException { 
      if (_urlfpBuffer != null && _urlfpBufferSize != 0) { 
        DataInputBuffer dataInputBuffer = new DataInputBuffer();
        dataInputBuffer.reset(_urlfpBuffer, _urlfpBufferSize);
        return dataInputBuffer;
      }
      else { 
        throw new IOException("URLFPBuffer Not Initialized!");
      }
    }

    public Buffer getURLFPAsBuffer()throws IOException { 
      if (_urlfpBuffer != null && _urlfpBufferSize != 0) { 
        return new Buffer(_urlfpBuffer,0, _urlfpBufferSize);
      }
      else { 
        throw new IOException("URLFPBuffer Not Initialized!");
      }
    }

    public boolean wasCrawled(URLFPV2 urlfp) { 
      return _crawledItemsBloomFilter.isPresent(urlfp);
    }
    
    public void setCrawled(URLFPV2 urlfp) { 
      _crawledItemsBloomFilter.add(urlfp);
    }
    
    public boolean isValidSegmentURL(URLFPV2 urlfp) { 
      return _validFingerprintsBloomFilter.isPresent(urlfp);
    }
  }
  
  
  File             _rootDataDir;
  int              _listId;
  int              _segmentId;
  int              _localLogItemCount;
  int              _checkpointItemCount;
  int              _remainingURLS;
  String           _nodeName;
  boolean          _segmentComplete;
  boolean          _urlCountValid;
  
  LinkedList<LogItemBuffer> _buffers = new LinkedList<LogItemBuffer>();
  
  public CrawlSegmentLog(File rootDataDirectory,int listId,int segmentId,String nodeName) { 
    
    _rootDataDir = rootDataDirectory;
    _listId = listId;
    _segmentId = segmentId;
    _remainingURLS = 0;
    _localLogItemCount = 0;
    _checkpointItemCount = 0;
    _nodeName = nodeName;
    _segmentComplete = false;
    _urlCountValid = false;
  }
  
  /** get the host name **/
  public String getNodeName() { 
    return _nodeName;
  }
  
  /** get the list this segment log is associated with **/
  public int getListId() { 
    return _listId;
  }
  
  /** check and see if this segment is complete **/
  public synchronized boolean isSegmentComplete() { 
    return _segmentComplete;
  }
  
  public synchronized boolean isURLCountValid() { 
    return _urlCountValid;
  }
  
  public static void insetFPIntoArray(ArrayList<CrawlSegmentURLFP> vector,CrawlSegmentURLFP targetfp) { 
    int insertionPos = findInsertionPosForFP(vector,targetfp.getUrlFP());
    if (insertionPos == -1){
      vector.add(0, targetfp);
    }
    else { 
      if (vector.get(insertionPos).getUrlFP() != targetfp.getUrlFP()) { 
        vector.add(insertionPos+1,targetfp);
      }
    }
  }
  
  public static int findInsertionPosForFP(ArrayList<CrawlSegmentURLFP> vector,long targetfp) {

    int low  = 0;
    int high = vector.size() - 1;
    
    while (low <= high) {
     int mid = low + ((high - low) / 2);

     CrawlSegmentURLFP urlfp = vector.get(mid);

     
     int compareResult = (urlfp.getUrlFP() < targetfp) ? -1 : (urlfp.getUrlFP() > targetfp) ? 1 : 0;
        
     if (compareResult > 0) {  
         high = mid - 1;
     }
     else if (compareResult < 0) { 
         low = mid + 1;
     }
     else { 
       return mid;
     }
       
   }
   return high;
  }
  
  private static void updateFPMapFromBulkQueryResponse(CrawlSegmentFPMap segmentDetail,BulkItemHistoryQueryResponse queryResponse) throws IOException { 
    
    BitStream bitStream = new BitStream(queryResponse.getResponseList().getReadOnlyBytes(),queryResponse.getResponseList().getCount()*8);
    BitStreamReader reader = new BitStreamReader(bitStream);
    
    int updatedItemCount = 0;
    int processedItemCount = 0;
    
    // ok walk entire urlfp stream (prepopulated from crawl segment)
    DataInputBuffer inputBuffer = segmentDetail.getURLFPAsStream();
    URLFPV2 urlfp = new URLFPV2();
    
    while (inputBuffer.available() != 0) {
      
      urlfp.setDomainHash(WritableUtils.readVLong(inputBuffer));
      urlfp.setUrlHash(WritableUtils.readVLong(inputBuffer));
      
      processedItemCount++;
    
      // check to see what history server says about the item ... 
      if (reader.getbit() == 1) {
        // if it indicates this item was crawled, update the bloom filter ... 
        segmentDetail.setCrawled(urlfp);
        updatedItemCount++;
        // and update urls complete ... 
        segmentDetail._urlsComplete++;
      }
      else {
        // otherwise, tricky, but check local bloom filter to see if it was crawled prior to checkpoint with history server 
        if (segmentDetail.wasCrawled(urlfp)) { 
          // if so, update urls complete 
          segmentDetail._urlsComplete++;
        }
      }
    }
    
    // if (Environment.detailLogEnabled())
      LOG.info("###SYNC: Reconciled FPMap with Query Response. " 
          + " URLCount:" + segmentDetail._urlCount  
          + " Complete:" + segmentDetail._urlsComplete
          + " Items Changed:" + updatedItemCount);
  }
    
  private static BulkItemHistoryQuery buildHistoryQueryBufferFromMap(CrawlSegmentFPMap segmentDetail) throws IOException { 
    // create a bulk item query message ... 
    BulkItemHistoryQuery query = new BulkItemHistoryQuery();
    // get the entire urlfp stream from segmentFPMap and set it in the message 
    query.setFingerprintList(segmentDetail.getURLFPAsBuffer());
    
    return query;
  }
    
  
  /** sync the incoming segment against the local crawl log and then send it up to the history server **/
  public int syncToLog(CrawlSegmentFPMap segmentDetail,SegmentLoader.CancelOperationCallback cancelCheck) throws IOException { 
    if (Environment.detailLogEnabled())
      LOG.info("### SYNC: List:"+ _listId + " Segment:" + _segmentId +" Syncing Progress Log");
    
    int itemsProcessed = 0;
    
    // and construct a path to the local crawl segment directory ... 
    File  activeLogPath = buildActivePath(_rootDataDir,_listId,_segmentId);
    File  checkpointLogPath = buildCheckpointPath(_rootDataDir,_listId, _segmentId);
    
    // check if it exists ... 
    if (checkpointLogPath.exists()){
      // log it ... 
      if (Environment.detailLogEnabled())
        LOG.info("### SYNC: List:"+ _listId + " Segment:" + _segmentId +" Checkpoint Log Found");
      // rename it as the active log ... 
      checkpointLogPath.renameTo(activeLogPath);
    }
    
    if (activeLogPath.exists()) {
      // reconcile against active log (if it exists) ...
      _localLogItemCount = reconcileLogFile(FileSystem.getLocal(CrawlEnvironment.getHadoopConfig()),new Path(activeLogPath.getAbsolutePath()),_listId,_segmentId,segmentDetail,null);
      if (Environment.detailLogEnabled())
        LOG.info("### SYNC: List:"+ _listId + " Segment:" + _segmentId +" Reconciled Local Log File with ProcessedItemCount:" + _localLogItemCount);
      itemsProcessed += _localLogItemCount;
    }

    FileSystem hdfs = CrawlEnvironment.getDefaultFileSystem();
    
    // first things first ... check to see if special completion log file exists in hdfs 
    Path hdfsSegmentCompletionLogPath = 
      new Path(CrawlEnvironment.getCrawlSegmentLogsDirectory() + "/" + getListId() + "/"
                    + getSegmentId() + "/" 
                    + CrawlEnvironment.buildCrawlSegmentCompletionLogFileName(getNodeName()));
    
    if (hdfs.exists(hdfsSegmentCompletionLogPath)) {
      if (Environment.detailLogEnabled())
        LOG.info("### SYNC: List:"+ _listId + " Segment:" + _segmentId +" Completion File Found. Marking Segment Complete");
      // if the file exists then this segment has been crawled and uploaded already ... 
      // if active log file exists ... delete it ... 
      if (activeLogPath.exists()) 
        activeLogPath.delete();
      //reset local log item count ... 
      _localLogItemCount = 0;
      itemsProcessed = -1;
      
      // remove all hosts from segment
      segmentDetail._urlsComplete = segmentDetail._urlCount;
    }
    else {
      
      if (segmentDetail != null) { 
        int retryCount = 0;
        final AtomicBoolean done = new AtomicBoolean();
        while (!done.get() && !cancelCheck.cancelOperation()) {
          
          retryCount++;
          
          if (Environment.detailLogEnabled())
            LOG.info("### SYNC: Building BulkItem History Query for List:"+ _listId + " Segment:" + _segmentId + " Attempt#:" + retryCount);
          BulkItemHistoryQuery query = buildHistoryQueryBufferFromMap(segmentDetail);
          
          if (query != null) {
            // create blocking semaphore ... 
            final Semaphore semaphore = new Semaphore(0);
            if (Environment.detailLogEnabled())
              LOG.info("### SYNC: Dispatching query to history server");
            //create an outer response object we can pass aysnc response to ... 
            final BulkItemHistoryQueryResponse outerResponse = new BulkItemHistoryQueryResponse();
            
            CrawlerServer.getServer().getHistoryServiceStub().bulkItemQuery(query, new Callback<BulkItemHistoryQuery, BulkItemHistoryQueryResponse>() {
  
              @Override
              public void requestComplete(final AsyncRequest<BulkItemHistoryQuery, BulkItemHistoryQueryResponse> request) {
                try { 
                  // response returns in async thread context ... 
                  if (request.getStatus() == Status.Success) {
                    if (Environment.detailLogEnabled())
                      LOG.info("###SYNC: bulk Query to history server succeeded. setting out resposne");
                    ImmutableBuffer buffer = request.getOutput().getResponseList();
                    outerResponse.setResponseList(new Buffer(buffer.getReadOnlyBytes(),0,buffer.getCount()));
                    done.set(true);
                  }
                  else { 
                    LOG.error("###SYNC: bulk Query to history server failed. Sleeping for 10 seconds and then will retry");
                    try {
                      Thread.sleep(10000);
                    } catch (InterruptedException e) {
                    }
                  }
                }
                finally { 
                  // release semaphore
                  semaphore.release();
                }
              }
            });
            LOG.info("###SYNC: Loader thread blocked waiting for bulk query response");
            semaphore.acquireUninterruptibly();
            LOG.info("###SYNC: Loader thread received response from history server");
            
            if (outerResponse.getResponseList().getCount() == 0) { 
              LOG.error("###SYNC: History Server Bulk Query Returned NULL!!! for List:" +  _listId + " Segment:" + _segmentId);
            }
            else { 
              // ok time to process the response and integrate the results into the fp list 
              updateFPMapFromBulkQueryResponse(segmentDetail,outerResponse);
            }
            done.set(true);
          }
          else { 
            if (Environment.detailLogEnabled())
              LOG.warn("### SYNC: No fingerprints found when processing segment detail for List:"+ _listId + " Segment:" + _segmentId);
            segmentDetail._urlsComplete = segmentDetail._urlCount;
            done.set(true);
          }
        }
      }
    }
    
    if (segmentDetail != null) { 
      _remainingURLS += (segmentDetail._urlCount - segmentDetail._urlsComplete);
      // mark url count as valid now ...
      _urlCountValid = true;

      // now if remaining url count is zero ... then mark the segment as complete ... 
      if(_remainingURLS == 0 && _localLogItemCount == 0) { 
        _segmentComplete = true;
      }
    }
    if (Environment.detailLogEnabled())
      LOG.info("### SYNC: List:"+ _listId + " Segment:" + _segmentId +" Done Syncing Progress Log TotalURLS:" + segmentDetail._urlCount +" RemainingURLS:" + _remainingURLS + " LocalLogItemCount:" + _localLogItemCount);
    
    return itemsProcessed;
  }
  
  /** append a CrawlURL item to the log **/
  public void completeItem(CrawlURL urlItem) {
    
    LogItem item = new LogItem();
    
    item._hostFP = urlItem.getHostFP();
    item._itemFP = urlItem.getFingerprint();
    item._urlData = urlItem;
    
    getAvailableBuffer().appendItem(item);
    
    if ((item._urlData.getFlags() & CrawlURL.Flags.IsRobotsURL) == 0) { 
      // now check to see if item was redirected ... 
      if ((item._urlData.getFlags() & CrawlURL.Flags.IsRedirected) != 0) { 
        // if so, check last attempt reason 
        if (item._urlData.getLastAttemptResult() == CrawlURL.CrawlResult.SUCCESS && item._urlData.isFieldDirty(CrawlURL.Field_REDIRECTURL)) {
          
          String redirectURL = item._urlData.getRedirectURL();
          
          // attempt to generate a fingerprint for the the redirected url ... 
          URLFPV2 fingerprint = URLUtils.getURLFPV2FromURL(redirectURL);
                  
          if (fingerprint != null) { 
            // append a redirect item 
            item = new LogItem();
            
            item._hostFP = fingerprint.getDomainHash();
            item._itemFP = fingerprint.getUrlHash();
            item._urlData = urlItem;
            item._writeToCrawLog = false;
            
            getAvailableBuffer().appendItem(item);
          }
                 
        }
      }
    }
    
    // reduce remaining url count 
    --_remainingURLS;
    
    // and increment local log item count ... 
    ++_localLogItemCount;
  }
  
  public void purgeLocalFiles() throws IOException { 
    File activePath       = buildActivePath(_rootDataDir,_listId,getSegmentId());
    File checkpointPath   = buildCheckpointPath(_rootDataDir,_listId,getSegmentId());

    if (activePath.exists())
      activePath.delete();
    if (checkpointPath.exists())
      checkpointPath.delete();
  }
  
  /** checkpoint log file **/
  public void checkpointLocalLog() throws IOException { 
    
    File activePath       = buildActivePath(_rootDataDir,_listId,getSegmentId());
    File checkpointPath   = buildCheckpointPath(_rootDataDir,_listId,getSegmentId());
    
    // capture local log item count ... 
    _checkpointItemCount = _localLogItemCount;
    
    checkpointPath.delete();
    // rename active path to check point path ... 
    activePath.renameTo(checkpointPath);
    // and recreate log .. 
    initializeLogFile(activePath);    
  }

  void finalizeCheckpoint() { 
    
    File checkpointLogFile= buildCheckpointPath(_rootDataDir,_listId, _segmentId);
    // delete local checkpoint log file ...
    checkpointLogFile.delete();
    // and reduce local log item count by checkpoint amount ...
    _localLogItemCount -= _checkpointItemCount;
    //reset checkpoint item count ...
    _checkpointItemCount = 0;

    if (isURLCountValid()) {
      LOG.info("finalizeCheckpoint for Segment:" + _segmentId + " List: " + _listId +  " Remaining:" + _remainingURLS + " LocalLogItemCount:" + _localLogItemCount);

      // now finally ... if  remaining url count is zero and local log item count is zero as well... 
      if (_remainingURLS == 0 && _localLogItemCount == 0) {

        LOG.info("CrawlSegment ListId:" + _listId + " Segment:" + _segmentId + " Marked as Complete During CrawlSegmentLog Checkpoint");
        
        // then mark the segment as complete ... 
        _segmentComplete = true;

      }
    }
  }
  
  void abortCheckpoint() { 
    File activeLogFile      = buildActivePath(_rootDataDir,_listId, _segmentId);
    File checkpointLogFile  = buildCheckpointPath(_rootDataDir,_listId, _segmentId);
    checkpointLogFile.renameTo(activeLogFile);
    //reset checkpoint item count ...
    _checkpointItemCount = 0;
  }  
  
  /** ensure paths **/
  private static void ensurePaths(File rootDirectory) { 
    File crawlDataDir = new File(rootDirectory,CrawlEnvironment.getCrawlerLocalOutputPath());
    if (!crawlDataDir.exists()) { 
      crawlDataDir.mkdir();
    }
  }
  
  public static void initializeLogFile(File activeLogFilePath) throws IOException { 
    if (!activeLogFilePath.exists()) { 
      writeHeader(activeLogFilePath,0);
    }
  }
  
  public void purgeActiveLog()throws IOException { 
    File activeLogFilePath = buildActivePath(_rootDataDir,_listId, _segmentId);
    
    if (activeLogFilePath.exists())
      activeLogFilePath.delete();
    
    initializeLogFile(activeLogFilePath);
  }
   
  /** get active log file path given segment id **/
   public static File buildActivePath(File rootDirectory,int listId,int segmentId) { 
     // and construct a path to the local crawl segment directory ... 
     File crawlDataDir = new File(rootDirectory,CrawlEnvironment.getCrawlerLocalOutputPath());
     // list directory ... 
     File listDir = new File(crawlDataDir,Integer.toString(listId));
     if (!listDir.exists()) { 
       listDir.mkdirs();
     }
     // append the segment id to the path ... 
     return new File(listDir,((Integer)segmentId).toString() + "_" + CrawlEnvironment.ActiveSegmentLog);
   }

   /** get active log file path given segment id **/
   public static File buildCheckpointPath(File rootDirectory,int listId,int segmentId) { 
     // and construct a path to the local crawl segment directory ... 
     File crawlDataDir = new File(rootDirectory,CrawlEnvironment.getCrawlerLocalOutputPath());
     // list directory ... 
     File listDir = new File(crawlDataDir,Integer.toString(listId));
     if (!listDir.exists()) { 
       listDir.mkdirs();
     }
     // append the segment id to the path ... 
     return new File(listDir,((Integer)segmentId).toString() + "_" + CrawlEnvironment.CheckpointSegmentLog);
   }
  
  
  /** get segment id of associated segment **/
  public int getSegmentId() { 
    return _segmentId;
  }
  
  /** flush and add all pending buffers into the passed in list **/ 
  public void flushLog(LinkedList<LogItemBuffer> collector) { 
    for (LogItemBuffer buffer : _buffers) {
      if (buffer.getItemCount() != 0 ) { 
        collector.addLast(buffer);
      }
    }
    _buffers.clear();
    _buffers.addFirst(new LogItemBuffer(getListId(),getSegmentId()));
  }
  
  private LogItemBuffer  getAvailableBuffer() { 
    if (_buffers.isEmpty() || !_buffers.getFirst().spaceAvailable()) { 
      _buffers.addFirst(new LogItemBuffer(getListId(),getSegmentId()));
    }
    return _buffers.getFirst();
  }
  
  static class LogItem implements Comparable<LogItem>  {
    
    public static final int ItemSize_Bytes = 20; // hostFP(long) + itemFP(long) + position(int)

    // Comparable Implementation
    public int compareTo(LogItem otherItem) {
      if (_hostFP < otherItem._hostFP )
        return -1;
      else if (_hostFP > otherItem._hostFP)
        return 1;
      else { 
        if (_itemFP < otherItem._itemFP)
          return -1;
        else if (_itemFP > otherItem._itemFP)
          return 1;
        else 
          return 0;
      }
    }    
    
    public boolean        _writeToCrawLog = true;
    public long           _hostFP;
    public long           _itemFP;
    public CrawlURL       _urlData;  
  }
  
  static class LogItemBuffer { 
    
    private int       _listId;
    private int       _segmentId;
    private LogItem[] _itemsArray = null;
    private int       _itemCount;
    
    
    public LogItemBuffer(int listId,int segmentId) {
      _listId = listId;
      _segmentId = segmentId;
      _itemCount = 0;
      _itemsArray = new LogItem[DEFAULT_LOGITEM_LIST_SIZE];
    }
       
    public int  getListId() { return _listId; }
    public int  getSegmentId() { return _segmentId; }
    public LogItem[] getItems() { return _itemsArray; }
    public int getItemCount() { return _itemCount; } 
    
    public void appendItem(LogItem item) { 
      if (_itemsArray ==null || _itemCount == _itemsArray.length) { 
        throw new RuntimeException("Invalid call to append item");
      }
      _itemsArray[_itemCount++] = item;
    }
    
    public boolean spaceAvailable() { 
      return (_itemsArray != null && _itemCount < _itemsArray.length);
    }
    
    public static interface CrawlURLWriter { 
      
      void writeItemCount(int entryCount) throws IOException ;
      void writeItem(CrawlURL url) throws IOException ;
    }
    
    public int flushToDisk(int startingItemPosition,CrawlURLWriter urlWriter,DataOutputStream segmentLogStream,DataOutputStream historyLog) throws IOException { 
      
      // write out entry count first ... 
      urlWriter.writeItemCount(_itemCount);
      for (int i=0;i<_itemCount;++i) { 
        if (_itemsArray[i]._writeToCrawLog) { 
          // write url data ...
          urlWriter.writeItem(_itemsArray[i]._urlData);
        }
        CrawlURL urlObject = _itemsArray[i]._urlData; 
        // if now crawl directives ... 
        if ((urlObject.getFlags() & CrawlURL.Flags.InParseQueue) == 0) { 
          if (segmentLogStream != null) { 
            // and write out segment log info ...
            segmentLogStream.writeLong(_itemsArray[i]._hostFP);
            segmentLogStream.writeLong(_itemsArray[i]._itemFP);
            segmentLogStream.writeInt(startingItemPosition + i);
          }
          if (historyLog != null) { 
            URLFPV2 fp = URLUtils.getURLFPV2FromURL(urlObject.getUrl());
            if (fp != null) { 
              // write original url to history log ... 
              fp.write(historyLog);
            }
            // if redirected ... 
            if ((_itemsArray[i]._urlData.getFlags() & CrawlURL.Flags.IsRedirected) != 0) { 
              // calc fingerprint for url ... 
              fp = URLUtils.getURLFPV2FromURL(urlObject.getRedirectURL());
              if (fp != null) { 
                // write redirect fingerprint to history log ... 
                fp.write(historyLog);
              }
            }
          }
        }
        
        _itemsArray[i]._urlData.clear();
        _itemsArray[i]._urlData = null;
        _itemsArray[i] = null;
      }
      
      return _itemCount;
    }
    
    public void loadFromStream(byte[] readBuffer, int itemCount) { 
      _itemCount = itemCount;
      if (_itemsArray == null || _itemsArray.length < itemCount) { 
        // reallocate array ...
        _itemsArray = new LogItem[_itemCount];
      }
      
      int bytePosition = 0;
      
      ByteArrayInputStream inputStream = new ByteArrayInputStream(readBuffer);
      DataInputStream dataInputStream = new DataInputStream(inputStream); 
      
      for (int i=0;i<_itemCount;++i) {
        
        LogItem item = new LogItem();
        
        item._hostFP = (((long)readBuffer[bytePosition++] << 56) +
                               ((long)(readBuffer[bytePosition++] & 255) << 48) +
                               ((long)(readBuffer[bytePosition++] & 255) << 40) +
                               ((long)(readBuffer[bytePosition++] & 255) << 32) +
                               ((long)(readBuffer[bytePosition++] & 255) << 24) +
                               ((readBuffer[bytePosition++] & 255) << 16) +
                               ((readBuffer[bytePosition++] & 255) <<  8) +
                               ((readBuffer[bytePosition++] & 255) <<  0));

        
        item._itemFP = (((long)readBuffer[bytePosition++] << 56) +
            ((long)(readBuffer[bytePosition++] & 255) << 48) +
            ((long)(readBuffer[bytePosition++] & 255) << 40) +
            ((long)(readBuffer[bytePosition++] & 255) << 32) +
            ((long)(readBuffer[bytePosition++] & 255) << 24) +
            ((readBuffer[bytePosition++] & 255) << 16) +
            ((readBuffer[bytePosition++] & 255) <<  8) +
            ((readBuffer[bytePosition++] & 255) <<  0));
        
        // skip position hint...
        bytePosition += 4;
        _itemsArray[i] = item;
        
      }
    }
    
  }
  
  public static int getHeaderSize() { 
    return 8;
  }
  
  public static int readerHeader(File logFilePath) throws IOException { 
    int recordCount = 0;
    FileInputStream stream = new FileInputStream(logFilePath);
 
    try { 
      DataInputStream reader = new DataInputStream(stream);
      recordCount = readHeader(reader);
    }
    finally {
      stream.close();
    }
    
    return recordCount;
  }
  
  public static int readHeader(DataInputStream reader) throws IOException { 
    reader.skipBytes(4);
    return reader.readInt();
  }

  public static final int LogFileHeaderBytes = 0xCC00CC00;

  public static void writeHeader(File logFilePath,int recordCount) throws IOException { 
    RandomAccessFile stream = new RandomAccessFile(logFilePath,"rw");
    try { 
      stream.seek(0);
      stream.writeInt(LogFileHeaderBytes);
      stream.writeInt(recordCount);
    }
    finally { 
      // stream.getFD().sync();
      stream.close();
    }
  }
  
  public static int reconcileLogFile(FileSystem fs,Path logFilePath,int listId,int segmentId,CrawlSegmentFPMap segment,File consolidationFile)throws IOException { 
    
    RandomAccessFile consolidationStream = null;

    int consolidationFileItemCount = 0;
    
    if (consolidationFile != null) { 
      consolidationStream = new RandomAccessFile(consolidationFile,"rw");
      consolidationFileItemCount = readerHeader(consolidationFile);
      consolidationStream.seek(consolidationStream.length());
    }
    
    int processedItemCount = 0;
    

    FSDataInputStream hdfsInputStream = null;
    
    try {
      
      // get the file size on disk 
      long fileSize = fs.getFileStatus(logFilePath).getLen();
  
      // allocate an array that can hold up to the list size of items ...
      byte[] buffer = new byte[DEFAULT_LOGITEM_LIST_SIZE * LogItem.ItemSize_Bytes];
      
      // calcuate item count 
      int totalItemCount = (int)( (fileSize - getHeaderSize()) / LogItem.ItemSize_Bytes);
          
      // get a reader ... 
      
      hdfsInputStream = fs.open(logFilePath);
  
      int headerItemCount = readHeader(hdfsInputStream);
      
      if (headerItemCount != totalItemCount) { 
        LOG.warn("CrawlSegmentLog - header item count for log file:" + logFilePath.toString() + " is:"+ headerItemCount + " file size indicates:" + totalItemCount);
        totalItemCount = headerItemCount;
      }
      
      int remainingItemCount = totalItemCount;
      
      
      LogItemBuffer itemList = new LogItemBuffer(listId,segmentId);
  
      while (remainingItemCount != 0) {
        
        int blockItemCount = Math.min(remainingItemCount,DEFAULT_LOGITEM_LIST_SIZE);
        
        // and read the data 
        hdfsInputStream.read(buffer,0,(int)blockItemCount * LogItem.ItemSize_Bytes);
        // and if consolidation stream is valid ... 
        if (consolidationStream != null) { 
          // add entries to that stream ... 
          consolidationStream.write(buffer,0,(int)blockItemCount * LogItem.ItemSize_Bytes);
        }
        
        // if not a dry run... 
        if (segment != null) { 
          // populate the item list   
          itemList.loadFromStream(buffer,blockItemCount);
          // reconcile the list against the segment 
          processedItemCount += reconcileItemList(itemList,segment);
        }
        // reduce item count 
        remainingItemCount -= blockItemCount;
      }
      
      // finally if consolidation stream is valid ... 
      if (consolidationStream != null) { 
        // update the file's header .. 
        writeHeader(consolidationFile, consolidationFileItemCount +totalItemCount );
      }
    }
    finally { 
      if (consolidationStream != null)  {
        consolidationStream.close();
      }
      if (hdfsInputStream != null) { 
        hdfsInputStream.close();
      }
    }
    return processedItemCount;
  }
  
  public static interface LogFileItemCallback { 
    public void processItem(long domainHash,long urlFingerprint);
  }
  
  public static void walkFingerprintsInLogFile(FileSystem fs,Path logFilePath,LogFileItemCallback callback)throws IOException { 
    
    FSDataInputStream hdfsInputStream = null;
    
    try {
      
      // get the file size on disk 
      long fileSize = fs.getFileStatus(logFilePath).getLen();
  
      // allocate an array that can hold up to the list size of items ...
      byte[] buffer = new byte[DEFAULT_LOGITEM_LIST_SIZE * LogItem.ItemSize_Bytes];
      
      // calcuate item count 
      int totalItemCount = (int)( (fileSize - getHeaderSize()) / LogItem.ItemSize_Bytes);
          
      // get a reader ... 
      
      hdfsInputStream = fs.open(logFilePath);
  
      int headerItemCount = readHeader(hdfsInputStream);
      
      if (headerItemCount != totalItemCount) { 
        LOG.warn("CrawlSegmentLog - header item count for log file:" + logFilePath.toString() + " is:"+ headerItemCount + " file size indicates:" + totalItemCount);
        totalItemCount = headerItemCount;
      }
      
      int remainingItemCount = totalItemCount;
      
      
      LogItemBuffer itemList = new LogItemBuffer(0,0);
  
      while (remainingItemCount != 0) {
        
        int blockItemCount = Math.min(remainingItemCount,DEFAULT_LOGITEM_LIST_SIZE);
        
        // and read the data 
        hdfsInputStream.read(buffer,0,(int)blockItemCount * LogItem.ItemSize_Bytes);
        
        // populate the item list   
        itemList.loadFromStream(buffer,blockItemCount);
        
        // for walk items in list 
        for (int i=0;i<itemList.getItemCount();++i) { 
          LogItem item = itemList.getItems()[i];
          callback.processItem(item._hostFP,item._itemFP);
        }
        // reduce item count 
        remainingItemCount -= blockItemCount;
      }
    }
    finally { 
      if (hdfsInputStream != null) { 
        hdfsInputStream.close();
      }
    }
  }  
  
  public static int reconcileItemList(LogItemBuffer itemList,CrawlSegmentFPMap segment) { 
   
    int processedItemCount = 0;

    URLFPV2 urlfp = new URLFPV2();
    
    // and now walk segment and list consolidating segment as we go along ... 
    for (int i=0;i<itemList.getItemCount();++i) {
      
      LogItem item  = itemList.getItems()[i];
      
      urlfp.setDomainHash(item._hostFP);
      urlfp.setUrlHash(item._itemFP);
      
      if (segment.isValidSegmentURL(urlfp)) { 
        //update local bloom filter ... 
        segment.setCrawled(urlfp); 
        // increment processed item count 
        processedItemCount++;
      }
    }
    return processedItemCount;
  }
}