CrawlList.java example

Explorer
commoncrawl-crawler-master
- src
  - com
    - dappit
      - Dapper
        parser
        CompressedDomBuilder.java
        DebugDocumentBuilder.java
        DocumentBuilder.java
        DomDocumentBuilder.java
        EnviromentController.java
        HTMLParser.java
        InstructionsPool.java
        LinkExtractionDocumentBuilder.java
        MozillaParser.java
        ParserException.java
        ParserInitializationException.java
        ParserInstruction.java
  - org
    - commoncrawl
/**
 * Copyright 2008 - CommonCrawl Foundation
 * 
 *    This program is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation, either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 **/

package org.commoncrawl.service.listcrawler;


import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.io.RandomAccessFile;
import java.nio.channels.FileLock;
import java.nio.channels.OverlappingFileLockException;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DataInputBuffer;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.WritableUtils;
import org.apache.log4j.BasicConfigurator;
import org.commoncrawl.crawl.common.internal.CrawlEnvironment;
import org.commoncrawl.hadoop.mergeutils.MergeSortSpillWriter;
import org.commoncrawl.hadoop.mergeutils.RawKeyValueComparator;
import org.commoncrawl.hadoop.mergeutils.SequenceFileSpillWriter;
import org.commoncrawl.mapred.ProxyCrawlHistoryItem;
import org.commoncrawl.protocol.CrawlURL;
import org.commoncrawl.protocol.URLFP;
import org.commoncrawl.rpc.base.shared.BinaryProtocol;
import org.commoncrawl.service.crawler.util.URLFPBloomFilter;
import org.commoncrawl.service.listcrawler.CrawlListDomainItem;
import org.commoncrawl.service.listcrawler.CrawlListMetadata;
import org.commoncrawl.service.listcrawler.CrawlHistoryManager.ItemUpdater;
import org.commoncrawl.util.GoogleURL;
import org.commoncrawl.util.URLFingerprint;
import org.commoncrawl.util.URLUtils;
import org.commoncrawl.util.CCStringUtils;
import org.commoncrawl.util.CRC16;
import org.commoncrawl.util.FileUtils;
import org.junit.Assert;

import com.google.gson.stream.JsonWriter;

/** 
 * A list of urls that need to be crawled 
 * @author rana
 *
 */
public final class CrawlList implements ItemUpdater {

  // default refresh interval is 60 days ...
  public static final int DEFAULT_REFRESH_INTERVAL_IN_SECS = 86400 * 60;

  /** 
   * events generated by the CrawlList 
   * 
   * @author rana
   *
   */
  public static interface CrawlListEvents { 
    public void itemUpdated(URLFP itemFingerprint);
  }


  public static final Log LOG = LogFactory.getLog(CrawlList.class);

  public static final int ValueFlag_HasRedirect = 1 << 0;

  File _listURLDataFile = null;
  File _fixedDataFile = null;
  File _variableDataFile = null;
  File _bloomFilterData = null;
  File _listMetadataFile = null;
  File _subDomainMetadataFile = null;
  URLFPBloomFilter _bloomFilter = null;
  long _listId;
  CrawlHistoryStorage _manager;
  CrawlListMetadata _metadata = new CrawlListMetadata();
  CrawlListEvents _eventListener;
  byte[] _tempFixedDataBuffer = null;
  int    _tempFixedDataBufferSize = 0;
  DataOutputBuffer _tempOutputBuffer = new DataOutputBuffer(OnDiskCrawlHistoryItem.ON_DISK_SIZE);
  TreeMap<Long,CrawlListMetadata> _transientSubDomainStats = new TreeMap<Long,CrawlListMetadata>();
  DataOutputBuffer _offsetLookupTable = null;
  Exception _exception;


  public enum LoadState { 
    UNINITIALIZED,
    QUEUED_FOR_LOADING,
    REALLY_LOADING,
    LOADED,
    ERROR
  }

  LoadState _listState = LoadState.UNINITIALIZED;

  public enum QueueState { 
    WAITING,
    QUEUEING,
    QUEUED,
    ERROR
  }

  QueueState _queueState = QueueState.WAITING;



  /**
   * internal factory constructor
   */
  private CrawlList(CrawlHistoryStorage manager,long listId, LoadState state ) {

    _manager = manager;

    //establish file names 
    initializeListFileNames();

    _listId = listId;
    _listState = state;
  }

  /**
   * internal factory constructor
   */
  private CrawlList(CrawlHistoryStorage manager,long listId, Exception e) {

    _manager = manager;

    //establish file names 
    initializeListFileNames();

    _listId = listId;
    _listState = LoadState.ERROR;
    _exception = e;
  }

  /** is list loaded 
   * 
   */
  public boolean isListLoaded() { 
    return _listState == LoadState.LOADED;
  }

  /** mark list as loading 
   * 
   * 
   */
  public void markListAsReallyLoading() { 
    _listState = LoadState.REALLY_LOADING;
  }


  // get the list's load state 
  public LoadState getLoadState() { 
    return _listState;
  }

  // get the last caught exception (if list is in error state)
  public Exception getLastException() { 
    return _exception;
  }

  /** get the list id 
   * 
   */
  public long getListId() { 
    return _listId;
  }

  /** set the event listener hook **
   * 
   * @param eventListener
   */
  public synchronized void setEventListener(CrawlListEvents eventListener) { 
    _eventListener = eventListener;
  }

  public synchronized CrawlListEvents getEventListener() { 
    return _eventListener;
  }


  /** get metadata 
   * 
   */
  public CrawlListMetadata getMetadata() { 
    CrawlListMetadata metadataOut = null;
    synchronized (_metadata) {
      try {
        metadataOut = (CrawlListMetadata) _metadata.clone();
      } catch (CloneNotSupportedException e) {
      }
    }
    return metadataOut;
  }

  /**
   * 
   * @return the path to the url data file (source for the urls in this list)
   */
  public File getListURLDataFile() { 
    return _listURLDataFile;
  }

  /**
   * Initialize a CrawlList in an error state ..
   */
  public static CrawlList createListWithLoadErrorState(CrawlHistoryStorage manager,long listId,Exception e) {
    return new CrawlList(manager,listId,e);
  }

  /**
   * Initialize a CrawlList in an laoding state ..
   */
  public static CrawlList createListLoadingInLoadingState(CrawlHistoryStorage manager,long listId,File dataFile,int refreshInterval) {
    CrawlList listOut = new CrawlList(manager,listId,LoadState.QUEUED_FOR_LOADING);

    listOut.getMetadata().setRefreshInterval(refreshInterval);
    listOut._listURLDataFile = dataFile;

    return listOut;
  }


  /**
   *	Load a CrawlList from previously stored disk state  	
   * 
   * @param manager - reference to the crawl list history manager 
   * @param listId  - the list id (the timestamp) for the given list to load from disk state
   */
  public CrawlList(CrawlHistoryStorage storage, long listId) throws IOException { 
    _listId = listId;
    _manager = storage;
    //establish file names 
    initializeListFileNames();

    LOG.info("Initilaizing pre-existing List with Id:" + listId);

    LOG.info("Loading BloomFilterData for List:" + listId);
    FileInputStream bloomFilterData = new FileInputStream(_bloomFilterData);

    try { 
      // load bloom filter 
      _bloomFilter = URLFPBloomFilter.load(bloomFilterData);
    }
    finally { 
      bloomFilterData.close();
    }

    // load list metadata from disk 
    loadMetadataFromDisk();
    // reset queued counts ... 
    _metadata.setQueuedItemCount(0);
    // write it back 
    writeMetadataToDisk();
    // load sub domain metadata from disk ... 
    loadSubDomainMetadataFromDisk();
    // reset queued count ... 
    resetSubDomainCounts();

    _listState = LoadState.LOADED;
  }

  /**
   * Initialize a new CrawlList object from a given input stream of urls 
   * 
   * @param manager 			 - reference to the crawl history log manager 
   * @param urlInputStream - the input stream containing the list of urls that we should add to this list ... 
   * @throws IOException 	  
   */
  public CrawlList(CrawlHistoryStorage manager,long listId,File sourceURLFile,int refreshInterval) throws IOException { 

    _manager = manager;

    _listState = LoadState.REALLY_LOADING;

    // initialize a new list id 
    _listId = listId;

    LOG.info("*** LIST:" + getListId() + " LOADING FROM SOURCE FILE:" + sourceURLFile.getAbsolutePath()); 

    //establish file names 
    initializeListFileNames();

    sourceURLFile.renameTo(_listURLDataFile);

    FileInputStream urlInputStream = new FileInputStream(_listURLDataFile);

    try {

      // set we will use to hold all fingerprints generated 
      TreeSet<URLFP> urlSet = new TreeSet<URLFP>();


      // create temp files ...
      File spillOutputFile = File.createTempFile("spillOut", Long.toString(_listId));


      // create mergesortspillwriter 
      SequenceFileSpillWriter<URLFP,ProxyCrawlHistoryItem> spillwriter 
      = new SequenceFileSpillWriter<URLFP,ProxyCrawlHistoryItem>(
          FileSystem.getLocal(
              CrawlEnvironment.getHadoopConfig()),
              CrawlEnvironment.getHadoopConfig(),
              new Path(spillOutputFile.getAbsolutePath()),
              URLFP.class,
              ProxyCrawlHistoryItem.class,
              null,false);


      try { 

        MergeSortSpillWriter<URLFP,ProxyCrawlHistoryItem> merger 
        = new MergeSortSpillWriter<URLFP, ProxyCrawlHistoryItem>(
            CrawlEnvironment.getHadoopConfig(),
            spillwriter,
            FileSystem.getLocal(
                CrawlEnvironment.getHadoopConfig()),
                new Path(manager.getLocalDataDir().getAbsolutePath()),
                null,
                new RawKeyValueComparator<URLFP, ProxyCrawlHistoryItem>() {

              DataInputBuffer _key1Buffer = new DataInputBuffer();
              DataInputBuffer _key2Buffer = new DataInputBuffer();
              @Override
              public int compareRaw(byte[] key1Data, int key1Offset,
                  int key1Length, byte[] key2Data, int key2Offset,
                  int key2Length, byte[] value1Data, int value1Offset,
                  int value1Length, byte[] value2Data, int value2Offset,
                  int value2Length) throws IOException {

                _key1Buffer.reset(key1Data,key1Offset,key1Length);
                _key2Buffer.reset(key2Data,key2Offset,key2Length);

                _key1Buffer.skip(2); // skip verison, and 1 byte id 
                _key2Buffer.skip(2); // skip verison, and 1 byte id 

                int domainHash1 = WritableUtils.readVInt(_key1Buffer);
                int domainHash2 = WritableUtils.readVInt(_key2Buffer);

                _key1Buffer.skip(1); // skip 1 byte id 
                _key2Buffer.skip(1); // skip 1 byte id 

                long fingerprint1= WritableUtils.readVLong(_key1Buffer);
                long fingerprint2= WritableUtils.readVLong(_key2Buffer);

                int result = ((Integer)domainHash1).compareTo(domainHash2);

                if (result == 0) { 
                  result = ((Long)fingerprint1).compareTo(fingerprint2);
                }

                return result;
              }

              @Override
              public int compare(URLFP key1, ProxyCrawlHistoryItem value1,URLFP key2, ProxyCrawlHistoryItem value2) {
                return key1.compareTo(key2);
              }
            },
            URLFP.class,
            ProxyCrawlHistoryItem.class,false,null);


        try { 

          LOG.info("*** LIST:" + getListId() + " Starting Scan of URLS In List");
          BufferedReader reader= new BufferedReader(new InputStreamReader(urlInputStream,Charset.forName("UTF-8")));

          String line = null;
          int lineNumber = 0;
          ProxyCrawlHistoryItem item = new ProxyCrawlHistoryItem();
          while ((line = reader.readLine()) != null) {
            ++lineNumber;
            if (line.length() != 0 && !line.startsWith("#")) { 
              URLFP fingerprint = URLUtils.getURLFPFromURL(line, true);

              if (fingerprint != null) {

                if (!urlSet.contains(fingerprint)) {
                  // and add fingerprint to set 
                  urlSet.add(fingerprint);
                  // initialize item 
                  item.clear();
                  item.setOriginalURL(line);
                  // and spill to merger / sorter .. 
                  merger.spillRecord(fingerprint, item);
                }
              }
              else { 
                LOG.error("*** LIST:" + getListId() + " Invalid URL Encounered at Line:" + lineNumber + " URL" + line);
              }
            }
          }
          LOG.info("*** LIST:" + getListId() + " Completed Scan of:" + urlSet.size() + " URLS");
        }
        finally { 
          merger.close();
        }
      }
      finally { 
        if (spillwriter != null)
          spillwriter.close();
      }
      LOG.info("*** LIST:" + getListId() + " Generating BloomFilter for:" + urlSet.size() + " keys");
      // generate bloom filter ...  
      _bloomFilter = new URLFPBloomFilter(urlSet.size(),7,10);

      for (URLFP fingerprint : urlSet) { 
        _bloomFilter.add(fingerprint);
      }
      LOG.info("*** LIST:" + getListId() + " Serializing BloomFilter");
      // serialize it
      FileOutputStream bloomFilterStream = new FileOutputStream(_bloomFilterData);
      try { 
        _bloomFilter.serialize(bloomFilterStream);			
      }
      finally { 
        bloomFilterStream.flush();
        bloomFilterStream.close();
      }

      LOG.info("*** LIST:" + getListId() + " Starting Read of Merged Sequence File:" + spillOutputFile);
      // now initialize value map and string maps based on output sequence file ... 
      SequenceFile.Reader reader = new SequenceFile.Reader(
          FileSystem.getLocal(CrawlEnvironment.getHadoopConfig()), new Path(spillOutputFile.getAbsolutePath()), CrawlEnvironment.getHadoopConfig());


      LOG.info("*** LIST:" + getListId() + " PRE-ALLOCATING FIXED DATA BUFFER OF SIZE:" + (urlSet.size() * OnDiskCrawlHistoryItem.ON_DISK_SIZE));
      // OK, Allocate room for fixed data file upfront 
      DataOutputBuffer valueStream = new DataOutputBuffer(urlSet.size() * OnDiskCrawlHistoryItem.ON_DISK_SIZE);
      LOG.info("*** LIST:" + getListId() + " ALLOCATION SUCCEEDED");

      try {

        //DataOutputStream valueStream = new DataOutputStream(new FileOutputStream(_fixedDataFile));
        RandomAccessFile stringsStream = new RandomAccessFile(_variableDataFile, "rw");

        try { 
          URLFP urlFP = new URLFP();
          ProxyCrawlHistoryItem item = new ProxyCrawlHistoryItem();

          // read fingerprints ... 
          while (reader.next(urlFP, item)) {
            // write out fixed data structure and strings 
            writeInitialOnDiskItem(urlFP,item,valueStream,stringsStream);
          }
        }
        finally { 
          //valueStream.flush();
          //valueStream.close();
          stringsStream.close();
        }
      }
      finally { 
        reader.close();
      }
      LOG.info("*** LIST:" + getListId() + " Finished Writing Initial Values to Disk");

      LOG.info("*** LIST:" + getListId() + " FIXED DATA BUFFER OF SIZE:" + valueStream.getLength() + " EXCEPECTED SIZE:" + (urlSet.size() * OnDiskCrawlHistoryItem.ON_DISK_SIZE));
      if (valueStream.getLength() != (urlSet.size() * OnDiskCrawlHistoryItem.ON_DISK_SIZE)) { 
        throw new IOException("Final FixedItemData Buffer Size:" + valueStream.getLength() + " != URLSetSize:" + (urlSet.size() * OnDiskCrawlHistoryItem.ON_DISK_SIZE));
      }
      // initialize temp data buffer variables 
      _tempFixedDataBuffer = valueStream.getData();
      _tempFixedDataBufferSize = valueStream.getLength();

      // update metadata 
      _metadata.setRefreshInterval(refreshInterval);
      _metadata.setUrlCount(urlSet.size());


      // setup version 
      _metadata.setVersion(1);

      // and write to disk 
      writeMetadataToDisk();

      // mark state as loaded ... 
      _listState = LoadState.LOADED;


      LOG.info("*** LIST:" + getListId() + " SYNCING"); 
      // reconcile with history log
      _manager.syncList(this.getListId(),urlSet,this);
      LOG.info("*** LIST:" + getListId() + " SYNC COMPLETE");

      // write metdata to disk again 
      writeMetadataToDisk();

      LOG.info("*** LIST:" + getListId() + " FLUSHING FIXED DATA");

      // and finally flush fixed data to disk 
      FileOutputStream finalDataStream = new FileOutputStream(_fixedDataFile);

      try { 
        synchronized (this) {
          int blockSize = 1 << 20;
          long bytesCopied = 0;
          for (int offset=0;offset<_tempFixedDataBufferSize;offset += blockSize) {
            int bytesToCopy = Math.min(blockSize,_tempFixedDataBufferSize - offset);
            finalDataStream.write(_tempFixedDataBuffer, offset, bytesToCopy);
            bytesCopied += bytesToCopy;
          }
          // validate bytes copied 
          if (bytesCopied != _tempFixedDataBufferSize) { 
            throw new IOException("Buffer Size:" + _tempFixedDataBufferSize + " Does not Match BytesCopied:" + bytesCopied);
          }

          // ok release the buffer 
          _tempFixedDataBuffer = null;
          _tempFixedDataBufferSize = 0;

          LOG.info("*** LIST:" + getListId() + " FIXED DATA FLUSH COMPLETE");
        }

      }
      finally { 
        finalDataStream.flush();
        finalDataStream.close();
      }

      // load sub domain metadata from disk ... 
      loadSubDomainMetadataFromDisk();

    }
    catch (IOException e) {
      LOG.error("*** LIST:" + getListId() + " Crawl List Initialization Failed With Exception:" + CCStringUtils.stringifyException(e));

      _fixedDataFile.delete();
      _variableDataFile.delete();
      _bloomFilterData.delete();

      _listState = LoadState.ERROR;

      throw e; 
    }
    finally { 
      urlInputStream.close();
    }

  }


  /**
   * update list state of a recently crawled item 
   * 
   * @param fingerprint - the fingerprint of the updated item 
   * @param newData			- the updated crawl history data for the given item 	
   * @throws IOException
   */
  @Override
  public void updateItemState(URLFP fingerprint,ProxyCrawlHistoryItem newData)throws IOException {

    if (_listState == LoadState.LOADED) { 
      // check for membership ... 
      if (_bloomFilter.isPresent(fingerprint)) {

        //LOG.info("UpdateItemState Called for URL:" + newData.getOriginalURL() + " List:" + getListId());

        //LOG.info("UpdateItemState Loading OnDisk Item for URL:" + newData.getOriginalURL() + " List:" + getListId());
        // extract existing item from disk 
        OnDiskCrawlHistoryItem originalItem = loadOnDiskItemForURLFP(fingerprint);

        //if present (null if false cache hit) 
        if (originalItem != null) { 

          // build an on disk item data structure for any potential changes ... 
          OnDiskCrawlHistoryItem newItem = onDiskItemFromHistoryItem(fingerprint,newData);

          // set inital offset information 
          newItem._fileOffset = originalItem._fileOffset;
          newItem._stringsOffset = originalItem._stringsOffset;

          // LOG.info("UpdateItemState Comparing OnDisk Item  to New Item for URL:" + newData.getOriginalURL() + " List:" + getListId());
          // compare the two items ... 
          if (!newItem.equals(originalItem)) { 
            //LOG.info("UpdateItemState Items Don't Match for  URL:" + newData.getOriginalURL() + " List:" + getListId());
            // ok items do not match ... figure out if strings are different ... 
            if (newItem._stringsCRC != originalItem._stringsCRC) {
              RandomAccessFile stringsFile = new RandomAccessFile(_variableDataFile, "rw");

              try {
                // seek to end 
                stringsFile.seek(stringsFile.length());
                // update offset info 
                newItem._stringsOffset = stringsFile.length();
                // write out string data length 
                WritableUtils.writeVInt(stringsFile,_stringBuffer1.getLength());
                // write strings to log file
                stringsFile.write(_stringBuffer1.getData(),0,_stringBuffer1.getLength());
              }
              finally { 
                stringsFile.close();
              }
            }
            // otherwise take the offset from old item 
            else { 
              newItem._stringsOffset = originalItem._stringsOffset;
            }
            //LOG.info("Opening Data File for OnDiskItem load for Fingerprint:" + newItem._urlFingerprint);

            // ok, different paths depending on wether this is an in memory update or not ... 
            boolean wroteToMemory = false;
            synchronized (this) {
              if (_tempFixedDataBuffer != null) {
                wroteToMemory = true;
                // reset output buffer 
                _tempOutputBuffer.reset();
                // serizlie to output buffer 
                newItem.serialize(_tempOutputBuffer);
                // copy to appropriate location 
                System.arraycopy(_tempOutputBuffer.getData(), 0, _tempFixedDataBuffer,(int) originalItem._fileOffset, OnDiskCrawlHistoryItem.ON_DISK_SIZE);
              }
            }

            if (!wroteToMemory){
              // write to disk 
              RandomAccessFile file = new RandomAccessFile(_fixedDataFile,"rw");

              try {

                while (true) { 
                  try {
                    //LOG.info("*** TRYING UPDATE LOCK FOR OFFSET:" + originalItem._fileOffset);
                    FileLock lock = file.getChannel().tryLock(originalItem._fileOffset, OnDiskCrawlHistoryItem.ON_DISK_SIZE, false);

                    try {
                      //LOG.info("*** GOT UPDATE LOCK FOR OFFSET:" + originalItem._fileOffset);
                      file.seek(originalItem._fileOffset);
                      newItem.serialize(file);
                      //LOG.info("Updated Data File for OnDiskItem for Fingerprint:" + originalItem._urlFingerprint);
                      break;
                    }
                    finally {
                      //LOG.info("*** RELEASED UPDATE LOCK FOR OFFSET:" + originalItem._fileOffset);
                      lock.release();
                    }
                  }
                  catch (OverlappingFileLockException e) { 
                    LOG.error("###LockConflict(RETRY):" + CCStringUtils.stringifyException(e));
                  }
                }
              }
              finally { 
                file.close();
              }
            }

            // ok now update metadata ... 
            synchronized (_metadata) {

              int updateFlags = calculateUpdateFlags(originalItem, newItem);

              if (updateFlags != 0) {

                int metadataDirtyFlags = updateMetadata(newItem, _metadata, 0);

                // only write metadata to disk if temp data buffer is null
                if (metadataDirtyFlags != 0 && !wroteToMemory) {
                  if ((metadataDirtyFlags & MetadataUpdateFlag_ModifiedCrawlStatus) != 0) {
                    _metadata.setQueuedItemCount(_metadata.getQueuedItemCount() - 1);
                  }
                  writeMetadataToDisk();
                }

                // if not writing to memory then update subdomain metadata 
                if (!wroteToMemory) {

                  synchronized (_subDomainMetadataFile) { 
                    CrawlListMetadata subDomainMetadata = getSubDomainMetadataByURL(newData.getOriginalURL());

                    int subDomainMetadataDirtyFlags = updateMetadata(newItem, subDomainMetadata, processFileOffsets);

                    if (subDomainMetadataDirtyFlags != 0 && !wroteToMemory) {
                      if ((subDomainMetadataDirtyFlags & MetadataUpdateFlag_ModifiedCrawlStatus) != 0) {
                        subDomainMetadata.setQueuedItemCount(subDomainMetadata.getQueuedItemCount() - 1);
                      }
                      writeSubDomainMetadataToDisk(subDomainMetadata);
                    }
                  }
                }
              }
            }

            synchronized (this) {
              if (_eventListener != null) { 
                _eventListener.itemUpdated(fingerprint);
              }
            }
          }
        }
      }
    }
  }

  private static final int processOrignalStatus 		 = 1 << 0;
  private static final int processOriginalResult 		 = 1 << 1;
  private static final int processRedirectStatus 		 = 1 << 2;
  private static final int processRedirectResult 		 = 1 << 3;
  private static final int processFileOffsets   		 = 1 << 4;
  private static final int processAllItems  				 = Integer.MAX_VALUE;

  private static int calculateUpdateFlags(OnDiskCrawlHistoryItem originalItem,OnDiskCrawlHistoryItem newItem) {
    int updateFlags = 0;

    if (newItem.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_CRAWL_STATUS) && !originalItem.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_CRAWL_STATUS)) {
      updateFlags |= processOrignalStatus;
    }

    if (newItem.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_ORIGINAL_RESULT_CODE) && !originalItem.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_ORIGINAL_RESULT_CODE)) {
      updateFlags |= processOriginalResult;
    }

    if (newItem.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_REDIRECT_STATUS) && !originalItem.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_REDIRECT_STATUS)) { 
      updateFlags |= processRedirectStatus;
    }

    if (newItem.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_REDIRECT_RESULT_CODE) && !originalItem.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_REDIRECT_RESULT_CODE)) {
      updateFlags |= processRedirectResult;
    }		
    return updateFlags;
  }

  private static final int MetadataUpdateFlag_ModifiedCrawlStatus = 1 << 0;
  private static final int MetadataUpdateFlag_ModifiedRedirectStatus = 1 << 1;
  private static final int MetadataUpdateFlag_ModifiedOffsets = 1 << 1;

  private static int updateMetadata(OnDiskCrawlHistoryItem newItem,CrawlListMetadata metadata, int updateFlags) { 

    int metadataDirtyFlags = 0;
    if (!newItem.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_REDIRECT_STATUS)) { 
      //if ((updateFlags & processOrignalStatus) != 0) {
        // LOG.info("### Updating OriginalCrawlStatus for Item:" + newData.getOriginalURL());
        // status changed ... 
        if (newItem._crawlStatus != 0) { 
          switch (newItem._crawlStatus) { 
            case CrawlURL.FailureReason.RobotsExcluded: metadata.setRobotsExcludedCount(metadata.getRobotsExcludedCount() + 1);break;
            case CrawlURL.FailureReason.Timeout: 		  metadata.setTimeoutErrorCount(metadata.getTimeoutErrorCount() + 1);break;
            case CrawlURL.FailureReason.IOException: 	  metadata.setIOExceptionCount(metadata.getIOExceptionCount() + 1);break;
            case CrawlURL.FailureReason.DNSFailure:     metadata.setDNSErrorCount(metadata.getDNSErrorCount() + 1);break;
  
            default: metadata.setOtherErrorCount(metadata.getOtherErrorCount() + 1);
          }
          metadataDirtyFlags |= MetadataUpdateFlag_ModifiedCrawlStatus;
        }
        
      //}
  
      //if ((updateFlags & processOriginalResult) != 0) {
  
        // LOG.info("### Updating OriginalResultCode for Item:" + newData.getOriginalURL());
        if (newItem._crawlStatus == 0) { 
          if (newItem._httpResultCode == 200) metadata.setHttp200Count( metadata.getHttp200Count() + 1);
          else if (newItem._httpResultCode == 301)  metadata.setHttp301Count( metadata.getHttp301Count() + 1);
          else if (newItem._httpResultCode == 403)  metadata.setHttp403Count( metadata.getHttp403Count() + 1);
          else if (newItem._httpResultCode == 404)  metadata.setHttp404Count( metadata.getHttp404Count() + 1);
          else if (newItem._httpResultCode >= 500 && newItem._httpResultCode < 600 )  metadata.setHttp500Count( metadata.getHttp500Count() + 1);
          else if (newItem._httpResultCode >= 600 )  metadata.setHttpOtherCount( metadata.getHttpOtherCount() + 1);
  
          metadataDirtyFlags |= MetadataUpdateFlag_ModifiedCrawlStatus;
        }
      //}
    }

    else { 
      //if ((updateFlags & processRedirectStatus) != 0) { 
        // status changed ... 
        if (newItem._redirectStatus != 0) { 
          switch (newItem._redirectStatus) { 
            case CrawlURL.FailureReason.RobotsExcluded: metadata.setRobotsExcludedCount(metadata.getRobotsExcludedCount() + 1);break;
            case CrawlURL.FailureReason.Timeout: 		metadata.setTimeoutErrorCount(metadata.getTimeoutErrorCount() + 1);break;
            case CrawlURL.FailureReason.IOException: 	metadata.setIOExceptionCount(metadata.getIOExceptionCount() + 1);break;
            case CrawlURL.FailureReason.DNSFailure:     metadata.setDNSErrorCount(metadata.getDNSErrorCount() + 1);break;
            default: metadata.setOtherErrorCount(metadata.getOtherErrorCount() + 1);
          }
          metadataDirtyFlags |= MetadataUpdateFlag_ModifiedCrawlStatus;
        }
        
      //}
  
      //if ((updateFlags & processRedirectResult) != 0) {
      if (newItem._redirectStatus == 0) {
        if (newItem._redirectHttpResult == 200) metadata.setHttp200Count( metadata.getHttp200Count() + 1);
        else if (newItem._redirectHttpResult == 301)  metadata.setHttp301Count( metadata.getHttp301Count() + 1);
        else if (newItem._redirectHttpResult == 403)  metadata.setHttp403Count( metadata.getHttp403Count() + 1);
        else if (newItem._redirectHttpResult == 404)  metadata.setHttp404Count( metadata.getHttp404Count() + 1);
        else if (newItem._redirectHttpResult >= 500 && newItem._redirectHttpResult < 600 )  metadata.setHttp500Count( metadata.getRedirectHttp500Count() + 1);
        else if (newItem._redirectHttpResult >= 600 )  metadata.setRedirectHttpOtherCount( metadata.getHttpOtherCount() + 1);
  
        metadataDirtyFlags |= MetadataUpdateFlag_ModifiedCrawlStatus;
      }
      //}
    }


    if ((updateFlags & processFileOffsets) != 0) { 
      if (!metadata.isFieldDirty(CrawlListMetadata.Field_FIRSTRECORDOFFSET) || metadata.getFirstRecordOffset() > newItem._fileOffset) { 
        metadata.setFirstRecordOffset(newItem._fileOffset);
        metadataDirtyFlags = MetadataUpdateFlag_ModifiedOffsets;
      }

      if (!metadata.isFieldDirty(CrawlListMetadata.Field_LASTRECORDOFFSET) || metadata.getLastRecordOffset() < newItem._fileOffset) { 
        metadata.setLastRecordOffset(newItem._fileOffset);
        metadataDirtyFlags = MetadataUpdateFlag_ModifiedOffsets;
      }
    }

    return metadataDirtyFlags;

  }

  /**
   * 
   * @return the queued (all urls queued for crawling or not) state of this list 
   */
  public QueueState getQueuedState() { 
    return _queueState;
  }

  private int lastDomainHash = -1;
  private String lastRootDomainName = null;
  private CrawlListMetadata lastRootDomainMetadata = null;
  private int domainQueuedCount = 0;

  private void updateSubDomainMetadataForItemDuringLoad(OnDiskCrawlHistoryItem item,String itemURL,URLFP itemFP,boolean isQueued) throws IOException { 
    // ok unfortunately, we need to update stats for the subdomain here 
    if (item._domainHash != lastDomainHash) {
      // update last domain hash ...
      lastDomainHash = item._domainHash;
      // extract root domain name 
      GoogleURL urlObject = new GoogleURL(itemURL);
      String rootDomainName = URLUtils.extractRootDomainName(urlObject.getHost());
      
      // if root domain name different than last root domain name ... 
      if (rootDomainName != lastRootDomainName) {
        // flush last entry 
        flushCachedSubDomainMetadata();
        // load new entry 
        if (rootDomainName != null) { 
          lastRootDomainName = rootDomainName;
          lastRootDomainMetadata = new CrawlListMetadata();
        }
      }
      if (lastRootDomainMetadata != null) {  
        if (isQueued){ 
          lastRootDomainMetadata.setQueuedItemCount(lastRootDomainMetadata.getQueuedItemCount() + 1);
        }
        else { 
          updateMetadata(item, lastRootDomainMetadata, 0);
        }
      }
      if (lastRootDomainName  != null) {
        updateSubDomainQueueStatus(lastRootDomainName,domainQueuedCount);
      }
    }
  }
  
  private void flushCachedSubDomainMetadata() throws IOException  { 
    if (lastRootDomainMetadata != null) { 
      // ok get the latest version of the metadata from disk 
      synchronized (_subDomainMetadataFile) { 
        // get from disk 
        CrawlListMetadata metadataOnDisk = getSubDomainMetadataByRootDomain(lastRootDomainName);
        // update on disk version ... 
        metadataOnDisk.setHttp200Count(metadataOnDisk.getHttp200Count() + lastRootDomainMetadata.getHttp200Count());
        metadataOnDisk.setHttp301Count(metadataOnDisk.getHttp301Count() + lastRootDomainMetadata.getHttp301Count());
        metadataOnDisk.setHttp403Count(metadataOnDisk.getHttp403Count() + lastRootDomainMetadata.getHttp403Count());
        metadataOnDisk.setHttp404Count(metadataOnDisk.getHttp404Count() + lastRootDomainMetadata.getHttp404Count());
        metadataOnDisk.setHttp500Count(metadataOnDisk.getHttp500Count() + lastRootDomainMetadata.getHttp500Count());
        metadataOnDisk.setHttpOtherCount(metadataOnDisk.getHttpOtherCount() + lastRootDomainMetadata.getHttpOtherCount());
        
        metadataOnDisk.setRobotsExcludedCount(metadataOnDisk.getRobotsExcludedCount() + lastRootDomainMetadata.getRobotsExcludedCount());
        metadataOnDisk.setTimeoutErrorCount(metadataOnDisk.getTimeoutErrorCount() + lastRootDomainMetadata.getTimeoutErrorCount());
        metadataOnDisk.setIOExceptionCount(metadataOnDisk.getIOExceptionCount() + lastRootDomainMetadata.getIOExceptionCount());
        metadataOnDisk.setDNSErrorCount(metadataOnDisk.getDNSErrorCount() + lastRootDomainMetadata.getDNSErrorCount());
        metadataOnDisk.setOtherErrorCount(metadataOnDisk.getOtherErrorCount() + lastRootDomainMetadata.getOtherErrorCount());
        
        metadataOnDisk.setQueuedItemCount(metadataOnDisk.getQueuedItemCount() + lastRootDomainMetadata.getQueuedItemCount());
        // ok write it back to disk 
        writeSubDomainMetadataToDisk(metadataOnDisk);
      }
      lastRootDomainMetadata = null;
      lastRootDomainName = null;
      lastDomainHash = -1;
    }
  }

  /** queue uncrawled urls via the CrawlQueueLoader
   * 
   * @param loader
   */
  public void queueUnCrawledItems(CrawlQueueLoader loader) throws IOException { 
    _queueState = QueueState.QUEUEING;

    int metadataVersion = getMetadata().getVersion();

    synchronized (_metadata) {
      // reset metadata PERIOD  
      int urlCount = _metadata.getUrlCount();
      _metadata.clear();
      _metadata.setUrlCount(urlCount);
    }

    RandomAccessFile fixedDataReader = new RandomAccessFile(_fixedDataFile, "rw");
    RandomAccessFile stringDataReader = new RandomAccessFile(_variableDataFile, "rw");
    try {

      OnDiskCrawlHistoryItem item = new OnDiskCrawlHistoryItem();
      URLFP fingerprint = new URLFP();

      while (fixedDataReader.getFilePointer() != fixedDataReader.length()) {

        long position = fixedDataReader.getFilePointer();

        //LOG.info("*** TRYING READ LOCK FOR OFFSET:" + position);
        while (true) {
          // get read lock on position ... 
          try { 
            FileLock lock = fixedDataReader.getChannel().tryLock(position, OnDiskCrawlHistoryItem.ON_DISK_SIZE, false);

            try { 
              //LOG.info("*** GOT READ LOCK FOR OFFSET:" + position);
              item.deserialize(fixedDataReader);
              break;
            }
            finally { 
              lock.release();
              //LOG.info("*** RELEASED READ LOCK FOR OFFSET:" + position);
            }
          }
          catch (OverlappingFileLockException e) { 
            LOG.error("*** LOCK CONTENTION AT:" + position + " Exception:" + CCStringUtils.stringifyException(e));
          }
        }
        
        // seek to string data 
        stringDataReader.seek(item._stringsOffset);
        // and skip buffer length 
        WritableUtils.readVInt(stringDataReader);
        // and read primary string 
        String url = stringDataReader.readUTF();
        // setup fingerprint 
        fingerprint.setDomainHash(item._domainHash);
        fingerprint.setUrlHash(item._urlFingerprint);

        
        // first, if it has not been crawled ever, crawl it not matter what ... 
        boolean crawlItem = !item.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_CRAWL_STATUS);

        // if it has been crawled ... check list metadata version ... 
        if (!crawlItem && metadataVersion >= 1) { 
          // ok this is newer version of the list ... 
          // check refresh time if specified ...
          int refreshIntervalInSeconds = DEFAULT_REFRESH_INTERVAL_IN_SECS;

          if (getMetadata().getRefreshInterval() != 0) {
            refreshIntervalInSeconds = getMetadata().getRefreshInterval();
          }

          if (item._updateTimestamp > 0) {
            long timeSinceLastCrawl = item._updateTimestamp;
            if (System.currentTimeMillis() - timeSinceLastCrawl >= (refreshIntervalInSeconds * 1000)) {
              crawlItem = true;
            }
          }
        }

        if (crawlItem) {
          
          loader.queueURL(fingerprint, url);

          synchronized (_metadata) {
            // update queued item count 
            _metadata.setQueuedItemCount(_metadata.getQueuedItemCount() + 1);
          }
        }
        else { 
          updateMetadata(item, _metadata,0);
        }
        // ok update subdomain stats 
        updateSubDomainMetadataForItemDuringLoad(item,url,fingerprint,crawlItem);
      }
      
      flushCachedSubDomainMetadata();
      
      loader.flush();
      
      _queueState = QueueState.QUEUED;
    }
    catch (IOException e) { 
      LOG.error("Encountered Exception Queueing Items for List:" + _listId + " Exception:" + CCStringUtils.stringifyException(e));
      _queueState = QueueState.ERROR;
    }
    finally { 
      fixedDataReader.close();
      stringDataReader.close();
    }
  }

  /** resubmit failed items 
   * 
   * @param loader
   */
  public void requeueFailedItems(CrawlQueueLoader loader) throws IOException { 
    synchronized (this) {
      _queueState = QueueState.QUEUEING;
    }
    RandomAccessFile fixedDataReader = new RandomAccessFile(_fixedDataFile, "rw");
    RandomAccessFile stringDataReader = new RandomAccessFile(_variableDataFile, "rw");
    try {

      OnDiskCrawlHistoryItem item = new OnDiskCrawlHistoryItem();
      URLFP fingerprint = new URLFP();

      while (fixedDataReader.getFilePointer() != fixedDataReader.length()) { 
        item.deserialize(fixedDataReader);
        boolean queueItem = false;
        if (item.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_CRAWL_STATUS)) {

          if (item.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_REDIRECT_STATUS)) { 
            queueItem = (item._redirectStatus != 0);

            if (!queueItem) { 
              if (item._redirectHttpResult != 200 && item._redirectHttpResult != 404) { 
                queueItem = true;
              }
            }
          }
          else { 
            queueItem = (item._crawlStatus != 0);

            if (!queueItem) { 
              if (item._httpResultCode != 200 && item._httpResultCode != 404) { 
                queueItem = true;
              }
            }
          }

          if (queueItem) { 
            // seek to string data 
            stringDataReader.seek(item._stringsOffset);
            // and skip buffer length 
            WritableUtils.readVInt(stringDataReader);
            // and read primary string 
            String url = stringDataReader.readUTF();
            // and spill
            fingerprint.setDomainHash(item._domainHash);
            fingerprint.setUrlHash(item._urlFingerprint);

            loader.queueURL(fingerprint, url);
          }
        }
      }
    }
    catch (IOException e) { 
      LOG.error("Encountered Exception Queueing Items for List:" + _listId + " Exception:" + CCStringUtils.stringifyException(e));
      _queueState = QueueState.QUEUED;
    }
    finally { 
      fixedDataReader.close();
      stringDataReader.close();
    }
  }


  /**
   * 
   * @param localLogFileDir
   * @param listId
   * @return
   */
  public static boolean allFilesPresent(File localLogFileDir,long listId) { 
    //establish file names 
    File urlDataFile = new File(localLogFileDir,LIST_URL_DATA_PREFIX + Long.toString(listId));
    File fixedDataFile = new File(localLogFileDir,LIST_VALUE_MAP_PREFIX + Long.toString(listId));
    File variableDataFile = new File(localLogFileDir,LIST_STRING_MAP_PREFIX + Long.toString(listId));
    File bloomFilterFile   = new File(localLogFileDir,LIST_BLOOM_DATA_PREFIX + Long.toString(listId));

    if (urlDataFile.exists() && 
        fixedDataFile.exists() && 
        variableDataFile.exists() && 
        bloomFilterFile.exists()) {

      return true;
    }
    return false;
  }

  public static final String LIST_URL_DATA_PREFIX = "listURLS-";
  public static final String LIST_VALUE_MAP_PREFIX = "listValueMap-";
  public static final String LIST_STRING_MAP_PREFIX = "listStringMap-";
  public static final String LIST_BLOOM_DATA_PREFIX = "listBloomFilter-";
  public static final String LIST_METADATA_PREFIX = "listMetadata-";
  public static final String LIST_SUBDOMAIN_METADATA_PREFIX = "listSubDomainMetadata-";

  private void initializeListFileNames() { 
    //establish file names 
    _listURLDataFile = new File(_manager.getLocalDataDir(),LIST_URL_DATA_PREFIX + Long.toString(_listId));
    _fixedDataFile = new File(_manager.getLocalDataDir(),LIST_VALUE_MAP_PREFIX + Long.toString(_listId));
    _variableDataFile = new File(_manager.getLocalDataDir(),LIST_STRING_MAP_PREFIX + Long.toString(_listId));
    _bloomFilterData  = new File(_manager.getLocalDataDir(),LIST_BLOOM_DATA_PREFIX + Long.toString(_listId));
    _listMetadataFile = new File(_manager.getLocalDataDir(),LIST_METADATA_PREFIX + Long.toString(_listId));
    _subDomainMetadataFile = new File(_manager.getLocalDataDir(),LIST_SUBDOMAIN_METADATA_PREFIX + Long.toString(_listId));
  }

  private static class OnDiskCrawlHistoryItem  {
    public long  _fileOffset = -1;
    int  	_domainHash = -1;					// 4
    long  _urlFingerprint = -1;		  // 8
    int   _stringsCRC = -1; 				// 4
    long  _stringsOffset = -1;			// 8
    byte	_flags = 0;								// 1
    byte 	_crawlStatus = -1;				// 1
    short _httpResultCode = -1;				// 2
    byte  _redirectStatus = -1;				// 1
    short _redirectHttpResult = -1;		// 2
    long  _updateTimestamp = -1;			// 8
    //__
    // 39 bytes


    public static final int ON_DISK_SIZE = 39;

    public static final int FLAG_HAS_CRAWL_STATUS 				= 1;
    public static final int FLAG_HAS_ORIGINAL_RESULT_CODE = 2;
    public static final int FLAG_HAS_REDIRECT_URL 				= 4;
    public static final int FLAG_HAS_REDIRECT_STATUS 			= 8;
    public static final int FLAG_HAS_REDIRECT_RESULT_CODE = 16;
    public static final int FLAG_HAS_LASTMODIFIED_TIME = 32;



    public int compareFingerprints(URLFP fp) { 
      int result = ((Integer)_domainHash).compareTo(fp.getDomainHash());
      if (result == 0) { 
        result = ((Long)_urlFingerprint).compareTo(fp.getUrlHash());
      }
      return result;
    }

    @Override
    public boolean equals(Object obj) {
      if (obj instanceof OnDiskCrawlHistoryItem) { 
        OnDiskCrawlHistoryItem other = (OnDiskCrawlHistoryItem)obj;
        if (_domainHash == other._domainHash && 
            _urlFingerprint == other._urlFingerprint && 
            _stringsCRC == other._stringsCRC && 
            _flags == other._flags &&
            _crawlStatus == other._crawlStatus && 
            _httpResultCode == other._httpResultCode && 
            _redirectStatus == other._redirectStatus && 
            _redirectHttpResult == other._redirectHttpResult) { 
          return true;
        }
      }
      return false;
    }

    public void setFlag(int flag) { 
      _flags |= flag;
    }

    public boolean isFlagSet(int flag) { 
      return ((_flags & flag) != 0);
    }

    public void serialize(DataOutput out) throws IOException { 
      out.writeInt(_domainHash);
      out.writeLong(_urlFingerprint);
      out.writeInt(_stringsCRC);
      out.writeLong(_stringsOffset);
      out.write(_flags);
      out.writeByte(_crawlStatus);
      out.writeShort(_httpResultCode);
      out.writeByte(_redirectStatus);
      out.writeShort(_redirectHttpResult);
      out.writeLong(_updateTimestamp);
    }

    public void deserialize(DataInput in) throws IOException { 
      _domainHash = in.readInt();
      _urlFingerprint = in.readLong();
      _stringsCRC = in.readInt();
      _stringsOffset = in.readLong();
      _flags = in.readByte();
      _crawlStatus = in.readByte();
      _httpResultCode = in.readShort();
      _redirectStatus = in.readByte();
      _redirectHttpResult = in.readShort();
      _updateTimestamp = in.readLong();
    }

  }

  DataOutputBuffer _stringBuffer1 = new DataOutputBuffer();
  DataOutputBuffer _stringBuffer2 = new DataOutputBuffer();
  CRC16 _stringCRC = new CRC16();

  private OnDiskCrawlHistoryItem onDiskItemFromHistoryItem(URLFP fingerprint, ProxyCrawlHistoryItem item) throws IOException { 

    OnDiskCrawlHistoryItem itemOut = new OnDiskCrawlHistoryItem();

    itemOut._domainHash = fingerprint.getDomainHash();
    itemOut._urlFingerprint = fingerprint.getUrlHash();
    itemOut._stringsCRC = calculateStringCRC(item,_stringBuffer1);
    if (item.isFieldDirty(ProxyCrawlHistoryItem.Field_CRAWLSTATUS)) {
      itemOut.setFlag(OnDiskCrawlHistoryItem.FLAG_HAS_CRAWL_STATUS);
      itemOut._crawlStatus = (byte) item.getCrawlStatus();
    }
    if (item.isFieldDirty(ProxyCrawlHistoryItem.Field_HTTPRESULTCODE)) {
      itemOut.setFlag(OnDiskCrawlHistoryItem.FLAG_HAS_ORIGINAL_RESULT_CODE);
      itemOut._httpResultCode = (short) item.getHttpResultCode();
    }
    if (item.isFieldDirty(ProxyCrawlHistoryItem.Field_REDIRECTURL)) { 
      itemOut.setFlag(OnDiskCrawlHistoryItem.FLAG_HAS_REDIRECT_URL);
    }
    if (item.isFieldDirty(ProxyCrawlHistoryItem.Field_REDIRECTSTATUS)) { 
      itemOut.setFlag(OnDiskCrawlHistoryItem.FLAG_HAS_REDIRECT_STATUS);
      itemOut._redirectStatus = (byte)item.getRedirectStatus();
    }
    if (item.isFieldDirty(ProxyCrawlHistoryItem.Field_REDIRECTHTTPRESULT)) {
      itemOut.setFlag(OnDiskCrawlHistoryItem.FLAG_HAS_REDIRECT_RESULT_CODE);
      itemOut._redirectHttpResult = (short)item.getRedirectHttpResult();
    }
    // update last modified time if present ....  
    if (item.isFieldDirty(ProxyCrawlHistoryItem.Field_LASTMODIFIEDTIME) && 
        item.getLastModifiedTime() > 0) {
      itemOut._updateTimestamp = Math.max(itemOut._updateTimestamp, item.getLastModifiedTime());
      itemOut.setFlag(OnDiskCrawlHistoryItem.FLAG_HAS_LASTMODIFIED_TIME);
    }

    return itemOut;
  }

  private int calculateStringCRC(ProxyCrawlHistoryItem item,DataOutputBuffer stringBuffer)throws IOException { 
    stringBuffer.reset();
    stringBuffer.writeUTF(item.getOriginalURL());
    if (item.isFieldDirty(ProxyCrawlHistoryItem.Field_REDIRECTURL)) { 
      stringBuffer.writeUTF(item.getRedirectURL());
    }
    _stringCRC.reset();
    _stringCRC.update(stringBuffer.getData(), 0, stringBuffer.getLength());

    return (int)_stringCRC.getValue();
  }

  private void writeInitialOnDiskItem(URLFP fp,ProxyCrawlHistoryItem historyItem,DataOutputStream valueStreamOut,RandomAccessFile stringStream) throws IOException { 

    OnDiskCrawlHistoryItem itemOut = onDiskItemFromHistoryItem(fp, historyItem);

    // update string offset ... 
    itemOut._stringsOffset = stringStream.length();
    // write out string data length 
    WritableUtils.writeVInt(stringStream,_stringBuffer1.getLength());
    // write strings to log file
    stringStream.write(_stringBuffer1.getData(),0,_stringBuffer1.getLength());
    // update timestamp ... 
    itemOut._updateTimestamp = -1;
    // and write to disk 
    itemOut.serialize(valueStreamOut);
  }

  private void dumpFixedDataFile() { 
    try { 
      RandomAccessFile fixedDataReader = new RandomAccessFile(_fixedDataFile,"rw");

      try { 
        OnDiskCrawlHistoryItem item = new OnDiskCrawlHistoryItem();
        int index =0;
        while (fixedDataReader.getFilePointer() != fixedDataReader.length()) { 
          item.deserialize(fixedDataReader);
          LOG.info("Item at Index:" + index++ + " Domain:" + item._domainHash + " URLFP:" + item._urlFingerprint);
        }
      }
      finally { 
        fixedDataReader.close();
      }
    }
    catch (IOException e) { 
      LOG.error(CCStringUtils.stringifyException(e));
    }

  }

  private OnDiskCrawlHistoryItem loadOnDiskItemForURLFP(URLFP fingerprint) throws IOException { 

    // see if state is cached in memory ...
    boolean loadedFromMemory = false;

    synchronized (this) { 
      if (_tempFixedDataBuffer != null) { 

        loadedFromMemory = true;

        int low = 0;
        int high = (int)(_tempFixedDataBufferSize / OnDiskCrawlHistoryItem.ON_DISK_SIZE) -1;

        OnDiskCrawlHistoryItem itemOut = new OnDiskCrawlHistoryItem();
        DataInputBuffer inputBuffer = new DataInputBuffer();


        int iterationNumber = 0;

        while (low <= high) {

          ++iterationNumber;

          int mid = low + ((high - low) / 2);

          inputBuffer.reset(_tempFixedDataBuffer,0,_tempFixedDataBufferSize);
          inputBuffer.skip(mid * OnDiskCrawlHistoryItem.ON_DISK_SIZE);

          // deserialize 
          itemOut.deserialize(inputBuffer);

          // now compare it against desired hash value ...
          int comparisonResult = itemOut.compareFingerprints(fingerprint);

          if (comparisonResult > 0)
            high = mid - 1;
          else if (comparisonResult < 0)
            low = mid + 1;
          else {

            // cache offset 
            itemOut._fileOffset = mid * OnDiskCrawlHistoryItem.ON_DISK_SIZE;

            // LOG.info("Found Match. Took:"+ iterationNumber + " iterations");
            // and return item 
            return itemOut;
          }
        }
        //LOG.error("Did Not Find Match For Domain:" + fingerprint.getDomainHash() + " URLFP:" + fingerprint.getUrlHash() + " Took:" + iterationNumber + " iterations");
      }
    }

    if (!loadedFromMemory) {
      //load from disk 

      //LOG.info("Opening Data File for OnDiskItem load for Fingerprint:" + fingerprint.getUrlHash());

      RandomAccessFile file = new RandomAccessFile(_fixedDataFile,"rw");

      // allocate buffer upfront 
      byte[] onDiskItemBuffer = new byte[OnDiskCrawlHistoryItem.ON_DISK_SIZE];
      DataInputBuffer inputStream = new DataInputBuffer();

      //LOG.info("Opened Data File. Searching for match");
      try { 

        int low = 0;
        int high = (int)(file.length() / OnDiskCrawlHistoryItem.ON_DISK_SIZE) -1;

        OnDiskCrawlHistoryItem itemOut = new OnDiskCrawlHistoryItem();

        int iterationNumber = 0;

        while (low <= high) {

          ++iterationNumber;

          int mid = low + ((high - low) / 2);

          // seek to proper location 
          file.seek(mid * OnDiskCrawlHistoryItem.ON_DISK_SIZE);
          // read the data structure 
          file.readFully(onDiskItemBuffer, 0, onDiskItemBuffer.length);
          // map location in file 
          //MappedByteBuffer memoryBuffer = file.getChannel().map(MapMode.READ_ONLY,mid * OnDiskCrawlHistoryItem.ON_DISK_SIZE,OnDiskCrawlHistoryItem.ON_DISK_SIZE);
          //DataInputStream  inputStream = new DataInputStream(new ByteBufferInputStream(memoryBuffer));
          inputStream.reset(onDiskItemBuffer,0,OnDiskCrawlHistoryItem.ON_DISK_SIZE);

          // deserialize 
          itemOut.deserialize(inputStream);

          // memoryBuffer = null;
          //inputStream = null;

          // now compare it against desired hash value ...
          int comparisonResult = itemOut.compareFingerprints(fingerprint);

          if (comparisonResult > 0)
            high = mid - 1;
          else if (comparisonResult < 0)
            low = mid + 1;
          else {

            // cache offset 
            itemOut._fileOffset = mid * OnDiskCrawlHistoryItem.ON_DISK_SIZE;

            // LOG.info("Found Match. Took:"+ iterationNumber + " iterations");
            // and return item 
            return itemOut;
          }
        }
        //LOG.error("******Did Not Find Match For Domain:" + fingerprint.getDomainHash() + " URLFP:" + fingerprint.getUrlHash() + " Took:" + iterationNumber + " iterations");

        //DEBUG ONLY !
        // dumpFixedDataFile();
      }
      finally { 
        file.close();
      }
    }
    return null;
  }

  private ProxyCrawlHistoryItem getHistoryItemFromURLFP(URLFP fingerprint) throws IOException {
    OnDiskCrawlHistoryItem item = loadOnDiskItemForURLFP(fingerprint);
    if (item != null) { 
      return getHistoryItemFromOnDiskItem(item);
    }
    return null;
  }

  private ProxyCrawlHistoryItem getHistoryItemFromOnDiskItem(OnDiskCrawlHistoryItem item) throws IOException { 

    ProxyCrawlHistoryItem itemOut = new ProxyCrawlHistoryItem();

    if ((item._flags & OnDiskCrawlHistoryItem.FLAG_HAS_CRAWL_STATUS) != 0)
      itemOut.setCrawlStatus(item._crawlStatus);
    if ((item._flags & OnDiskCrawlHistoryItem.FLAG_HAS_ORIGINAL_RESULT_CODE) != 0)
      itemOut.setHttpResultCode(item._httpResultCode);
    if ((item._flags & OnDiskCrawlHistoryItem.FLAG_HAS_REDIRECT_STATUS) != 0)
      itemOut.setRedirectStatus(item._redirectStatus);
    if ((item._flags & OnDiskCrawlHistoryItem.FLAG_HAS_REDIRECT_RESULT_CODE) != 0)
      itemOut.setRedirectHttpResult(item._redirectHttpResult);
    if ((item._flags & OnDiskCrawlHistoryItem.FLAG_HAS_LASTMODIFIED_TIME) != 0)
      itemOut.setLastModifiedTime(item._updateTimestamp);
    // now attept to get the string offset 
    RandomAccessFile stringDataReader = new RandomAccessFile(_variableDataFile, "rw");
    try { 
      // seek to string data 
      stringDataReader.seek(item._stringsOffset);
      // and skip buffer length 
      WritableUtils.readVInt(stringDataReader);
      // now populate original url ... 
      itemOut.setOriginalURL(stringDataReader.readUTF());
      // now if redirect url is present 
      if ((item._flags & OnDiskCrawlHistoryItem.FLAG_HAS_REDIRECT_URL) != 0) { 
        itemOut.setRedirectURL(stringDataReader.readUTF());
      }
    }
    finally { 
      stringDataReader.close();
    }
    return itemOut;
  }

  /**
   * deserialize metadata from disk 
   * 
   * @throws IOException
   */
  void loadMetadataFromDisk()throws IOException {

    // skip metadata load if sub-domain metadata file is missing... 
    // in this case, metadata will be rebuilt during subdomain metadata rescan ... 
    if (_subDomainMetadataFile.exists()) {

      RandomAccessFile file = new RandomAccessFile(_listMetadataFile,"rw");
      try { 
        _metadata.deserialize(file, new BinaryProtocol());
        int urlCount = _metadata.getUrlCount();
        _metadata.clear();
        _metadata.setUrlCount(urlCount);
      }
      finally { 
        file.close();
      }
    }
  }

  /**
   * serialize metadata to disk 
   * @throws IOException
   */
  void writeMetadataToDisk()throws IOException {

    synchronized(_metadata) { 
      RandomAccessFile file = new RandomAccessFile(_listMetadataFile,"rw");
      try { 
        file.seek(0);
        _metadata.serialize(file, new BinaryProtocol());
      }
      finally { 
        file.close();
      }
    }
  }

  public static void generateTestURLFile(File outputFile,String... urlList)throws IOException { 
    PrintWriter writer = new PrintWriter(outputFile,"UTF-8");

    for (String url : urlList) { 
      writer.println(url);
    }

    writer.flush();
    writer.close();

  }

  private static void validateListCode(final File dataDirectory,long listId) throws IOException { 

    final String urlList[] = new String[] { 
        "http://www.yahoo.com/1",
        "http://www.google.com/1",
        "http://www.cnn.com/1",
        "http://www.yahoo.com/2",
        "http://www.google.com/2",
        "http://www.cnn.com/2"
    };

    File tempFile = File.createTempFile("CrawlList", "validateListInit");
    File localTempFile = new File(dataDirectory,tempFile.getName());

    generateTestURLFile(localTempFile,urlList);

    final TreeMap<String,URLFP> urlToFPMap = new TreeMap<String,URLFP>();
    final TreeMap<URLFP,String> urlFPToString = new TreeMap<URLFP,String>();

    for (String url : urlList) {
      URLFP fp = URLUtils.getURLFPFromURL(url, true);
      urlToFPMap.put(url, fp);
      urlFPToString.put(fp, url);
    }

    final TreeMap<URLFP,ProxyCrawlHistoryItem> itemsToMarkComplete = new TreeMap<URLFP,ProxyCrawlHistoryItem>();

    ProxyCrawlHistoryItem item1 = new ProxyCrawlHistoryItem();

    item1.setCrawlStatus(CrawlURL.FailureReason.RobotsExcluded);
    item1.setOriginalURL(urlList[1]);

    ProxyCrawlHistoryItem item2 = new ProxyCrawlHistoryItem();

    item2.setCrawlStatus(0);
    item2.setOriginalURL(urlList[3]);
    item2.setHttpResultCode(301);
    item2.setRedirectURL("http://www.yahoo.com/3");
    item2.setRedirectStatus(0);
    item2.setRedirectHttpResult(200);

    ProxyCrawlHistoryItem item3 = new ProxyCrawlHistoryItem();

    item3.setCrawlStatus(0);
    item3.setOriginalURL(urlList[4]);
    item3.setHttpResultCode(301);
    item3.setRedirectURL("http://www.google.com/3");
    item3.setRedirectStatus(CrawlURL.FailureReason.IOException);


    itemsToMarkComplete.put(urlToFPMap.get(item1.getOriginalURL()), item1);
    itemsToMarkComplete.put(urlToFPMap.get(item2.getOriginalURL()), item2);
    itemsToMarkComplete.put(urlToFPMap.get(item3.getOriginalURL()), item3);

    final Set<URLFP> itemsToMarkCompleteFPSet= itemsToMarkComplete.keySet();
    final Set<URLFP> itemsNotMarked = new TreeSet<URLFP>(urlToFPMap.values());
    itemsNotMarked.removeAll(itemsToMarkCompleteFPSet);



    CrawlHistoryStorage storage = new CrawlHistoryStorage() {

      @Override
      public void syncList(long listId,TreeSet<URLFP> matchCriteria, ItemUpdater targetList) throws IOException {


        for (URLFP matchItem : matchCriteria) { 
          if (itemsToMarkCompleteFPSet.contains(matchItem)) { 
            targetList.updateItemState(matchItem, itemsToMarkComplete.get(matchItem));
          }
        }
      }

      @Override
      public File getLocalDataDir() {
        return dataDirectory;
      }
    };

    CrawlList list1 = new CrawlList(storage,listId,localTempFile,0);

    for (int pass=0;pass<2;++pass) { 

      CrawlList list = null;

      if (pass == 0) {
        System.out.println("Pass 0 - Initialize from URLList");
        list = list1;
      }
      else { 
        System.out.println("Pass 1 - Initialize from OnDisk Data");
        list = new CrawlList(storage, listId);
      }



      // iterate fingerprints 
      for (URLFP fingerprint : urlToFPMap.values()) { 
        ProxyCrawlHistoryItem itemRetrieved = list.getHistoryItemFromURLFP(fingerprint);
        if (itemsToMarkCompleteFPSet.contains(fingerprint)) { 
          ProxyCrawlHistoryItem itemExpected = itemsToMarkComplete.get(fingerprint);
          Assert.assertTrue(itemExpected.equals(itemRetrieved));
        }
        else { 
          Assert.assertTrue(itemRetrieved.getOriginalURL().equals(urlFPToString.get(fingerprint)) && 
              !itemRetrieved.isFieldDirty(ProxyCrawlHistoryItem.Field_CRAWLSTATUS) && 
              !itemRetrieved.isFieldDirty(ProxyCrawlHistoryItem.Field_HTTPRESULTCODE) &&
              !itemRetrieved.isFieldDirty(ProxyCrawlHistoryItem.Field_REDIRECTHTTPRESULT) &&
              !itemRetrieved.isFieldDirty(ProxyCrawlHistoryItem.Field_REDIRECTSTATUS) &&
              !itemRetrieved.isFieldDirty(ProxyCrawlHistoryItem.Field_REDIRECTURL));
        }
      }
    }
    // validate string code does not update when strings have not changed 
    item3.setRedirectStatus(0);
    item3.setRedirectHttpResult(200);

    long variableDataLength = list1._variableDataFile.length();
    long fixedDataLength    = list1._fixedDataFile.length();

    list1.updateItemState(urlToFPMap.get(item3.getOriginalURL()), item3);

    Assert.assertTrue(fixedDataLength == list1._fixedDataFile.length());
    Assert.assertTrue(variableDataLength == list1._variableDataFile.length());

    list1.queueUnCrawledItems(new CrawlQueueLoader() {

      @Override
      public void queueURL(URLFP urlfp, String url) {
        Assert.assertTrue(itemsNotMarked.contains(urlfp));
        Assert.assertTrue(urlFPToString.get(urlfp).equals(url));
      } 
      
      @Override
      public void flush() {
        // TODO Auto-generated method stub
        
      }

    });

  }

  public static void testmain(String[] args) {

    // initialize ...
    Configuration conf = new Configuration();

    conf.addResource("nutch-default.xml");
    conf.addResource("nutch-site.xml");
    conf.addResource("core-site.xml");
    conf.addResource("hdfs-site.xml");
    conf.addResource("mapred-site.xml");

    BasicConfigurator.configure();

    conf.set("mapred.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec");
    conf.set("mapred.map.output.compression.codec","org.apache.hadoop.io.compress.GzipCodec");

    CrawlEnvironment.setHadoopConfig(conf);
    CrawlEnvironment.setDefaultHadoopFSURI("file:///");

    File testDirectory = new File("/tmp/CrawlListTests");
    FileUtils.recursivelyDeleteFile(testDirectory);
    testDirectory.mkdir();

    try {
      validateListCode(testDirectory,System.currentTimeMillis());
    } catch (IOException e) {
      e.printStackTrace();
    }
  }

  private static final int OFFSET_TABLE_ENTRY_SIZE = 12;

  private final int getOffsetForSubDomainData(long domainHash) throws IOException { 
    DataInputBuffer inputBuffer = new DataInputBuffer();

    int low = 0;
    int high = (int)(_offsetLookupTable.getLength() / OFFSET_TABLE_ENTRY_SIZE) -1;

    while (low <= high) {

      int mid = low + ((high - low) / 2);

      inputBuffer.reset(_offsetLookupTable.getData(),_offsetLookupTable.getLength());
      inputBuffer.skip(mid * OFFSET_TABLE_ENTRY_SIZE);

      // deserialize
      long hash = inputBuffer.readLong();

      // now compare it against desired hash value ...
      int comparisonResult = ((Long)hash).compareTo(domainHash);

      if (comparisonResult > 0)
        high = mid - 1;
      else if (comparisonResult < 0)
        low = mid + 1;
      else {
        return inputBuffer.readInt();
      }
    }
    throw new IOException("NOT-FOUND!");
  }

  void updateSubDomainQueueStatus(String rootDomainName,int deltaQueuedCount)throws IOException {
    long domainHash = URLFingerprint.generate64BitURLFPrint(rootDomainName);
    synchronized (_subDomainMetadataFile) {
      CrawlListMetadata metadata = new CrawlListMetadata();

      RandomAccessFile file = new RandomAccessFile(_subDomainMetadataFile,"rw");
      try { 
        int dataOffset = getOffsetForSubDomainData(domainHash);
        if (dataOffset == 0) { 
          throw new IOException("Data Offset Zero for host:" + rootDomainName);
        }
        file.seek(dataOffset);
        metadata.readFields(file);
        // set the data offset on the way out so that updates write to the proper location 
        metadata.setQueuedItemCount(metadata.getQueuedItemCount() + deltaQueuedCount);
        // ok reseek to data offset 
        file.seek(dataOffset);
        // rewrite the data structure
        metadata.write(file);
      }
      finally { 
        file.close();
      }
    }      
  }
  public CrawlListMetadata getSubDomainMetadataByURL(String originalURL) throws IOException {
    GoogleURL urlObject = new GoogleURL(originalURL);
    return getSubDomainMetadataByDomain(urlObject.getHost());
  }

  public CrawlListMetadata getSubDomainMetadataByDomain(String hostName) throws IOException { 
    String rootDomainName = URLUtils.extractRootDomainName(hostName);

    if (rootDomainName != null) { 
      return getSubDomainMetadataByRootDomain(rootDomainName);
    }
    throw new IOException("Unable to Extract RootDomainName for host:" + hostName);
  }
  
  public CrawlListMetadata getSubDomainMetadataByRootDomain(String rootDomainName) throws IOException { 
    long domainHash = URLFingerprint.generate64BitURLFPrint(rootDomainName);

    CrawlListMetadata metadata = new CrawlListMetadata();
    synchronized (_subDomainMetadataFile) { 
      RandomAccessFile file = new RandomAccessFile(_subDomainMetadataFile,"rw");
      try { 
        int dataOffset = getOffsetForSubDomainData(domainHash);
        if (dataOffset == 0) { 
          throw new IOException("Data Offset Zero for host:" + rootDomainName);
        }
        file.seek(dataOffset);
        metadata.readFields(file);
        // set the data offset on the way out so that updates write to the proper location 
        metadata.setSubDomainDataOffset(dataOffset);
      }
      finally { 
        file.close();
      }
    }
    return metadata;
  }

  // get subdomain metadata  
  CrawlListMetadata getTransientSubDomainMetadata(String originalURL)throws IOException { 
    GoogleURL urlObject = new GoogleURL(originalURL);
    String rootDomainName = URLUtils.extractRootDomainName(urlObject.getHost());
    if (rootDomainName != null) { 
      long domainHash = URLFingerprint.generate64BitURLFPrint(rootDomainName);


      CrawlListMetadata metadata = _transientSubDomainStats.get(domainHash);
      if (metadata == null) { 
        metadata = new CrawlListMetadata();
        _transientSubDomainStats.put(domainHash, metadata);
        metadata.setDomainName(rootDomainName);
        metadata.setDomainHash(domainHash);
      }
      return metadata;
    }
    throw new IOException("Unable to Extract RootDomainName for url:" + originalURL);
  }

  /**
   * serialize metadata to disk 
   * @throws IOException
   */
  void writeSubDomainMetadataToDisk(CrawlListMetadata subDomainData)throws IOException {

    DataOutputBuffer outputBuffer = new DataOutputBuffer(CrawlListMetadata.Constants.FixedDataSize);

    subDomainData.serialize(outputBuffer, new BinaryProtocol());

    if (outputBuffer.getLength() > CrawlListMetadata.Constants.FixedDataSize) { 
      LOG.error("ListMetadata Serialize for List:" + subDomainData.getDomainName() + " > FixedDataSize!!!");
      outputBuffer.reset();
      subDomainData.setDomainName("<<CORRUPT>>");
      subDomainData.serialize(outputBuffer, new BinaryProtocol());
    }

    synchronized (_subDomainMetadataFile) {
      RandomAccessFile file = new RandomAccessFile(_subDomainMetadataFile,"rw");
      try { 
        if (subDomainData.getSubDomainDataOffset() == 0) { 
          throw new IOException("Data Offset Zero during write!");
        }
        file.seek(subDomainData.getSubDomainDataOffset());
        file.write(outputBuffer.getData(),0,outputBuffer.getLength());
      }
      finally { 
        file.close();
      }
    }
  }

  void writeInitialSubDomainMetadataToDisk() throws IOException {

    RandomAccessFile file = new RandomAccessFile(_subDomainMetadataFile,"rw");

    try { 

      file.writeByte(0); // version
      file.writeInt(_transientSubDomainStats.size());

      ArrayList<CrawlListMetadata> sortedMetadata = new ArrayList<CrawlListMetadata>();
      sortedMetadata.addAll(_transientSubDomainStats.values());
      _transientSubDomainStats = null;
      CrawlListMetadata metadataArray[] = sortedMetadata.toArray(new CrawlListMetadata[0]);
      Arrays.sort(metadataArray,new Comparator<CrawlListMetadata>() {

        @Override
        public int compare(CrawlListMetadata o1, CrawlListMetadata o2) {
          int result = ((Integer)o2.getUrlCount()).compareTo(o1.getUrlCount());
          if (result == 0) { 
            result = o1.getDomainName().compareTo(o2.getDomainName());
          }
          return result;
        }
      });

      DataOutputBuffer outputBuffer = new DataOutputBuffer(CrawlListMetadata.Constants.FixedDataSize);

      TreeMap<Long,Integer> idToOffsetMap = new TreeMap<Long,Integer>();

      for (CrawlListMetadata entry : metadataArray) {
        // reset output buffer 
        outputBuffer.reset();
        // write item to disk 
        entry.serialize(outputBuffer, new BinaryProtocol() );

        if (outputBuffer.getLength() > CrawlListMetadata.Constants.FixedDataSize) { 
          LOG.fatal("Metadata Serialization for List:" + getListId() + " SubDomain:" + entry.getDomainName());
          System.out.println("Metadata Serialization for List:" + getListId() + " SubDomain:" + entry.getDomainName());
        }
        // save offset 
        idToOffsetMap.put(entry.getDomainHash(), (int)file.getFilePointer());
        // write out fixed data size 
        file.write(outputBuffer.getData(),0,CrawlListMetadata.Constants.FixedDataSize);
      }

      // write lookup table 
      _offsetLookupTable = new DataOutputBuffer(idToOffsetMap.size() * OFFSET_TABLE_ENTRY_SIZE);

      for (Map.Entry<Long,Integer> entry : idToOffsetMap.entrySet()) { 
        _offsetLookupTable.writeLong(entry.getKey());
        _offsetLookupTable.writeInt(entry.getValue());
      }
    }
    finally { 
      file.close();
    }
    _transientSubDomainStats = null;
  }

  void  resetSubDomainCounts() throws IOException { 

    LOG.info("*** LIST:" + getListId() + " Reset SubDomain Queued Counts.");

    if (_subDomainMetadataFile.exists()) { 

      LOG.info("*** LIST:" + getListId() + " FILE EXISTS .");


      RandomAccessFile file = new RandomAccessFile(_subDomainMetadataFile,"rw");
      DataInputBuffer inputBuffer = new DataInputBuffer();
      DataOutputBuffer outputBuffer = new DataOutputBuffer(CrawlListMetadata.Constants.FixedDataSize);

      try { 
        // skip version 
        file.read();
        // read item count 
        int itemCount = file.readInt();

        LOG.info("*** LIST:" + getListId() + " SUBDOMAIN ITEM COUNT:"+ itemCount);

        CrawlListMetadata newMetadata = new CrawlListMetadata();

        for (int i=0;i<itemCount;++i) {

          long orignalPos = file.getFilePointer();
          file.readFully(outputBuffer.getData(), 0, CrawlListMetadata.Constants.FixedDataSize);
          inputBuffer.reset(outputBuffer.getData(),CrawlListMetadata.Constants.FixedDataSize);
          try { 
            newMetadata.deserialize(inputBuffer, new BinaryProtocol());
          }
          catch (Exception e) { 
            LOG.error("-----Failed to Deserialize Metadata at Index:" + i + " Exception:" + CCStringUtils.stringifyException(e));
          }
          // ok reset everything except hashes and first/last url pointers 
          int urlCount = newMetadata.getUrlCount();
          long firstRecordOffset = newMetadata.getFirstRecordOffset();
          long lastRecordOffset  = newMetadata.getLastRecordOffset();
          String domainName = newMetadata.getDomainName();
          long    domainHash = newMetadata.getDomainHash();
          
          // reset 
          newMetadata.clear();
          // restore 
          newMetadata.setUrlCount(urlCount);
          newMetadata.setFirstRecordOffset(firstRecordOffset);
          newMetadata.setLastRecordOffset(lastRecordOffset);
          newMetadata.setDomainName(domainName);
          newMetadata.setDomainHash(domainHash);
          
          // serialize it ... 
          outputBuffer.reset();
          newMetadata.serialize(outputBuffer, new BinaryProtocol());
          // write it back to disk 
          file.seek(orignalPos);
          // and rewrite it ... 
          file.write(outputBuffer.getData(),0,CrawlListMetadata.Constants.FixedDataSize);
        }
      }
      finally {
        file.close();
      }
      LOG.info("*** LIST:" + getListId() + " DONE RESETTIGN SUBDOMAIN METADATA QUEUE COUNTS");
    }
  }

  void loadSubDomainMetadataFromDisk()throws IOException { 
    LOG.info("*** LIST:" + getListId() + " LOAD SUBDOMAIN METADATA FROM DISK ...  ");
    if (_subDomainMetadataFile.exists()) { 

      LOG.info("*** LIST:" + getListId() + " FILE EXISTS LOADING SUBDOMAIN DATA FROM DISK.");


      RandomAccessFile file = new RandomAccessFile(_subDomainMetadataFile,"rw");
      DataInputBuffer inputBuffer = new DataInputBuffer();
      byte fixedDataBlock[] = new byte[CrawlListMetadata.Constants.FixedDataSize];


      try { 
        // skip version 
        file.read();
        // read item count 
        int itemCount = file.readInt();

        LOG.info("*** LIST:" + getListId() + " SUBDOMAIN ITEM COUNT:"+ itemCount);

        CrawlListMetadata newMetadata = new CrawlListMetadata();

        TreeMap<Long,Integer> idToOffsetMap = new TreeMap<Long,Integer>();
        for (int i=0;i<itemCount;++i) {

          long orignalPos = file.getFilePointer();
          file.readFully(fixedDataBlock, 0, fixedDataBlock.length);
          inputBuffer.reset(fixedDataBlock,fixedDataBlock.length);
          try { 
            newMetadata.deserialize(inputBuffer, new BinaryProtocol());
          }
          catch (Exception e) { 
            LOG.error("-----Failed to Deserialize Metadata at Index:" + i + " Exception:" + CCStringUtils.stringifyException(e));
          }
          idToOffsetMap.put(newMetadata.getDomainHash(), (int)orignalPos);
        }

        // write lookup table 
        _offsetLookupTable = new DataOutputBuffer(idToOffsetMap.size() * OFFSET_TABLE_ENTRY_SIZE);
        for (Map.Entry<Long,Integer> entry : idToOffsetMap.entrySet()) { 
          _offsetLookupTable.writeLong(entry.getKey());
          _offsetLookupTable.writeInt(entry.getValue());
        }
      }
      finally { 
        file.close();
      }
      LOG.info("*** LIST:" + getListId() + " DONE LOADING SUBDOMAIN DATA FROM DISK");
    }
    else { 

      LOG.info("*** LIST:" + getListId() + " SUBDOMAIN METADATA DOES NOT EXIST! LOADING FROM SCRATCH");



      RandomAccessFile fixedDataReader = new RandomAccessFile(_fixedDataFile, "rw");
      RandomAccessFile stringDataReader = new RandomAccessFile(_variableDataFile, "rw");

      try {

        //ok rebuild top level metadata as well 
        _metadata.clear();

        OnDiskCrawlHistoryItem item = new OnDiskCrawlHistoryItem();

        int processedCount = 0;
        while (fixedDataReader.getFilePointer() != fixedDataReader.length()) {

          long position = fixedDataReader.getFilePointer();

          // store offset in item 
          item._fileOffset = position;
          // load from disk 
          item.deserialize(fixedDataReader);
          try { 
            // seek to string data 
            stringDataReader.seek(item._stringsOffset);
            // and skip buffer length 
            WritableUtils.readVInt(stringDataReader);
            // and read primary string 
            String url = stringDataReader.readUTF();

            // get metadata object for subdomain 
            CrawlListMetadata subDomainMetadata = getTransientSubDomainMetadata(url);

            // increment url count 
            subDomainMetadata.setUrlCount(subDomainMetadata.getUrlCount() + 1);

            // increment top level metadata count 
            _metadata.setUrlCount(_metadata.getUrlCount() + 1);

            // update top level metadata ..
            updateMetadata(item, _metadata, 0);

            // update sub-domain metadata object  from item data
            updateMetadata(item, subDomainMetadata, 0);

            ++processedCount;
          }
          catch (IOException e) { 
            LOG.error("Exception Reading String Data For Item:" + (processedCount + 1));
            LOG.error("Exception:" + CCStringUtils.stringifyException(e));
            LOG.error("File Position:"+ fixedDataReader.getFilePointer() + " StringsPointer:" + stringDataReader.getFilePointer());
          }

          if (processedCount % 10000 == 0) { 
            LOG.info("*** LIST:" + getListId() + " Processed:" + processedCount + " Items");
          }
        }

        // ok commit top level metadata to disk as well 
        writeMetadataToDisk();

      }
      catch (IOException e) { 
        LOG.error("Encountered Exception Queueing Items for List:" + _listId + " Exception:" + CCStringUtils.stringifyException(e));
        LOG.error("File Position:"+ fixedDataReader.getFilePointer() + " StringsPointer:" + stringDataReader.getFilePointer());
        _queueState = QueueState.QUEUED;
      }
      finally { 
        fixedDataReader.close();
        stringDataReader.close();
      }
      LOG.info("*** LIST:" + getListId() + " SUBDOMAIN METADATA REBUILT FROM LIST DATA . WRITING TO DISK");

      // write metadat to disk 
      writeInitialSubDomainMetadataToDisk();



      LOG.info("*** LIST:" + getListId() + " SUBDOMAIN METADATA REBUILT FROM LIST DATA . WRITE COMPLETE");
    }
  }

  public int getSubDomainItemCount() { 
    synchronized (_metadata) {
      return _offsetLookupTable.getLength() / OFFSET_TABLE_ENTRY_SIZE;
    }
  }

  public ArrayList<CrawlListDomainItem> getSubDomainList(int offset,int count) { 
    synchronized (_metadata) {

      ArrayList<CrawlListDomainItem> itemsOut = new ArrayList<CrawlListDomainItem>();

      try {
        synchronized (_subDomainMetadataFile) { 
          RandomAccessFile file = new RandomAccessFile(_subDomainMetadataFile,"rw");
          DataInputBuffer inputBuffer = new DataInputBuffer();
          byte fixedDataBlock[] = new byte[CrawlListMetadata.Constants.FixedDataSize];

          try { 
            // skip version 
            file.read();
            // read item count 
            int itemCount = file.readInt();

            int i = offset;
            int end = Math.min(i+count,itemCount);

            LOG.info("*** LIST:" + getListId() + " SUBDOMAIN ITEM COUNT:"+ itemCount);

            if (i<itemCount) { 

              file.seek(5 + (CrawlListMetadata.Constants.FixedDataSize * offset));

              CrawlListMetadata newMetadata = new CrawlListMetadata();

              for (;i<end;++i) {

                long orignalPos = file.getFilePointer();
                file.readFully(fixedDataBlock, 0, fixedDataBlock.length);
                inputBuffer.reset(fixedDataBlock,fixedDataBlock.length);
                newMetadata.deserialize(inputBuffer, new BinaryProtocol());
                itemsOut.add(buildSubDomainSummary(newMetadata.getDomainName(),newMetadata));
              }
            }
          }
          finally { 
            file.close();
          }
        }
      }
      catch (IOException e) {
        LOG.error(CCStringUtils.stringifyException(e));
      }
      LOG.info("*** LIST:" + getListId() + " DONE LOADING SUBDOMAIN DATA FROM DISK");

      return itemsOut;
    }
  }

  private static CrawlListDomainItem buildSubDomainSummary(String domainName,CrawlListMetadata metadata) { 
    CrawlListDomainItem domainItem = new CrawlListDomainItem();

    domainItem.setDomainName(domainName);

    int robotsExcludedItemsCount =0;
    int errorItemsCount =0;
    int otherHTTPResultsCount = 0;

    metadata.getHttp200Count();

    metadata.getRobotsExcludedCount();
    
    errorItemsCount += metadata.getTimeoutErrorCount();
    errorItemsCount += metadata.getIOExceptionCount();
    errorItemsCount += metadata.getDNSErrorCount();
    errorItemsCount += metadata.getOtherErrorCount();
    
    
    otherHTTPResultsCount += metadata.getHttp403Count();
    otherHTTPResultsCount += metadata.getHttp404Count();
    otherHTTPResultsCount += metadata.getHttp500Count();
    otherHTTPResultsCount += metadata.getHttpOtherCount();

    domainItem.setUrlCount(metadata.getUrlCount());
    domainItem.setUrlsCrawled(metadata.getHttp200Count()+otherHTTPResultsCount);
    domainItem.setHttp200Count(metadata.getHttp200Count());
    domainItem.setInCacheItemsCount(0);
    domainItem.setRobotsExcludedCount(robotsExcludedItemsCount);
    domainItem.setErrorCount(errorItemsCount);

    domainItem.setFirstItemOffset(metadata.getFirstRecordOffset());
    domainItem.setLastItemOffset(metadata.getLastRecordOffset());
    domainItem.setHashCode((int)metadata.getDomainHash());

    domainItem.setQueuedCount(metadata.getQueuedItemCount());
    
    return domainItem;
  }

  /*
	public CrawlListMetadata getSubDomainMetadata() { 
		synchronized (_metadata) {
			ImmutableSortedSet.Builder<String> builder = ImmutableSortedSet.naturalOrder();
			builder.addAll(_subDomainNameToStatsMap.keySet());
			return builder.build();
    }
	}
   */
   /**********************************************************************/
  public static void main(String[] args) throws IOException {
    if (args[0].equalsIgnoreCase("dump")) { 
      File dataDir = new File(args[1]);
      long listId = Long.parseLong(args[2]);
      File outputPath = new File(args[3]);
      dumpUnCrawledItems(dataDir,listId,outputPath,true);
    }
  }
  
  public static void dumpUnCrawledItems(File dataDir,long listId,File outputFilePath, boolean includeRobotsExcludedItems) throws IOException { 

    File fixedDataFile = new File(dataDir,LIST_VALUE_MAP_PREFIX + Long.toString(listId));
    File variableDataFile = new File(dataDir,LIST_STRING_MAP_PREFIX + Long.toString(listId));
    
    LOG.info("FixedDataFile is:" + fixedDataFile);
    LOG.info("VariableDataFile is:" + variableDataFile);
    
    RandomAccessFile fixedDataReader = new RandomAccessFile(fixedDataFile, "r");
    RandomAccessFile stringDataReader = new RandomAccessFile(variableDataFile, "r");
    
    JsonWriter writer = new JsonWriter(new BufferedWriter(new FileWriter(outputFilePath),1024*1024*10));
    
    writer.setIndent(" ");
    
    try { 
      writer.beginObject();
      writer.name("urls");
      writer.beginArray();
      try {
  
        OnDiskCrawlHistoryItem item = new OnDiskCrawlHistoryItem();
        URLFP fingerprint = new URLFP();
  
        while (fixedDataReader.getFilePointer() != fixedDataReader.length()) {
  
          long position = fixedDataReader.getFilePointer();
  
          item.deserialize(fixedDataReader);
          
          // seek to string data 
          stringDataReader.seek(item._stringsOffset);
          // and skip buffer length 
          WritableUtils.readVInt(stringDataReader);
          // and read primary string 
          String url = stringDataReader.readUTF();
          // setup fingerprint 
          fingerprint.setDomainHash(item._domainHash);
          fingerprint.setUrlHash(item._urlFingerprint);
  
          // any item that has not been crawled needs to be queued 
          boolean queueItem = !item.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_CRAWL_STATUS);
          
          // if item is not queued, check to see if we need to retry the item 
          if (!queueItem && item.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_CRAWL_STATUS)) {
  
            if (item.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_REDIRECT_STATUS)) {
              
              queueItem = (item._redirectStatus != 0);
              
              if (!queueItem) { 
                if (item._redirectHttpResult != 200 && item._redirectHttpResult != 404) { 
                  queueItem = true;
                }
              }
            }
            else { 
              queueItem = (item._crawlStatus != 0);
  
              if (!queueItem) { 
                if (item._httpResultCode != 200 && item._httpResultCode != 404) { 
                  queueItem = true;
                }
              }
            }
          }
          
          if (queueItem) { 
            // ok if queue item is set ... 
            writer.beginObject();
            writer.name("url");
            writer.value(url);
            writer.name("redirected");
            writer.value((boolean)item.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_REDIRECT_STATUS));
            writer.name("lastStatus");
            if (item.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_REDIRECT_STATUS)) { 
              if (item._redirectStatus == 0) { 
                writer.value("HTTP-" + item._redirectHttpResult);
              }
              else { 
                writer.value(CrawlURL.FailureReason.toString(item._redirectHttpResult));
              }
            }
            else {
              if (item.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_CRAWL_STATUS)) { 
                if (item._crawlStatus == 0) { 
                  writer.value("HTTP-" + item._httpResultCode);
                }
                else { 
                  writer.value(CrawlURL.FailureReason.toString(item._crawlStatus));
                }
              }
              else { 
                writer.value("UNCRAWLED");
              }
            }
            writer.name("updateTime");
            writer.value(item._updateTimestamp);
            writer.endObject();
          }
        }
      }
      catch (IOException e) { 
        LOG.error("Encountered Exception Queueing Items for List:" + listId + " Exception:" + CCStringUtils.stringifyException(e));
      }
      finally { 
        fixedDataReader.close();
        stringDataReader.close();
      }
      
      writer.endArray();
      writer.endObject();
    }
    catch (Exception e) { 
      LOG.error(CCStringUtils.stringifyException(e));
      throw new IOException(e);
    }
    finally { 
      writer.flush();
      writer.close();
    }
    
  }
  
  
}