CrawlLog.java example

Explorer
commoncrawl-crawler-master
- src
  - com
    - dappit
      - Dapper
        parser
        CompressedDomBuilder.java
        DebugDocumentBuilder.java
        DocumentBuilder.java
        DomDocumentBuilder.java
        EnviromentController.java
        HTMLParser.java
        InstructionsPool.java
        LinkExtractionDocumentBuilder.java
        MozillaParser.java
        ParserException.java
        ParserInitializationException.java
        ParserInstruction.java
  - org
    - commoncrawl
/**
 * Copyright 2008 - CommonCrawl Foundation
 * 
 *    This program is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation, either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 */

package org.commoncrawl.service.crawler;

import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.DataInput;
import java.io.DataInputStream;
import java.io.DataOutput;
import java.io.DataOutputStream;
import java.io.EOFException;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.RandomAccessFile;
import java.net.MalformedURLException;
import java.net.URL;
import java.security.MessageDigest;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.Vector;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.zip.CRC32;

import junit.framework.Assert;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.SequenceFile.CompressionType;
import org.apache.hadoop.io.compress.SnappyCodec;
import org.commoncrawl.async.ConcurrentTask;
import org.commoncrawl.async.EventLoop;
import org.commoncrawl.async.Timer;
import org.commoncrawl.async.ConcurrentTask.CompletionCallback;
import org.commoncrawl.common.Environment;
import org.commoncrawl.crawl.common.internal.CrawlEnvironment;
import org.commoncrawl.protocol.CrawlSegmentDetail;
import org.commoncrawl.protocol.CrawlSegmentHost;
import org.commoncrawl.protocol.CrawlSegmentURL;
import org.commoncrawl.protocol.CrawlURL;
import org.commoncrawl.util.CCStringUtils;
import org.commoncrawl.util.FPGenerator;
import org.commoncrawl.util.FileUtils;
import org.commoncrawl.util.FlexBuffer;
import org.commoncrawl.util.MovingAverage;
import org.commoncrawl.util.RuntimeStatsCollector;
import org.commoncrawl.util.SmoothedAverage;
import org.mortbay.jetty.security.Credential.MD5;

import com.google.common.collect.Iterators;

public final class CrawlLog {

  public static final Log LOG = LogFactory.getLog(CrawlLog.class);

  public static final int DEFAULT_LOG_FLUSH_INTERVAL = 30000;

  public static final int DEFAULT_LOG_CHECKPOINT_INTERVAL = 60000 * 5;

  public static final int DEFAULT_LOG_FILE_CHECKPOINT_ITEM_COUNT_THRESHOLD = 100000;
  
  public static final long DEFAULT_LOG_FILE_SIZE_CHECKPOINT_THRESHOLD = 1073741824 * 4; 

  /** log file header **/
  LogFileHeader _header = new LogFileHeader();

  /** node name **/
  String _nodeName;

  /** data directory **/
  File _rootDirectory;

  /** event loop **/
  EventLoop _eventLoop;

  /** thread pool **/
  ExecutorService _threadPool;

  /** crawler engine pointer **/
  CrawlerEngine _engine;

  /** robots segment logger **/
  CrawlSegmentLog _robotsSegment = new CrawlSegmentLog(null, -1, -1, null);

  /** individual crawl segment loggers **/
  Map<Long, CrawlSegmentLog> _loggers = new HashMap<Long, CrawlSegmentLog>();

  /** checkpoint completion callback **/
  CheckpointCompletionCallback _checkpointCompletionCallback = null;

  /** checkpoint id - analogous to parse segment id **/
  long _checkpointId;

  /** flush in progress flag **/
  boolean _flushInProgress = false;

  /** a shutdown operation is in progress **/
  boolean _shutdownInProgress = false;

  /** log flusher timer **/
  Timer _logFlusherTimer = null;

  /** last checkpoint time **/
  long _lastCheckpointTime = -1;

  MovingAverage _flushTimeAVG = new MovingAverage(10);
  SmoothedAverage _flushTimeSmoothed = new SmoothedAverage(.8);
  long _lastFlushTime = 0;

  /** get active log file path **/
  public static File getActivePath(File directoryRoot) {
    // and construct a path to the local crawl segment directory ...
    File crawlDataDir = new File(directoryRoot, CrawlEnvironment.getCrawlerLocalOutputPath());
    // append the segment id to the path ...
    return new File(crawlDataDir, CrawlEnvironment.ActiveCrawlLog);
  }

  /** get active log file path **/
  public static File getCheckpointPath(File directoryRoot) {
    // and construct a path to the local crawl segment directory ...
    File crawlDataDir = new File(directoryRoot, CrawlEnvironment.getCrawlerLocalOutputPath());
    // append the segment id to the path ...
    return new File(crawlDataDir, CrawlEnvironment.CheckpointCrawlLog);
  }

  public static void ensureDataDirectory(File directoryRoot) {
    // and construct a path to the local crawl segment directory ...
    File crawlDataDir = new File(directoryRoot, CrawlEnvironment.getCrawlerLocalOutputPath());

    if (!crawlDataDir.exists()) {
      crawlDataDir.mkdir();
    }
  }

  /** purge local data directory **/
  public static void purgeDataDirectory(File directoryRoot) {
    // get crawl output path ...
    File crawlDataDir = new File(directoryRoot, CrawlEnvironment.getCrawlerLocalOutputPath());
    // delete entire directory and all contents underneath it
    FileUtils.recursivelyDeleteFile(crawlDataDir);
    // recreate directory
    crawlDataDir.mkdirs();
  }

  /** unit test constructor **/
  public CrawlLog() throws IOException {
    _rootDirectory = new File(CrawlEnvironment.DEFAULT_DATA_DIR, "crawlLog_unittest");
    if (!_rootDirectory.exists())
      _rootDirectory.mkdir();
    _eventLoop = new EventLoop();
    _nodeName = "test";
    _eventLoop.start();
    _threadPool = Executors.newFixedThreadPool(1);

    initialize();

  }

  public CrawlLog(CrawlerEngine engine) throws IOException {

    _engine = engine;
    _rootDirectory = engine.getServer().getDataDirectory();
    _nodeName = engine.getServer().getHostName();
    _eventLoop = engine.getEventLoop();
    _threadPool = engine.getServer().getDefaultThreadPool();

    initialize();
  }

  private void initialize() throws IOException {

    // create data directory if necessary ...
    ensureDataDirectory(_rootDirectory);

    File checkpointLogPath = getCheckpointPath(_rootDirectory);
    File activeLogPath = getActivePath(_rootDirectory);

    // check if it exists ...
    if (checkpointLogPath.exists()) {
      // log it ...
      LOG.warn("####Checkpoint Crawl Log Found - Possible Crash Recovery");
      // rename it as the active log ...
      checkpointLogPath.renameTo(activeLogPath);
    }

    LOG.info("Crawl Log Initializing Active Log");
    // either way call initialize active log ...
    _header = initializeActiveLog(_rootDirectory);

    LOG.info("Crawl Log Initialize returned " + _header._itemCount + " Entries in Active Log");

  }

  /** initialize log (file) **/
  private static LogFileHeader initializeActiveLog(File rootDirectory) throws IOException {
    File activeLogPath = getActivePath(rootDirectory);
    return initializeLogFileHeaderFromLogFile(activeLogPath);
  }

  private static LogFileHeader initializeLogFileHeaderFromLogFile(File logFilePath) throws IOException {

    LogFileHeader headerOut = null;
    if (!logFilePath.exists()) {
      DataOutputStream outputStream = new DataOutputStream(new FileOutputStream(logFilePath));
      try {
        headerOut = initializeEmptyLogFile(outputStream);
      } finally {
        outputStream.close();
      }
    } else {
      headerOut = new LogFileHeader();

      DataInputStream inputStream = new DataInputStream(new FileInputStream(logFilePath));

      try {
        headerOut.readHeader(inputStream);
      } finally {
        inputStream.close();
      }
    }

    return headerOut;
  }

  /** get the host name **/
  public String getNodeName() {
    return _nodeName;
  }

  /** make packed log id from list id and segment log id **/
  public static long makeSegmentLogId(int listId, int segmentId) {
    return (((long) listId) << 32) | (long) segmentId;
  }

  /** get segment log id from packed id **/
  public static int getSegmentIdFromLogId(long logId) {
    return (int) (logId & 0xFFFFFFFF);
  }

  /** get list id from packed id **/
  public static int getListIdFromLogId(long logId) {
    return (int) ((logId >> 32) & 0xFFFFFFFF);
  }

  /** add a segment log given segment id **/
  public void addSegmentLog(CrawlSegmentLog log) {
    if (_loggers.get(log.getSegmentId()) != null) {
      LOG.error("Attempt to Activate an Already Active Segment Log. Segment Id:" + log.getSegmentId());
      throw new RuntimeException("Attempt to Activate an Already Active Segment Log. Segment Id:" + log.getSegmentId());
    }
    _loggers.put(makeSegmentLogId(log.getListId(), log.getSegmentId()), log);
  }

  /** get the special robots crawl segment **/
  public CrawlSegmentLog getRobotsSegment() {
    return _robotsSegment;
  }

  /** get a segment log given segment id **/
  public CrawlSegmentLog getLogForSegment(int listId, int segmentId) {
    return _loggers.get(makeSegmentLogId(listId, segmentId));
  }

  /** remove segment log **/
  public CrawlSegmentLog removeSegmentLog(int listId, int segmentId) {
    return _loggers.remove(makeSegmentLogId(listId, segmentId));
  }

  private static class LogFileHeader {

    public static final int LogFileHeaderBytes = 0xCC00CC00;
    public static final int LogFileVersion = 1;

    public LogFileHeader() {
      _fileSize = 0;
      _itemCount = 0;
    }

    public long _fileSize;
    public long _itemCount;

    public void writeHeader(DataOutput stream) throws IOException {
      stream.writeInt(LogFileHeaderBytes);
      stream.writeInt(LogFileVersion);
      stream.writeLong(_fileSize);
      stream.writeLong(_itemCount);
    }

    public void readHeader(DataInput stream) throws IOException {
      int headerBytes = stream.readInt();
      int version = stream.readInt();

      if (headerBytes != LogFileHeaderBytes && version != LogFileVersion) {
        throw new IOException("Invalid CrawlLog File Header Detected!");
      }
      _fileSize = stream.readLong();
      _itemCount = stream.readLong();
    }
  }

  private static void updateLogFileHeader(File logFileName, LogFileHeader header, long addedRecordCount)
      throws IOException {

    RandomAccessFile file = new RandomAccessFile(logFileName, "rw");

    try {

      // update cached header ...
      header._fileSize = file.getChannel().size();
      header._itemCount += addedRecordCount;
      // set the position at zero ..
      file.seek(0);
      // and write header to disk ...
      header.writeHeader(file);
    } finally {
      // major bottle neck..
      // file.getFD().sync();
      file.close();
    }
  }

  private static LogFileHeader initializeEmptyLogFile(DataOutput stream) throws IOException {

    LogFileHeader header = new LogFileHeader();
    header.writeHeader(stream);

    return header;
  }

  public static LogFileHeader readLogFileHeader(File logFileName) throws IOException {

    LogFileHeader headerOut = new LogFileHeader();
    RandomAccessFile file = new RandomAccessFile(logFileName, "r");
    try {
      headerOut = readLogFileHeader(file);
    } finally {
      file.close();
    }
    return headerOut;
  }

  private static LogFileHeader readLogFileHeader(DataInput reader) throws IOException {

    LogFileHeader headerOut = new LogFileHeader();

    headerOut.readHeader(reader);

    return headerOut;
  }

  private boolean isCheckpointInProgress() {
    return _checkpointCompletionCallback != null;
  }

  private boolean isFlushInProgress() {
    return _flushInProgress == true;
  }

  private void setFlushInProgress(boolean value) {
    _flushInProgress = value;

    if (value == false) {
      // since we are in the async thread at this point, check to see if a
      // checkpoint is in progress
      if (isCheckpointInProgress()) {
        // if so, it was deferred, because of the flush in progress... so we
        // need to actually kick off the checkpoint progress
        // now that the flush is complete
        doCheckpoint();
      }
    }
  }

  public static interface CheckpointCompletionCallback {

    public void checkpointComplete(long checkpointId, Vector<Long> completedSegmentList);

    public void checkpointFailed(long checkpointId, Exception e);

  }

  public static interface FlushCompletionCallback {
    public void flushComplete();

    public void flushFailed(Exception e);
  }

  /** essentially swap crawl logs **/
  private void checkpointLocalCrawlLog() throws IOException {
    File activeCrawlLog = getActivePath(_rootDirectory);
    File checkpointCrawlLog = getCheckpointPath(_rootDirectory);

    LOG.info("MOVING ACTIVE:" + activeCrawlLog + "TO:" + checkpointCrawlLog);
    // delete any existing checkpoint log ...
    checkpointCrawlLog.delete();
    // rename active log to check point log
    activeCrawlLog.renameTo(checkpointCrawlLog);
    // and create a new active crawlLog ...
    _header = initializeActiveLog(_rootDirectory);
  }

  public void checkpoint(long checkpointStartTime, CheckpointCompletionCallback callback, long checkpointId) {

    // first check to see if checkpoint is already in progress ...
    if (isCheckpointInProgress()) {
      // immediately fail call
      callback.checkpointFailed(checkpointId, new Exception("Invalid State. Checkpoint already in progress!"));
    }

    _lastCheckpointTime = checkpointStartTime;

    // otherwise transition to a checkpoint in progress state
    _checkpointCompletionCallback = callback;
    _checkpointId = checkpointId;

    // now check to see if we are not in the middle of a flush ...
    if (!isFlushInProgress()) {
      // if not we can directly start the actual checkpoint process ...
      doCheckpoint();
    }
    // otherwise wait for the flush to finish (and thus trigger the checkpoint
    // process)
  }

  public void finalizeCheckpoint() {
    File checkpointLogFile = getCheckpointPath(_rootDirectory);
    checkpointLogFile.delete();
  }

  public void abortCheckpoint() {
    File activeLogFile = getActivePath(_rootDirectory);
    File checkpointLogFile = getCheckpointPath(_rootDirectory);
    LOG.info("###ABORTING CHECKPOINT! RENAMING:" + checkpointLogFile + " TO:" + activeLogFile);
    checkpointLogFile.renameTo(activeLogFile);
  }

  public void purgeActiveLog() throws IOException {
    File activeLogFilePath = getActivePath(_rootDirectory);

    if (activeLogFilePath.exists())
      activeLogFilePath.delete();

    _header = initializeActiveLog(_rootDirectory);
  }

  private static class CorruptCrawlLogException extends IOException {

    public CorruptCrawlLogException(String description) {
      super(description);
    }
  }

  /**
   * seek out next instance of sync bytes in the file input stream
   * 
   * @param file
   * @throws IOException
   */
  private static boolean seekToNextSyncBytesPos(byte[] syncBytesBuffer, RandomAccessFile file, long maxFileSize)
      throws IOException {

    while (file.getFilePointer() < maxFileSize) {
      try {
        // read in a sync.length buffer amount
        file.read(syncBytesBuffer);

        int syncLen = SYNC_BYTES_SIZE;

        // start scan for next sync position ...
        for (int i = 0; file.getFilePointer() < maxFileSize; i++) {
          int j = 0;
          for (; j < syncLen; j++) {
            if (_sync[j] != syncBytesBuffer[(i + j) % syncLen])
              break;
          }
          if (j == syncLen) {
            // found matching sync bytes - reset file pos to before sync bytes
            file.seek(file.getFilePointer() - SYNC_BYTES_SIZE); // position
            // before
            // sync
            return true;
          }
          syncBytesBuffer[i % syncLen] = file.readByte();
        }
      } catch (IOException e) {
        LOG.warn("IOException at:" + file.getFilePointer() + " Exception:" + CCStringUtils.stringifyException(e));
        LOG.warn("Skipping to:" + file.getFilePointer() + 4096);
        file.seek(file.getFilePointer() + 4096);
      }
    }
    return false;
  }

  private static interface HDFSCrawlURLWriter {
    public void writeCrawlURLItem(Text url, CrawlURL urlObject) throws IOException;

    public void close() throws IOException;

    public List<Path> getFilenames();
  }

  private static class SequenceFileCrawlURLWriter implements HDFSCrawlURLWriter {

    private static final long FLUSH_THRESHOLD = 1073741824L;
    
    FileSystem _fs;
    Configuration _conf;
    Path _stagingDirectory;
    String _nodeName;
    long _currentFileRecordCount = 0;
    ArrayList<Path> _outputPaths = new ArrayList<Path>();

    long _nextFileId = -1L;
    Path currentFilePath = null;
    SequenceFile.Writer writer = null;
    long _prevPos;

    public SequenceFileCrawlURLWriter(Configuration conf, FileSystem fs, Path path, String nodeName, long checkpointId)
        throws IOException {
      _conf = conf;
      _fs = fs;
      _stagingDirectory = path;
      _nodeName = nodeName;
      _nextFileId = checkpointId;

      flushFile(true);
    }

    private void flushFile(boolean openNew) throws IOException {
      if (writer != null) {
        writer.close();
        if (_currentFileRecordCount != 0) {
          LOG.info("Flushed Temp Checkpoint File:" + currentFilePath);
          _outputPaths.add(currentFilePath);
        } else {
          LOG.info("Deleting Emtpy Checkpoint File:" + currentFilePath);
          _fs.delete(currentFilePath, false);
        }
        writer = null;
        _currentFileRecordCount = 0;
        currentFilePath = null;
      }

      if (openNew) {
        // allocate a new filename
        currentFilePath = new Path(_stagingDirectory, CrawlEnvironment.buildCrawlLogCheckpointName(_nodeName,
            _nextFileId++));
        LOG.info("Allocating new Checkpoint File:" + currentFilePath);
        // delete it
        if (_fs.exists(currentFilePath)) {
          LOG.warn("Existing Checkpoint TempFile found at:" + currentFilePath + " - Deleting");
          _fs.delete(currentFilePath, false);
        }
        // open a sequence file writer at the temp file location ...
        writer = SequenceFile.createWriter(_fs, _conf, currentFilePath, Text.class, CrawlURL.class,
            CompressionType.BLOCK, new SnappyCodec());
        // reset record count ...
        _currentFileRecordCount = 0;
      }
    }

    @Override
    public void writeCrawlURLItem(Text url, CrawlURL urlObject) throws IOException {
      writer.append(url, urlObject);
      ++_currentFileRecordCount;
      long pos = writer.getLength();
      if (pos != _prevPos) {
        _prevPos = pos;
        if (pos >= FLUSH_THRESHOLD) {
          flushFile(true);
        }
      }
    }

    public void close() throws IOException {
      flushFile(false);
    }

    public List<Path> getFilenames() {
      return _outputPaths;
    }
  };

  private static class URLWriterException extends IOException {
    public URLWriterException() {

    }
  }

  private static void transferLocalCheckpointLog(File crawlLogPath, HDFSCrawlURLWriter writer, long checkpointId)
      throws IOException {

    // and open the crawl log file ...
    RandomAccessFile inputStream = null;

    IOException exception = null;

    CRC32 crc = new CRC32();
    CustomByteArrayOutputStream buffer = new CustomByteArrayOutputStream(1 << 17);
    byte[] syncBytesBuffer = new byte[SYNC_BYTES_SIZE];

    // save position for potential debug output.
    long lastReadPosition = 0;

    try {
      inputStream = new RandomAccessFile(crawlLogPath, "rw");
      // and a data input stream ...
      RandomAccessFile reader = inputStream;
      // seek to zero
      reader.seek(0L);

      // read the header ...
      LogFileHeader header = readLogFileHeader(reader);

      // read a crawl url from the stream...

      while (inputStream.getFilePointer() < header._fileSize) {

        if (seekToNextSyncBytesPos(syncBytesBuffer, reader, header._fileSize)) {

          try {
            lastReadPosition = inputStream.getFilePointer();

            // skip sync
            inputStream.skipBytes(SYNC_BYTES_SIZE);

            // read length ...
            int urlDataLen = reader.readInt();
            long urlDataCRC = reader.readLong();

            if (urlDataLen > buffer.getBuffer().length) {
              buffer = new CustomByteArrayOutputStream(((urlDataLen / 65536) + 1) * 65536);
            }
            reader.read(buffer.getBuffer(), 0, urlDataLen);
            crc.reset();
            crc.update(buffer.getBuffer(), 0, urlDataLen);

            long computedValue = crc.getValue();

            // validate crc values ...
            if (computedValue != urlDataCRC) {
              LOG.error("CRC Mismatch Detected during HDFS transfer in CrawlLog:" + crawlLogPath.getAbsolutePath()
                  + " Checkpoint Id:" + checkpointId + " FilePosition:" + lastReadPosition);
              inputStream.seek(lastReadPosition + 1);
            } else {
              // allocate a crawl url data structure
              CrawlURL url = new CrawlURL();
              DataInputStream bufferReader = new DataInputStream(new ByteArrayInputStream(buffer.getBuffer(), 0,
                  urlDataLen));
              // populate it from the (in memory) data stream
              url.readFields(bufferReader);
              try {
                // and write out appropriate sequence file entries ...
                writer.writeCrawlURLItem(new Text(url.getUrl()), url);
              } catch (IOException e) {
                LOG.error("Failed to write CrawlURL to SequenceFileWriter with Exception:"
                    + CCStringUtils.stringifyException(e));
                throw new URLWriterException();
              }
            }
          } catch (URLWriterException e) {
            LOG.error("Caught URLRewriter Exception! - Throwing to outer layer!");
            throw e;
          } catch (Exception e) {
            LOG.error("Ignoring Error Processing CrawlLog Entry at Position:" + lastReadPosition + " Exception:"
                + CCStringUtils.stringifyException(e));
          }
        } else {
          break;
        }
      }
    } catch (EOFException e) {
      LOG.error("Caught EOF Exception during read of local CrawlLog:" + crawlLogPath.getAbsolutePath()
          + " Checkpoint Id:" + checkpointId + " FilePosition:" + lastReadPosition);
    } catch (IOException e) {
      LOG.error(CCStringUtils.stringifyException(e));
      exception = e;
      throw e;
    } finally {
      if (inputStream != null)
        inputStream.close();
    }
  }


  private Path transferLocalSegmentLog(FileSystem hdfs, File localSegmentLogFile, long checkpointId, int listId,
      int segmentId) throws IOException {

    if (localSegmentLogFile.exists()) {

      // determine the file's size ...
      // if > header size (in other words it has data ... )
      if (localSegmentLogFile.length() > CrawlSegmentLog.getHeaderSize()) {
        // construct a target path (where we are going to store the checkpointed
        // crawl log )
        Path remoteLogFileName = CrawlEnvironment.getRemoteCrawlSegmentLogCheckpointPath(new Path(CrawlEnvironment.getCrawlSegmentLogsDirectory()),getNodeName(), checkpointId, listId, segmentId);

        // replace if existing ... 
        hdfs.delete(remoteLogFileName,false);

        Path localPath = new Path(localSegmentLogFile.getAbsolutePath());
        
        hdfs.mkdirs(remoteLogFileName.getParent());
        
        LOG.info("Copying CrawlSegmentLog for List:" + listId + " Segment:" + segmentId + " from:" + localPath  + " to fs:" + hdfs + " path:" + remoteLogFileName);
        hdfs.copyFromLocalFile(localPath, remoteLogFileName);

        return remoteLogFileName;
      }
    }
    return null;
  }

  private void purgeHDFSSegmentLogs(FileSystem hdfs, int listId, int segmentId) throws IOException {

    Path listLogDirectory = new Path(CrawlEnvironment.getCrawlSegmentLogsDirectory(), ((Integer) listId).toString());
    Path segmentLogDirectory = new Path(listLogDirectory, ((Integer) segmentId).toString());
    Path completionLogFilePath = new Path(segmentLogDirectory, CrawlEnvironment
        .buildCrawlSegmentCompletionLogFileName(getNodeName()));

    if (!hdfs.exists(completionLogFilePath)) {
      // create a zero length completion log file on hdfs ...
      hdfs.createNewFile(completionLogFilePath);
    }

    // skip this step as history servers now manage segment logs
    /*
     * // and now ... delete all logs Path segmentLogWildcardPath = new
     * Path(segmentLogDirectory
     * ,CrawlEnvironment.buildCrawlSegmentLogCheckpointWildcardString
     * (getNodeName())); FileStatus paths[] =
     * hdfs.globStatus(segmentLogWildcardPath); if (paths != null) { for
     * (FileStatus path : paths) { // hdfs.delete(path.getPath()); } }
     */
  }

  /** perform the actual checkpoint work here ... **/
  private void doCheckpoint() {
    // at this point, we should be in the async thread, and all flusher
    // activities are blocked ...
    LOG.info("CrawlLog Checkpoint - Starting ");
    // collect all necessary information from thread-unsafe data structure now
    // (in async thread context)
    final Set<Long> activeSegments = new HashSet<Long>();

    try {
      // add all active segment ids to our key set ...
      activeSegments.addAll(_loggers.keySet());
      LOG.info("CrawlLog Checkpoint - Preparing CrawlLog Files");
      // checkpoint crawl log ...
      checkpointLocalCrawlLog();
      LOG.info("CrawlLog Checkpoint - Preparing Segment Log Files");
      // next checkpoint all active segment logs ...
      for (CrawlSegmentLog segmentLog : _loggers.values()) {
        segmentLog.checkpointLocalLog();
      }
      LOG.info("CrawlLog Checkpoint - Ready for HDFS Transfer");
    } catch (IOException e) {
      LOG.error("Checkpoint failed with Exception:" + CCStringUtils.stringifyException(e));
    }

    // spawn a thread to do most of the blocking io ...
    _threadPool.submit(new ConcurrentTask<Boolean>(_eventLoop,

    new Callable<Boolean>() {

      public Boolean call() throws Exception {

        // we need to track these in case of failure ...
        Vector<Path> segmentLogFinalPaths = new Vector<Path>();

        // get the log file system
        final FileSystem crawlLogsFS  = CrawlEnvironment.getDefaultFileSystem();

        try {

          LOG.info("CrawlLog Checkpoint - Transferring CrawlLog to HDFS");

          // construct a target path (where we are going to store the
          // checkpointed crawl log )
          //Path stagingDirectory = new Path(CrawlEnvironment.getCheckpointStagingDirectory());
          Path checkpointDirectory = CrawlerServer.getServer().getCrawlContentPath();
          LOG.info("***Checkpoint Dir is:" + checkpointDirectory);
          if (checkpointDirectory == null) { 
            throw new IOException("Checkpoint Failed. Null Checkpoint Directory!");
          }

          FileSystem crawlDataFS = FileSystem.get(checkpointDirectory.toUri(),CrawlEnvironment.getHadoopConfig());
          LOG.info("***Checkpoint Content FS is:" + crawlDataFS);
          SequenceFileCrawlURLWriter hdfsWriter = new SequenceFileCrawlURLWriter(CrawlEnvironment.getHadoopConfig(),
              crawlDataFS, checkpointDirectory, getNodeName(), _checkpointId);

          try {
            // write out crawl log to hdfs ...
            transferLocalCheckpointLog(getCheckpointPath(_rootDirectory), hdfsWriter, _checkpointId);
          } catch (Exception e) {
            LOG.error("HDFS Write of CrawlLog failed. Deleting tempFiles:" + hdfsWriter.getFilenames() + " Exception:"
                + CCStringUtils.stringifyException(e));

            // close writer
            hdfsWriter.close();
            // delete any hdfs output ...
            for (Path path : hdfsWriter.getFilenames()) {
              LOG.info("Deleting temp (HDFS) checkpoint file:" + path);
              crawlDataFS.delete(path, false);
            }
            throw e;
          } finally {
            hdfsWriter.close();
          }

          LOG.info("CrawlLog Checkpoint - Transferring CrawlSegment Logs");
          // and next for every segment
          for (long packedLogId : activeSegments) {

            File segmentLogPath = CrawlSegmentLog.buildCheckpointPath(_rootDirectory, getListIdFromLogId(packedLogId),
                getSegmentIdFromLogId(packedLogId));

            LOG.info("CrawlLog Checkpoint - Transferring CrawlSegment Log for List:" + getListIdFromLogId(packedLogId) + " Segment:"+ getSegmentIdFromLogId(packedLogId));
            // copy the segment log ...
            Path remoteLogFilePath 
                = transferLocalSegmentLog(crawlLogsFS, segmentLogPath, _checkpointId,getListIdFromLogId(packedLogId), getSegmentIdFromLogId(packedLogId));
            
            // if path is not null (data was copied) ...
            if (remoteLogFilePath != null) {
              // add it to vector ...
              segmentLogFinalPaths.add(remoteLogFilePath);
            }
          }
          LOG.info("CrawlLog Checkpoint - Finished Transferring CrawlSegment Logs");

          // now if we got here ... all hdfs transfers succeeded ...
          // go ahead and move checkpoint log from staging to final data
          // directory ...
          /* 
          Path checkpointDirectory = new Path(CrawlEnvironment.getCheckpointDataDirectory());

          // if no checkpoint data directory ... create one ...
          if (!hdfs.exists(checkpointDirectory))
            hdfs.mkdirs(checkpointDirectory);

          for (Path checkpointTempFilePath : hdfsWriter.getFilenames()) {
            Path checkpointFinalPath = new Path(checkpointDirectory, checkpointTempFilePath.getName());
            LOG.info("Promoting Checking File From:" + checkpointTempFilePath + " to:" + checkpointFinalPath);
            // and essentially move the crawl log file from staging to data
            // directory ..
            boolean success = hdfs.rename(checkpointTempFilePath, checkpointFinalPath);
            if (!success) {
              throw new IOException("Failed to Rename Checkpoint Temp:" + checkpointTempFilePath + " to:"
                  + checkpointFinalPath);
            }
          }
          */
          // if we got here checkpoint was successfull...
          return true;
        } catch (Exception e) {
          LOG.error("Checkpoint:" + _checkpointId + " FAILED with exception:" + CCStringUtils.stringifyException(e));
          for (Path segmentPath : segmentLogFinalPaths) {
            crawlLogsFS.delete(segmentPath,false);
          }
          throw e;
        }
      }
    },

    new CompletionCallback<Boolean>() {

      public void taskComplete(Boolean updateResult) {

        Vector<Long> completedSegmentList = new Vector<Long>();

        LOG.info("CrawlLog Checkpoint - Finalizing CrawlLog Checkpoint");
        // delete the local checkpoint log ...
        finalizeCheckpoint();

        LOG.info("CrawlLog Checkpoint - Finalizing CrawlSegmentLogs");
        for (CrawlSegmentLog segmentLog : _loggers.values()) {
          // LOG.info("CrawlLog Checkpoint - Finalizing CrawlSegmentLog for Segment:"
          // + segmentLog.getSegmentId());
          // finalize the checkpoint on the segment log ...
          segmentLog.finalizeCheckpoint();
          // and check to see if the segment has been completed ...
          if (segmentLog.isSegmentComplete()) {
            // if so, add it our completed segments list ...
            completedSegmentList.add(makeSegmentLogId(segmentLog.getListId(), segmentLog.getSegmentId()));
          }
        }

        // now for all completed segments ... purge hdfs logs ...
        for (long packedSegmentId : completedSegmentList) {
          try {
            LOG.info("CrawlLog Checkpoint - Purging HDFS CrawlSegmentLogs from Completed Segment. List:"
                + getListIdFromLogId(packedSegmentId) + " Segment:" + getSegmentIdFromLogId(packedSegmentId));
            // purge hdfs files (and create a completion log file)
            //purgeHDFSSegmentLogs(CrawlEnvironment.getDefaultFileSystem(), getListIdFromLogId(packedSegmentId),getSegmentIdFromLogId(packedSegmentId));
            LOG.info("CrawlLog Checkpoint - Purging Local CrawlSegmentLogs from Completed Segment. List:"
                + getListIdFromLogId(packedSegmentId) + " Segment:" + getSegmentIdFromLogId(packedSegmentId));
            // and purge local files as well ...
            _loggers.get(packedSegmentId).purgeLocalFiles();
          } catch (IOException e) {
            LOG.error("Purge SegmentLog for Segment List:" + getListIdFromLogId(packedSegmentId) + " Segment:"
                + getSegmentIdFromLogId(packedSegmentId) + " threw IOException:" + CCStringUtils.stringifyException(e));
          }
          LOG.info("CrawlLog Checkpoint - DeRegistering Segment List:" + getListIdFromLogId(packedSegmentId)
              + " Segment:" + getSegmentIdFromLogId(packedSegmentId) + " From CrawlLog");
          // no matter what ... unload the segment ...
          _loggers.remove(packedSegmentId);
        }

        CheckpointCompletionCallback callback = _checkpointCompletionCallback;
        long checkpointId = _checkpointId;

        // otherwise transition to a checkpoint in progress state
        _checkpointCompletionCallback = null;
        _checkpointId = -1;

        LOG.info("CrawlLog Checkpoint - Checkpoint Complete - Initiating Callback");

        // and complete transaction ...
        callback.checkpointComplete(checkpointId, completedSegmentList);

      }

      public void taskFailed(Exception e) {

        // all failures are critical in this particular task ...
        LOG.error("Crawl Log FLUSH Threw Exception:" + CCStringUtils.stringifyException(e));

        // revert checkpoint logs ...
        abortCheckpoint();

        for (CrawlSegmentLog segmentLog : _loggers.values()) {
          segmentLog.abortCheckpoint();
        }

        CheckpointCompletionCallback callback = _checkpointCompletionCallback;
        long checkpointId = _checkpointId;

        // otherwise transition to a checkpoint in progress state
        _checkpointCompletionCallback = null;
        _checkpointId = -1;

        // now check to see if this was corrupt crawl log exception
        if (e.getCause() instanceof CorruptCrawlLogException) {
          // ACK!!!
          LOG.fatal("Corrupt CrawlLog detected with Exception:" + CCStringUtils.stringifyException(e));

          try {
            // this is a serious error ... time to purge the crawl log directory
            // altogether ...
            purgeActiveLog();

            // and all active segment logs as well...
            for (CrawlSegmentLog segmentLog : _loggers.values()) {
              segmentLog.purgeActiveLog();
            }
          } catch (IOException e2) {
            LOG.error("IOException during Segment Log PURGE:" + CCStringUtils.stringifyException(e2));
          }

          // time to die hard ...
          throw new RuntimeException(e);
        }

        // and complete transaction ...
        callback.checkpointFailed(checkpointId, e);
      }
    }));
  }

  private static final class CustomByteArrayOutputStream extends ByteArrayOutputStream {
    public CustomByteArrayOutputStream(int initialSize) {
      super(initialSize);
    }

    public byte[] getBuffer() {
      return buf;
    }
  }

  private void logCrawlLogWrite(CrawlURL url, int bufferSizeOut) {
    StringBuffer sb = new StringBuffer();

    sb.append(String.format("%1$20.20s ", CCStringUtils.dateStringFromTimeValue(System.currentTimeMillis())));
    sb.append(String.format("%1$4.4s ", url.getResultCode()));
    sb.append(String.format("%1$10.10s ", url.getContentRaw().getCount()));
    if ((url.getFlags() & CrawlURL.Flags.IsRedirected) != 0) {
      sb.append(url.getRedirectURL());
      sb.append(" ");
    }
    sb.append(url.getUrl());
    _engine.getCrawlLogLog().info(sb.toString());
  }

  static byte[] _sync; // 16 random bytes
  static final int SYNC_BYTES_SIZE = 16;
  static {
    try {
      MessageDigest digester = MessageDigest.getInstance("MD5");
      digester.update("SOME RANDOM BYTES".getBytes());
      _sync = digester.digest();
    } catch (Exception e) {
      throw new RuntimeException(e);
    }
  }

  private static class SyncedCrawlURLLogWriter {

    boolean _injectErrors = false;
    boolean _corruptThisEntry = false;

    public SyncedCrawlURLLogWriter(boolean injectErrors) {
      _injectErrors = injectErrors;
    }

    public SyncedCrawlURLLogWriter() {

    }

    private CustomByteArrayOutputStream bufferOutputStream = new CustomByteArrayOutputStream(1 << 17);
    private DataOutputStream dataOutputStream = new DataOutputStream(bufferOutputStream);
    private CRC32 crc = new CRC32();

    public void writeItem(DataOutputStream crawlLogStream, CrawlURL url) throws IOException {

      bufferOutputStream.reset();
      // write to intermediate stream ...
      url.write(dataOutputStream);
      // and crc the data ...
      crc.reset();
      crc.update(bufferOutputStream.getBuffer(), 0, bufferOutputStream.size());
      // write out sync bytes first
      crawlLogStream.write(_sync);
      // write out length
      crawlLogStream.writeInt(bufferOutputStream.size());
      // crc next
      long computedValue = crc.getValue();
      if (_injectErrors) {
        _corruptThisEntry = !_corruptThisEntry;
        if (_corruptThisEntry) {
          LOG.info("Intentionally Corrupting URL:" + url.getUrl());
          computedValue += 12;
        }
      }
      crawlLogStream.writeLong(computedValue);
      // and then the data
      crawlLogStream.write(bufferOutputStream.getBuffer(), 0, bufferOutputStream.size());
    }
  }

  private void flushLog(final FlushCompletionCallback completionCallback) {
    if (Environment.detailLogEnabled())
      LOG.info("LOG_FLUSH:Collecting Entries....");
    // set flush in progress indicator ...
    setFlushInProgress(true);
    // and collect buffers in async thread context (thus not requiring
    // synchronization)
    final LinkedList<CrawlSegmentLog.LogItemBuffer> collector = new LinkedList<CrawlSegmentLog.LogItemBuffer>();
    // flush robots log
    _robotsSegment.flushLog(collector);
    // walk segments collecting log items ....
    for (CrawlSegmentLog logger : _loggers.values()) {
      // flush any log items into the collector
      logger.flushLog(collector);
    }
    if (Environment.detailLogEnabled())
      LOG.info("LOG_FLUSH:Collection Returned " + collector.size() + " Buffers");

    // walk collector list identifying the list of unique segment ids
    final Set<Long> packedSegmentIdSet = new HashSet<Long>();

    int urlItemCount = 0;

    for (CrawlSegmentLog.LogItemBuffer buffer : collector) {
      if (buffer.getListId() != -1 && buffer.getSegmentId() != -1) {
        packedSegmentIdSet.add(makeSegmentLogId(buffer.getListId(), buffer.getSegmentId()));
      }
      urlItemCount += buffer.getItemCount();
    }

    if (Environment.detailLogEnabled())
      LOG.info("LOG_FLUSH:There are  " + urlItemCount + " Items in Flush Buffer Associated With "
          + packedSegmentIdSet.size() + " Segments");

    final File crawlLogFile = getActivePath(_rootDirectory);

    // now check to see if there is anything to do ...
    if (collector.size() != 0) {
      if (Environment.detailLogEnabled())
        LOG.info("LOG_FLUSH: Collector Size is NOT Zero... Starting Log Flusher Thread");
      // ok ... time to spawn a thread to do the blocking flush io
      _threadPool.submit(new ConcurrentTask<Boolean>(_eventLoop,

      new Callable<Boolean>() {

        public Boolean call() throws Exception {

          if (Environment.detailLogEnabled())
            LOG.info("LOG_FLUSH: Log Flusher Thread Started");
          long startTime = System.currentTimeMillis();

          Map<Long, DataOutputStream> streamsMapByPackedId = new HashMap<Long, DataOutputStream>();
          Map<Long, Integer> recordCountsByPackedId = new HashMap<Long, Integer>();

          long crawlLogRecordCount = 0;

          // open the actual crawler log file ...
          final DataOutputStream crawlLogStream = new DataOutputStream(new FileOutputStream(crawlLogFile, true));

          try {
            if (Environment.detailLogEnabled())
              LOG.info("LOG_FLUSH: Log Flusher Thread Opening Streams for Segments in Buffer");
            // now open a set of file descriptors related to the identified
            // segments
            for (long packedSegmentId : packedSegmentIdSet) {
              // construct the unique filename for the given log file...
              File activeSegmentLog = CrawlSegmentLog.buildActivePath(_rootDirectory,
                  getListIdFromLogId(packedSegmentId), getSegmentIdFromLogId(packedSegmentId));
              // initialize the segment log ...
              CrawlSegmentLog.initializeLogFile(activeSegmentLog);
              // initialize record counts per stream ...
              recordCountsByPackedId.put(packedSegmentId, CrawlSegmentLog.readerHeader(activeSegmentLog));
              // and open an output stream for the specified log file ...
              streamsMapByPackedId.put(packedSegmentId, new DataOutputStream(new FileOutputStream(activeSegmentLog,
                  true)));
            }

            if (Environment.detailLogEnabled())
              LOG.info("LOG_FLUSH: Log Flusher Thread Walking Items in Buffer");

            // initialize a total item count variable
            int totalItemCount = 0;

            // crawl history stream
            DataOutputBuffer historyStream = new DataOutputBuffer();

            // and now walk log buffers ...
            for (CrawlSegmentLog.LogItemBuffer buffer : collector) {
              if (Environment.detailLogEnabled())
                LOG.info("LOG_FLUSH: Log Flusher Thread Writing " + buffer.getItemCount() + " Entries for Segment:"
                    + buffer.getSegmentId());

              // output stream
              DataOutputStream segmentLogStream = null;

              if (buffer.getListId() != -1 && buffer.getSegmentId() != -1) {
                // update segment count first ...
                recordCountsByPackedId.put(makeSegmentLogId(buffer.getListId(), buffer.getSegmentId()),
                    recordCountsByPackedId.get(makeSegmentLogId(buffer.getListId(), buffer.getSegmentId()))
                        + buffer.getItemCount());
                // get output stream associated with segment id
                segmentLogStream = streamsMapByPackedId
                    .get(makeSegmentLogId(buffer.getListId(), buffer.getSegmentId()));
              }

              // and our local record counter ...
              crawlLogRecordCount += buffer.getItemCount();

              // and next do the actual disk flush ...
              totalItemCount += buffer.flushToDisk(totalItemCount,

              new CrawlSegmentLog.LogItemBuffer.CrawlURLWriter() {

                SyncedCrawlURLLogWriter syncedLogWriter = new SyncedCrawlURLLogWriter();

                public void writeItem(CrawlURL url) throws IOException {
                  // log it
                  logCrawlLogWrite(url, url.getContentSize());
                  // write it
                  syncedLogWriter.writeItem(crawlLogStream, url);
                }

                public void writeItemCount(int entryCount) throws IOException {
                }

              }, segmentLogStream, historyStream);
            }

            if (Environment.detailLogEnabled())
              LOG.info("LOG_FLUSH: Log Flusher Finished Writing Entries To Disk");
            collector.clear();

          } catch (IOException e) {
            LOG.error("Critical Exception during Crawl Log Flush:" + CCStringUtils.stringifyException(e));
            throw e;
          } finally {
            if (crawlLogStream != null) {
              crawlLogStream.flush();
              crawlLogStream.close();
            }

            for (DataOutputStream stream : streamsMapByPackedId.values()) {
              if (stream != null)
                stream.flush();
              stream.close();
            }
          }
          // at this point... update the crawl log header ...
          try {
            if (Environment.detailLogEnabled())
              LOG.info("LOG_FLUSH: Updating Log File Headers");
            // update the log file header
            updateLogFileHeader(crawlLogFile, _header, crawlLogRecordCount);
            // and update each completion log header ...
            for (long packedSegmentId : recordCountsByPackedId.keySet()) {
              File activeSegmentLogPath = CrawlSegmentLog.buildActivePath(_rootDirectory,
                  getListIdFromLogId(packedSegmentId), getSegmentIdFromLogId(packedSegmentId));
              CrawlSegmentLog.writeHeader(activeSegmentLogPath, recordCountsByPackedId.get(packedSegmentId));
            }
          } catch (IOException e) {
            LOG.error("Criticial Exception during Crawl Log Fluhs:" + CCStringUtils.stringifyException(e));
            throw e;
          } finally {

          }

          long endTime = System.currentTimeMillis();

          _flushTimeAVG.addSample((double) endTime - startTime);
          _flushTimeSmoothed.addSample((double) endTime - startTime);
          _lastFlushTime = endTime - startTime;

          LOG.info("LOG_FLUSH: Log Flusher Flushed Successfully");
          return true;
        }
      },

      new CompletionCallback<Boolean>() {

        public void taskComplete(Boolean updateResult) {
          setFlushInProgress(false);
          if (completionCallback != null) {
            completionCallback.flushComplete();
          }
        }

        public void taskFailed(Exception e) {

          setFlushInProgress(false);

          if (completionCallback != null) {
            completionCallback.flushFailed(e);
          }

          // all failures are critical in this particular task ...
          LOG.fatal("Crawl Log FLUSH Threw Exception:" + CCStringUtils.stringifyException(e));

          // no matter ... it is time to CORE the server ...
          throw new RuntimeException("CRITICAL FAILURE: Crawl Log FLUSH Threw Exception:"
              + CCStringUtils.stringifyException(e));

        }
      }));
    } else {
      setFlushInProgress(false);
      if (completionCallback != null) {
        completionCallback.flushComplete();
      }
    }
  }

  public boolean isForcedCheckpointPossible() {
    // now one more check to see if we have enough items to do a checkpoint ...
    if (_header._itemCount != 0) {
      return true;
    }
    return false;
  }

  public boolean isCheckpointPossible(long currentTime) {

    if (_lastCheckpointTime == -1 || currentTime - _lastCheckpointTime >= CrawlerServer.getServer().getCrawlLogCheckpointInterval()) {

      // now one more check to see if we have enough items to do a checkpoint
      // ...
      if (_header._itemCount >= CrawlerServer.getServer().getCrawlLogCheckpointItemThreshold()
          || _header._fileSize >= CrawlerServer.getServer().getCrawlLogCheckpointLogSizeThreshold()) {
        return true;
      }
    }
    return false;
  }

  public void forceFlushAndCheckpointLog(final CheckpointCompletionCallback outerCallback) {
    if (isCheckpointInProgress() || isFlushInProgress()) {
      throw new RuntimeException("forceFlush called while active Checkpoint or Flush In Progress!!");
    }

    flushLog(new FlushCompletionCallback() {

      @Override
      public void flushComplete() {

        long currentTime = System.currentTimeMillis();

        LOG.info("LOG_FLUSH Flush Complete... Checking to see if Checkpoint Possilbe");
        if (isForcedCheckpointPossible()) {
          // yes .. go ahead and checkpoint log
          LOG.info("Checkpointing Logs to HDFS");
          // start the checkpoint ...
          checkpoint(currentTime, new CheckpointCompletionCallback() {

            public void checkpointComplete(long checkpointId, Vector<Long> completedSegmentList) {
              LOG.info("CrawlLog Checkpoint:" + checkpointId + " completed");

              if (completedSegmentList != null) {
                // walk completed segments ... updating their crawl state ...
                if (_engine != null) {
                  for (long packedSegmentId : completedSegmentList) {
                    // notify crawler engine of status change ...
                    _engine.crawlSegmentComplete(packedSegmentId);
                  }
                }
              }
              // ok initiate outer callback
              outerCallback.checkpointComplete(checkpointId, null);
            }

            public void checkpointFailed(long checkpointId, Exception e) {
              LOG.error("Checkpoint Failed for Checkpoint:" + checkpointId + " With Exception:"
                  + CCStringUtils.stringifyException(e));
              outerCallback.checkpointFailed(checkpointId, e);
            }

          }, currentTime);
        } else {
          if (Environment.detailLogEnabled())
            LOG.info("Checkpoint Skipped. Nothing to checkpoint");
          outerCallback.checkpointComplete(0, null);
        }
      }

      @Override
      public void flushFailed(Exception e) {
        // log error and bail ...
        LOG.error(CCStringUtils.stringifyException(e));
        // initiate callback
        outerCallback.checkpointFailed(0, e);
      }

    });
  }

  public void startLogFlusher() {

    _logFlusherTimer = new Timer(CrawlerServer.getServer().getCrawlLogFlushInterval(), true, new Timer.Callback() {

      public void timerFired(Timer timer) {
        // if checkpoint is NOT in progress ...
        if (!isCheckpointInProgress() && !isFlushInProgress()) {

          LOG.info("LOG_FLUSH Starting ...");

          flushLog(

          new FlushCompletionCallback() {

            public void flushComplete() {
              // flush is complete ... check to see if we want to do a
              // checkpoint ...
              long currentTime = System.currentTimeMillis();

              LOG.info("LOG_FLUSH Flush Complete... Checking to see if Checkpoint Possilbe");
              if (isCheckpointPossible(currentTime)) {

                LOG.info("Checkpointing Logs to HDFS");

                // pause fetcher to prevent race condition where log flush takes
                // a long time and causes the fetcher to consume all avaliable
                // memory with content buffers
                _engine.pauseFetch();

                // start the checkpoint ...
                checkpoint(currentTime, new CheckpointCompletionCallback() {

                  public void checkpointComplete(long checkpointId, Vector<Long> completedSegmentList) {
                    LOG.info("CrawlLog Checkpoint:" + checkpointId + " completed");

                    _engine.resumeFetch();

                    if (completedSegmentList != null) {
                      // walk completed segments ... updating their crawl state
                      // ...
                      if (_engine != null) {
                        for (long packedSegmentId : completedSegmentList) {
                          // notify crawler engine of status change ...
                          _engine.crawlSegmentComplete(packedSegmentId);
                        }
                      }
                    }
                  }

                  public void checkpointFailed(long checkpointId, Exception e) {

                    _engine.resumeFetch();

                    LOG.error("Checkpoint Failed for Checkpoint:" + checkpointId + " With Exception:"
                        + CCStringUtils.stringifyException(e));
                  }

                }, currentTime);
              }
            }

            public void flushFailed(Exception e) {
              LOG.error("Flush Failed with Exception:" + CCStringUtils.stringifyException(e));
            }

          }

          );

          _engine.resumeFetch();
        }

        // now
      }
    });

    _eventLoop.setTimer(_logFlusherTimer);
  }

  public interface LogFlusherStopActionCallback {
    public void stopComplete();
  }

  public void stopLogFlusher(final LogFlusherStopActionCallback completionCallback) {

    // indicate that a shutdown is in progress ...
    _shutdownInProgress = true;

    // stop the log flusher timer ...
    if (_logFlusherTimer != null) {
      _eventLoop.cancelTimer(_logFlusherTimer);
    }

    // create a polling timer ...
    final Timer waitTimer = new Timer(1000, true, new Timer.Callback() {

      public void timerFired(Timer timer) {

        // check to see if we are done flushing or checkpointing ...
        if (!isFlushInProgress() && !isCheckpointInProgress()) {
          LOG.info("CrawlLog - stopLog Timer - No Flush or Checkpoint in Progress... Initiating CrawlLog Shutdown");
          // good to go ... cancel timer first ...
          _eventLoop.cancelTimer(timer);
          // and cleanup ...
          _logFlusherTimer = null;
          _shutdownInProgress = false;
          // initiate callback ...
          completionCallback.stopComplete();
        } else {
          LOG.info("CrawlLog - stopLog Timer - Flush or Checkpoint in Progress... Waiting ... ");
        }
      }
    });
    // and start the timer ...
    _eventLoop.setTimer(waitTimer);
  }

  public void collectStats(RuntimeStatsCollector collector) {

    collector.setDoubleValue(CrawlerEngineStats.ID, CrawlerEngineStats.Name.CrawlLog_FlushTimeAVG, _flushTimeAVG
        .getAverage());
    collector.setDoubleValue(CrawlerEngineStats.ID, CrawlerEngineStats.Name.CrawlLog_FlushTimeSmoothed,
        _flushTimeSmoothed.getAverage());
    collector.setLongValue(CrawlerEngineStats.ID, CrawlerEngineStats.Name.CrawlLog_FlushTimeLast, _lastFlushTime);
  }

  private static CrawlSegmentHost createHost(String hostName) {
    CrawlSegmentHost host = new CrawlSegmentHost();
    host.setHostName(hostName);
    byte[] hostNameAsBytes = host.getHostName().getBytes();
    host.setHostFP(FPGenerator.std64.fp(hostNameAsBytes, 0, hostNameAsBytes.length));
    return host;
  }

  private static CrawlSegmentURL createSegmentURL(URL url) {
    CrawlSegmentURL segmentURL = new CrawlSegmentURL();
    segmentURL.setUrl(url.toString());
    byte[] urlAsBytes = segmentURL.getUrl().getBytes();
    segmentURL.setUrlFP(FPGenerator.std64.fp(urlAsBytes, 0, urlAsBytes.length));
    return segmentURL;
  }

  private static CrawlSegmentDetail loadCrawlSegment(String fileName) throws IOException {

    TreeMap<String, CrawlSegmentHost> hosts = new TreeMap<String, CrawlSegmentHost>();

    URL resourceURL = CrawlEnvironment.getHadoopConfig().getResource(fileName);

    if (resourceURL == null) {
      throw new FileNotFoundException();
    }
    InputStream stream = resourceURL.openStream();
    BufferedReader reader = new BufferedReader(new InputStreamReader(new BufferedInputStream(stream)));

    String line = null;

    do {
      line = reader.readLine();
      if (line != null) {
        if (Environment.detailLogEnabled())
          LOG.info(line);
        try {
          URL theURL = new URL(line);

          CrawlSegmentHost host = hosts.get(theURL.getHost());
          if (host == null) {

            host = createHost(theURL.getHost());

            hosts.put(theURL.getHost(), host);
          }
          CrawlSegmentURL segmentURL = createSegmentURL(theURL);
          host.getUrlTargets().add(segmentURL);
        } catch (MalformedURLException e) {
          LOG.error("SKIPPING Malformed URL::" + line);
        }
      }
    } while (line != null);

    CrawlSegmentDetail crawlSegmentDetail = new CrawlSegmentDetail();

    int urlCount = 0;
    crawlSegmentDetail.setSegmentId(1);
    for (CrawlSegmentHost host : hosts.values()) {
      crawlSegmentDetail.getHosts().add(host);
      urlCount += host.getUrlTargets().size();
    }

    crawlSegmentDetail.setUrlCount(urlCount);

    // finally, sort by host (as will be the case in a proper map reduce
    // produced segment ...
    Collections.sort(crawlSegmentDetail.getHosts());

    return crawlSegmentDetail;

  }

  public Vector<Long> getActiveSegmentIdList() {

    Vector<Long> segmentIdList = new Vector<Long>();
    segmentIdList.addAll(_loggers.keySet());
    return segmentIdList;
  }

  static void validateInputOutputCrawlURLArrays(ArrayList<CrawlURL> input, ArrayList<CrawlURL> output)
      throws IOException {
    Assert.assertTrue(input.size() == output.size());
    for (int i = 0; i < input.size(); ++i) {
      CrawlURL left = input.get(i);
      CrawlURL right = input.get(i);
      Assert.assertTrue(left.getUrl().equals(right.getUrl()));
      Assert.assertTrue(left.getContentRaw().getReadOnlyBytes().equals(right.getContentRaw().getReadOnlyBytes()));
    }
  }

  static void validateLogFlusherCode(final File localDirPath, final Path remotePath, boolean injectErrors)
      throws IOException {

    final Configuration conf = new Configuration();

    final FileSystem fs = FileSystem.get(conf);

    fs.mkdirs(remotePath);

    // ok create a crawlLog test file
    File localFile = File.createTempFile("crawlLog", "test", localDirPath);
    localFile.delete();

    LOG.info("Initializing Temp File:" + localFile);
    // initialize
    LogFileHeader fileHeader = initializeLogFileHeaderFromLogFile(localFile);

    LOG.info("Creating SyncedCrawl URL Writer");
    // create synced url writer ...
    SyncedCrawlURLLogWriter crawlURLWriter = new SyncedCrawlURLLogWriter(injectErrors);

    ArrayList<CrawlURL> urlObjects = new ArrayList<CrawlURL>();
    // write a couple of url objects
    for (int i = 0; i < 100; ++i) {
      CrawlURL url = new CrawlURL();
      url.setUrl("http://someurl.com/" + i);
      byte bytes[] = MD5.digest("Some Random:" + Math.random() + " Number").getBytes();
      url.setContentRaw(new FlexBuffer(bytes));
      final DataOutputStream crawlLogStream = new DataOutputStream(new FileOutputStream(localFile, true));
      try {
        LOG.info("Appending object to log");
        crawlURLWriter.writeItem(crawlLogStream, url);
      } finally {
        LOG.info("Flushing Log");
        crawlLogStream.flush();
        crawlLogStream.close();
      }
      LOG.info("Updating Header");
      updateLogFileHeader(localFile, fileHeader, 1);

      if (!injectErrors || i % 2 == 0) {
        urlObjects.add(url);
      } else {
        // drop odd entry
        LOG.info("Dropping Odd Entry:" + url.getUrl());
      }
    }

    final ArrayList<CrawlURL> urlObjectsOut = new ArrayList<CrawlURL>();

    HDFSCrawlURLWriter stubWriter = new HDFSCrawlURLWriter() {

      SequenceFileCrawlURLWriter innerWriter = new SequenceFileCrawlURLWriter(conf, fs, remotePath, "testNode", 1L);

      @Override
      public void writeCrawlURLItem(Text url, CrawlURL urlObject) throws IOException {
        LOG.info("Got URL:" + url.toString());
        urlObjectsOut.add(urlObject);
        innerWriter.writeCrawlURLItem(url, urlObject);
      }

      @Override
      public void close() throws IOException {
        innerWriter.close();
      }

      public List<Path> getFilenames() {
        return innerWriter.getFilenames();
      }
    };

    try {
      LOG.info("Transferring from Local to Remote");
      transferLocalCheckpointLog(localFile, stubWriter, 1L);
    } finally {
      stubWriter.close();
    }
    LOG.info("Validating Input/Output");
    validateInputOutputCrawlURLArrays(urlObjects, urlObjectsOut);
    // read via sequenceFile
    urlObjectsOut.clear();
    Path firstFile = Iterators.getNext(stubWriter.getFilenames().iterator(), null);
    SequenceFile.Reader reader = new SequenceFile.Reader(fs, firstFile, conf);
    Text key = new Text();
    CrawlURL value = new CrawlURL();
    while (reader.next(key, value)) {
      LOG.info("Got:" + key.toString());
      urlObjectsOut.add(value);
      value = new CrawlURL();
    }
    reader.close();
    LOG.info("Validating Input/Output");
    validateInputOutputCrawlURLArrays(urlObjects, urlObjectsOut);

    LOG.info("Done!");
  }

  public static void walkCrawlLogFile(File crawlLogPath, long startOffset) throws IOException {

    // and open the crawl log file ...
    RandomAccessFile inputStream = null;

    IOException exception = null;

    CRC32 crc = new CRC32();
    CustomByteArrayOutputStream buffer = new CustomByteArrayOutputStream(1 << 17);
    byte[] syncBytesBuffer = new byte[SYNC_BYTES_SIZE];

    // save position for potential debug output.
    long lastReadPosition = 0;

    try {
      inputStream = new RandomAccessFile(crawlLogPath, "rw");

      // and a data input stream ...
      RandomAccessFile reader = inputStream;
      // seek to zero
      reader.seek(0L);

      // read the header ...
      LogFileHeader header = readLogFileHeader(reader);

      System.out.println("Header ItemCount:" + header._itemCount + " FileSize:" + header._fileSize);

      if (startOffset != 0L) {
        System.out.println("Preseeking to:" + startOffset);
        reader.seek(startOffset);
      }

      Configuration conf = new Configuration();

      // read a crawl url from the stream...

      long recordCount = 0;
      while (inputStream.getFilePointer() < header._fileSize) {

        // System.out.println("PRE-SYNC SeekPos:"+
        // inputStream.getFilePointer());
        if (seekToNextSyncBytesPos(syncBytesBuffer, reader, header._fileSize)) {

          // System.out.println("POST-SYNC SeekPos:"+
          // inputStream.getFilePointer());

          lastReadPosition = inputStream.getFilePointer();

          // skip sync
          inputStream.skipBytes(SYNC_BYTES_SIZE);

          // read length ...
          int urlDataLen = reader.readInt();
          long urlDataCRC = reader.readLong();

          if (urlDataLen > buffer.getBuffer().length) {
            buffer = new CustomByteArrayOutputStream(((urlDataLen / 65536) + 1) * 65536);
          }
          reader.read(buffer.getBuffer(), 0, urlDataLen);
          crc.reset();
          crc.update(buffer.getBuffer(), 0, urlDataLen);

          long computedValue = crc.getValue();

          // validate crc values ...
          if (computedValue != urlDataCRC) {
            LOG.error("CRC Mismatch Detected during HDFS transfer in CrawlLog:" + crawlLogPath.getAbsolutePath()
                + " FilePosition:" + lastReadPosition);
            inputStream.seek(lastReadPosition + 1);
          } else {
            if (recordCount++ % 10000 == 0) {
              // allocate a crawl url data structure
              CrawlURL url = new CrawlURL();
              DataInputStream bufferReader = new DataInputStream(new ByteArrayInputStream(buffer.getBuffer(), 0,
                  urlDataLen));
              // populate it from the (in memory) data stream
              url.readFields(bufferReader);

              System.out.println("Record:" + recordCount + " At:" + lastReadPosition + " URL:" + url.getUrl()
                  + " BuffSize:" + urlDataLen + " ContentLen:" + url.getContentRaw().getCount() + " LastModified:"
                  + new Date(url.getLastAttemptTime()).toString());
            }
          }
        } else {
          break;
        }
      }
    } catch (EOFException e) {
      LOG.error("Caught EOF Exception during read of local CrawlLog:" + crawlLogPath.getAbsolutePath()
          + " FilePosition:" + lastReadPosition);
    } catch (IOException e) {
      LOG.error(CCStringUtils.stringifyException(e));
      exception = e;
      throw e;
    } finally {
      if (inputStream != null)
        inputStream.close();
    }
  }
}