/** * Copyright 2008 - CommonCrawl Foundation * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * */ package org.commoncrawl.service.crawler; import java.io.BufferedInputStream; import java.io.BufferedReader; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.DataInput; import java.io.DataInputStream; import java.io.DataOutput; import java.io.DataOutputStream; import java.io.EOFException; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.RandomAccessFile; import java.net.MalformedURLException; import java.net.URL; import java.security.MessageDigest; import java.util.ArrayList; import java.util.Collections; import java.util.Date; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Set; import java.util.TreeMap; import java.util.Vector; import java.util.concurrent.Callable; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.zip.CRC32; import junit.framework.Assert; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.DataOutputBuffer; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.SequenceFile.CompressionType; import org.apache.hadoop.io.compress.SnappyCodec; import org.commoncrawl.async.ConcurrentTask; import org.commoncrawl.async.EventLoop; import org.commoncrawl.async.Timer; import org.commoncrawl.async.ConcurrentTask.CompletionCallback; import org.commoncrawl.common.Environment; import org.commoncrawl.crawl.common.internal.CrawlEnvironment; import org.commoncrawl.protocol.CrawlSegmentDetail; import org.commoncrawl.protocol.CrawlSegmentHost; import org.commoncrawl.protocol.CrawlSegmentURL; import org.commoncrawl.protocol.CrawlURL; import org.commoncrawl.util.CCStringUtils; import org.commoncrawl.util.FPGenerator; import org.commoncrawl.util.FileUtils; import org.commoncrawl.util.FlexBuffer; import org.commoncrawl.util.MovingAverage; import org.commoncrawl.util.RuntimeStatsCollector; import org.commoncrawl.util.SmoothedAverage; import org.mortbay.jetty.security.Credential.MD5; import com.google.common.collect.Iterators; public final class CrawlLog { public static final Log LOG = LogFactory.getLog(CrawlLog.class); public static final int DEFAULT_LOG_FLUSH_INTERVAL = 30000; public static final int DEFAULT_LOG_CHECKPOINT_INTERVAL = 60000 * 5; public static final int DEFAULT_LOG_FILE_CHECKPOINT_ITEM_COUNT_THRESHOLD = 100000; public static final long DEFAULT_LOG_FILE_SIZE_CHECKPOINT_THRESHOLD = 1073741824 * 4; /** log file header **/ LogFileHeader _header = new LogFileHeader(); /** node name **/ String _nodeName; /** data directory **/ File _rootDirectory; /** event loop **/ EventLoop _eventLoop; /** thread pool **/ ExecutorService _threadPool; /** crawler engine pointer **/ CrawlerEngine _engine; /** robots segment logger **/ CrawlSegmentLog _robotsSegment = new CrawlSegmentLog(null, -1, -1, null); /** individual crawl segment loggers **/ Map<Long, CrawlSegmentLog> _loggers = new HashMap<Long, CrawlSegmentLog>(); /** checkpoint completion callback **/ CheckpointCompletionCallback _checkpointCompletionCallback = null; /** checkpoint id - analogous to parse segment id **/ long _checkpointId; /** flush in progress flag **/ boolean _flushInProgress = false; /** a shutdown operation is in progress **/ boolean _shutdownInProgress = false; /** log flusher timer **/ Timer _logFlusherTimer = null; /** last checkpoint time **/ long _lastCheckpointTime = -1; MovingAverage _flushTimeAVG = new MovingAverage(10); SmoothedAverage _flushTimeSmoothed = new SmoothedAverage(.8); long _lastFlushTime = 0; /** get active log file path **/ public static File getActivePath(File directoryRoot) { // and construct a path to the local crawl segment directory ... File crawlDataDir = new File(directoryRoot, CrawlEnvironment.getCrawlerLocalOutputPath()); // append the segment id to the path ... return new File(crawlDataDir, CrawlEnvironment.ActiveCrawlLog); } /** get active log file path **/ public static File getCheckpointPath(File directoryRoot) { // and construct a path to the local crawl segment directory ... File crawlDataDir = new File(directoryRoot, CrawlEnvironment.getCrawlerLocalOutputPath()); // append the segment id to the path ... return new File(crawlDataDir, CrawlEnvironment.CheckpointCrawlLog); } public static void ensureDataDirectory(File directoryRoot) { // and construct a path to the local crawl segment directory ... File crawlDataDir = new File(directoryRoot, CrawlEnvironment.getCrawlerLocalOutputPath()); if (!crawlDataDir.exists()) { crawlDataDir.mkdir(); } } /** purge local data directory **/ public static void purgeDataDirectory(File directoryRoot) { // get crawl output path ... File crawlDataDir = new File(directoryRoot, CrawlEnvironment.getCrawlerLocalOutputPath()); // delete entire directory and all contents underneath it FileUtils.recursivelyDeleteFile(crawlDataDir); // recreate directory crawlDataDir.mkdirs(); } /** unit test constructor **/ public CrawlLog() throws IOException { _rootDirectory = new File(CrawlEnvironment.DEFAULT_DATA_DIR, "crawlLog_unittest"); if (!_rootDirectory.exists()) _rootDirectory.mkdir(); _eventLoop = new EventLoop(); _nodeName = "test"; _eventLoop.start(); _threadPool = Executors.newFixedThreadPool(1); initialize(); } public CrawlLog(CrawlerEngine engine) throws IOException { _engine = engine; _rootDirectory = engine.getServer().getDataDirectory(); _nodeName = engine.getServer().getHostName(); _eventLoop = engine.getEventLoop(); _threadPool = engine.getServer().getDefaultThreadPool(); initialize(); } private void initialize() throws IOException { // create data directory if necessary ... ensureDataDirectory(_rootDirectory); File checkpointLogPath = getCheckpointPath(_rootDirectory); File activeLogPath = getActivePath(_rootDirectory); // check if it exists ... if (checkpointLogPath.exists()) { // log it ... LOG.warn("####Checkpoint Crawl Log Found - Possible Crash Recovery"); // rename it as the active log ... checkpointLogPath.renameTo(activeLogPath); } LOG.info("Crawl Log Initializing Active Log"); // either way call initialize active log ... _header = initializeActiveLog(_rootDirectory); LOG.info("Crawl Log Initialize returned " + _header._itemCount + " Entries in Active Log"); } /** initialize log (file) **/ private static LogFileHeader initializeActiveLog(File rootDirectory) throws IOException { File activeLogPath = getActivePath(rootDirectory); return initializeLogFileHeaderFromLogFile(activeLogPath); } private static LogFileHeader initializeLogFileHeaderFromLogFile(File logFilePath) throws IOException { LogFileHeader headerOut = null; if (!logFilePath.exists()) { DataOutputStream outputStream = new DataOutputStream(new FileOutputStream(logFilePath)); try { headerOut = initializeEmptyLogFile(outputStream); } finally { outputStream.close(); } } else { headerOut = new LogFileHeader(); DataInputStream inputStream = new DataInputStream(new FileInputStream(logFilePath)); try { headerOut.readHeader(inputStream); } finally { inputStream.close(); } } return headerOut; } /** get the host name **/ public String getNodeName() { return _nodeName; } /** make packed log id from list id and segment log id **/ public static long makeSegmentLogId(int listId, int segmentId) { return (((long) listId) << 32) | (long) segmentId; } /** get segment log id from packed id **/ public static int getSegmentIdFromLogId(long logId) { return (int) (logId & 0xFFFFFFFF); } /** get list id from packed id **/ public static int getListIdFromLogId(long logId) { return (int) ((logId >> 32) & 0xFFFFFFFF); } /** add a segment log given segment id **/ public void addSegmentLog(CrawlSegmentLog log) { if (_loggers.get(log.getSegmentId()) != null) { LOG.error("Attempt to Activate an Already Active Segment Log. Segment Id:" + log.getSegmentId()); throw new RuntimeException("Attempt to Activate an Already Active Segment Log. Segment Id:" + log.getSegmentId()); } _loggers.put(makeSegmentLogId(log.getListId(), log.getSegmentId()), log); } /** get the special robots crawl segment **/ public CrawlSegmentLog getRobotsSegment() { return _robotsSegment; } /** get a segment log given segment id **/ public CrawlSegmentLog getLogForSegment(int listId, int segmentId) { return _loggers.get(makeSegmentLogId(listId, segmentId)); } /** remove segment log **/ public CrawlSegmentLog removeSegmentLog(int listId, int segmentId) { return _loggers.remove(makeSegmentLogId(listId, segmentId)); } private static class LogFileHeader { public static final int LogFileHeaderBytes = 0xCC00CC00; public static final int LogFileVersion = 1; public LogFileHeader() { _fileSize = 0; _itemCount = 0; } public long _fileSize; public long _itemCount; public void writeHeader(DataOutput stream) throws IOException { stream.writeInt(LogFileHeaderBytes); stream.writeInt(LogFileVersion); stream.writeLong(_fileSize); stream.writeLong(_itemCount); } public void readHeader(DataInput stream) throws IOException { int headerBytes = stream.readInt(); int version = stream.readInt(); if (headerBytes != LogFileHeaderBytes && version != LogFileVersion) { throw new IOException("Invalid CrawlLog File Header Detected!"); } _fileSize = stream.readLong(); _itemCount = stream.readLong(); } } private static void updateLogFileHeader(File logFileName, LogFileHeader header, long addedRecordCount) throws IOException { RandomAccessFile file = new RandomAccessFile(logFileName, "rw"); try { // update cached header ... header._fileSize = file.getChannel().size(); header._itemCount += addedRecordCount; // set the position at zero .. file.seek(0); // and write header to disk ... header.writeHeader(file); } finally { // major bottle neck.. // file.getFD().sync(); file.close(); } } private static LogFileHeader initializeEmptyLogFile(DataOutput stream) throws IOException { LogFileHeader header = new LogFileHeader(); header.writeHeader(stream); return header; } public static LogFileHeader readLogFileHeader(File logFileName) throws IOException { LogFileHeader headerOut = new LogFileHeader(); RandomAccessFile file = new RandomAccessFile(logFileName, "r"); try { headerOut = readLogFileHeader(file); } finally { file.close(); } return headerOut; } private static LogFileHeader readLogFileHeader(DataInput reader) throws IOException { LogFileHeader headerOut = new LogFileHeader(); headerOut.readHeader(reader); return headerOut; } private boolean isCheckpointInProgress() { return _checkpointCompletionCallback != null; } private boolean isFlushInProgress() { return _flushInProgress == true; } private void setFlushInProgress(boolean value) { _flushInProgress = value; if (value == false) { // since we are in the async thread at this point, check to see if a // checkpoint is in progress if (isCheckpointInProgress()) { // if so, it was deferred, because of the flush in progress... so we // need to actually kick off the checkpoint progress // now that the flush is complete doCheckpoint(); } } } public static interface CheckpointCompletionCallback { public void checkpointComplete(long checkpointId, Vector<Long> completedSegmentList); public void checkpointFailed(long checkpointId, Exception e); } public static interface FlushCompletionCallback { public void flushComplete(); public void flushFailed(Exception e); } /** essentially swap crawl logs **/ private void checkpointLocalCrawlLog() throws IOException { File activeCrawlLog = getActivePath(_rootDirectory); File checkpointCrawlLog = getCheckpointPath(_rootDirectory); LOG.info("MOVING ACTIVE:" + activeCrawlLog + "TO:" + checkpointCrawlLog); // delete any existing checkpoint log ... checkpointCrawlLog.delete(); // rename active log to check point log activeCrawlLog.renameTo(checkpointCrawlLog); // and create a new active crawlLog ... _header = initializeActiveLog(_rootDirectory); } public void checkpoint(long checkpointStartTime, CheckpointCompletionCallback callback, long checkpointId) { // first check to see if checkpoint is already in progress ... if (isCheckpointInProgress()) { // immediately fail call callback.checkpointFailed(checkpointId, new Exception("Invalid State. Checkpoint already in progress!")); } _lastCheckpointTime = checkpointStartTime; // otherwise transition to a checkpoint in progress state _checkpointCompletionCallback = callback; _checkpointId = checkpointId; // now check to see if we are not in the middle of a flush ... if (!isFlushInProgress()) { // if not we can directly start the actual checkpoint process ... doCheckpoint(); } // otherwise wait for the flush to finish (and thus trigger the checkpoint // process) } public void finalizeCheckpoint() { File checkpointLogFile = getCheckpointPath(_rootDirectory); checkpointLogFile.delete(); } public void abortCheckpoint() { File activeLogFile = getActivePath(_rootDirectory); File checkpointLogFile = getCheckpointPath(_rootDirectory); LOG.info("###ABORTING CHECKPOINT! RENAMING:" + checkpointLogFile + " TO:" + activeLogFile); checkpointLogFile.renameTo(activeLogFile); } public void purgeActiveLog() throws IOException { File activeLogFilePath = getActivePath(_rootDirectory); if (activeLogFilePath.exists()) activeLogFilePath.delete(); _header = initializeActiveLog(_rootDirectory); } private static class CorruptCrawlLogException extends IOException { public CorruptCrawlLogException(String description) { super(description); } } /** * seek out next instance of sync bytes in the file input stream * * @param file * @throws IOException */ private static boolean seekToNextSyncBytesPos(byte[] syncBytesBuffer, RandomAccessFile file, long maxFileSize) throws IOException { while (file.getFilePointer() < maxFileSize) { try { // read in a sync.length buffer amount file.read(syncBytesBuffer); int syncLen = SYNC_BYTES_SIZE; // start scan for next sync position ... for (int i = 0; file.getFilePointer() < maxFileSize; i++) { int j = 0; for (; j < syncLen; j++) { if (_sync[j] != syncBytesBuffer[(i + j) % syncLen]) break; } if (j == syncLen) { // found matching sync bytes - reset file pos to before sync bytes file.seek(file.getFilePointer() - SYNC_BYTES_SIZE); // position // before // sync return true; } syncBytesBuffer[i % syncLen] = file.readByte(); } } catch (IOException e) { LOG.warn("IOException at:" + file.getFilePointer() + " Exception:" + CCStringUtils.stringifyException(e)); LOG.warn("Skipping to:" + file.getFilePointer() + 4096); file.seek(file.getFilePointer() + 4096); } } return false; } private static interface HDFSCrawlURLWriter { public void writeCrawlURLItem(Text url, CrawlURL urlObject) throws IOException; public void close() throws IOException; public List<Path> getFilenames(); } private static class SequenceFileCrawlURLWriter implements HDFSCrawlURLWriter { private static final long FLUSH_THRESHOLD = 1073741824L; FileSystem _fs; Configuration _conf; Path _stagingDirectory; String _nodeName; long _currentFileRecordCount = 0; ArrayList<Path> _outputPaths = new ArrayList<Path>(); long _nextFileId = -1L; Path currentFilePath = null; SequenceFile.Writer writer = null; long _prevPos; public SequenceFileCrawlURLWriter(Configuration conf, FileSystem fs, Path path, String nodeName, long checkpointId) throws IOException { _conf = conf; _fs = fs; _stagingDirectory = path; _nodeName = nodeName; _nextFileId = checkpointId; flushFile(true); } private void flushFile(boolean openNew) throws IOException { if (writer != null) { writer.close(); if (_currentFileRecordCount != 0) { LOG.info("Flushed Temp Checkpoint File:" + currentFilePath); _outputPaths.add(currentFilePath); } else { LOG.info("Deleting Emtpy Checkpoint File:" + currentFilePath); _fs.delete(currentFilePath, false); } writer = null; _currentFileRecordCount = 0; currentFilePath = null; } if (openNew) { // allocate a new filename currentFilePath = new Path(_stagingDirectory, CrawlEnvironment.buildCrawlLogCheckpointName(_nodeName, _nextFileId++)); LOG.info("Allocating new Checkpoint File:" + currentFilePath); // delete it if (_fs.exists(currentFilePath)) { LOG.warn("Existing Checkpoint TempFile found at:" + currentFilePath + " - Deleting"); _fs.delete(currentFilePath, false); } // open a sequence file writer at the temp file location ... writer = SequenceFile.createWriter(_fs, _conf, currentFilePath, Text.class, CrawlURL.class, CompressionType.BLOCK, new SnappyCodec()); // reset record count ... _currentFileRecordCount = 0; } } @Override public void writeCrawlURLItem(Text url, CrawlURL urlObject) throws IOException { writer.append(url, urlObject); ++_currentFileRecordCount; long pos = writer.getLength(); if (pos != _prevPos) { _prevPos = pos; if (pos >= FLUSH_THRESHOLD) { flushFile(true); } } } public void close() throws IOException { flushFile(false); } public List<Path> getFilenames() { return _outputPaths; } }; private static class URLWriterException extends IOException { public URLWriterException() { } } private static void transferLocalCheckpointLog(File crawlLogPath, HDFSCrawlURLWriter writer, long checkpointId) throws IOException { // and open the crawl log file ... RandomAccessFile inputStream = null; IOException exception = null; CRC32 crc = new CRC32(); CustomByteArrayOutputStream buffer = new CustomByteArrayOutputStream(1 << 17); byte[] syncBytesBuffer = new byte[SYNC_BYTES_SIZE]; // save position for potential debug output. long lastReadPosition = 0; try { inputStream = new RandomAccessFile(crawlLogPath, "rw"); // and a data input stream ... RandomAccessFile reader = inputStream; // seek to zero reader.seek(0L); // read the header ... LogFileHeader header = readLogFileHeader(reader); // read a crawl url from the stream... while (inputStream.getFilePointer() < header._fileSize) { if (seekToNextSyncBytesPos(syncBytesBuffer, reader, header._fileSize)) { try { lastReadPosition = inputStream.getFilePointer(); // skip sync inputStream.skipBytes(SYNC_BYTES_SIZE); // read length ... int urlDataLen = reader.readInt(); long urlDataCRC = reader.readLong(); if (urlDataLen > buffer.getBuffer().length) { buffer = new CustomByteArrayOutputStream(((urlDataLen / 65536) + 1) * 65536); } reader.read(buffer.getBuffer(), 0, urlDataLen); crc.reset(); crc.update(buffer.getBuffer(), 0, urlDataLen); long computedValue = crc.getValue(); // validate crc values ... if (computedValue != urlDataCRC) { LOG.error("CRC Mismatch Detected during HDFS transfer in CrawlLog:" + crawlLogPath.getAbsolutePath() + " Checkpoint Id:" + checkpointId + " FilePosition:" + lastReadPosition); inputStream.seek(lastReadPosition + 1); } else { // allocate a crawl url data structure CrawlURL url = new CrawlURL(); DataInputStream bufferReader = new DataInputStream(new ByteArrayInputStream(buffer.getBuffer(), 0, urlDataLen)); // populate it from the (in memory) data stream url.readFields(bufferReader); try { // and write out appropriate sequence file entries ... writer.writeCrawlURLItem(new Text(url.getUrl()), url); } catch (IOException e) { LOG.error("Failed to write CrawlURL to SequenceFileWriter with Exception:" + CCStringUtils.stringifyException(e)); throw new URLWriterException(); } } } catch (URLWriterException e) { LOG.error("Caught URLRewriter Exception! - Throwing to outer layer!"); throw e; } catch (Exception e) { LOG.error("Ignoring Error Processing CrawlLog Entry at Position:" + lastReadPosition + " Exception:" + CCStringUtils.stringifyException(e)); } } else { break; } } } catch (EOFException e) { LOG.error("Caught EOF Exception during read of local CrawlLog:" + crawlLogPath.getAbsolutePath() + " Checkpoint Id:" + checkpointId + " FilePosition:" + lastReadPosition); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); exception = e; throw e; } finally { if (inputStream != null) inputStream.close(); } } private Path transferLocalSegmentLog(FileSystem hdfs, File localSegmentLogFile, long checkpointId, int listId, int segmentId) throws IOException { if (localSegmentLogFile.exists()) { // determine the file's size ... // if > header size (in other words it has data ... ) if (localSegmentLogFile.length() > CrawlSegmentLog.getHeaderSize()) { // construct a target path (where we are going to store the checkpointed // crawl log ) Path remoteLogFileName = CrawlEnvironment.getRemoteCrawlSegmentLogCheckpointPath(new Path(CrawlEnvironment.getCrawlSegmentLogsDirectory()),getNodeName(), checkpointId, listId, segmentId); // replace if existing ... hdfs.delete(remoteLogFileName,false); Path localPath = new Path(localSegmentLogFile.getAbsolutePath()); hdfs.mkdirs(remoteLogFileName.getParent()); LOG.info("Copying CrawlSegmentLog for List:" + listId + " Segment:" + segmentId + " from:" + localPath + " to fs:" + hdfs + " path:" + remoteLogFileName); hdfs.copyFromLocalFile(localPath, remoteLogFileName); return remoteLogFileName; } } return null; } private void purgeHDFSSegmentLogs(FileSystem hdfs, int listId, int segmentId) throws IOException { Path listLogDirectory = new Path(CrawlEnvironment.getCrawlSegmentLogsDirectory(), ((Integer) listId).toString()); Path segmentLogDirectory = new Path(listLogDirectory, ((Integer) segmentId).toString()); Path completionLogFilePath = new Path(segmentLogDirectory, CrawlEnvironment .buildCrawlSegmentCompletionLogFileName(getNodeName())); if (!hdfs.exists(completionLogFilePath)) { // create a zero length completion log file on hdfs ... hdfs.createNewFile(completionLogFilePath); } // skip this step as history servers now manage segment logs /* * // and now ... delete all logs Path segmentLogWildcardPath = new * Path(segmentLogDirectory * ,CrawlEnvironment.buildCrawlSegmentLogCheckpointWildcardString * (getNodeName())); FileStatus paths[] = * hdfs.globStatus(segmentLogWildcardPath); if (paths != null) { for * (FileStatus path : paths) { // hdfs.delete(path.getPath()); } } */ } /** perform the actual checkpoint work here ... **/ private void doCheckpoint() { // at this point, we should be in the async thread, and all flusher // activities are blocked ... LOG.info("CrawlLog Checkpoint - Starting "); // collect all necessary information from thread-unsafe data structure now // (in async thread context) final Set<Long> activeSegments = new HashSet<Long>(); try { // add all active segment ids to our key set ... activeSegments.addAll(_loggers.keySet()); LOG.info("CrawlLog Checkpoint - Preparing CrawlLog Files"); // checkpoint crawl log ... checkpointLocalCrawlLog(); LOG.info("CrawlLog Checkpoint - Preparing Segment Log Files"); // next checkpoint all active segment logs ... for (CrawlSegmentLog segmentLog : _loggers.values()) { segmentLog.checkpointLocalLog(); } LOG.info("CrawlLog Checkpoint - Ready for HDFS Transfer"); } catch (IOException e) { LOG.error("Checkpoint failed with Exception:" + CCStringUtils.stringifyException(e)); } // spawn a thread to do most of the blocking io ... _threadPool.submit(new ConcurrentTask<Boolean>(_eventLoop, new Callable<Boolean>() { public Boolean call() throws Exception { // we need to track these in case of failure ... Vector<Path> segmentLogFinalPaths = new Vector<Path>(); // get the log file system final FileSystem crawlLogsFS = CrawlEnvironment.getDefaultFileSystem(); try { LOG.info("CrawlLog Checkpoint - Transferring CrawlLog to HDFS"); // construct a target path (where we are going to store the // checkpointed crawl log ) //Path stagingDirectory = new Path(CrawlEnvironment.getCheckpointStagingDirectory()); Path checkpointDirectory = CrawlerServer.getServer().getCrawlContentPath(); LOG.info("***Checkpoint Dir is:" + checkpointDirectory); if (checkpointDirectory == null) { throw new IOException("Checkpoint Failed. Null Checkpoint Directory!"); } FileSystem crawlDataFS = FileSystem.get(checkpointDirectory.toUri(),CrawlEnvironment.getHadoopConfig()); LOG.info("***Checkpoint Content FS is:" + crawlDataFS); SequenceFileCrawlURLWriter hdfsWriter = new SequenceFileCrawlURLWriter(CrawlEnvironment.getHadoopConfig(), crawlDataFS, checkpointDirectory, getNodeName(), _checkpointId); try { // write out crawl log to hdfs ... transferLocalCheckpointLog(getCheckpointPath(_rootDirectory), hdfsWriter, _checkpointId); } catch (Exception e) { LOG.error("HDFS Write of CrawlLog failed. Deleting tempFiles:" + hdfsWriter.getFilenames() + " Exception:" + CCStringUtils.stringifyException(e)); // close writer hdfsWriter.close(); // delete any hdfs output ... for (Path path : hdfsWriter.getFilenames()) { LOG.info("Deleting temp (HDFS) checkpoint file:" + path); crawlDataFS.delete(path, false); } throw e; } finally { hdfsWriter.close(); } LOG.info("CrawlLog Checkpoint - Transferring CrawlSegment Logs"); // and next for every segment for (long packedLogId : activeSegments) { File segmentLogPath = CrawlSegmentLog.buildCheckpointPath(_rootDirectory, getListIdFromLogId(packedLogId), getSegmentIdFromLogId(packedLogId)); LOG.info("CrawlLog Checkpoint - Transferring CrawlSegment Log for List:" + getListIdFromLogId(packedLogId) + " Segment:"+ getSegmentIdFromLogId(packedLogId)); // copy the segment log ... Path remoteLogFilePath = transferLocalSegmentLog(crawlLogsFS, segmentLogPath, _checkpointId,getListIdFromLogId(packedLogId), getSegmentIdFromLogId(packedLogId)); // if path is not null (data was copied) ... if (remoteLogFilePath != null) { // add it to vector ... segmentLogFinalPaths.add(remoteLogFilePath); } } LOG.info("CrawlLog Checkpoint - Finished Transferring CrawlSegment Logs"); // now if we got here ... all hdfs transfers succeeded ... // go ahead and move checkpoint log from staging to final data // directory ... /* Path checkpointDirectory = new Path(CrawlEnvironment.getCheckpointDataDirectory()); // if no checkpoint data directory ... create one ... if (!hdfs.exists(checkpointDirectory)) hdfs.mkdirs(checkpointDirectory); for (Path checkpointTempFilePath : hdfsWriter.getFilenames()) { Path checkpointFinalPath = new Path(checkpointDirectory, checkpointTempFilePath.getName()); LOG.info("Promoting Checking File From:" + checkpointTempFilePath + " to:" + checkpointFinalPath); // and essentially move the crawl log file from staging to data // directory .. boolean success = hdfs.rename(checkpointTempFilePath, checkpointFinalPath); if (!success) { throw new IOException("Failed to Rename Checkpoint Temp:" + checkpointTempFilePath + " to:" + checkpointFinalPath); } } */ // if we got here checkpoint was successfull... return true; } catch (Exception e) { LOG.error("Checkpoint:" + _checkpointId + " FAILED with exception:" + CCStringUtils.stringifyException(e)); for (Path segmentPath : segmentLogFinalPaths) { crawlLogsFS.delete(segmentPath,false); } throw e; } } }, new CompletionCallback<Boolean>() { public void taskComplete(Boolean updateResult) { Vector<Long> completedSegmentList = new Vector<Long>(); LOG.info("CrawlLog Checkpoint - Finalizing CrawlLog Checkpoint"); // delete the local checkpoint log ... finalizeCheckpoint(); LOG.info("CrawlLog Checkpoint - Finalizing CrawlSegmentLogs"); for (CrawlSegmentLog segmentLog : _loggers.values()) { // LOG.info("CrawlLog Checkpoint - Finalizing CrawlSegmentLog for Segment:" // + segmentLog.getSegmentId()); // finalize the checkpoint on the segment log ... segmentLog.finalizeCheckpoint(); // and check to see if the segment has been completed ... if (segmentLog.isSegmentComplete()) { // if so, add it our completed segments list ... completedSegmentList.add(makeSegmentLogId(segmentLog.getListId(), segmentLog.getSegmentId())); } } // now for all completed segments ... purge hdfs logs ... for (long packedSegmentId : completedSegmentList) { try { LOG.info("CrawlLog Checkpoint - Purging HDFS CrawlSegmentLogs from Completed Segment. List:" + getListIdFromLogId(packedSegmentId) + " Segment:" + getSegmentIdFromLogId(packedSegmentId)); // purge hdfs files (and create a completion log file) //purgeHDFSSegmentLogs(CrawlEnvironment.getDefaultFileSystem(), getListIdFromLogId(packedSegmentId),getSegmentIdFromLogId(packedSegmentId)); LOG.info("CrawlLog Checkpoint - Purging Local CrawlSegmentLogs from Completed Segment. List:" + getListIdFromLogId(packedSegmentId) + " Segment:" + getSegmentIdFromLogId(packedSegmentId)); // and purge local files as well ... _loggers.get(packedSegmentId).purgeLocalFiles(); } catch (IOException e) { LOG.error("Purge SegmentLog for Segment List:" + getListIdFromLogId(packedSegmentId) + " Segment:" + getSegmentIdFromLogId(packedSegmentId) + " threw IOException:" + CCStringUtils.stringifyException(e)); } LOG.info("CrawlLog Checkpoint - DeRegistering Segment List:" + getListIdFromLogId(packedSegmentId) + " Segment:" + getSegmentIdFromLogId(packedSegmentId) + " From CrawlLog"); // no matter what ... unload the segment ... _loggers.remove(packedSegmentId); } CheckpointCompletionCallback callback = _checkpointCompletionCallback; long checkpointId = _checkpointId; // otherwise transition to a checkpoint in progress state _checkpointCompletionCallback = null; _checkpointId = -1; LOG.info("CrawlLog Checkpoint - Checkpoint Complete - Initiating Callback"); // and complete transaction ... callback.checkpointComplete(checkpointId, completedSegmentList); } public void taskFailed(Exception e) { // all failures are critical in this particular task ... LOG.error("Crawl Log FLUSH Threw Exception:" + CCStringUtils.stringifyException(e)); // revert checkpoint logs ... abortCheckpoint(); for (CrawlSegmentLog segmentLog : _loggers.values()) { segmentLog.abortCheckpoint(); } CheckpointCompletionCallback callback = _checkpointCompletionCallback; long checkpointId = _checkpointId; // otherwise transition to a checkpoint in progress state _checkpointCompletionCallback = null; _checkpointId = -1; // now check to see if this was corrupt crawl log exception if (e.getCause() instanceof CorruptCrawlLogException) { // ACK!!! LOG.fatal("Corrupt CrawlLog detected with Exception:" + CCStringUtils.stringifyException(e)); try { // this is a serious error ... time to purge the crawl log directory // altogether ... purgeActiveLog(); // and all active segment logs as well... for (CrawlSegmentLog segmentLog : _loggers.values()) { segmentLog.purgeActiveLog(); } } catch (IOException e2) { LOG.error("IOException during Segment Log PURGE:" + CCStringUtils.stringifyException(e2)); } // time to die hard ... throw new RuntimeException(e); } // and complete transaction ... callback.checkpointFailed(checkpointId, e); } })); } private static final class CustomByteArrayOutputStream extends ByteArrayOutputStream { public CustomByteArrayOutputStream(int initialSize) { super(initialSize); } public byte[] getBuffer() { return buf; } } private void logCrawlLogWrite(CrawlURL url, int bufferSizeOut) { StringBuffer sb = new StringBuffer(); sb.append(String.format("%1$20.20s ", CCStringUtils.dateStringFromTimeValue(System.currentTimeMillis()))); sb.append(String.format("%1$4.4s ", url.getResultCode())); sb.append(String.format("%1$10.10s ", url.getContentRaw().getCount())); if ((url.getFlags() & CrawlURL.Flags.IsRedirected) != 0) { sb.append(url.getRedirectURL()); sb.append(" "); } sb.append(url.getUrl()); _engine.getCrawlLogLog().info(sb.toString()); } static byte[] _sync; // 16 random bytes static final int SYNC_BYTES_SIZE = 16; static { try { MessageDigest digester = MessageDigest.getInstance("MD5"); digester.update("SOME RANDOM BYTES".getBytes()); _sync = digester.digest(); } catch (Exception e) { throw new RuntimeException(e); } } private static class SyncedCrawlURLLogWriter { boolean _injectErrors = false; boolean _corruptThisEntry = false; public SyncedCrawlURLLogWriter(boolean injectErrors) { _injectErrors = injectErrors; } public SyncedCrawlURLLogWriter() { } private CustomByteArrayOutputStream bufferOutputStream = new CustomByteArrayOutputStream(1 << 17); private DataOutputStream dataOutputStream = new DataOutputStream(bufferOutputStream); private CRC32 crc = new CRC32(); public void writeItem(DataOutputStream crawlLogStream, CrawlURL url) throws IOException { bufferOutputStream.reset(); // write to intermediate stream ... url.write(dataOutputStream); // and crc the data ... crc.reset(); crc.update(bufferOutputStream.getBuffer(), 0, bufferOutputStream.size()); // write out sync bytes first crawlLogStream.write(_sync); // write out length crawlLogStream.writeInt(bufferOutputStream.size()); // crc next long computedValue = crc.getValue(); if (_injectErrors) { _corruptThisEntry = !_corruptThisEntry; if (_corruptThisEntry) { LOG.info("Intentionally Corrupting URL:" + url.getUrl()); computedValue += 12; } } crawlLogStream.writeLong(computedValue); // and then the data crawlLogStream.write(bufferOutputStream.getBuffer(), 0, bufferOutputStream.size()); } } private void flushLog(final FlushCompletionCallback completionCallback) { if (Environment.detailLogEnabled()) LOG.info("LOG_FLUSH:Collecting Entries...."); // set flush in progress indicator ... setFlushInProgress(true); // and collect buffers in async thread context (thus not requiring // synchronization) final LinkedList<CrawlSegmentLog.LogItemBuffer> collector = new LinkedList<CrawlSegmentLog.LogItemBuffer>(); // flush robots log _robotsSegment.flushLog(collector); // walk segments collecting log items .... for (CrawlSegmentLog logger : _loggers.values()) { // flush any log items into the collector logger.flushLog(collector); } if (Environment.detailLogEnabled()) LOG.info("LOG_FLUSH:Collection Returned " + collector.size() + " Buffers"); // walk collector list identifying the list of unique segment ids final Set<Long> packedSegmentIdSet = new HashSet<Long>(); int urlItemCount = 0; for (CrawlSegmentLog.LogItemBuffer buffer : collector) { if (buffer.getListId() != -1 && buffer.getSegmentId() != -1) { packedSegmentIdSet.add(makeSegmentLogId(buffer.getListId(), buffer.getSegmentId())); } urlItemCount += buffer.getItemCount(); } if (Environment.detailLogEnabled()) LOG.info("LOG_FLUSH:There are " + urlItemCount + " Items in Flush Buffer Associated With " + packedSegmentIdSet.size() + " Segments"); final File crawlLogFile = getActivePath(_rootDirectory); // now check to see if there is anything to do ... if (collector.size() != 0) { if (Environment.detailLogEnabled()) LOG.info("LOG_FLUSH: Collector Size is NOT Zero... Starting Log Flusher Thread"); // ok ... time to spawn a thread to do the blocking flush io _threadPool.submit(new ConcurrentTask<Boolean>(_eventLoop, new Callable<Boolean>() { public Boolean call() throws Exception { if (Environment.detailLogEnabled()) LOG.info("LOG_FLUSH: Log Flusher Thread Started"); long startTime = System.currentTimeMillis(); Map<Long, DataOutputStream> streamsMapByPackedId = new HashMap<Long, DataOutputStream>(); Map<Long, Integer> recordCountsByPackedId = new HashMap<Long, Integer>(); long crawlLogRecordCount = 0; // open the actual crawler log file ... final DataOutputStream crawlLogStream = new DataOutputStream(new FileOutputStream(crawlLogFile, true)); try { if (Environment.detailLogEnabled()) LOG.info("LOG_FLUSH: Log Flusher Thread Opening Streams for Segments in Buffer"); // now open a set of file descriptors related to the identified // segments for (long packedSegmentId : packedSegmentIdSet) { // construct the unique filename for the given log file... File activeSegmentLog = CrawlSegmentLog.buildActivePath(_rootDirectory, getListIdFromLogId(packedSegmentId), getSegmentIdFromLogId(packedSegmentId)); // initialize the segment log ... CrawlSegmentLog.initializeLogFile(activeSegmentLog); // initialize record counts per stream ... recordCountsByPackedId.put(packedSegmentId, CrawlSegmentLog.readerHeader(activeSegmentLog)); // and open an output stream for the specified log file ... streamsMapByPackedId.put(packedSegmentId, new DataOutputStream(new FileOutputStream(activeSegmentLog, true))); } if (Environment.detailLogEnabled()) LOG.info("LOG_FLUSH: Log Flusher Thread Walking Items in Buffer"); // initialize a total item count variable int totalItemCount = 0; // crawl history stream DataOutputBuffer historyStream = new DataOutputBuffer(); // and now walk log buffers ... for (CrawlSegmentLog.LogItemBuffer buffer : collector) { if (Environment.detailLogEnabled()) LOG.info("LOG_FLUSH: Log Flusher Thread Writing " + buffer.getItemCount() + " Entries for Segment:" + buffer.getSegmentId()); // output stream DataOutputStream segmentLogStream = null; if (buffer.getListId() != -1 && buffer.getSegmentId() != -1) { // update segment count first ... recordCountsByPackedId.put(makeSegmentLogId(buffer.getListId(), buffer.getSegmentId()), recordCountsByPackedId.get(makeSegmentLogId(buffer.getListId(), buffer.getSegmentId())) + buffer.getItemCount()); // get output stream associated with segment id segmentLogStream = streamsMapByPackedId .get(makeSegmentLogId(buffer.getListId(), buffer.getSegmentId())); } // and our local record counter ... crawlLogRecordCount += buffer.getItemCount(); // and next do the actual disk flush ... totalItemCount += buffer.flushToDisk(totalItemCount, new CrawlSegmentLog.LogItemBuffer.CrawlURLWriter() { SyncedCrawlURLLogWriter syncedLogWriter = new SyncedCrawlURLLogWriter(); public void writeItem(CrawlURL url) throws IOException { // log it logCrawlLogWrite(url, url.getContentSize()); // write it syncedLogWriter.writeItem(crawlLogStream, url); } public void writeItemCount(int entryCount) throws IOException { } }, segmentLogStream, historyStream); } if (Environment.detailLogEnabled()) LOG.info("LOG_FLUSH: Log Flusher Finished Writing Entries To Disk"); collector.clear(); } catch (IOException e) { LOG.error("Critical Exception during Crawl Log Flush:" + CCStringUtils.stringifyException(e)); throw e; } finally { if (crawlLogStream != null) { crawlLogStream.flush(); crawlLogStream.close(); } for (DataOutputStream stream : streamsMapByPackedId.values()) { if (stream != null) stream.flush(); stream.close(); } } // at this point... update the crawl log header ... try { if (Environment.detailLogEnabled()) LOG.info("LOG_FLUSH: Updating Log File Headers"); // update the log file header updateLogFileHeader(crawlLogFile, _header, crawlLogRecordCount); // and update each completion log header ... for (long packedSegmentId : recordCountsByPackedId.keySet()) { File activeSegmentLogPath = CrawlSegmentLog.buildActivePath(_rootDirectory, getListIdFromLogId(packedSegmentId), getSegmentIdFromLogId(packedSegmentId)); CrawlSegmentLog.writeHeader(activeSegmentLogPath, recordCountsByPackedId.get(packedSegmentId)); } } catch (IOException e) { LOG.error("Criticial Exception during Crawl Log Fluhs:" + CCStringUtils.stringifyException(e)); throw e; } finally { } long endTime = System.currentTimeMillis(); _flushTimeAVG.addSample((double) endTime - startTime); _flushTimeSmoothed.addSample((double) endTime - startTime); _lastFlushTime = endTime - startTime; LOG.info("LOG_FLUSH: Log Flusher Flushed Successfully"); return true; } }, new CompletionCallback<Boolean>() { public void taskComplete(Boolean updateResult) { setFlushInProgress(false); if (completionCallback != null) { completionCallback.flushComplete(); } } public void taskFailed(Exception e) { setFlushInProgress(false); if (completionCallback != null) { completionCallback.flushFailed(e); } // all failures are critical in this particular task ... LOG.fatal("Crawl Log FLUSH Threw Exception:" + CCStringUtils.stringifyException(e)); // no matter ... it is time to CORE the server ... throw new RuntimeException("CRITICAL FAILURE: Crawl Log FLUSH Threw Exception:" + CCStringUtils.stringifyException(e)); } })); } else { setFlushInProgress(false); if (completionCallback != null) { completionCallback.flushComplete(); } } } public boolean isForcedCheckpointPossible() { // now one more check to see if we have enough items to do a checkpoint ... if (_header._itemCount != 0) { return true; } return false; } public boolean isCheckpointPossible(long currentTime) { if (_lastCheckpointTime == -1 || currentTime - _lastCheckpointTime >= CrawlerServer.getServer().getCrawlLogCheckpointInterval()) { // now one more check to see if we have enough items to do a checkpoint // ... if (_header._itemCount >= CrawlerServer.getServer().getCrawlLogCheckpointItemThreshold() || _header._fileSize >= CrawlerServer.getServer().getCrawlLogCheckpointLogSizeThreshold()) { return true; } } return false; } public void forceFlushAndCheckpointLog(final CheckpointCompletionCallback outerCallback) { if (isCheckpointInProgress() || isFlushInProgress()) { throw new RuntimeException("forceFlush called while active Checkpoint or Flush In Progress!!"); } flushLog(new FlushCompletionCallback() { @Override public void flushComplete() { long currentTime = System.currentTimeMillis(); LOG.info("LOG_FLUSH Flush Complete... Checking to see if Checkpoint Possilbe"); if (isForcedCheckpointPossible()) { // yes .. go ahead and checkpoint log LOG.info("Checkpointing Logs to HDFS"); // start the checkpoint ... checkpoint(currentTime, new CheckpointCompletionCallback() { public void checkpointComplete(long checkpointId, Vector<Long> completedSegmentList) { LOG.info("CrawlLog Checkpoint:" + checkpointId + " completed"); if (completedSegmentList != null) { // walk completed segments ... updating their crawl state ... if (_engine != null) { for (long packedSegmentId : completedSegmentList) { // notify crawler engine of status change ... _engine.crawlSegmentComplete(packedSegmentId); } } } // ok initiate outer callback outerCallback.checkpointComplete(checkpointId, null); } public void checkpointFailed(long checkpointId, Exception e) { LOG.error("Checkpoint Failed for Checkpoint:" + checkpointId + " With Exception:" + CCStringUtils.stringifyException(e)); outerCallback.checkpointFailed(checkpointId, e); } }, currentTime); } else { if (Environment.detailLogEnabled()) LOG.info("Checkpoint Skipped. Nothing to checkpoint"); outerCallback.checkpointComplete(0, null); } } @Override public void flushFailed(Exception e) { // log error and bail ... LOG.error(CCStringUtils.stringifyException(e)); // initiate callback outerCallback.checkpointFailed(0, e); } }); } public void startLogFlusher() { _logFlusherTimer = new Timer(CrawlerServer.getServer().getCrawlLogFlushInterval(), true, new Timer.Callback() { public void timerFired(Timer timer) { // if checkpoint is NOT in progress ... if (!isCheckpointInProgress() && !isFlushInProgress()) { LOG.info("LOG_FLUSH Starting ..."); flushLog( new FlushCompletionCallback() { public void flushComplete() { // flush is complete ... check to see if we want to do a // checkpoint ... long currentTime = System.currentTimeMillis(); LOG.info("LOG_FLUSH Flush Complete... Checking to see if Checkpoint Possilbe"); if (isCheckpointPossible(currentTime)) { LOG.info("Checkpointing Logs to HDFS"); // pause fetcher to prevent race condition where log flush takes // a long time and causes the fetcher to consume all avaliable // memory with content buffers _engine.pauseFetch(); // start the checkpoint ... checkpoint(currentTime, new CheckpointCompletionCallback() { public void checkpointComplete(long checkpointId, Vector<Long> completedSegmentList) { LOG.info("CrawlLog Checkpoint:" + checkpointId + " completed"); _engine.resumeFetch(); if (completedSegmentList != null) { // walk completed segments ... updating their crawl state // ... if (_engine != null) { for (long packedSegmentId : completedSegmentList) { // notify crawler engine of status change ... _engine.crawlSegmentComplete(packedSegmentId); } } } } public void checkpointFailed(long checkpointId, Exception e) { _engine.resumeFetch(); LOG.error("Checkpoint Failed for Checkpoint:" + checkpointId + " With Exception:" + CCStringUtils.stringifyException(e)); } }, currentTime); } } public void flushFailed(Exception e) { LOG.error("Flush Failed with Exception:" + CCStringUtils.stringifyException(e)); } } ); _engine.resumeFetch(); } // now } }); _eventLoop.setTimer(_logFlusherTimer); } public interface LogFlusherStopActionCallback { public void stopComplete(); } public void stopLogFlusher(final LogFlusherStopActionCallback completionCallback) { // indicate that a shutdown is in progress ... _shutdownInProgress = true; // stop the log flusher timer ... if (_logFlusherTimer != null) { _eventLoop.cancelTimer(_logFlusherTimer); } // create a polling timer ... final Timer waitTimer = new Timer(1000, true, new Timer.Callback() { public void timerFired(Timer timer) { // check to see if we are done flushing or checkpointing ... if (!isFlushInProgress() && !isCheckpointInProgress()) { LOG.info("CrawlLog - stopLog Timer - No Flush or Checkpoint in Progress... Initiating CrawlLog Shutdown"); // good to go ... cancel timer first ... _eventLoop.cancelTimer(timer); // and cleanup ... _logFlusherTimer = null; _shutdownInProgress = false; // initiate callback ... completionCallback.stopComplete(); } else { LOG.info("CrawlLog - stopLog Timer - Flush or Checkpoint in Progress... Waiting ... "); } } }); // and start the timer ... _eventLoop.setTimer(waitTimer); } public void collectStats(RuntimeStatsCollector collector) { collector.setDoubleValue(CrawlerEngineStats.ID, CrawlerEngineStats.Name.CrawlLog_FlushTimeAVG, _flushTimeAVG .getAverage()); collector.setDoubleValue(CrawlerEngineStats.ID, CrawlerEngineStats.Name.CrawlLog_FlushTimeSmoothed, _flushTimeSmoothed.getAverage()); collector.setLongValue(CrawlerEngineStats.ID, CrawlerEngineStats.Name.CrawlLog_FlushTimeLast, _lastFlushTime); } private static CrawlSegmentHost createHost(String hostName) { CrawlSegmentHost host = new CrawlSegmentHost(); host.setHostName(hostName); byte[] hostNameAsBytes = host.getHostName().getBytes(); host.setHostFP(FPGenerator.std64.fp(hostNameAsBytes, 0, hostNameAsBytes.length)); return host; } private static CrawlSegmentURL createSegmentURL(URL url) { CrawlSegmentURL segmentURL = new CrawlSegmentURL(); segmentURL.setUrl(url.toString()); byte[] urlAsBytes = segmentURL.getUrl().getBytes(); segmentURL.setUrlFP(FPGenerator.std64.fp(urlAsBytes, 0, urlAsBytes.length)); return segmentURL; } private static CrawlSegmentDetail loadCrawlSegment(String fileName) throws IOException { TreeMap<String, CrawlSegmentHost> hosts = new TreeMap<String, CrawlSegmentHost>(); URL resourceURL = CrawlEnvironment.getHadoopConfig().getResource(fileName); if (resourceURL == null) { throw new FileNotFoundException(); } InputStream stream = resourceURL.openStream(); BufferedReader reader = new BufferedReader(new InputStreamReader(new BufferedInputStream(stream))); String line = null; do { line = reader.readLine(); if (line != null) { if (Environment.detailLogEnabled()) LOG.info(line); try { URL theURL = new URL(line); CrawlSegmentHost host = hosts.get(theURL.getHost()); if (host == null) { host = createHost(theURL.getHost()); hosts.put(theURL.getHost(), host); } CrawlSegmentURL segmentURL = createSegmentURL(theURL); host.getUrlTargets().add(segmentURL); } catch (MalformedURLException e) { LOG.error("SKIPPING Malformed URL::" + line); } } } while (line != null); CrawlSegmentDetail crawlSegmentDetail = new CrawlSegmentDetail(); int urlCount = 0; crawlSegmentDetail.setSegmentId(1); for (CrawlSegmentHost host : hosts.values()) { crawlSegmentDetail.getHosts().add(host); urlCount += host.getUrlTargets().size(); } crawlSegmentDetail.setUrlCount(urlCount); // finally, sort by host (as will be the case in a proper map reduce // produced segment ... Collections.sort(crawlSegmentDetail.getHosts()); return crawlSegmentDetail; } public Vector<Long> getActiveSegmentIdList() { Vector<Long> segmentIdList = new Vector<Long>(); segmentIdList.addAll(_loggers.keySet()); return segmentIdList; } static void validateInputOutputCrawlURLArrays(ArrayList<CrawlURL> input, ArrayList<CrawlURL> output) throws IOException { Assert.assertTrue(input.size() == output.size()); for (int i = 0; i < input.size(); ++i) { CrawlURL left = input.get(i); CrawlURL right = input.get(i); Assert.assertTrue(left.getUrl().equals(right.getUrl())); Assert.assertTrue(left.getContentRaw().getReadOnlyBytes().equals(right.getContentRaw().getReadOnlyBytes())); } } static void validateLogFlusherCode(final File localDirPath, final Path remotePath, boolean injectErrors) throws IOException { final Configuration conf = new Configuration(); final FileSystem fs = FileSystem.get(conf); fs.mkdirs(remotePath); // ok create a crawlLog test file File localFile = File.createTempFile("crawlLog", "test", localDirPath); localFile.delete(); LOG.info("Initializing Temp File:" + localFile); // initialize LogFileHeader fileHeader = initializeLogFileHeaderFromLogFile(localFile); LOG.info("Creating SyncedCrawl URL Writer"); // create synced url writer ... SyncedCrawlURLLogWriter crawlURLWriter = new SyncedCrawlURLLogWriter(injectErrors); ArrayList<CrawlURL> urlObjects = new ArrayList<CrawlURL>(); // write a couple of url objects for (int i = 0; i < 100; ++i) { CrawlURL url = new CrawlURL(); url.setUrl("http://someurl.com/" + i); byte bytes[] = MD5.digest("Some Random:" + Math.random() + " Number").getBytes(); url.setContentRaw(new FlexBuffer(bytes)); final DataOutputStream crawlLogStream = new DataOutputStream(new FileOutputStream(localFile, true)); try { LOG.info("Appending object to log"); crawlURLWriter.writeItem(crawlLogStream, url); } finally { LOG.info("Flushing Log"); crawlLogStream.flush(); crawlLogStream.close(); } LOG.info("Updating Header"); updateLogFileHeader(localFile, fileHeader, 1); if (!injectErrors || i % 2 == 0) { urlObjects.add(url); } else { // drop odd entry LOG.info("Dropping Odd Entry:" + url.getUrl()); } } final ArrayList<CrawlURL> urlObjectsOut = new ArrayList<CrawlURL>(); HDFSCrawlURLWriter stubWriter = new HDFSCrawlURLWriter() { SequenceFileCrawlURLWriter innerWriter = new SequenceFileCrawlURLWriter(conf, fs, remotePath, "testNode", 1L); @Override public void writeCrawlURLItem(Text url, CrawlURL urlObject) throws IOException { LOG.info("Got URL:" + url.toString()); urlObjectsOut.add(urlObject); innerWriter.writeCrawlURLItem(url, urlObject); } @Override public void close() throws IOException { innerWriter.close(); } public List<Path> getFilenames() { return innerWriter.getFilenames(); } }; try { LOG.info("Transferring from Local to Remote"); transferLocalCheckpointLog(localFile, stubWriter, 1L); } finally { stubWriter.close(); } LOG.info("Validating Input/Output"); validateInputOutputCrawlURLArrays(urlObjects, urlObjectsOut); // read via sequenceFile urlObjectsOut.clear(); Path firstFile = Iterators.getNext(stubWriter.getFilenames().iterator(), null); SequenceFile.Reader reader = new SequenceFile.Reader(fs, firstFile, conf); Text key = new Text(); CrawlURL value = new CrawlURL(); while (reader.next(key, value)) { LOG.info("Got:" + key.toString()); urlObjectsOut.add(value); value = new CrawlURL(); } reader.close(); LOG.info("Validating Input/Output"); validateInputOutputCrawlURLArrays(urlObjects, urlObjectsOut); LOG.info("Done!"); } public static void walkCrawlLogFile(File crawlLogPath, long startOffset) throws IOException { // and open the crawl log file ... RandomAccessFile inputStream = null; IOException exception = null; CRC32 crc = new CRC32(); CustomByteArrayOutputStream buffer = new CustomByteArrayOutputStream(1 << 17); byte[] syncBytesBuffer = new byte[SYNC_BYTES_SIZE]; // save position for potential debug output. long lastReadPosition = 0; try { inputStream = new RandomAccessFile(crawlLogPath, "rw"); // and a data input stream ... RandomAccessFile reader = inputStream; // seek to zero reader.seek(0L); // read the header ... LogFileHeader header = readLogFileHeader(reader); System.out.println("Header ItemCount:" + header._itemCount + " FileSize:" + header._fileSize); if (startOffset != 0L) { System.out.println("Preseeking to:" + startOffset); reader.seek(startOffset); } Configuration conf = new Configuration(); // read a crawl url from the stream... long recordCount = 0; while (inputStream.getFilePointer() < header._fileSize) { // System.out.println("PRE-SYNC SeekPos:"+ // inputStream.getFilePointer()); if (seekToNextSyncBytesPos(syncBytesBuffer, reader, header._fileSize)) { // System.out.println("POST-SYNC SeekPos:"+ // inputStream.getFilePointer()); lastReadPosition = inputStream.getFilePointer(); // skip sync inputStream.skipBytes(SYNC_BYTES_SIZE); // read length ... int urlDataLen = reader.readInt(); long urlDataCRC = reader.readLong(); if (urlDataLen > buffer.getBuffer().length) { buffer = new CustomByteArrayOutputStream(((urlDataLen / 65536) + 1) * 65536); } reader.read(buffer.getBuffer(), 0, urlDataLen); crc.reset(); crc.update(buffer.getBuffer(), 0, urlDataLen); long computedValue = crc.getValue(); // validate crc values ... if (computedValue != urlDataCRC) { LOG.error("CRC Mismatch Detected during HDFS transfer in CrawlLog:" + crawlLogPath.getAbsolutePath() + " FilePosition:" + lastReadPosition); inputStream.seek(lastReadPosition + 1); } else { if (recordCount++ % 10000 == 0) { // allocate a crawl url data structure CrawlURL url = new CrawlURL(); DataInputStream bufferReader = new DataInputStream(new ByteArrayInputStream(buffer.getBuffer(), 0, urlDataLen)); // populate it from the (in memory) data stream url.readFields(bufferReader); System.out.println("Record:" + recordCount + " At:" + lastReadPosition + " URL:" + url.getUrl() + " BuffSize:" + urlDataLen + " ContentLen:" + url.getContentRaw().getCount() + " LastModified:" + new Date(url.getLastAttemptTime()).toString()); } } } else { break; } } } catch (EOFException e) { LOG.error("Caught EOF Exception during read of local CrawlLog:" + crawlLogPath.getAbsolutePath() + " FilePosition:" + lastReadPosition); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); exception = e; throw e; } finally { if (inputStream != null) inputStream.close(); } } }