/**
* Copyright 2008 - CommonCrawl Foundation
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
**/
package org.commoncrawl.service.listcrawler;
import java.io.DataOutput;
import java.io.File;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Vector;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.SequenceFile.CompressionType;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.record.Buffer;
import org.commoncrawl.crawl.common.internal.CrawlEnvironment;
import org.commoncrawl.protocol.CacheItem;
import org.commoncrawl.util.CCStringUtils;
/**
* Thread that flushes crawled content to HDFS
*
* @author rana
*
*/
final class HDFSFlusherThread implements Runnable {
public static final Log LOG = LogFactory.getLog(HDFSFlusherThread.class);
CacheManager _manager;
public HDFSFlusherThread(CacheManager manager) {
_manager = manager;
}
private long generateSequenceFileAndIndex(int itemFlushLimit,RandomAccessFile sourceLogFile,long startPos,long endPos, byte[] syncBytes,SequenceFile.Writer writer,DataOutput indexStreamOut,ArrayList<FingerprintAndOffsetTuple> tupleListOut) throws IOException {
byte [] syncCheck = new byte[syncBytes.length];
// and create a list to hold fingerprint / offset information
Vector<FingerprintAndOffsetTuple> fpOffsetList = new Vector<FingerprintAndOffsetTuple>();
long currentPos = startPos;
LOG.info("Flushing Entries Starting up to offset:" + endPos);
CacheItemHeader itemHeader = new CacheItemHeader();
int itemsProcessed = 0;
boolean ignoreFlushLimit = false;
// start read
while (currentPos < endPos) {
if ((endPos - currentPos) < LocalLogFileHeader.SYNC_BYTES_SIZE)
break;
// seek to current position ...
sourceLogFile.seek(currentPos);
boolean headerLoadFailed = false;
try {
// read the item header ... assuming things are good so far ...
itemHeader.readHeader(sourceLogFile);
}
catch (IOException e) {
CacheManager.LOG.error("### Item Header Load At Position:" + currentPos +" Failed With Exception:" + CCStringUtils.stringifyException(e));
headerLoadFailed = true;
}
if (headerLoadFailed) {
CacheManager.LOG.error("### Item File Corrupt at position:" + currentPos +" Seeking Next Sync Point");
currentPos += LocalLogFileHeader.SYNC_BYTES_SIZE;
}
// if header sync bytes don't match .. then seek to next sync position ...
if (headerLoadFailed || !Arrays.equals(itemHeader._sync, syncBytes)) {
CacheManager.LOG.error("### Item File Corrupt at position:" + currentPos +" Seeking Next Sync Point");
// reseek to current pos
sourceLogFile.seek(currentPos);
// read in a sync.length buffer amount
sourceLogFile.readFully(syncCheck);
int syncLen = syncBytes.length;
// start scan for next sync position ...
for (int i = 0; sourceLogFile.getFilePointer() < endPos; i++) {
int j = 0;
for (; j < syncLen; j++) {
if (syncBytes[j] != syncCheck[(i+j)%syncLen])
break;
}
if (j == syncLen) {
sourceLogFile.seek(sourceLogFile.getFilePointer() - LocalLogFileHeader.SYNC_BYTES_SIZE); // position before sync
break;
}
syncCheck[i%syncLen] = sourceLogFile.readByte();
}
// whatever, happened file pointer is at current pos
currentPos = sourceLogFile.getFilePointer();
if (currentPos < endPos) {
CacheManager.LOG.info("### Item Loader Found another sync point at:" + currentPos);
}
else {
CacheManager.LOG.error("### No more sync points found!");
}
}
else {
CacheManager.LOG.info("WritingItem with FP:" + itemHeader._fingerprint + " Pos Is:" + writer.getLength());
// track offset information for index building purposes
fpOffsetList.add(new FingerprintAndOffsetTuple(itemHeader._fingerprint,writer.getLength()));
// read item data ...
CacheItem cacheItem = new CacheItem();
cacheItem.readFields(sourceLogFile);
// now read content length
int contentLength = sourceLogFile.readInt();
// and if content present... allocate buffer
if (contentLength != 0) {
// allocate content buffer
byte[] contentBuffer = new byte[contentLength];
// read it from disk
sourceLogFile.readFully(contentBuffer);
// and set content into cache item
cacheItem.setContent(new Buffer(contentBuffer));
}
CacheManager.LOG.info("Adding to Sequence File Item with URL:" + cacheItem.getUrl());
// write to sequence file ...
writer.append(new Text(cacheItem.getUrl()),cacheItem);
// now seek past data
currentPos += CacheItemHeader.SIZE + itemHeader._dataLength + CacheManager.ITEM_RECORD_TRAILING_BYTES;
// increment item count
itemsProcessed++;
}
if (!ignoreFlushLimit && itemsProcessed >= itemFlushLimit) {
// ok this gets tricky now ...
// figure out how many bytes of data were required to get to flush limit
long approxCheckpointSize = currentPos - startPos;
// compute a threshold number
long bytesThreshold = (long) (approxCheckpointSize * .70);
// compute bytes remaining in checkpoint file ...
long bytesRemaining = endPos - currentPos;
// ok if bytes remaining are less than threshold number then go ahead and gobble
// everything up in a single pass (to prevent smaller subsequent index
if (bytesRemaining <= bytesThreshold) {
// ignore the flush limit and keep on rolling to the end ...
ignoreFlushLimit = true;
LOG.warn("*****Bytes Remaining:" + bytesRemaining + " less than % of last whole chkpt size:" + approxCheckpointSize + ". Bypassing Flush Limit");
}
else {
LOG.info("Reached Flush Item Limit:" + itemsProcessed + " Breaking Out");
break;
}
}
}
LOG.info("Writing Index");
// ok now build the index file ...
HDFSFileIndex.writeIndex(fpOffsetList,indexStreamOut);
LOG.info("Done Writing Index. Total Items Written:" + fpOffsetList.size());
// copy offset list into tuple list
tupleListOut.addAll(fpOffsetList);
return currentPos;
}
static class IndexDataFileTriple {
public Path _dataFilePath = null;
public Path _indexFilePath = null;
public File _localIndexFilePath = null;
}
@Override
public void run() {
boolean shutdown = false;
while(!shutdown) {
try {
final CacheFlushRequest request = _manager.getHDFSFlushRequestQueue().take();
switch (request._requestType) {
case ExitThreadRequest: {
// shutdown condition ...
CacheManager.LOG.info("Cache Flusher Thread Received Shutdown. Exiting!");
shutdown = true;
}
break;
case FlushRequest: {
LOG.info("Received Flush Request");
ArrayList<IndexDataFileTriple> tempFiles = new ArrayList<IndexDataFileTriple>();
ArrayList<FingerprintAndOffsetTuple> tuplesOut = new ArrayList<FingerprintAndOffsetTuple>();
// flag to track request status at end ..
boolean requestFailed = false;
long logStart = LocalLogFileHeader.SIZE;
long logEnd = logStart + request._bytesToFlush;
// create a hdfs temp file for data (and index)
long generateTime = System.currentTimeMillis();
Path tempDir = new Path(CrawlEnvironment.getHadoopConfig().get("mapred.temp.dir", ".") + "/flusher-temp-"+ generateTime);
// mkdir ...
try {
_manager.getRemoteFileSystem().mkdirs(tempDir);
} catch (IOException e1) {
LOG.error(CCStringUtils.stringifyException(e1));
requestFailed = true;
}
int iterationNumber = 0;
while (logStart != logEnd && !requestFailed) {
Path tempDataFile = new Path(tempDir,CacheManager.PROXY_CACHE_FILE_DATA_PREFIX + "-" + iterationNumber);
Path tempIndexFile = new Path(tempDir,CacheManager.PROXY_CACHE_FILE_INDEX_PREFIX + "-" + iterationNumber);
LOG.info("FlushRequest Pass#:" + iterationNumber + " DataPath:"+tempDataFile + " IndexPath:" + tempIndexFile);
SequenceFile.Writer writer = null;
FSDataOutputStream indexOutputStream = null;
RandomAccessFile localLogFile = null;
try {
LOG.info("Pass#:" + iterationNumber + " Opening SequenceFile for Output");
// open a temporary hdfs streams ...
writer = SequenceFile.createWriter(_manager.getRemoteFileSystem(),CrawlEnvironment.getHadoopConfig(),tempDataFile,Text.class,CacheItem.class,CompressionType.NONE);
// opening index output stream ...
LOG.info("Pass#:" + iterationNumber + " Opening Index Output Stream");
indexOutputStream = _manager.getRemoteFileSystem().create(tempIndexFile);
LOG.info("Pass#:" + iterationNumber + " Opening Local Log");
localLogFile = new RandomAccessFile(_manager.getActiveLogFilePath(),"rw");
// transfer log entries and generate index
logStart = generateSequenceFileAndIndex(_manager.getCacheFlushThreshold(),localLogFile,logStart,logEnd,_manager.getLocalLogSyncBytes(),writer,indexOutputStream,tuplesOut);
}
catch (IOException e) {
CacheManager.LOG.error(CCStringUtils.stringifyException(e));
requestFailed = true;
}
finally {
if (writer != null) {
try {
writer.close();
} catch (IOException e) {
LOG.error(CCStringUtils.stringifyException(e));
}
}
if (indexOutputStream != null) {
try {
indexOutputStream.close();
} catch (IOException e) {
LOG.error(CCStringUtils.stringifyException(e));
}
}
if (localLogFile != null) {
try {
localLogFile.close();
} catch (IOException e) {
LOG.error(CCStringUtils.stringifyException(e));
}
}
}
if (requestFailed) {
try {
LOG.info("Pass#:" + iterationNumber+ " Failed. Deleting temp files");
_manager.getRemoteFileSystem().delete(tempDataFile, false);
_manager.getRemoteFileSystem().delete(tempIndexFile, false);
}
catch (IOException e){
LOG.error("Delete Failed During Failure! Potenital Orphan Files! : " + CCStringUtils.stringifyException(e));
}
break;
}
else {
LOG.info("Pass#:" + iterationNumber+ " Finished. Adding files to tuple list");
// add temp file tuple
IndexDataFileTriple indexDataPair = new IndexDataFileTriple();
indexDataPair._dataFilePath = tempDataFile;
indexDataPair._indexFilePath = tempIndexFile;
tempFiles.add(indexDataPair);
}
iterationNumber++;
}
LOG.info("All Passes Complete. Finalizing Commit");
// ok if request failed ...
if (!requestFailed) {
int itemIndex = 0;
for (IndexDataFileTriple indexDataPair : tempFiles) {
// generate final paths ...
Path finalOutputDir = _manager.getRemoteDataDirectory();
Path finalDataFilePath = new Path(finalOutputDir,CacheManager.PROXY_CACHE_FILE_DATA_PREFIX+ "-" + (generateTime + itemIndex));
Path finalIndexFilePath = new Path(finalOutputDir,CacheManager.PROXY_CACHE_FILE_INDEX_PREFIX+ "-" + (generateTime + itemIndex));
try {
LOG.info("Pass#:" + itemIndex + " Renaming Temp Files");
LOG.info("Pass#:" + itemIndex + " Final Data File Name is:" + finalDataFilePath);
LOG.info("Pass#:" + itemIndex + " Final Index File Name is:" + finalIndexFilePath);
// rename files ...
_manager.getRemoteFileSystem().rename(indexDataPair._dataFilePath, finalDataFilePath);
indexDataPair._dataFilePath = finalDataFilePath;
_manager.getRemoteFileSystem().rename(indexDataPair._indexFilePath, finalIndexFilePath);
indexDataPair._indexFilePath = finalIndexFilePath;
}
catch (IOException e) {
LOG.info("Pass#:" + itemIndex + " Rename Failed");
LOG.error(CCStringUtils.stringifyException(e));
requestFailed = true;
break;
}
try {
// copy to local ...
indexDataPair._localIndexFilePath = new File(_manager.getLocalDataDirectory(),finalIndexFilePath.getName());
LOG.info("Pass#:" + itemIndex + " Copying Remote Index File at:" + finalIndexFilePath + " to Local Directory:" + indexDataPair._localIndexFilePath.getAbsolutePath());
_manager.getRemoteFileSystem().copyToLocalFile(finalIndexFilePath, new Path(indexDataPair._localIndexFilePath.getAbsolutePath()));
LOG.info("Pass#:" + itemIndex + " Done Copying Remote Index File to Local");
}
catch (IOException e) {
LOG.info("Pass#:" + itemIndex + " Local File Copy Failed with Exception:" + CCStringUtils.stringifyException(e));
requestFailed = true;
indexDataPair._localIndexFilePath = null;
break;
}
// inrement item index
itemIndex++;
}
// ok callback to manager if request succeeded
if (!requestFailed) {
try {
LOG.info("Flush Complete. Calling hdfsFlushComplete");
_manager.hdfsCacheFlushRequestComplete(request,tuplesOut,tempFiles);
LOG.info("Flush Complete. hdfsFlushComplete succeeded");
}
catch (IOException e) {
LOG.error("hdfsFlushComplete returned Exception:" + CCStringUtils.stringifyException(e));
requestFailed = true;
}
}
}
if (requestFailed) {
LOG.info("Cache Manager Log Flush Failed. Deleteing files");
try {
// delete temp file directory recursively
_manager.getRemoteFileSystem().delete(tempDir, true);
}
catch (IOException e) {
LOG.error(CCStringUtils.stringifyException(e));
}
// iterate temp file list
for (IndexDataFileTriple triple : tempFiles) {
try {
LOG.info("Deleteing:" + triple._dataFilePath);
_manager.getRemoteFileSystem().delete(triple._dataFilePath,false);
LOG.info("Deleteing:" + triple._indexFilePath);
_manager.getRemoteFileSystem().delete(triple._indexFilePath,false);
if (triple._localIndexFilePath != null) {
LOG.info("Deleteing LOCAL:" + triple._localIndexFilePath);
triple._localIndexFilePath.delete();
}
}
catch (IOException e) {
LOG.error(CCStringUtils.stringifyException(e));
}
}
// callback to manager with the bad news ...
_manager.hdfsCacheFlushRequestFailed(request);
}
}
break;
}
} catch (InterruptedException e) {
LOG.error("Unexpected Exception in HDFSFlusher Thread:" + CCStringUtils.stringifyException(e));
}
}
}
}