/**
* Copyright 2008 - CommonCrawl Foundation
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
**/
package org.commoncrawl.service.listcrawler;
import java.io.DataOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.OutputStream;
import java.io.RandomAccessFile;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.zip.CRC32;
import java.util.zip.CheckedOutputStream;
import java.util.zip.GZIPOutputStream;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.io.DataOutputBuffer;
import org.commoncrawl.protocol.CacheItem;
import org.commoncrawl.util.ByteStream;
import org.commoncrawl.util.CCStringUtils;
import org.commoncrawl.util.FlexBuffer;
/**
* thread that writes crawled content to disk / cache
* @author rana
*
*/
final class CacheWriterThread implements Runnable {
private class ThriftyGZIPOutputStream extends GZIPOutputStream {
public ThriftyGZIPOutputStream(OutputStream os) throws IOException {
super(os);
}
@Override
public void finish() throws IOException {
super.finish();
def.end();
}
@Override
public void close() throws IOException {
super.finish();
def.end();
}
}
public static final Log LOG = LogFactory.getLog(CacheWriterThread.class);
private CRC32 _crc32Out = new CRC32();
private CacheManager _manager;
private File _dataDirectory;
private LinkedBlockingQueue<CacheWriteRequest> _writeRequestQueue;
public CacheWriterThread(CacheManager cacheManager) {
_manager = cacheManager;
_dataDirectory = cacheManager.getLocalDataDirectory();
_writeRequestQueue = cacheManager.getWriteRequestQueue();
}
@Override
public void run() {
boolean shutdown = false;
while(!shutdown) {
try {
final CacheWriteRequest request = _writeRequestQueue.take();
switch (request._requestType) {
case ExitThreadRequest: {
// shutdown condition ...
CacheManager.LOG.info("Disk Writer Thread Received Shutdown. Exiting!");
shutdown = true;
}
break;
case WriteRequest: {
long timeStart = System.currentTimeMillis();
try {
// reset crc calculator (single thread so no worries on synchronization)
_crc32Out.reset();
// figure out if we need to compress the item ...
if ((request._item.getFlags() & CacheItem.Flags.Flag_IsCompressed) == 0 && request._item.getContent().getCount() != 0) {
LOG.info("Incoming Cache Request Content for:" + request._item.getUrl() + " is not compressed. Compressing...");
ByteStream compressedBytesOut = new ByteStream(request._item.getContent().getCount());
ThriftyGZIPOutputStream gzipOutputStream = new ThriftyGZIPOutputStream(compressedBytesOut);
gzipOutputStream.write(request._item.getContent().getReadOnlyBytes(),0,request._item.getContent().getCount());
gzipOutputStream.finish();
LOG.info("Finished Compressing Incoming Content for:" + request._item.getUrl() +
" BytesIn:" + request._item.getContent().getCount() +
" BytesOut:" + compressedBytesOut.size());
// replace buffer
request._item.setContent(new FlexBuffer(compressedBytesOut.getBuffer(),0,compressedBytesOut.size()));
request._item.setFlags((request._item.getFlags() | CacheItem.Flags.Flag_IsCompressed));
}
// create streams ...
ByteStream bufferOutputStream = new ByteStream(8192);
CheckedOutputStream checkedStream = new CheckedOutputStream(bufferOutputStream,_crc32Out);
DataOutputStream dataOutputStream = new DataOutputStream(checkedStream);
// remember if this item has content ...
boolean hasContent = request._item.isFieldDirty(CacheItem.Field_CONTENT);
// now mark the content field as clean, so that it will not be serialized in our current serialization attempt ...
request._item.setFieldClean(CacheItem.Field_CONTENT);
// and go ahead and write out the data to the intermediate buffer while also computing partial checksum
request._item.write(dataOutputStream);
request._item.setFieldDirty(CacheItem.Field_CONTENT);
// ok, now ... write out file header ...
CacheItemHeader itemHeader = new CacheItemHeader(_manager.getLocalLogSyncBytes());
itemHeader._status = CacheItemHeader.STATUS_ALIVE;
itemHeader._lastAccessTime = System.currentTimeMillis();
itemHeader._fingerprint = request._itemFingerprint;
// compute total length ...
// first the header bytes in the cacheItem
itemHeader._dataLength = bufferOutputStream.size();
// next the content length (encoded - as in size + bytes) ...
itemHeader._dataLength += 4 + request._item.getContent().getCount();
// lastly the crc value iteself ...
itemHeader._dataLength += 8;
// open the log file ...
DataOutputBuffer logStream = new DataOutputBuffer();
// ok, go ahead and write the header
itemHeader.writeHeader(logStream);
// ok now write out the item data minus content...
logStream.write(bufferOutputStream.getBuffer(),0,bufferOutputStream.size());
// now create a checked stream for the content ...
CheckedOutputStream checkedStream2 = new CheckedOutputStream(logStream,checkedStream.getChecksum());
dataOutputStream = new DataOutputStream(checkedStream2);
// content size
dataOutputStream.writeInt(request._item.getContent().getCount());
// now write out the content (via checked stream so that we can calc checksum on content)
dataOutputStream.write(request._item.getContent().getReadOnlyBytes(),0,request._item.getContent().getCount());
// ok ... lastly write out the checksum bytes ...
dataOutputStream.writeLong(checkedStream2.getChecksum().getValue());
// and FINALLY, write out the total item bytes (so that we can seek in reverse to read last request log
logStream.writeInt(CacheItemHeader.SIZE + itemHeader._dataLength);
// ok flush everyting to the memory stream
dataOutputStream.flush();
//ok - time to acquire the log semaphore
//LOG.info("Acquiring Local Log Semaphore");
_manager.getLocalLogAccessSemaphore().acquireUninterruptibly();
try {
// now time to acquire the write semaphore ...
_manager.getLocalLogWriteAccessSemaphore().acquireUninterruptibly();
// get the current file position
long recordOffset = _manager.getLocalLogFilePos();
try {
long ioTimeStart = System.currentTimeMillis();
RandomAccessFile logFile = new RandomAccessFile(_manager.getActiveLogFilePath(),"rw");
try {
// seek to our known record offset
logFile.seek(recordOffset);
// write out the data
logFile.write(logStream.getData(),0,logStream.getLength());
}
finally {
logFile.close();
}
// now we need to update the file header
_manager.updateLogFileHeader(_manager.getActiveLogFilePath(),1,CacheItemHeader.SIZE + itemHeader._dataLength + 4 /*trailing bytes*/);
CacheManager.LOG.info("#### Wrote Cache Item in:" + (System.currentTimeMillis() - timeStart)
+ " iotime:" + (System.currentTimeMillis() - ioTimeStart) + " QueueSize:" + _writeRequestQueue.size());
}
finally {
// release write semaphore quickly
_manager.getLocalLogWriteAccessSemaphore().release();
}
// now inform the manager of the completed request ...
_manager.writeRequestComplete(request,recordOffset);
}
finally {
//LOG.info("Releasing Local Log Semaphore");
_manager.getLocalLogAccessSemaphore().release();
}
}
catch (IOException e) {
CacheManager.LOG.error("### FUC# BATMAN! - GONNA LOSE THIS REQUEST!!!!:" + CCStringUtils.stringifyException(e));
_manager.writeRequestFailed(request,e);
}
}
break;
}
} catch (InterruptedException e) {
}
}
}
}