/**
* Copyright 2008 - CommonCrawl Foundation
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
**/
package org.commoncrawl.util;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.File;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.util.ArrayList;
import java.util.zip.CRC32;
import org.apache.hadoop.io.DataInputBuffer;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableUtils;
import org.apache.hadoop.record.Buffer;
/** quick and dirty (for now) way to write writable records to a local disk file
*
* @author rana
*
*/
public class TimeSeriesDataFile<ValueType extends Writable> {
private static final int SyncBytes = 0xCC00CC00;
private CRC32 crc = new CRC32();
private File fileName=null;
private static final int RECORD_HEADER_LENGTH = 4 * 4;
private Class valueClass;
public static class KeyValueTuple<KeyType,ValueType> {
public KeyValueTuple(KeyType key,ValueType value) {
this.key = key;
this.value = value;
}
public KeyType key;
public ValueType value;
public long recordPos;
}
/** constructor
*
* @param fileName the output path (file will be created if it doesn't exist)
*/
public TimeSeriesDataFile(File fileName,Class valueClass) {
this.fileName = fileName;
this.valueClass = valueClass;
}
/** append a record to the file ...
*
* @param key
* @param value
* @throws IOException
*/
public synchronized long appendRecordToLogFile(long key,Writable value)throws IOException {
LogFileHeader header = new LogFileHeader();
boolean preExistingHeader = fileName.exists();
RandomAccessFile file = new RandomAccessFile(fileName,"rw");
long recordPositionOut = -1;
try {
if(preExistingHeader) {
long headerOffset = readLogFileHeader(file, header);
if (header._writePos == 0) {
recordPositionOut = headerOffset;
}
else {
recordPositionOut = header._writePos;
}
// seelk to appropriate write position
file.seek(recordPositionOut);
}
else {
recordPositionOut = writeLogFileHeader(file,header);
}
DataOutputBuffer buffer = new DataOutputBuffer();
// write out sync bytes ...
buffer.writeInt(SyncBytes);
// write out placeholder for record length
buffer.writeInt(0);
// write out placeholder for crc
buffer.writeLong(0);
// write out key + value to buffer
WritableUtils.writeVLong(buffer,key);
// write out value ...
value.write(buffer);
// write out trailing record size (4 bytes sync + 4 bytes record length + 4 bytes crc + key/value buffer +
buffer.writeInt(buffer.getLength());
// reset crc
crc.reset();
//calc crc
crc.update(buffer.getData(),RECORD_HEADER_LENGTH,buffer.getLength()-RECORD_HEADER_LENGTH);
// ok fix up record ...
// write out record length
// total length - sync bytes(4) - record length(4), at offset 4
writeInt(buffer.getLength() - 8,4,buffer.getData());
// and write out crc
// at offset 8 (after sync(4) and length(4)
writeLong(crc.getValue(),8,buffer.getData());
// and then the data
file.write(buffer.getData(),0,buffer.getLength());
// now update header ...
header._itemCount += 1;
header._writePos = file.getFilePointer();
header._lastRecordLength = buffer.getLength() - 4;
header._lastRecordKey = key;
// now write out header anew ...
writeLogFileHeader(file,header);
}
finally {
if (file != null) {
file.close();
}
}
return recordPositionOut;
}
/**
* read given a position
*
* @param position file position to start read at
* @param maxNumberOfRecords maximum number of records to read
* @param optionalMinKeyValue optional min key value to limit read by or -1
* @return a vector of KeyValueTuples
* @throws IOException
*/
public synchronized ArrayList <KeyValueTuple<Long,ValueType> > readFromPos(long position,int maxNumberOfRecords,long optionalMinKeyValue) throws IOException {
ArrayList< KeyValueTuple<Long,ValueType> > valuesOut = new ArrayList< KeyValueTuple<Long,ValueType> >();
LogFileHeader header = new LogFileHeader();
if (fileName.exists()) {
RandomAccessFile file = new RandomAccessFile(fileName,"r");
try {
//read header ...
long headerOffset = readLogFileHeader(file, header);
long endOfPrevRecord = position;
if (position > headerOffset) {
file.seek(endOfPrevRecord - 4);
// read previous record length
int currentRecordLength = file.readInt();
// delegate to common read
doCommonRead(valuesOut,file,headerOffset,endOfPrevRecord,currentRecordLength,maxNumberOfRecords,optionalMinKeyValue);
}
}
finally {
if (file != null) {
file.close();
}
}
}
return valuesOut;
}
/**
* read from the tail end of the file
*
* @param maxNumberOfRecords the maximum number of records to read from the tail
* @return a list of records at the tail end of the file
* @throws IOException
*/
public synchronized ArrayList< KeyValueTuple<Long,ValueType> > readFromTail(int maxNumberOfRecords,long optionalMinKeyValue)throws IOException {
ArrayList< KeyValueTuple<Long,ValueType> > valuesOut = new ArrayList< KeyValueTuple<Long,ValueType> >();
LogFileHeader header = new LogFileHeader();
if (fileName.exists()) {
RandomAccessFile file = new RandomAccessFile(fileName,"r");
try {
//read header ...
long headerOffset = readLogFileHeader(file, header);
// figure out how many records we can read ...
int recordsToRead = maxNumberOfRecords;
if (recordsToRead != 0) {
long endOfPrevRecord = header._writePos;
// read in first record length ...
int currentRecordLength = header._lastRecordLength;
// delegate to common read
doCommonRead(valuesOut,file,headerOffset,endOfPrevRecord,currentRecordLength,recordsToRead,optionalMinKeyValue);
}
}
finally {
if (file != null) {
file.close();
}
}
}
return valuesOut;
}
private void doCommonRead(
ArrayList< KeyValueTuple<Long,ValueType> > valuesOut,
RandomAccessFile file,
long headerOffset,
long endOfPrevRecord,
int currentRecordLength,
int recordsToRead,
long optionalMinKeyValue) throws IOException {
Buffer recordBuffer = new Buffer();
DataInputBuffer inputBuffer = new DataInputBuffer();
// ok start walking backwards ...
while (recordsToRead != 0) {
// setup new previous record pos pointer
endOfPrevRecord = endOfPrevRecord - currentRecordLength - 4;
// and seek to it endOfLastRecord - 4
file.seek(endOfPrevRecord - 4);
recordBuffer.setCapacity(currentRecordLength + 8);
// read in proper amount of data ...
file.read(recordBuffer.get(),0,currentRecordLength + 8);
// ok initialize input buffer ...
inputBuffer.reset(recordBuffer.get(), currentRecordLength + 8);
// now read next record length first ...
int nextRecordLength = inputBuffer.readInt();
// next read sync bytes ...
int syncBytes = inputBuffer.readInt();
// validate
if (syncBytes != SyncBytes) {
throw new IOException("Corrupt Record Detected!");
}
// ok read real record bytes ...
int realRecordBytes = inputBuffer.readInt();
// read crc ...
long crcValue = inputBuffer.readLong();
// ok validate crc ...
crc.reset();
crc.update(inputBuffer.getData(),inputBuffer.getPosition(),realRecordBytes-8);
if (crcValue != crc.getValue()) {
throw new IOException("CRC Mismatch!");
}
// ok now read key and value
try {
long key = WritableUtils.readVLong(inputBuffer);
if (optionalMinKeyValue != -1 && key < optionalMinKeyValue) {
break;
}
ValueType value = (ValueType) valueClass.newInstance();
value.readFields(inputBuffer);
KeyValueTuple tuple = new KeyValueTuple<Long, ValueType>(key, value);
tuple.recordPos = endOfPrevRecord;
valuesOut.add(0,tuple);
} catch (Exception e) {
throw new IOException(e);
}
currentRecordLength = nextRecordLength;
recordsToRead--;
if (endOfPrevRecord == headerOffset)
break;
}
}
/**
* get the key value of the last record in the file
* @return record key as a long or -1 if zero records in file
* @throws IOException
*/
public synchronized long getLastRecordKey() throws IOException{
LogFileHeader header = new LogFileHeader();
if (fileName.exists()) {
RandomAccessFile file = new RandomAccessFile(fileName,"r");
Buffer recordBuffer = new Buffer();
DataInputBuffer inputBuffer = new DataInputBuffer();
try {
//read header ...
long headerOffset = readLogFileHeader(file, header);
return header._lastRecordKey;
}
finally {
if (file != null) {
file.close();
}
}
}
return -1;
}
/**
* get the number of records in the file
*
* @return record count in file
* @throws IOException
*/
public synchronized int getRecordCount() throws IOException {
LogFileHeader header = new LogFileHeader();
if (fileName.exists()) {
RandomAccessFile file = new RandomAccessFile(fileName,"r");
Buffer recordBuffer = new Buffer();
DataInputBuffer inputBuffer = new DataInputBuffer();
try {
//read header ...
long headerOffset = readLogFileHeader(file, header);
return header._itemCount;
}
finally {
if (file != null) {
file.close();
}
}
}
return 0;
}
private static class LogFileHeader {
public static final int LogFileHeaderBytes = SyncBytes;
public static final int LogFileVersion = 1;
public LogFileHeader() {
_writePos = 0;
_itemCount = 0;
_lastRecordLength = 0;
_lastRecordKey = -1;
}
public long _writePos;
public int _itemCount;
public int _lastRecordLength;
public long _lastRecordKey;
public void writeHeader(DataOutput stream) throws IOException {
stream.writeInt(LogFileHeaderBytes);
stream.writeInt(LogFileVersion);
stream.writeLong(_writePos);
stream.writeInt(_itemCount);
stream.writeInt(_lastRecordLength);
stream.writeLong(_lastRecordKey);
}
public void readHeader(DataInput stream) throws IOException {
int headerBytes = stream.readInt();
int version = stream.readInt();
if (headerBytes != LogFileHeaderBytes && version !=LogFileVersion) {
throw new IOException("Invalid CrawlLog File Header Detected!");
}
_writePos = stream.readLong();
_itemCount = stream.readInt();
_lastRecordLength = stream.readInt();
_lastRecordKey = stream.readLong();
}
}
private static long writeLogFileHeader(RandomAccessFile file, LogFileHeader header )throws IOException {
// set the position at zero ..
file.seek(0);
// and write header to disk ...
header.writeHeader(file);
//took sync out because it was becoming a sever bottleneck
// file.getFD().sync();
return file.getFilePointer();
}
private static long readLogFileHeader(RandomAccessFile file,LogFileHeader header) throws IOException {
file.seek(0);
header.readHeader(file);
return file.getFilePointer();
}
static void writeInt(int value,int atOffset, byte[] intoBytes) throws IOException {
intoBytes[atOffset + 0] = (byte) ((value >>> 24) & 0xFF);
intoBytes[atOffset + 1] = (byte) ((value >>> 16) & 0xFF);
intoBytes[atOffset + 2] = (byte) ((value >>> 8) & 0xFF);
intoBytes[atOffset + 3] = (byte) ((value >>> 0) & 0xFF);
}
static void writeLong(long value,int atOffset, byte[] intoBytes) throws IOException {
intoBytes[atOffset + 0] = (byte) ((value >>> 56) & 0xFF);
intoBytes[atOffset + 1] = (byte) ((value >>> 48) & 0xFF);
intoBytes[atOffset + 2] = (byte) ((value >>> 40) & 0xFF);
intoBytes[atOffset + 3] = (byte) ((value >>> 32) & 0xFF);
intoBytes[atOffset + 4] = (byte) ((value >>> 24) & 0xFF);
intoBytes[atOffset + 5] = (byte) ((value >>> 16) & 0xFF);
intoBytes[atOffset + 6] = (byte) ((value >>> 8) & 0xFF);
intoBytes[atOffset + 7] = (byte) ((value >>> 0) & 0xFF);
}
}