package org.commoncrawl.db;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.junit.*;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.IOException;
import java.util.Vector;
import SQLite.*;
import SQLite.Exception;
import java.rmi.dgc.VMID;
import org.commoncrawl.rpc.base.internal.UnitTestStruct1;
import org.commoncrawl.rpc.base.shared.BinaryProtocol;
import org.commoncrawl.rpc.base.shared.RPCStruct;
import org.commoncrawl.rpc.base.shared.RPCStructWithId;
public class RecordStore {
public static final Log LOG = LogFactory.getLog(RecordStore.class);
private static int DATABASE_VERSION = 1;
public static class RecordStoreException extends IOException {
/**
*
*/
private static final long serialVersionUID = 1L;
public RecordStoreException(String reason) {
super(reason);
}
}
public static class ReplicationServer {
public String _serverName;
public int _port;
ReplicationServer(String serverName,int port) {
_serverName = serverName;
_port = port;
}
}
private class TransactionState {
public TransactionState() {
_activeThread = Thread.currentThread();
_txnNumber = ++RecordStore.this._lastTxnNumber;
_refCount = 0;
}
public Thread _activeThread;
public int _refCount;
public long _txnNumber;
}
private enum RecordStatus{ ALIVE,DELETED };
private int _databaseVersion = 0;
private long _lastCheckpointNumber = 0;
private long _lastTxnNumber = 0;
private long _lastRecordId = 0;
private String _databaseId;
private Database _database;
TransactionState _txnState;
private static BinaryProtocol binaryProtocol = new BinaryProtocol();
public synchronized void initialize(File localFilePath,
Vector<ReplicationServer> servers)throws RecordStoreException {
try {
_database = new Database();
_database.open(localFilePath.getAbsolutePath(), 0666);
if (checkTables()) {
queryMetadata();
}
else {
createTables();
}
}
catch (SQLite.Exception e) {
throw wrapSQLiteException(e);
}
}
public String getDatabaseId() {
return _databaseId;
}
private RecordStoreException wrapSQLiteException(SQLite.Exception e ) {
return new RecordStoreException(e.getMessage());
}
public synchronized void shutdown() {
if (_database != null) {
try {
_database.close();
} catch (Exception e) {
e.printStackTrace();
}
_database = null;
}
}
/**
*
* @return true if tables actually created
* @throws RecordStoreException
*/
private final void createTables()throws RecordStoreException {
Stmt stmt = null;
try {
// create the tables ...
String createMasterSQL = "Create Table metadata ( database_id TEXT NOT NULL," +
"database_version INTEGER NOT NULL," +
"last_checkpoint_id INTEGER(8) NOT NULL," +
"last_txn_number INTEGER(8) NOT NULL);";
// record_id,record_type,parent_id,record_key,txn_number,record_status,record_data
String createRecordTableSQL = "Create Table records ( record_id INTEGER(8) NOT NULL," +
"record_type TEXT NOT NULL,"+
"parent_id TEXT,"+
"record_key TEXT,"+
"txn_number INTEGER(8) NOT NULL," +
"record_status INTEGER NOT NULL," +
"record_data BLOB," +
"PRIMARY KEY(record_id) );";
String createIndexSQL = "Create Index records_key_idx on records (record_key);";
String createIndexSQL2 = "Create Index records_parent_id_idx on records (parent_id);";
_database.exec(createMasterSQL, null);
_database.exec(createRecordTableSQL, null);
_database.exec(createIndexSQL, null);
_database.exec(createIndexSQL2, null);
_databaseId = new VMID().toString();
_lastCheckpointNumber = 0;
_lastTxnNumber = 0;
String insertMasterSQL = "Insert Into metadata (database_id,database_version,last_checkpoint_id,last_txn_number)" +
" Values (?,?,?,?);";
stmt = _database.prepare(insertMasterSQL);
stmt.bind(1,_databaseId);
stmt.bind(2,DATABASE_VERSION);
stmt.bind(3,_lastCheckpointNumber);
stmt.bind(4,_lastTxnNumber);
stmt.step();
}
catch (SQLite.Exception e) {
throw wrapSQLiteException(e);
}
finally {
if (stmt != null){
try {
stmt.close();
}
catch (SQLite.Exception e) {
throw wrapSQLiteException(e);
}
}
}
}
private synchronized final void queryMetadata() throws RecordStoreException {
try {
// query the master table ...
String queryMasterSQL = "Select database_id,database_version,last_checkpoint_id,last_txn_number from metadata;";
String recordIdSQL = "Select Max(record_id) from records;";
Stmt stmt = null;
Stmt stmt2 = null;
try {
stmt = _database.prepare(queryMasterSQL);
stmt2 = _database.prepare(recordIdSQL);
if (stmt.step() && stmt2.step()) {
_databaseId = stmt.column_string(0);
//TODO: VALIDATE DATABASE VERSION HERE ...
int databaseVersion = stmt.column_int(1);
_lastCheckpointNumber = stmt.column_long(2);
_lastTxnNumber = stmt.column_long(3);
_lastRecordId = stmt2.column_long(0);
}
}
finally{
stmt.close();
stmt2.close();
}
}
catch (SQLite.Exception e) {
throw wrapSQLiteException(e);
}
}
private synchronized final boolean checkTables()throws RecordStoreException {
SQLite.TableResult result = null;
try {
result = _database.get_table("SELECT name FROM sqlite_master where type IN('table','view') AND name IN" +
"('metadata','records');");
if (result != null && result.nrows == 2) {
return true;
}
return false;
}
catch (SQLite.Exception e) {
throw wrapSQLiteException(e);
}
finally {
if (result != null) {
result.clear();
}
}
}
public synchronized final boolean inTransaction() {
if (_txnState != null && _txnState._activeThread == Thread.currentThread())
return true;
else
return false;
}
public final synchronized void beginTransaction() throws RecordStoreException {
if (_txnState != null && _txnState._activeThread != Thread.currentThread()) {
throw new RecordStoreException("Invalid State. TXN already open in another thread!");
}
if (_txnState == null) {
_txnState = new TransactionState();
}
if (_txnState._refCount++ == 0) {
try {
_database.exec("BEGIN EXCLUSIVE;", null);
}
catch (SQLite.Exception e){
if (--_txnState._refCount == 0) {
_txnState = null;
}
throw wrapSQLiteException(e);
}
}
}
public final synchronized void commitTransaction() throws RecordStoreException {
if (_txnState == null || _txnState._activeThread != Thread.currentThread()) {
throw new RecordStoreException("Invalid State. commit called on non-existen txn or from different thread.");
}
if (--_txnState._refCount == 0) {
_txnState = null;
try {
_database.exec("COMMIT;", null);
}
catch (SQLite.Exception e) {
throw wrapSQLiteException(e);
}
}
}
public final synchronized void abortTransaction(){
if (_txnState == null || _txnState._activeThread != Thread.currentThread()) {
LOG.error("Invalid Call to abortTransaction. No Transaction is Active");
}
// abort transaction irrespective of ref count ...
_txnState = null;
try {
_database.exec("ROLLBACK;", null);
}
catch (SQLite.Exception e) {
LOG.error("ABORT TRANSACTION FAILED WITH ERROR:" + e.toString());
throw new RuntimeException(wrapSQLiteException(e));
}
}
public synchronized final <Type> long insertRecord(String parentId,String key,RPCStructWithId struct) throws RecordStoreException{
Stmt stmt = null;
if (struct == null || key == null) {
throw new RecordStoreException("NULL data value not allowed.");
}
try {
// validate txn status
validateTransaction();
// allocate a new record id ...
struct.setRecordId(++_lastRecordId);
// serialize the data
ByteArrayOutputStream byteStream = new ByteArrayOutputStream();
DataOutputStream outputStream = new DataOutputStream (byteStream);
try {
struct.serialize(outputStream, binaryProtocol);
outputStream.flush();
}
catch (IOException e){
throw new RecordStoreException("Serialization Error");
}
String strInsertSQL = "Insert Into records (record_id,record_type,parent_id,record_key,txn_number,record_status,record_data) " +
"Values (?,?,?,?,?,?,?);";
stmt = _database.prepare(strInsertSQL);
stmt.bind(1,struct.getRecordId());
stmt.bind(2,struct.getClass().getName());
stmt.bind(3,parentId);
if (key == null) {
stmt.bind(4);
}
else {
stmt.bind(4,key);
}
stmt.bind(5,_txnState._txnNumber);
stmt.bind(6,RecordStatus.ALIVE.ordinal());
stmt.bind(7,byteStream.toByteArray());
stmt.step();
}
catch (SQLite.Exception e) {
throw wrapSQLiteException(e);
}
finally {
if (stmt != null) {
try {
stmt.close();
}
catch (SQLite.Exception e) {
throw wrapSQLiteException(e);
}
}
}
return struct.getRecordId();
}
public synchronized final void updateRecordById(long recordId, RPCStructWithId struct) throws RecordStoreException{
Stmt stmt = null;
if (struct== null || recordId == 0) {
throw new RecordStoreException("NULL data value not allowed.");
}
// serialize the data
ByteArrayOutputStream byteStream = new ByteArrayOutputStream();
DataOutputStream outputStream = new DataOutputStream (byteStream);
try {
struct.serialize(outputStream, binaryProtocol);
outputStream.flush();
}
catch (IOException e){
throw new RecordStoreException("Serialization Error");
}
try {
// validate txn status
validateTransaction();
String strInsertSQL = "Update records Set record_type=?,txn_number=?,record_status=?,record_data=? where record_id=?;";
stmt = _database.prepare(strInsertSQL);
stmt.bind(1,struct.getClass().getName());
stmt.bind(2,_txnState._txnNumber);
stmt.bind(3,RecordStatus.ALIVE.ordinal());
stmt.bind(4,byteStream.toByteArray());
stmt.bind(5,struct.getRecordId());
stmt.step();
}
catch (SQLite.Exception e) {
throw wrapSQLiteException(e);
}
finally {
if (stmt != null) {
try {
stmt.close();
}
catch (SQLite.Exception e) {
throw wrapSQLiteException(e);
}
}
}
}
public synchronized final void updateRecordByKey(String key, RPCStruct struct) throws RecordStoreException {
Stmt stmt = null;
if (struct== null || key == null) {
throw new RecordStoreException("NULL data value not allowed.");
}
// serialize the data
ByteArrayOutputStream byteStream = new ByteArrayOutputStream();
DataOutputStream outputStream = new DataOutputStream (byteStream);
try {
struct.serialize(outputStream, binaryProtocol);
outputStream.flush();
}
catch (IOException e){
throw new RecordStoreException("Serialization Error");
}
try {
// validate txn status
validateTransaction();
String strInsertSQL = "Update records Set record_type=?,txn_number=?,record_status=?,record_data=? where record_key=?;";
stmt = _database.prepare(strInsertSQL);
stmt.bind(1,struct.getClass().getName());
stmt.bind(2,_txnState._txnNumber);
stmt.bind(3,RecordStatus.ALIVE.ordinal());
stmt.bind(4,byteStream.toByteArray());
stmt.bind(5,key);
stmt.step();
}
catch (SQLite.Exception e) {
throw wrapSQLiteException(e);
}
finally {
if (stmt != null) {
try {
stmt.close();
}
catch (SQLite.Exception e) {
throw wrapSQLiteException(e);
}
}
}
}
public synchronized final void deleteRecordById(long recordId) throws RecordStoreException{
Stmt stmt = null;
try {
// validate txn status
validateTransaction();
/*
String strInsertSQL = "Update records Set txn_number=?,record_status=?,record_data=? where record_id=?;";
stmt = _database.prepare(strInsertSQL);
stmt.bind(1,_txnState._txnNumber);
stmt.bind(2,RecordStatus.DELETED.ordinal());
stmt.bind(3);
stmt.bind(4,recordId);
*/
String strDeleteSQL = "Delete from records where record_id=?;";
stmt = _database.prepare(strDeleteSQL);
stmt.bind(1,recordId);
stmt.step();
}
catch (SQLite.Exception e) {
throw wrapSQLiteException(e);
}
finally {
if (stmt != null) {
try {
stmt.close();
}
catch (SQLite.Exception e) {
throw wrapSQLiteException(e);
}
}
}
}
public synchronized final void deleteRecordByKey(String recordKey) throws RecordStoreException{
Stmt stmt = null;
try {
// validate txn status
validateTransaction();
/*
String strInsertSQL = "Update records Set txn_number=?,record_status=?,record_data=? where record_key=?;";
stmt = _database.prepare(strInsertSQL);
stmt.bind(1,_txnState._txnNumber);
stmt.bind(2,RecordStatus.DELETED.ordinal());
stmt.bind(3);
stmt.bind(4,recordKey);
*/
String strDeleteSQL = "Delete from records where record_key=?;";
stmt = _database.prepare(strDeleteSQL);
stmt.bind(1,recordKey);
stmt.step();
}
catch (SQLite.Exception e) {
throw wrapSQLiteException(e);
}
finally {
if (stmt != null) {
try {
stmt.close();
}
catch (SQLite.Exception e) {
throw wrapSQLiteException(e);
}
}
}
}
public synchronized final void deleteChildRecords(String parentId) throws RecordStoreException{
Stmt stmt = null;
try {
// validate txn status
validateTransaction();
/*
String strInsertSQL = "Update records Set txn_number=?,record_status=?,record_data=? where parent_id=?;";
stmt = _database.prepare(strInsertSQL);
stmt.bind(1,_txnState._txnNumber);
stmt.bind(2,RecordStatus.DELETED.ordinal());
stmt.bind(3);
stmt.bind(4,parentId);
*/
String strDeleteSQL = "Delete from records where parent_id=?;";
stmt = _database.prepare(strDeleteSQL);
stmt.bind(1,parentId);
stmt.step();
}
catch (SQLite.Exception e) {
throw wrapSQLiteException(e);
}
finally {
if (stmt != null) {
try {
stmt.close();
}
catch (SQLite.Exception e) {
throw wrapSQLiteException(e);
}
}
}
}
public synchronized RPCStruct getRecordByKey(String key) throws RecordStoreException {
return getRecordByKeyOrId(key,0);
}
public synchronized RPCStruct getRecordById(long recordId) throws RecordStoreException {
return getRecordByKeyOrId(null,recordId);
}
static String org_crawlcommons = "org.crawlcommons";
private synchronized final RPCStruct getRecordByKeyOrId(String key,long recordId) throws RecordStoreException {
String strSQL;
if (key != null)
strSQL = "Select record_id,record_type,record_data from records where record_key = ? and record_status=?;";
else
strSQL = "Select record_id,record_type,record_data from records where record_id = ? and record_status=?;";
Stmt stmt = null;
RPCStructWithId struct = null;
try {
stmt = _database.prepare(strSQL);
if (key != null)
stmt.bind(1,key);
else
stmt.bind(1,recordId);
stmt.bind(2,RecordStatus.ALIVE.ordinal());
if (stmt.step()) {
recordId = stmt.column_long(0);
String recordType = stmt.column_string(1);
byte[] data = stmt.column_bytes(2);
if (recordType.startsWith(org_crawlcommons)) {
recordType = recordType.replaceFirst(org_crawlcommons, "org.commoncrawl");
}
// allocate an instance of the struct
struct = (RPCStructWithId) Class.forName(recordType).newInstance();
// create the necessary stream
DataInputStream in = new DataInputStream(new ByteArrayInputStream(data));
// and deserialize the struct ...
struct.deserialize(in, binaryProtocol);
// set the record id before returning the struct to the caller
struct.setRecordId(recordId);
}
}
catch (SQLite.Exception e) {
throw wrapSQLiteException(e);
}
catch (ClassNotFoundException e){
e.printStackTrace();
throw new RuntimeException(e);
} catch (InstantiationException e) {
e.printStackTrace();
throw new RuntimeException(e);
} catch (IllegalAccessException e) {
e.printStackTrace();
throw new RuntimeException(e);
} catch (IOException e) {
e.printStackTrace();
throw new RecordStoreException("DeSerialization Failure");
}
finally{
try {
if (stmt != null)
stmt.close();
}
catch (SQLite.Exception e) {
throw wrapSQLiteException(e);
}
}
return struct;
}
public synchronized Vector<String> getChildRecordKeysByParentId(String parentId) throws RecordStoreException {
String strSQL = "Select record_key from records where parent_id = ? and record_status=?;";
Stmt stmt = null;
Vector<String> children = new Vector<String>();
try {
stmt = _database.prepare(strSQL);
stmt.bind(1,parentId);
stmt.bind(2,RecordStatus.ALIVE.ordinal());
while (stmt.step()) {
children.add(stmt.column_string(0));
}
}
catch (SQLite.Exception e) {
throw wrapSQLiteException(e);
}
finally{
try {
stmt.close();
}
catch (SQLite.Exception e) {
throw wrapSQLiteException(e);
}
}
return children;
}
public synchronized Vector<Long> getChildRecordsByParentId(String parentId) throws RecordStoreException {
String strSQL = "Select record_id from records where parent_id = ? and record_status=?;";
Stmt stmt = null;
Vector<Long> children = new Vector<Long>();
try {
stmt = _database.prepare(strSQL);
stmt.bind(1,parentId);
stmt.bind(2,RecordStatus.ALIVE.ordinal());
while (stmt.step()) {
children.add(stmt.column_long(0));
}
}
catch (SQLite.Exception e) {
throw wrapSQLiteException(e);
}
finally{
try {
stmt.close();
}
catch (SQLite.Exception e) {
throw wrapSQLiteException(e);
}
}
return children;
}
private final synchronized void validateTransaction() throws RecordStoreException {
if (_txnState == null || _txnState._activeThread != Thread.currentThread()) {
throw new RecordStoreException("Invalid State.No current Transaction or Txn opened in another thread.");
}
}
/** generic helper routine to persist RPCStruct to disk **/
public synchronized void insertUpdatePersistentObject ( RPCStructWithId object,String parentKey,String keyPrefix,boolean update) throws RecordStoreException {
if (update)
updateRecordByKey(keyPrefix+object.getKey(), object);
else
insertRecord(parentKey, keyPrefix+object.getKey(), object);
}
/*
TODO: GARBAGE .. GET RID OF IT ...
static char seedPath[] = new char[24];
static char lastPath[] = new char[24];
static char lastDomain[] = { 'a','a','a','a','a','a','a','a','a','a'-1};
static void generateSeeds() {
for (int i =0;i<seedPath.length;++i) {
seedPath[i] = (char) ('a' + Math.random() * 26);
}
}
static String getNextDomain() {
// reset seed path
System.arraycopy(seedPath, 0, lastPath, 0, seedPath.length);
// increment domain path
for (int i=lastDomain.length-1;i>=0;--i){
if (lastDomain[i] != 'z') {
lastDomain[i] += 1;
break;
}
else {
lastDomain[i]='a';
}
}
return new String(lastDomain);
}
// get next path ...
static String getNextPath() {
for (int i=lastPath.length-1;i>=0;--i){
if (lastPath[i] != 'z') {
lastPath[i] += 1;
break;
}
else {
lastPath[i]='a';
}
}
return new String(lastPath);
}
@Test
public void testDatabaseSize() throws java.lang.Exception {
File db1 = new File("/tmp/test.db");
File db2 = new File("/tmp/test2.db");
File db3 = new File("/tmp/test3.db");
File db4 = new File("/tmp/test4.db");
RecordStore recordStore[] = new RecordStore[1];
for (int j=0;j<recordStore.length;++j)
recordStore[j] = new RecordStore();
recordStore[0].initialize(db1, null);
//recordStore[1].initialize(db2, null);
//recordStore[2].initialize(db3, null);
//recordStore[3].initialize(db4, null);
System.out.println("Database Ready");
CrawlURL data = new CrawlURL();
boolean doInsert = false;
if (doInsert) {
boolean inTxn = false;
long totalTimeStart = System.currentTimeMillis();
long timeStart = 0;
for (int i=0;i<1;){
if (i%10000 == 0){
if (inTxn) {
for (int j=0;j<recordStore.length;++j)
recordStore[j].commitTransaction();
long timeSpent = System.currentTimeMillis() - timeStart;
System.out.println("Inserted "+i+" Records in " +((Long)timeSpent).toString() +" Milliseconds");
}
inTxn = true;
for (int j=0;j<recordStore.length;++j)
recordStore[j].beginTransaction();
timeStart = System.currentTimeMillis();
}
String domain = getNextDomain();
String path = getNextPath();
for (int j=0;j<16;++j) {
data.setUrl("http://"+domain+"/"+path);
data.setFingerprint((long) (Math.random() * Long.MAX_VALUE));
path = getNextPath();
for (int k=0;k<recordStore.length;++k)
recordStore[k].insertRecord(0, ((Long)data.getFingerprint()).toString(),data);
++i;
}
}
if (inTxn){
for (int j=0;j<recordStore.length;++j)
recordStore[j].commitTransaction();
}
System.out.println("Total time expended:"+((Long)(System.currentTimeMillis() - totalTimeStart)));
}
else {
long seekStart = System.currentTimeMillis();
CrawlURL returnedData = (CrawlURL)recordStore[0].getRecordById(1000500);
System.out.println("Seek took "+((Long)System.currentTimeMillis()-seekStart));
System.out.println("Seek returned URL:"+returnedData.getUrl());
}
}
*/
@Test
public void testRecordStore() throws java.lang.Exception {
System.out.println(System.getProperty("java.library.path"));
File tempDBFile = File.createTempFile("sqlite",".db");
RecordStore recordStore = new RecordStore();
recordStore.initialize(tempDBFile, null);
UnitTestStruct1 unitTest = new UnitTestStruct1();
unitTest.setStringType("foo");
long timeStart = System.currentTimeMillis();
recordStore.beginTransaction();
for (int i=0;i<10000;++i) {
recordStore.insertRecord(((Integer)(i/1000)).toString(), Integer.toString(i), unitTest);
}
recordStore.commitTransaction();
long timeEnd = System.currentTimeMillis();
System.out.println("10000 record insertion took:"+Long.toString(timeEnd-timeStart)+" millisecs.");
timeStart = System.currentTimeMillis();
recordStore.beginTransaction();
for (int i=0;i<1000;++i) {
int key = (int) (10000.00 * Math.random());
unitTest.setStringType(Integer.toString(key));
recordStore.updateRecordByKey(Integer.toString(key), unitTest);
}
recordStore.commitTransaction();
timeEnd = System.currentTimeMillis();
System.out.println("random update of 1000 records took:"+Long.toString(timeEnd-timeStart)+" millisecs.");
int randomKey = (int) (1000.00 * Math.random());
timeStart = System.currentTimeMillis();
UnitTestStruct1 unitTest2 = (UnitTestStruct1)recordStore.getRecordByKey(Integer.toString(randomKey));
timeEnd = System.currentTimeMillis();
System.out.println("random seek took:"+Long.toString(timeEnd-timeStart)+" millisecs.");
randomKey = (int) (1000000.00 * Math.random());
timeStart = System.currentTimeMillis();
recordStore.beginTransaction();
recordStore.deleteRecordByKey(Integer.toString(randomKey));
recordStore.commitTransaction();
timeEnd = System.currentTimeMillis();
System.out.println("random delete took:"+Long.toString(timeEnd-timeStart)+" millisecs.");
long randomId = (long) (1000.00 * Math.random());
Vector<Long> children = recordStore.getChildRecordsByParentId("0");
timeStart = System.currentTimeMillis();
for (Long childId : children) {
UnitTestStruct1 unitTest3 = (UnitTestStruct1) recordStore.getRecordById(childId);
}
timeEnd = System.currentTimeMillis();
System.out.println("read of:"+Integer.toString(children.size())+" records took:"+Long.toString(timeEnd-timeStart)+" millisecs.");
}
}