package org.commoncrawl.service.crawlhistoryV2;
import java.io.ByteArrayInputStream;
import java.io.DataInputStream;
import java.io.File;
import java.io.IOException;
import java.text.NumberFormat;
import java.util.concurrent.Semaphore;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.WritableUtils;
import org.apache.hadoop.record.Buffer;
import org.commoncrawl.crawl.common.internal.CrawlEnvironment;
import org.commoncrawl.db.RecordStore;
import org.commoncrawl.protocol.BulkItemHistoryQuery;
import org.commoncrawl.protocol.BulkItemHistoryQueryResponse;
import org.commoncrawl.protocol.BulkUpdateData;
import org.commoncrawl.protocol.CrawlerHistoryService;
import org.commoncrawl.protocol.CrawlerHistoryServiceV2;
import org.commoncrawl.protocol.SingleItemHistoryQueryResponse;
import org.commoncrawl.protocol.URLFPV2;
import org.commoncrawl.rpc.base.internal.AsyncClientChannel;
import org.commoncrawl.rpc.base.internal.AsyncContext;
import org.commoncrawl.rpc.base.internal.AsyncServerChannel;
import org.commoncrawl.rpc.base.internal.NullMessage;
import org.commoncrawl.rpc.base.internal.AsyncRequest.Status;
import org.commoncrawl.rpc.base.shared.RPCException;
import org.commoncrawl.server.CommonCrawlServer;
import org.commoncrawl.util.CCStringUtils;
import org.commoncrawl.util.ImmutableBuffer;
import org.commoncrawl.util.URLFPBloomFilter;
import org.commoncrawl.util.URLFPUtils;
import org.commoncrawl.util.BitUtils.BitStream;
public class CrawlHistoryServer extends CommonCrawlServer
implements CrawlerHistoryServiceV2, AsyncServerChannel.ConnectionCallback{
// ok we are rushed for time, so hard code a bunch of stuff for now
static Path hdfsBasePath = new Path("crawl/historyV2");
static Path checkpointBasePath = new Path(hdfsBasePath,"checkpoints");
static Path checkpointStagingPath = new Path(hdfsBasePath,"staging");
static Path tlogPath = new Path(hdfsBasePath,"tlog");
static final String CHECKPOINT_PREFIX = "CHECKPOINT-";
static final String TLOG_PREFIX = "TLOG-";
private static final NumberFormat NUMBER_FORMAT = NumberFormat.getInstance();
private static final int NUM_HASH_FUNCTIONS = 10;
private static final int NUM_BITS = 11;
private static final int NUM_ELEMENTS = 55 * 1 << 20; // 55 million elements per filter
static {
NUMBER_FORMAT.setMinimumIntegerDigits(5);
NUMBER_FORMAT.setGroupingUsed(false);
}
// up to
private URLFPBloomFilter _bloomFilters[] = new URLFPBloomFilter[CrawlEnvironment.NUM_DB_SHARDS];
/** primary crawler database **/
RecordStore _recordStore = new RecordStore();
/** server state record key **/
String HistoryServerStateKey = "HistoryServerState";
private static final Log LOG = LogFactory.getLog(CrawlHistoryServer.class);
@Override
protected String getDefaultDataDir() {
return CrawlEnvironment.DEFAULT_DATA_DIR;
}
@Override
protected String getDefaultHttpInterface() {
return CrawlEnvironment.DEFAULT_HTTP_INTERFACE;
}
@Override
protected int getDefaultHttpPort() {
return CrawlEnvironment.DEFAULT_CRAWLER_HISTORY_HTTP_PORT;
}
@Override
protected String getDefaultLogFileName() {
return "historyserver.log";
}
@Override
protected String getDefaultRPCInterface() {
return CrawlEnvironment.DEFAULT_RPC_INTERFACE;
}
@Override
protected int getDefaultRPCPort() {
return CrawlEnvironment.DEFAULT_CRAWLER_HISTORY_RPC_PORT;
}
@Override
protected String getWebAppName() {
return CrawlEnvironment.CRAWLER_HISTORY_WEBAPP_NAME;
}
@Override
protected boolean initServer() {
File dataPath = getDataDirectory();
File dbPath = new File(dataPath,CrawlEnvironment.CRAWLER_HISTORY_DB);
// now initialize the recorstore ...
try {
// initialize database
_recordStore.initialize(dbPath, null);
// load state (if any)
loadState();
// load bloom filter from disk if possible
loadBloomFilter();
// create server channel ...
AsyncServerChannel channel = new AsyncServerChannel(this, this.getEventLoop(), this.getServerAddress(),this);
// register RPC services it supports ...
registerService(channel,CrawlerHistoryService.spec);
// start the checkpoint thread ...
startCheckpointThread(CrawlEnvironment.getDefaultFileSystem());
} catch (IOException e) {
LOG.error(CCStringUtils.stringifyException(e));
}
return true;
}
private void loadCheckpointState() throws IOException {
// ok find latest checkpoint dir
// if found, load bloom filters from checkpoint dir
// else initialize empty bloom filters
// for each log buffer
// if log buffer timestamp > last checkpoint timestamp
// mutate bloom filter based on log buffers
}
/** do a clean shutdown (if possible) **/
@Override
public void stop() {
// ok, wait to grab the checkpoint thread semaphore
LOG.info("Server Shutdown Detected. Waiting on checkpoint thread");
_shutdownFlag = true;
_checkpointThreadSemaphore.acquireUninterruptibly();
LOG.info("Checkpoint thread semaphore acquired. Joining checkpoint thread ... ");
if (_checkpointThread != null) {
try {
_checkpointThread.join();
} catch (Exception e) {
LOG.error("Exception while waiting for Checkpoint Thread shutdown:" + CCStringUtils.stringifyException(e));
}
}
// ok safe to call super now ...
super.stop();
}
/** load state **/
private void loadState() throws IOException {
}
@Override
protected boolean parseArguements(String[] argv) {
return true;
}
@Override
protected void printUsage() {
}
@Override
protected boolean startDaemons() {
return true;
}
@Override
protected void stopDaemons() {
}
@Override
public void bulkItemQuery(AsyncContext<BulkItemHistoryQuery, BulkItemHistoryQueryResponse> rpcContext)throws RPCException {
LOG.info("Received BulkItemQueryRequest");
ImmutableBuffer inputBuffer = rpcContext.getInput().getFingerprintList();
if (inputBuffer.getCount() != 0) {
try {
if (_bloomFilters == null) {
throw new IOException("BloomFilter Not Initilized. Invalid Server State!");
}
DataInputStream inputStream = new DataInputStream(
new ByteArrayInputStream(inputBuffer.getReadOnlyBytes(),0,inputBuffer.getCount()));
BitStream bitStreamOut = new BitStream();
URLFPV2 fingerprint = new URLFPV2();
int itemsPresent = 0;
while (inputStream.available() != 0) {
fingerprint.setDomainHash(WritableUtils.readVLong(inputStream));
fingerprint.setUrlHash(WritableUtils.readVLong(inputStream));
int partition = URLFPUtils.getPartitionGivenFP(fingerprint);
if (_bloomFilters[partition] == null) {
throw new IOException("BloomFilter not Loaded");
}
if (_bloomFilters[partition].isPresent(fingerprint)) {
bitStreamOut.addbit(1);
++itemsPresent;
}
else {
bitStreamOut.addbit(0);
}
}
LOG.info("Received BulkItemQueryRequest Completed with " + itemsPresent + " items found");
rpcContext.getOutput().setResponseList(new Buffer(bitStreamOut.bits,0,(bitStreamOut.nbits + 7) / 8));
}
catch (IOException e) {
LOG.error(CCStringUtils.stringifyException(e));
rpcContext.setStatus(Status.Error_RequestFailed);
rpcContext.setErrorDesc(CCStringUtils.stringifyException(e));
}
rpcContext.completeRequest();
}
}
@Override
public void singleItemQuery(AsyncContext<URLFPV2, SingleItemHistoryQueryResponse> rpcContext)throws RPCException {
try {
if (_bloomFilters == null) {
throw new IOException("BloomFilter Not Initilized. Invalid Server State!");
}
int partition = URLFPUtils.getPartitionGivenFP(rpcContext.getInput());
if (_bloomFilters[partition] == null) {
throw new IOException("BloomFilter for Part:" + partition + " Not Loaded!");
}
rpcContext.getOutput().setWasCrawled(_bloomFilters[partition].isPresent(rpcContext.getInput()));
}
catch (IOException e) {
LOG.error(CCStringUtils.stringifyException(e));
rpcContext.setStatus(Status.Error_RequestFailed);
rpcContext.setErrorDesc(CCStringUtils.stringifyException(e));
}
rpcContext.completeRequest();
}
@Override
public void updateHistory(AsyncContext<URLFPV2, NullMessage> rpcContext)throws RPCException {
try {
if (_bloomFilters == null) {
throw new IOException("BloomFilter Not Initilized. Invalid Server State!");
}
int partition = URLFPUtils.getPartitionGivenFP(rpcContext.getInput());
if (_bloomFilters[partition] == null) {
throw new IOException("BloomFilter for Part:" + partition + " Not Loaded!");
}
_bloomFilters[partition].add(rpcContext.getInput());
}
catch (IOException e) {
LOG.error(CCStringUtils.stringifyException(e));
rpcContext.setStatus(Status.Error_RequestFailed);
rpcContext.setErrorDesc(CCStringUtils.stringifyException(e));
}
rpcContext.completeRequest();
}
private final Path getDataFileBasePath() {
return new Path(CrawlEnvironment.HDFS_HistoryServerBase,getHostName());
}
private final Path getDataFileFinalPath() {
return new Path(CrawlEnvironment.HDFS_HistoryServerBase,getHostName()+".data");
}
private final Path getDataFileCheckpointPath() {
return new Path(CrawlEnvironment.HDFS_HistoryServerBase,getHostName()+".checkpoint");
}
private void loadBloomFilter() throws IOException {
FileSystem fs = CrawlEnvironment.getDefaultFileSystem();
Path dataFilePath = getDataFileFinalPath();
FileStatus fileStatus = fs.getFileStatus(dataFilePath);
if (fileStatus == null || !fileStatus.isDir()) {
IOException e = new IOException("Data File Path:" + dataFilePath + " is not a valid directory!");
LOG.error(e.getMessage());
throw e;
}
// check how many files are present ...
FileStatus files[] = fs.globStatus(new Path(dataFilePath,"part-*"));
// ok files exist... read them in
if (files.length != 0) {
if (files.length != CrawlEnvironment.NUM_DB_SHARDS) {
IOException e = new IOException("Invalid Number of Shards found at:" + dataFilePath);
LOG.error(e.getMessage());
throw e;
}
for (FileStatus file : files) {
// get part file index ...
int part = Integer.parseInt(file.getPath().getName().substring("part-".length()));
if (part < CrawlEnvironment.NUM_DB_SHARDS) {
FSDataInputStream inputStream = fs.open(file.getPath());
try {
LOG.info("Loading Part:" + file.getPath().getName());
_bloomFilters[part] = URLFPBloomFilter.load(inputStream);
}
finally {
inputStream.close();
}
}
else {
IOException e = new IOException("Invalid part file found during load:" + file.getPath());
}
}
}
// ok no, they dont. initialize empty bloom filters
else {
LOG.info("No Existing Filters Found. Allocating New Filters");
for (int i=0;i<CrawlEnvironment.NUM_DB_SHARDS;++i){
LOG.info("Allocting Part:" + i);
_bloomFilters[i] = new URLFPBloomFilter(NUM_ELEMENTS, NUM_HASH_FUNCTIONS, NUM_BITS);
}
}
}
private Thread _checkpointThread = null;
private Semaphore _checkpointThreadSemaphore = new Semaphore(1);
private boolean _shutdownFlag = false;
/** checkpoint flush interval **/
private static final int CHECKPOINT_FLUSH_INTERVAL = 15 * 60 * 1000; // 15 minutes
private void startCheckpointThread(final FileSystem fs) {
_checkpointThread = new Thread(new Runnable() {
@Override
public void run() {
// ok check point thread will run indefinitely until shutdown event is triggered
while (!_shutdownFlag) {
// get a cas lock on the server
// rotate checkpoint log
// release lock
// start checkpoint
// generate timestamp
// create hdfs checkpoint temp
// write each partition to disk
// promote checkpoint to checkpoint dir
// delete old checkpoint
// delete log buffers with timestamp < checkpoint timestamp
}
}
});
_checkpointThread.start();
}
@Override
public void IncomingClientConnected(AsyncClientChannel channel) {
}
@Override
public void IncomingClientDisconnected(AsyncClientChannel channel) {
}
@Override
public void bulkUpdateHistory(AsyncContext<BulkUpdateData, NullMessage> rpcContext) throws RPCException {
LOG.info("Received BulkUpdate Request");
ImmutableBuffer inputBuffer = rpcContext.getInput().getFingerprintList();
if (inputBuffer.getCount() != 0) {
try {
if (_bloomFilters == null) {
throw new IOException("BloomFilter Not Initilized. Invalid Server State!");
}
DataInputStream inputStream = new DataInputStream(
new ByteArrayInputStream(inputBuffer.getReadOnlyBytes(),0,inputBuffer.getCount()));
URLFPV2 fingerprint = new URLFPV2();
int itemsAdded = 0;
while (inputStream.available() != 0) {
fingerprint.setDomainHash(WritableUtils.readVLong(inputStream));
fingerprint.setUrlHash(WritableUtils.readVLong(inputStream));
// get partition ...
int partition = URLFPUtils.getPartitionGivenFP(fingerprint);
if (_bloomFilters[partition] == null) {
throw new IOException("BloomFilter for part:" + partition + " not initialized!");
}
// write to filter ...
_bloomFilters[partition].add(fingerprint);
++itemsAdded;
}
LOG.info("Finished Processed BulkUpdate Request. " + itemsAdded + " items processed." );
}
catch (IOException e) {
LOG.error(CCStringUtils.stringifyException(e));
rpcContext.setStatus(Status.Error_RequestFailed);
rpcContext.setErrorDesc(CCStringUtils.stringifyException(e));
}
rpcContext.completeRequest();
}
}
public static final long getLatestCheckpointId(FileSystem remoteFS)throws IOException {
FileStatus candidates[] = remoteFS.globStatus(new Path(checkpointBasePath,CHECKPOINT_PREFIX+"*"));
long bestCandidateId = -1;
for (FileStatus candidate : candidates) {
String candidateName = candidate.getPath().getName();
try {
long candidateId = Long.parseLong(candidateName.substring(CHECKPOINT_PREFIX.length()));
if (bestCandidateId == -1 || candidateId > bestCandidateId) {
if (candidate.isDir()) {
bestCandidateId = candidateId;
}
else {
LOG.error("Skipping Canidate:" + candidateName);
}
}
}
catch (NumberFormatException e) {
LOG.error(CCStringUtils.stringifyException(e));
}
}
return bestCandidateId;
}
}