package org.archive.cassandra;
import java.io.IOException;
import java.util.LinkedList;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import org.apache.commons.lang.math.NumberUtils;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.archive.format.cdx.CDXLine;
import org.archive.format.cdx.StandardCDXLineFactory;
import org.archive.hadoop.mapreduce.CDXMapper;
import com.datastax.driver.core.BatchStatement;
import com.datastax.driver.core.BatchStatement.Type;
import com.datastax.driver.core.BoundStatement;
import com.datastax.driver.core.Cluster;
import com.datastax.driver.core.Host;
import com.datastax.driver.core.Metadata;
import com.datastax.driver.core.PoolingOptions;
import com.datastax.driver.core.PreparedStatement;
import com.datastax.driver.core.ResultSetFuture;
import com.datastax.driver.core.Session;
public class CDXImporter {
protected Cluster cluster;
protected Session session;
protected String cdxQuery =
"INSERT INTO cdxspace.cdx (surt, datetime, originalurl, mimetype, statuscode, digest, offset, length, filename)" +
"VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)";
protected PreparedStatement insertCdxQuery;
protected BatchStatement batch = null;
protected int batchCount = 0;
protected int numToBatch = 10000;
protected StandardCDXLineFactory cdxLineFactory = new StandardCDXLineFactory("cdx11");
protected PoolingOptions pool = new PoolingOptions();
protected LinkedList<ResultSetFuture> results;
protected int numActive = 8;
protected int minuteTimeout = 3;
protected TaskAttemptContext context;
protected CDXMapper cdxConverter;
public void init(String node, TaskAttemptContext context, boolean canon) {
if (canon) {
cdxConverter = new CDXMapper();
}
this.context = context;
Cluster.Builder builder = Cluster.builder().addContactPoint(node);
//builder.withCompression(Compression.LZ4);
//builder.withPoolingOptions(pool);
cluster = builder.build();
results = new LinkedList<ResultSetFuture>();
Metadata metadata = cluster.getMetadata();
System.out.printf("Connected to cluster: %s\n", metadata.getClusterName());
for (Host host : metadata.getAllHosts()) {
System.out.printf("Datatacenter: %s; Host: %s; Rack: %s\n",
host.getDatacenter(), host.getAddress(), host.getRack());
}
session = cluster.connect();
insertCdxQuery = session.prepare(cdxQuery);
}
public void insertCdxLine(String cdxline)
{
if (cdxConverter != null) {
cdxline = cdxConverter.convertLine(cdxline);
}
CDXLine line = cdxLineFactory.createStandardCDXLine(cdxline);
String surt = line.getUrlKey();
String datetime = line.getTimestamp();
String original = line.getOriginalUrl();
String mimetype = line.getMimeType();
Integer statuscode = NumberUtils.toInt(line.getStatusCode(), -1);
String digest = line.getDigest();
Long offset = NumberUtils.toLong(line.getOffset(), -1);
Integer length = NumberUtils.toInt(line.getLength(), -1);
String filename = line.getFilename();
BoundStatement cdxStmt = new BoundStatement(insertCdxQuery);
cdxStmt.bind(surt, datetime, original, mimetype, statuscode, digest, offset, length, filename);
if (batch == null) {
batch = new BatchStatement(Type.UNLOGGED);
}
batch.add(cdxStmt);
batchCount++;
if (batchCount >= numToBatch) {
sendBatch();
}
}
protected void sendBatch()
{
if (results.size() == numActive) {
try {
msg("Waiting for timeout");
results.pollFirst().getUninterruptibly(minuteTimeout, TimeUnit.MINUTES);
} catch (TimeoutException e) {
msg(e.toString());
}
}
results.addLast(session.executeAsync(batch));
msg("Batch Sent!");
batchCount = 0;
batch = null;
}
private void msg(String string) {
System.err.println(string);
// try {
// context.setStatus(string);
//} catch (IOException e) {
// e.printStackTrace();
// }
}
public void close()
{
if (batch != null) {
sendBatch();
}
msg("Starting Cluster Shutdown...");
if (cluster != null) {
try {
cluster.shutdown().get(minuteTimeout, TimeUnit.MINUTES);
} catch (Exception e) {
msg("Shutdown Interrupted!");
}
}
msg("Cluster Shutdown");
}
public String getCdxQuery() {
return cdxQuery;
}
public void setCdxQuery(String cdxQuery) {
this.cdxQuery = cdxQuery;
}
public StandardCDXLineFactory getCdxLineFactory() {
return cdxLineFactory;
}
public void setCdxLineFactory(StandardCDXLineFactory cdxLineFactory) {
this.cdxLineFactory = cdxLineFactory;
}
public int getNumToBatch() {
return numToBatch;
}
public void setNumToBatch(int numToBatch) {
this.numToBatch = numToBatch;
}
public PoolingOptions getPool() {
return pool;
}
public void setPool(PoolingOptions pool) {
this.pool = pool;
}
public int getMinuteTimeout() {
return minuteTimeout;
}
public void setMinuteTimeout(int minuteTimeout) {
this.minuteTimeout = minuteTimeout;
}
}