package org.archive.cassandra; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.RecordWriter; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.archive.format.cdx.StandardCDXLineFactory; public class CassCDXRecordWriter extends RecordWriter<Text, Text> { protected CDXImporter importer; public CassCDXRecordWriter(TaskAttemptContext context) { Configuration conf = context.getConfiguration(); String nodehost = conf.get("conf.cass.host"); importer = new CDXImporter(); String query = conf.get("conf.cass.query"); if (query != null) { importer.setCdxQuery(query); } String cdxFormat = conf.get("conf.cass.cdxformat"); if (cdxFormat != null) { importer.setCdxLineFactory(new StandardCDXLineFactory(cdxFormat)); } int batchSize = conf.getInt("conf.cass.batchSize", -1); if (batchSize > 0) { importer.setNumToBatch(batchSize); } int minuteTimeout = conf.getInt("conf.cass.minuteTimeout", -1); if (minuteTimeout > 0) { importer.setMinuteTimeout(minuteTimeout); } boolean canon = conf.getBoolean("conf.cass.canon", false); importer.init(nodehost, context, canon); } @Override public void write(Text key, Text value) throws IOException, InterruptedException { String cdxline; if (value.getLength() == 0) { cdxline = key.toString(); } else if (key.getLength() == 0) { cdxline = value.toString(); } else { cdxline = key + " " + value; } importer.insertCdxLine(cdxline); } @Override public void close(TaskAttemptContext context) throws IOException, InterruptedException { importer.close(); } }