package org.archive.cassandra; import static org.apache.cassandra.utils.ByteBufferUtil.bytes; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import org.apache.cassandra.config.CFMetaData; import org.apache.cassandra.config.Config; import org.apache.cassandra.db.ColumnFamilyType; import org.apache.cassandra.db.marshal.UTF8Type; import org.apache.cassandra.dht.ByteOrderedPartitioner; import org.apache.cassandra.io.compress.CompressionParameters; import org.apache.cassandra.io.compress.DeflateCompressor; import org.apache.cassandra.io.compress.ICompressor; import org.apache.cassandra.io.compress.LZ4Compressor; import org.apache.cassandra.io.compress.SnappyCompressor; import org.apache.cassandra.io.sstable.SSTableSimpleWriter; import org.apache.commons.cli.BasicParser; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; import org.apache.commons.lang.math.NumberUtils; import org.archive.format.cdx.CDXLine; import org.archive.format.cdx.StandardCDXLineFactory; import org.archive.hadoop.mapreduce.CDXMapper; import org.archive.util.zip.OpenJDK7GZIPInputStream; public class CDXToSSTable { protected CDXMapper cdxConverter; ByteOrderedPartitioner partitioner; SSTableSimpleWriter writer; protected StandardCDXLineFactory cdxLineFactory; protected String lastKey; public CDXToSSTable(String dir, String format, String keyspace, String cf, ICompressor compressor, boolean conv) { partitioner = new ByteOrderedPartitioner(); cdxLineFactory = new StandardCDXLineFactory(format); if (conv) { cdxConverter = new CDXMapper(); } File fileDir = new File(dir); fileDir.mkdir(); CFMetaData cfmeta = new CFMetaData(keyspace, cf, ColumnFamilyType.Standard, UTF8Type.instance, null); CompressionParameters cprop = new CompressionParameters(compressor); cfmeta.compressionParameters(cprop); writer = new SSTableSimpleWriter(fileDir, cfmeta, partitioner); } final static String DEFAULT_KEYSPACE = "cdxspace"; final static String DEFAULT_CF = "cdx2"; final static String DEFAULT_FORMAT = "cdx11"; public static void main(String[] args) throws IOException { Options options = new Options(); options.addOption("d", false, "DeflateCompressor"); options.addOption("l", false, "LZ4Compressor"); options.addOption("s", false, "SnappyCompressor"); //options.addOption("o", "output", true, "Ouput Dir"); options.addOption("k", "key", true, "Keyspace (default: " + DEFAULT_KEYSPACE + ")"); options.addOption("f", "format", true, "cdx format (default: " + DEFAULT_FORMAT + ")"); options.addOption("cf", "table", true, "Table/Column Family (default: " + DEFAULT_CF + ")"); CommandLineParser parser = new BasicParser(); CommandLine cmd = null; try { cmd = parser.parse( options, args); } catch (ParseException e) { System.err.println(e); } String keyspace = cmd.getOptionValue("k", DEFAULT_KEYSPACE); String cf = cmd.getOptionValue("cf", DEFAULT_CF); String format = cmd.getOptionValue("f", DEFAULT_FORMAT); boolean convert = format.equals("cdx09"); CDXToSSTable cdxTosst = null; ICompressor compressor = null; if (cmd.hasOption("d")) { compressor = DeflateCompressor.create(null); } else if (cmd.hasOption("s")) { compressor = SnappyCompressor.create(null); } else if (cmd.hasOption("l")) { compressor = LZ4Compressor.create(null); } else { compressor = LZ4Compressor.create(null); } String argsleft[] = cmd.getArgs(); if (argsleft.length == 0) { System.err.println("Must supply <outdir>"); System.exit(1); return; } String input, outdir; if (argsleft.length == 1) { outdir = argsleft[0]; input = null; } else { input = argsleft[0]; outdir = argsleft[1]; } try { Config.setClientMode(true); InputStream in = null; if (input == null || input.isEmpty() || input.equals("-")) { in = System.in; } else { in = new FileInputStream(input); if (input.endsWith(".gz")) { in = new OpenJDK7GZIPInputStream(in); } } cdxTosst = new CDXToSSTable(outdir, format, keyspace, cf, compressor, convert); String line = null; BufferedReader reader = new BufferedReader(new InputStreamReader(in)); while ((line = reader.readLine()) != null) { cdxTosst.insertCdxLine(line); } } finally { if (cdxTosst != null) { cdxTosst.close(); } } } public void insertCdxLine(String cdxline) throws IOException { if (cdxline.startsWith("dns:") || cdxline.startsWith("warcinfo:")) { return; } if (cdxConverter != null) { try { cdxline = cdxConverter.convertLine(cdxline); } catch (Exception e) { //System.err.println("Skipping " + cdxline + " due to " + e.toString()); return; } } CDXLine line = cdxLineFactory.createStandardCDXLine(cdxline); String surt = line.getUrlKey(); String datetime = line.getTimestamp(); String original = line.getOriginalUrl(); String mimetype = line.getMimeType(); Integer statuscode = NumberUtils.toInt(line.getStatusCode(), -1); String digest = line.getDigest(); Long offset = NumberUtils.toLong(line.getOffset(), -1); Integer length = NumberUtils.toInt(line.getLength(), -1); String filename = line.getFilename(); long timestamp = System.currentTimeMillis(); String key = surt + " " + datetime; if (lastKey != null && key.equals(lastKey)) { System.err.println("Skipping Dupe: " + key); return; } lastKey = key; writer.newRow(bytes(key)); writer.addColumn(bytes("originalurl"), bytes(original), timestamp); writer.addColumn(bytes("mimetype"), bytes(mimetype), timestamp); writer.addColumn(bytes("statuscode"), bytes(statuscode), timestamp); writer.addColumn(bytes("digest"), bytes(digest), timestamp); writer.addColumn(bytes("offset"), bytes(offset), timestamp); writer.addColumn(bytes("length"), bytes(length), timestamp); writer.addColumn(bytes("filename"), bytes(filename), timestamp); } public void close() { writer.close(); } }