package org.archive.hadoop.cdx; import java.io.OutputStreamWriter; import java.io.PrintWriter; import java.nio.charset.Charset; import java.util.Comparator; import java.util.Iterator; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.archive.util.iterator.SortedCompositeIterator; public class CDXClusterRangeDumper implements Tool { Charset UTF8 = Charset.forName("utf-8"); public final static String TOOL_NAME = "cluster-range"; public static final String TOOL_DESCRIPTION = "A tool for dumping ranges of a CDX cluster to STDOUT"; private Configuration conf; public void setConf(Configuration conf) { this.conf = conf; } public Configuration getConf() { return conf; } public static int USAGE(int code) { System.err.println("Usage: " + TOOL_NAME + " START END CLUSTER_HDFS_URL ..."); System.err.println("\tDump all CDX records within the cluster at CLUSTER_HDFS_URL"); System.err.println("\tstarting at START(inclusive), ending at END(exclusive)"); System.err.println("\tto STDOUT. If multiple clusters URLs are specified"); System.err.println("\ttheir results will be merged into a single sorted stream."); return code; } public int run(String[] args) throws Exception { if(args.length < 3) { return USAGE(1); } String start = args[0]; String end = args[1]; Iterator<String> itr; if(args.length == 3) { Path clusterPath = new Path(args[2]); CDXCluster c = new CDXCluster(getConf(), clusterPath); itr = c.getRange(start,end); } else { Comparator<String> comparator = new Comparator<String>() { public int compare(String s1, String s2) { return s1.compareTo(s2); } }; SortedCompositeIterator<String> scitr = new SortedCompositeIterator<String>(comparator); for(int i = 2; i < args.length; i++) { Path clusterPath = new Path(args[i]); CDXCluster c = new CDXCluster(getConf(), clusterPath); scitr.addIterator(c.getRange(start,end)); } itr = scitr; } PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out, UTF8)); while(itr.hasNext()) { pw.println(itr.next()); } pw.flush(); pw.close(); return 0; } public static void main(String[] args) throws Exception { int res = ToolRunner.run(new Configuration(), new CDXClusterRangeDumper(), args); System.exit(res); } }