package org.archive.hadoop.cdx; import java.io.BufferedOutputStream; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStream; import java.nio.charset.Charset; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.archive.util.StreamCopy; public class HDFSRangeDumper implements Tool { Charset UTF8 = Charset.forName("utf-8"); public final static String TOOL_NAME = "range-dumper"; public static final String TOOL_DESCRIPTION = "A tool for dumping contents of files from HDFS to STDOUT"; private Configuration conf; public void dumpPath(Path inputPath, OutputStream target) throws IOException { FileSystem fs = inputPath.getFileSystem(getConf()); FSDataInputStream fsdis = fs.open(inputPath); StreamCopy.copy(fsdis, target); fsdis.close(); } public void dumpPath(Path inputPath, OutputStream target, long start, long length) throws IOException { String inputPathString = inputPath.toUri().toASCIIString(); FileSystem fs = inputPath.getFileSystem(getConf()); FSDataInputStream fsdis = fs.open(inputPath); fsdis.seek(start); long amt = StreamCopy.copyLength(fsdis, target, length); if(amt != length) { throw new IOException( String.format("Short copy(%s)(%d)(%d): got(%d)\n", inputPathString,start,length,amt)); } fsdis.close(); } public void setConf(Configuration conf) { this.conf = conf; } public Configuration getConf() { return conf; } public static void USAGE(int code) { System.err.println("Usage: " + TOOL_NAME + " [INPUT]"); System.err.println("\tReads lines from local path INPUT (or STDIN if omitted) of the format:"); System.err.println("\t\tHDFS_URL"); System.err.println("\tOR"); System.err.println("\t\tHDFS_URL<tab>OFFSET<tab>LENGTH"); System.err.println("\tIn the first form, dumps the entire contents of HDFS_URL"); System.err.println("\tIn the second, dumps LENGTH octets from HDFS_URL beginning at offset OFFSET"); System.exit(code); } public int run(String[] args) throws Exception { if(args.length > 1) { USAGE(1); } InputStreamReader isr = null; File input = null; if(args.length == 0) { isr = new InputStreamReader(System.in,UTF8); } else { input = new File(args[0]); isr = new InputStreamReader(new FileInputStream(input),UTF8); } BufferedReader br = new BufferedReader(isr); String line; OutputStream out = new BufferedOutputStream(System.out); while(true) { line = br.readLine(); if(line == null) { break; } String parts[] = line.split("\t"); if(parts.length == 1) { dumpPath(new Path(line), out); } else if(parts.length == 3) { long start = Long.parseLong(parts[1]); long length = Long.parseLong(parts[2]); dumpPath(new Path(parts[0]), System.out, start, length); } else { throw new IOException("Wrong number of fields in " + line); } System.err.format("Dumped\t%s\n",parts[0]); } if(input != null) { isr.close(); } out.flush(); return 0; } public static void main(String[] args) throws Exception { int res = ToolRunner.run(new Configuration(), new HDFSRangeDumper(), args); System.exit(res); } }