HDFSRangeDumper.java example

Explorer
wayback-machine-master
package org.archive.hadoop.cdx;

import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.nio.charset.Charset;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.archive.util.StreamCopy;

public class HDFSRangeDumper implements Tool {
	
	Charset UTF8 = Charset.forName("utf-8");
	public final static String TOOL_NAME = "range-dumper";
	public static final String TOOL_DESCRIPTION = 
		"A tool for dumping contents of files from HDFS to STDOUT";
	
	private Configuration conf;
	
	
	public void dumpPath(Path inputPath, OutputStream target) throws IOException {
		FileSystem fs = inputPath.getFileSystem(getConf());
		FSDataInputStream fsdis = fs.open(inputPath);
		StreamCopy.copy(fsdis, target);
		fsdis.close();
	}
	public void dumpPath(Path inputPath, OutputStream target, long start, long length) throws IOException {
		String inputPathString = inputPath.toUri().toASCIIString();
		FileSystem fs = inputPath.getFileSystem(getConf());
		FSDataInputStream fsdis = fs.open(inputPath);
		fsdis.seek(start);
		long amt = StreamCopy.copyLength(fsdis, target, length);
		if(amt != length) {
			throw new IOException(
					String.format("Short copy(%s)(%d)(%d): got(%d)\n",
					inputPathString,start,length,amt));
		}
		fsdis.close();
	}
	public void setConf(Configuration conf) {
		this.conf = conf;
	}
	public Configuration getConf() {
		return conf;
	}
	public static void USAGE(int code) {
		System.err.println("Usage: " + TOOL_NAME + " [INPUT]");
		System.err.println("\tReads lines from local path INPUT (or STDIN if omitted) of the format:");
		System.err.println("\t\tHDFS_URL");
		System.err.println("\tOR");
		System.err.println("\t\tHDFS_URL<tab>OFFSET<tab>LENGTH");
		System.err.println("\tIn the first form, dumps the entire contents of HDFS_URL");
		System.err.println("\tIn the second, dumps LENGTH octets from HDFS_URL beginning at offset OFFSET");
		System.exit(code);
	}
	public int run(String[] args) throws Exception {
		if(args.length > 1) {
			USAGE(1);
		}
		InputStreamReader isr = null;
		File input = null;
		if(args.length == 0) {
			isr = new InputStreamReader(System.in,UTF8);
		} else {
			input = new File(args[0]);
			isr = new InputStreamReader(new FileInputStream(input),UTF8);
		}
		BufferedReader br = new BufferedReader(isr);
		String line;
		OutputStream out = new BufferedOutputStream(System.out);
		while(true) {
			line = br.readLine();
			if(line == null) {
				break;
			}
			String parts[] = line.split("\t");
			if(parts.length == 1) {
				
				dumpPath(new Path(line), out);

			} else if(parts.length == 3) {
				
				long start = Long.parseLong(parts[1]);
				long length = Long.parseLong(parts[2]);

				dumpPath(new Path(parts[0]), System.out, start, length);
			
			} else {
				throw new IOException("Wrong number of fields in " + line);
			}
			System.err.format("Dumped\t%s\n",parts[0]);
		}
		if(input != null) {
			isr.close();
		}
		out.flush();
		return 0;
	}

	public static void main(String[] args) throws Exception {
		int res = ToolRunner.run(new Configuration(), new HDFSRangeDumper(), args);
		System.exit(res);
	}

}