package org.archive.hadoop.cdx; import java.io.BufferedReader; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStream; import java.io.OutputStreamWriter; import java.io.PrintWriter; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Comparator; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.archive.util.iterator.AbstractPeekableIterator; import org.archive.util.iterator.SortedCompositeIterator; public class ManifestAggregator implements Tool { public final static String TOOL_NAME = "manifest-aggregator"; public static final String MANIFEST_BASENAME = "manifest.txt"; public static final String TOOL_DESCRIPTION = "A tool for merging sorted manifest files in a CDX HDFS installation"; private Configuration conf; public void aggregate(Path partsPath, OutputStream target) throws IOException { String dirString = partsPath.toUri().toASCIIString(); FileSystem fs = partsPath.getFileSystem(getConf()); FileStatus status = fs.getFileStatus(partsPath); if(!status.isDir()) { throw new IOException(dirString + " is not a directory!"); } FileStatus entries[] = fs.listStatus(partsPath); ArrayList<Path> manifests = new ArrayList<Path>(); for(FileStatus entry : entries) { Path entryPath = entry.getPath(); if(!entry.isDir()) { throw new IOException( String.format("Non directory entry (%s) in %s", entryPath.getName(),dirString)); } Path manifestPath = new Path(entryPath,MANIFEST_BASENAME); if(!fs.isFile(manifestPath)) { throw new IOException( String.format("No file at manifest path %s", manifestPath.toUri().toASCIIString())); } manifests.add(manifestPath); } Comparator<String> comparator = new Comparator<String>() { public int compare(String s1, String s2) { return s1.compareTo(s2); } }; Charset UTF8 = Charset.forName("utf-8"); SortedCompositeIterator<String> mergeItr = new SortedCompositeIterator<String>(comparator); for(Path manifestPath : manifests) { FSDataInputStream fsdis = fs.open(manifestPath); InputStreamReader isr = new InputStreamReader(fsdis, UTF8); BufferedReader br = new BufferedReader(isr); mergeItr.addIterator(AbstractPeekableIterator.wrapReader(br)); } OutputStreamWriter osw = new OutputStreamWriter(target, UTF8); PrintWriter pw = new PrintWriter(osw); while(mergeItr.hasNext()) { pw.println(mergeItr.next()); } pw.flush(); pw.close(); mergeItr.close(); } public void setConf(Configuration conf) { this.conf = conf; } public Configuration getConf() { return conf; } public static void USAGE(int code) { System.err.println("Usage: " + TOOL_NAME + " HDFS_PATH LOCAL_PATH"); System.exit(code); } public int run(String[] args) throws Exception { if(args.length != 2) { USAGE(1); } Path inputDir = new Path(args[0]); File target = new File(args[1]); FileOutputStream fos = new FileOutputStream(target); aggregate(inputDir, fos); return 0; } public static void main(String[] args) throws Exception { int res = ToolRunner.run(new Configuration(), new ManifestAggregator(), args); System.exit(res); } }