package org.archive.hadoop.cdx; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.io.PrintWriter; import java.net.URI; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.logging.Logger; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; public class SummaryGenerator implements Tool { private static final Logger LOGGER = Logger.getLogger(SummaryGenerator.class.getName()); public static final String SUMMARY_SUFFIX = ".summary"; public static final String GZ_SUFFIX = ".gz"; public static final String ALL_SUMMARY_PREFIX = "ALL"; public final static String TOOL_NAME = "summary-generator"; public static final String TOOL_DESCRIPTION = "A tool for generating a meta-index summary from a set of shard partition summaries in a CDX HDFS installation"; private Configuration conf; public Configuration getConf() { return conf; } public void setConf(Configuration conf) { this.conf = conf; } public static void USAGE(int code) { System.err.println("USAGE: " + TOOL_NAME + " HDFS_URL"); System.exit(code); } public void createSummary(FileSystem fs, Path clusterPath, PrintWriter pw) throws IOException { Charset UTF8 = Charset.forName("utf-8"); HashMap<String, Path> summaries = new HashMap<String, Path>(); HashMap<String, Path> parts = new HashMap<String, Path>(); HashMap<String,Long> partsLength = new HashMap<String, Long>(); FileStatus entries[] = fs.listStatus(clusterPath); int sumLen = SUMMARY_SUFFIX.length(); int gzLen = GZ_SUFFIX.length(); for(FileStatus entry : entries) { Path entryPath = entry.getPath(); String pathStr = entryPath.toUri().toASCIIString(); String name = entryPath.getName(); if(entry.isDir()) { LOGGER.info("Ignoring Directory entry " + pathStr); } else if(name.equals(ALL_SUMMARY_PREFIX + SUMMARY_SUFFIX)) { // just skip - this is our target.. } else if(name.endsWith(SUMMARY_SUFFIX)) { String prefix = name.substring(0,name.length() - sumLen); summaries.put(prefix, entryPath); } else if(name.endsWith(GZ_SUFFIX)) { String prefix = name.substring(0,name.length() - gzLen); parts.put(prefix, entryPath); partsLength.put(prefix, entry.getLen()); } else { LOGGER.info("Ignoring entry " + pathStr); } } // just for sanities sake - lets make sure all summaries have a part: for(String name : summaries.keySet()) { if(!parts.containsKey(name)) { throw new IOException("Missing part for summary:" + name); } } // now dump all summaries - make sure we do it in sorted order: ArrayList<String> summaryNames = new ArrayList<String>(summaries.keySet()); String tmp[] = new String[0]; String tmp2[] = summaryNames.toArray(tmp); Arrays.sort(tmp2); for(String part : tmp2) { // long length = partsLength.get(part); FSDataInputStream fsdis = fs.open(summaries.get(part)); InputStreamReader isr = new InputStreamReader(fsdis,UTF8); BufferedReader br = new BufferedReader(isr); String line; // String prevUrl = null; // long prevOffset = 0; while(true) { line = br.readLine(); if(line == null) { break; } String fields[] = line.split("\\s"); if(fields.length < 3) { throw new IOException("Bad line in " + part + ":" + line); } long offset = Long.parseLong(fields[0]); long length = Long.parseLong(fields[1]); String url = fields[2]; pw.format("%s\t%s\t%d\t%d\n", url, part, offset, length); // if(prevUrl != null) { // pw.format("%s\t%s\t%d\t%d\n", // prevUrl, part, prevOffset, offset - prevOffset); // } // prevUrl = url; // prevOffset = offset; } // if(prevUrl != null) { // pw.format("%s\t%s\t%d\t%d\n", // prevUrl, part, prevOffset, length - prevOffset); // } br.close(); } pw.flush(); } public void createSummaryOld(FileSystem fs, Path clusterPath, PrintWriter pw) throws IOException { Charset UTF8 = Charset.forName("utf-8"); HashMap<String, Path> summaries = new HashMap<String, Path>(); HashMap<String, Path> parts = new HashMap<String, Path>(); HashMap<String,Long> partsLength = new HashMap<String, Long>(); FileStatus entries[] = fs.listStatus(clusterPath); int sumLen = SUMMARY_SUFFIX.length(); int gzLen = GZ_SUFFIX.length(); for(FileStatus entry : entries) { Path entryPath = entry.getPath(); String pathStr = entryPath.toUri().toASCIIString(); String name = entryPath.getName(); if(entry.isDir()) { LOGGER.info("Ignoring Directory entry " + pathStr); } else if(name.endsWith(SUMMARY_SUFFIX)) { String prefix = name.substring(0,name.length() - sumLen); summaries.put(prefix, entryPath); } else if(name.endsWith(GZ_SUFFIX)) { String prefix = name.substring(0,name.length() - gzLen); parts.put(prefix, entryPath); partsLength.put(prefix, entry.getLen()); } else { LOGGER.info("Ignoring entry " + pathStr); } } // just for sanities sake - lets make sure all summaries have a part: for(String name : summaries.keySet()) { if(!parts.containsKey(name)) { throw new IOException("Missing part for summary:" + name); } } // now dump all summaries - make sure we do it in sorted order: ArrayList<String> summaryNames = new ArrayList<String>(summaries.keySet()); String tmp[] = new String[0]; String tmp2[] = summaryNames.toArray(tmp); Arrays.sort(tmp2); for(String part : tmp2) { long length = partsLength.get(part); FSDataInputStream fsdis = fs.open(summaries.get(part)); InputStreamReader isr = new InputStreamReader(fsdis,UTF8); BufferedReader br = new BufferedReader(isr); String line; String prevUrl = null; long prevOffset = 0; while(true) { line = br.readLine(); if(line == null) { break; } String fields[] = line.split("\\s"); if(fields.length < 3) { throw new IOException("Bad line in " + part + ":" + line); } long offset = Long.parseLong(fields[0]); String url = fields[1]; if(prevUrl != null) { pw.format("%s\t%s\t%d\t%d\n", prevUrl, part, prevOffset, offset - prevOffset); } prevUrl = url; prevOffset = offset; } if(prevUrl != null) { pw.format("%s\t%s\t%d\t%d\n", prevUrl, part, prevOffset, length - prevOffset); } br.close(); } pw.flush(); } public int run(String[] args) throws Exception { if(args.length < 1) { USAGE(1); } if(args.length > 2) { USAGE(1); } String hdfsUrl = args[0]; boolean isOld = false; if(args.length == 2) { hdfsUrl = args[1]; isOld = true; } URI uri = new URI(hdfsUrl); FileSystem fs = FileSystem.get(uri,getConf()); Path path = new Path(hdfsUrl); Path target = new Path(path,ALL_SUMMARY_PREFIX + SUMMARY_SUFFIX); if(fs.exists(target)) { System.err.format("Error-exists: " + target.toUri().toASCIIString()); return 1; } Charset UTF8 = Charset.forName("utf-8"); FSDataOutputStream os = fs.create(target); OutputStreamWriter osw = new OutputStreamWriter(os, UTF8); PrintWriter pw = new PrintWriter(osw); if(isOld) { createSummaryOld(fs, path, pw); } else { createSummary(fs, path, pw); } osw.flush(); osw.close(); return 0; } public static void main(String[] args) throws Exception { int res = ToolRunner.run(new Configuration(), new SummaryGenerator(), args); System.exit(res); } }