/* * This file is part of the Wayback archival access software * (http://archive-access.sourceforge.net/projects/wayback/). * * Licensed to the Internet Archive (IA) by one or more individual * contributors. * * The IA licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.archive.wayback.hadoop; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.net.URL; import java.util.ArrayList; import java.util.Date; import java.util.List; import java.util.zip.GZIPInputStream; import org.apache.commons.httpclient.URIException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.compress.GzipCodec; import org.apache.hadoop.mapred.ClusterStatus; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.MapReduceBase; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.RunningJob; import org.apache.hadoop.mapred.TextInputFormat; import org.apache.hadoop.mapred.TextOutputFormat; import org.apache.hadoop.mapred.lib.IdentityMapper; import org.apache.hadoop.mapred.lib.IdentityReducer; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.archive.wayback.util.ByteOp; import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; public class CDXSort extends Configured implements Tool { private RunningJob jobResult = null; static int printUsage() { System.out.println("cdxsort <split> <input> <output>"); ToolRunner.printGenericCommandUsage(System.out); return -1; } /** * The main driver for sort program. Invoke this method to submit the * map/reduce job. * * @throws IOException * When there is communication problems with the job tracker. */ public int run(String[] args) throws Exception { boolean compressOutput = false; boolean dereferenceInputs = false; boolean canonicalize = false; boolean funkyInput = false; JobConf jobConf = new JobConf(getConf(), CDXSort.class); jobConf.setJobName("cdxsort"); jobConf.setMapperClass(IdentityMapper.class); jobConf.setReducerClass(IdentityReducer.class); JobClient client = new JobClient(jobConf); ClusterStatus cluster = client.getClusterStatus(); List<String> otherArgs = new ArrayList<String>(); for (int i = 0; i < args.length; ++i) { try { if ("-m".equals(args[i])) { jobConf.setNumMapTasks(Integer.parseInt(args[++i])); } else if ("--compress-output".equals(args[i])) { compressOutput = true; } else if ("--funky-input".equals(args[i])) { funkyInput = true; } else if ("--dereference-inputs".equals(args[i])) { dereferenceInputs = true; } else if ("--canonicalize".equals(args[i])) { canonicalize = true; } else { otherArgs.add(args[i]); } } catch (NumberFormatException except) { System.out.println("ERROR: Integer expected instead of " + args[i]); return printUsage(); } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); // exits } } // Make sure there are exactly 3 parameters left: split input output if (otherArgs.size() != 3) { System.out.println("ERROR: Wrong number of parameters: " + otherArgs.size() + " instead of 3."); return printUsage(); } String splitPath = otherArgs.get(0); String inputPath = otherArgs.get(1); String outputPath = otherArgs.get(2); // load the split file, find and set the number of reduces AlphaPartitioner partitioner = new AlphaPartitioner(); File localSplitFile = new File(splitPath); FileInputStream fis = new FileInputStream(localSplitFile); InputStreamReader isr = new InputStreamReader(fis,ByteOp.UTF8); BufferedReader bis = new BufferedReader(isr); // try { // partitioner.loadBoundaries(bis); // } catch (IOException except) { // System.err.println("ERROR: Problem loading file " + splitPath); // return printUsage(); // exits // } // jobConf.setNumReduceTasks(partitioner.getNumPartitions()); // // // copy the split file into the FS, add to the DistributedCache: //// AlphaPartitioner.setPartitionFile(jobConf, localSplitFile); // AlphaPartitioner.setSplitCache(jobConf, localSplitFile); // System.err.println("uploaded split file to FS and DistributedCache"); // // // Set job configs: // jobConf.setInputFormat(TextInputFormat.class); // // jobConf.setOutputFormat(TextOutputFormat.class); // if (canonicalize) { // jobConf.setMapperClass(CDXCanonicalizerMapClass.class); // } else { // jobConf.setMapperClass(CDXMapClass.class); // } // jobConf.setOutputKeyClass(Text.class); // jobConf.setOutputValueClass(Text.class); // jobConf.set("mapred.textoutputformat.separator", " "); // jobConf.setPartitionerClass(AlphaPartitioner.class); int inputCount = 0; // Set job input: if (dereferenceInputs) { // SO SLOW... can't add one at a time... // FileReader is2 = new FileReader(new File(inputPath)); // BufferedReader bis2 = new BufferedReader(is2); // while (true) { // String line = bis2.readLine(); // if (line == null) { // break; // } // FileInputFormat.addInputPath(jobConf, new Path(line)); // inputCount++; // System.err.println("Added path(" + inputCount + "): " + line); // } // PASS 2: // FileReader is2 = new FileReader(new File(inputPath)); // BufferedReader bis2 = new BufferedReader(is2); // ArrayList<String> list = new ArrayList<String>(); // // while (true) { // String line = bis2.readLine(); // if (line == null) { // break; // } // list.add(line); // inputCount++; // } // Path arr[] = new Path[list.size()]; // for(int i=0; i < list.size(); i++) { // arr[i] = new Path(list.get(i)); // } // FileInputFormat.setInputPaths(jobConf, arr); // PASS 3: if(funkyInput) { jobConf.setMapperClass(FunkyDeReffingCDXCanonicalizerMapClass.class); } else { jobConf.setMapperClass(DeReffingCDXCanonicalizerMapClass.class); } FileInputFormat.setInputPaths(jobConf, new Path(inputPath)); inputCount = 1; } else { FileInputFormat.setInputPaths(jobConf, new Path(inputPath)); inputCount = 1; } // Set job output: FileOutputFormat.setOutputPath(jobConf, new Path(outputPath)); if (compressOutput) { FileOutputFormat.setCompressOutput(jobConf, true); FileOutputFormat.setOutputCompressorClass(jobConf, GzipCodec.class); } // System.out.println("Running on " + cluster.getTaskTrackers() // + " nodes, processing " + inputCount + " files/directories" // + " into " + outputPath + " with " // + partitioner.getNumPartitions() + " reduces."); Date startTime = new Date(); System.out.println("Job started: " + startTime); jobResult = JobClient.runJob(jobConf); Date end_time = new Date(); System.out.println("Job ended: " + end_time); System.out.println("The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds."); return 0; } /** * Mapper which reads a canonicalized CDX line, splitting into: key - URL + * timestamp val - everything else * * @author brad * @version $Date$, $Revision$ */ public static class CDXMapClass extends MapReduceBase implements Mapper<LongWritable, Text, Text, Text> { private Text outKey = new Text(); private Text outValue = new Text(); public void map(LongWritable lineNumber, Text line, OutputCollector<Text, Text> output, Reporter reporter) throws IOException { String tmp = line.toString(); int i1 = tmp.lastIndexOf(' '); if(i1 > 0) { outKey.set(tmp.substring(0,i1)); outValue.set(tmp.substring(i1+1)); output.collect(outKey, outValue); } else { System.err.println("Problem with line(" + tmp + ")"); } // output.collect(line, outValue); // reporter.setStatus("Running"); } } public static class FunkyDeReffingCDXCanonicalizerMapClass extends DeReffingCDXCanonicalizerMapClass { protected Mapper<LongWritable, Text, Text, Text> getInner() { return new FunkyCDXCanonicalizerMapClass(); } } public static class DeReffingCDXCanonicalizerMapClass extends MapReduceBase implements Mapper<LongWritable, Text, Text, Text> { protected Mapper<LongWritable, Text, Text, Text> getInner() { return new CDXCanonicalizerMapClass(); } /* (non-Javadoc) * @see org.apache.hadoop.mapred.Mapper#map(java.lang.Object, java.lang.Object, org.apache.hadoop.mapred.OutputCollector, org.apache.hadoop.mapred.Reporter) */ public void map(LongWritable lineNo, Text urlText, OutputCollector<Text, Text> output, Reporter reporter) throws IOException { LongWritable lw = new LongWritable(); Text tmp = new Text(); // CDXCanonicalizerMapClass inner = new CDXCanonicalizerMapClass(); Mapper<LongWritable, Text, Text, Text> inner = getInner(); // arg 1 is a URL String urlString = urlText.toString(); InputStream is = null; FileSystem fs = null; if(urlString.startsWith("http://")) { URL u = new URL(urlString.toString()); System.err.println("Openning URL stream for:" + urlString); is = u.openStream(); } else { System.err.println("Creating default Filesystem for:" + urlString); fs = FileSystem.get(new Configuration(true)); Path p = new Path(urlString); // FSDataInputStream fsdis = fs.open(p); is = fs.open(p); } if(urlString.endsWith(".gz")) { is = new GZIPInputStream(is); } try { BufferedReader br = new BufferedReader( new InputStreamReader(is,ByteOp.UTF8)); String tmpS = null; long line = 0; while((tmpS = br.readLine()) != null) { lw.set(line++); tmp.set(tmpS); inner.map(lw, tmp, output, reporter); } is.close(); if(fs != null) { fs.close(); } } catch (IOException e) { System.err.println("IOException with url:" + urlString); e.printStackTrace(); throw e; } } } /** * Mapper which reads an identity CDX line, outputting: key - canonicalized * original URL + timestamp val - everything else * * @author brad * @version $Date$, $Revision$ */ public static class CDXCanonicalizerMapClass extends MapReduceBase implements Mapper<LongWritable, Text, Text, Text> { private Text outKey = new Text(); private Text outValue = new Text(); AggressiveUrlCanonicalizer canonicalizer = new AggressiveUrlCanonicalizer(); private StringBuilder ksb = new StringBuilder(); private int i1 = 0; private int i2 = 0; private int i3 = 0; private int i4 = 0; public void map(LongWritable lineNumber, Text line, OutputCollector<Text, Text> output, Reporter reporter) throws IOException { String s = line.toString(); boolean problems = true; i1 = s.indexOf(' '); if(i1 > 0) { i2 = s.indexOf(' ', i1 + 1); if(i2 > 0) { i3 = s.indexOf(' ', i2 + 1); if(i3 > 0) { i4 = s.lastIndexOf(' '); if(i4 > i3) { try { ksb.setLength(0); ksb.append(canonicalizer.urlStringToKey(s.substring(i2 + 1, i3))); ksb.append(s.substring(i1,i4)); outKey.set(ksb.toString()); outValue.set(s.substring(i4+1)); output.collect(outKey, outValue); problems = false; } catch(URIException e) { // just eat it.. problems will be true. } } } } } if(problems) { System.err.println("CDX-Can: Problem with line("+s+")"); } } } /** * Mapper which reads an identity Funky format CDX line, outputting: * key - canonicalized original URL + timestamp * val - everything else * * input lines are a hybrid format: * * ORIG_URL * DATE * '-' (literal) * MIME * HTTP_CODE * SHA1 * REDIRECT * START_OFFSET * ARC_PREFIX (sans .arc.gz) * ROBOT_FLAG (combo of AIF - no: Archive,Index,Follow, or '-' if none) * * Ex: * http://www.myow.de:80/news_show.php? 20061126032815 - text/html 200 DVKFPTOJGCLT3G5GUVLCETHLFO3222JM - 91098929 foo A * * Need to: * . replace col 3 with orig url * . replace col 1 with canonicalized orig url * . replace SHA1 with first 4 digits of SHA1 * . append .arc.gz to ARC_PREFIX * . omit lines with ROBOT_FLAG containing 'A' * . remove last column * * @author brad * @version $Date$, $Revision$ */ public static class FunkyCDXCanonicalizerMapClass extends MapReduceBase implements Mapper<LongWritable, Text, Text, Text> { private static int SHA1_DIGITS = 3; private Text outKey = new Text(); private Text outValue = new Text(); AggressiveUrlCanonicalizer canonicalizer = new AggressiveUrlCanonicalizer(); private StringBuilder ksb = new StringBuilder(); private StringBuilder vsb = new StringBuilder(); private int i1 = 0; private int i2 = 0; private int i3 = 0; private int i4 = 0; public void map(LongWritable lineNumber, Text line, OutputCollector<Text, Text> output, Reporter reporter) throws IOException { String s = line.toString(); String parts[] = s.split(" "); boolean problems = true; if(parts.length == 10) { if(!parts[9].contains("A")) { ksb.setLength(0); vsb.setLength(0); try { ksb.append(canonicalizer.urlStringToKey(parts[0])).append(" "); ksb.append(parts[1]); // date vsb.append(parts[0]).append(" "); // orig_url vsb.append(parts[3]).append(" "); // MIME vsb.append(parts[4]).append(" "); // HTTP_CODE vsb.append(parts[5].substring(0, SHA1_DIGITS)).append(" "); // SHA1 vsb.append(parts[6]).append(" "); // redirect vsb.append(parts[7]).append(" "); // start_offset vsb.append(parts[8]).append(".arc.gz"); // arc_prefix outKey.set(ksb.toString()); outValue.set(vsb.toString()); output.collect(outKey, outValue); } catch (URIException e) { System.err.println("Failed Canonicalize:("+parts[0]+ ") in ("+parts[8]+"):("+parts[7]+")"); } } } else { System.err.println("Funky: Problem with line("+s+")"); } } } public static void main(String[] args) throws Exception { int res = ToolRunner.run(new Configuration(), new CDXSort(), args); System.exit(res); } /** * Get the last job that was run using this instance. * * @return the results of the last job that was run */ public RunningJob getResult() { return jobResult; } }