/** * Copyright 2008 - CommonCrawl Foundation * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * **/ package org.commoncrawl.hadoop.template; import java.io.IOException; import java.nio.ByteBuffer; import java.nio.charset.Charset; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.MapRunner; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.RecordReader; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.SequenceFileOutputFormat; import org.apache.hadoop.mapred.TaskAttemptID; import org.apache.hadoop.mapred.lib.IdentityReducer; import org.apache.hadoop.util.StringUtils; import org.commoncrawl.hadoop.io.ARCInputFormat; import org.commoncrawl.hadoop.io.ARCSplitCalculator; import org.commoncrawl.hadoop.io.ARCSplitReader; import org.commoncrawl.hadoop.io.JetS3tARCSource; import org.commoncrawl.protocol.shared.ArcFileItem; import org.commoncrawl.util.ArcFileReader; /** * * @author rana * */ public class SampleHadoopJob extends MapRunner<Text, ArcFileItem, Text, Text> { /** logging **/ private static final Log LOG = LogFactory.getLog(SampleHadoopJob.class); /** * main routine * * @param args */ public static void main(String[] args) { // amazon access key - passed on command line String accessKey = args[0]; // amazon secret key - passed on command line String secretKey = args[1]; // regular expression to match against - passed in command line String regEx = args[2]; // group number to extract int groupNumber = Integer.parseInt(args[3]); /** arc files names start with year then month **/ // we want to process all files uploaded in 2009 // so, we will use the prefix string "2009", // buy you could, for example pass in a more restrictive // pattern such as "2008/06". String inputPrefix = "2009"; LOG.info("Processing Path:" + inputPrefix); // allocate job config JobConf job = new JobConf(SampleHadoopJob.class); // set job name job.setJobName("Sample RegEx Job against path:" + inputPrefix); // set regular expression attributes job.set("mapred.mapper.regex", regEx); job.setInt("mapred.mapper.regex.group", groupNumber); // create temp file pth Path tempDir = new Path(job.get("mapred.temp.dir", ".") + "/temp-" + System.currentTimeMillis()); LOG.info("Output for job " + job.getJobName() + " is:" + tempDir); // we are going to be using the JetS3ARCSource as an input source to // the ArcInputFormat. This input source uses the multi-threaded jets3 // library to request data from S3. /** setup s3 properties **/ // set the number of retries per ARC file. // we are setting this number to one, so if an IOException // occurs when processing an ARCFile, we are going to silently skip it // and continue processing the next ARC file. You should set this to be // a number LESS than mapred.max.tracker.failures (as defined in your // job config or hadoop-site.xml). Otherwise, your entire job could // fail if it encounteres a bad ARC file in the bucket, or if the S3 serivce // exhibits a failure condition specific to a single key or set of keys. JetS3tARCSource.setMaxRetries(job, 1); // set up S3 credentials ... JetS3tARCSource.setAWSAccessKeyID(job, accessKey); JetS3tARCSource.setAWSSecretAccessKey(job, secretKey); // set the number of files per split // set this number higher if the bucket contains lots of files, to reduce // the burden on the map-reduce system from tracking too many file splits. ARCSplitCalculator.setFilesPerSplit(job, 25); /** set up arc reader properties **/ // again, set the timeout to something reasonable, so that your entire job // will not hang if a single GET request fails to complete in a reasonable // amount of time ArcFileReader.setIOTimeoutValue(30000); // set input prefixes ... JetS3tARCSource.setInputPrefixes(job, inputPrefix); // and S3 bucket name ... JetS3tARCSource.setBucketName(job, "commoncrawl"); // and setup arc source for ArcInputFormat ARCInputFormat.setARCSourceClass(job, JetS3tARCSource.class); // now inform the job that it needs to use the ARCInputFormat job.setInputFormat(ARCInputFormat.class); // set up our map runner class // we use a map runner instead of a mapper here to give us an extra level of // control over how we handle errors. When running a large job against // the crawl corpus which may contain hunders of thousands of ARC files, it // is extremely important to reduce the risks of abnormal job termination. job.setMapRunnerClass(SampleHadoopJob.class); // setup reducer (identity in this case ... ) job.setReducerClass(IdentityReducer.class); // standard output format ... job.setOutputFormat(SequenceFileOutputFormat.class); // set output path FileOutputFormat.setOutputPath(job,tempDir); // map output types job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(Text.class); // run the job ... try { LOG.info("Starting Job:" + job.getJobName()); JobClient.runJob(job); LOG.info("Finished Job:" + job.getJobName()); } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); e.printStackTrace(); } } /** track the task's attempt id **/ private TaskAttemptID attemptID = null; /** and how many task failures we can tolerate **/ private int maxAttemptTaskId = -1; /** track split details for debubbing purposes **/ private String splitDetails = null; /** regular expression pattern - initialized from job config (per mapper) **/ private Pattern pattern = null; private int group = 0; /** overloaded to initialize class variables from job config **/ @Override public void configure(JobConf job) { attemptID = TaskAttemptID.forName(job.get("mapred.task.id")); maxAttemptTaskId = job.getInt("mapred.max.tracker.failures", 4) - 1; splitDetails = job.get(ARCSplitReader.SPLIT_DETAILS, "Spit Details Unknown"); pattern = Pattern.compile(job.get("mapred.mapper.regex")); group = job.getInt("mapred.mapper.regex.group", 0); } /** internal map routine -called by our map runner overload **/ void map(Text key, ArcFileItem value, OutputCollector<Text, Text> output, Reporter reporter) throws IOException { try { // TODO: This is a very simplitic char conversion Charset asciiCharset = Charset.forName("ASCII"); // if the arc file has content ... if (value.getContent().getCount() != 0) { // attempt to convert it to ascii String asciiString = asciiCharset.decode( ByteBuffer.wrap(value.getContent().getReadOnlyBytes(), 0, value.getContent().getCount())).toString(); // ok walk the string looking for the pattern Matcher matcher = pattern.matcher(asciiString); while (matcher.find()) { // found a match, output match, key is url, value is matched group output.collect(key, new Text(matcher.group(group))); } } } // catch any type of exception and log it ONLY for now catch (Exception e) { LOG.error(StringUtils.stringifyException(e)); } } /** we extend map runner to extert greater control over failures **/ @Override public void run(RecordReader<Text, ArcFileItem> input, OutputCollector<Text, Text> output, Reporter reporter) throws IOException { int lastValidPos = 0; try { // allocate key & value instances that are re-used for all entries Text key = input.createKey(); ArcFileItem value = input.createValue(); // read next input while (input.next(key, value)) { // call map function map(key, value, output, reporter); } } catch (IOException e) { String errorMessage = "Exception processing Split:" + splitDetails + " Exception:" + StringUtils.stringifyException(e); LOG.error(errorMessage); if (attemptID.getId() != maxAttemptTaskId) { throw new IOException(errorMessage); } // and just ignore the message } catch (Throwable e) { String errorMessage = "Unknown Exception processing Split:" + splitDetails + " Exception:" + StringUtils.stringifyException(e); LOG.error(errorMessage); // if attempt number is not max attempt number configured... if (attemptID.getId() != maxAttemptTaskId) { // then bubble up exception throw new IOException(errorMessage); } } finally { input.close(); } } }