package org.commoncrawl.mapred.ec2.postprocess.crawldb; import java.io.IOException; import java.net.URI; import java.text.NumberFormat; import java.util.ArrayList; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Set; import java.util.Vector; import static org.mockito.Mockito.mock; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.permission.FsPermission; import org.apache.hadoop.io.DataInputBuffer; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.RawComparator; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.io.WritableUtils; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reducer; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.util.Progressable; import org.commoncrawl.util.CCStringUtils; import org.commoncrawl.util.MultiFileMergeUtils.MultiFileInputReader; import org.commoncrawl.util.MultiFileMergeUtils.MultiFileInputReader.KeyAndValueData; import org.commoncrawl.util.MultiFileMergeUtils.MultiFileInputReader.RawRecordValue; import org.commoncrawl.util.S3SeekableResilientInputStream; import org.commoncrawl.util.TextBytes; import org.commoncrawl.util.Tuples.Pair; import com.amazonaws.auth.BasicAWSCredentials; import com.amazonaws.services.s3.AmazonS3Client; import com.amazonaws.services.s3.model.ListObjectsRequest; import com.amazonaws.services.s3.model.ObjectListing; import com.amazonaws.services.s3.model.ObjectMetadata; import com.google.common.collect.Lists; /** * Final Merge Step is done using a non-shuffle reduce since all input segments have been pre-sorted and * pre-sharded with the proper shard count. * * @author rana * */ public class CrawlDBMergeSortReducer implements Reducer<IntWritable, Text ,TextBytes,TextBytes> { static final Log LOG = LogFactory.getLog(CrawlDBMergeSortReducer.class); JobConf _conf; AmazonS3Client _s3Client; @Override public void configure(JobConf job) { _conf = job; _s3Client = new AmazonS3Client(new BasicAWSCredentials(_conf.get("fs.s3n.awsAccessKeyId"),_conf.get("fs.s3n.awsSecretAccessKey"))); } @Override public void close() throws IOException { _s3Client.shutdown(); } static class RawValueIterator implements Iterator<TextBytes> { TextBytes keyBytes = new TextBytes(); TextBytes valueBytes = new TextBytes(); DataInputBuffer keyInputBuffer = new DataInputBuffer(); DataInputBuffer inputBuffer = new DataInputBuffer(); Iterator<RawRecordValue> rawIterator; void reset(Iterable<RawRecordValue> rawIterable) { this.rawIterator = rawIterable.iterator(); } @Override public boolean hasNext() { return rawIterator.hasNext(); } @Override public TextBytes next(){ try { RawRecordValue nextRawValue = rawIterator.next(); // read in text bytes key ... keyInputBuffer.reset(nextRawValue.key.getData(),0,nextRawValue.key.getLength()); inputBuffer.reset(nextRawValue.data.getData(),0,nextRawValue.data.getLength()); int valueTextLen = WritableUtils.readVInt(inputBuffer); valueBytes.set(nextRawValue.data.getData(),inputBuffer.getPosition(),valueTextLen); int keyTextLen = WritableUtils.readVInt(keyInputBuffer); keyBytes.set(nextRawValue.key.getData(),keyInputBuffer.getPosition(),keyTextLen); System.out.println("NextKey:" + keyBytes.toString() + " Source:" + nextRawValue.source); return valueBytes; } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); throw new RuntimeException(e); } } @Override public void remove() { throw new UnsupportedOperationException("remove"); } } private FileSystem getS3NFileSystem()throws IOException { return new FileSystem() { @Override public URI getUri() { return null; } @Override public FSDataInputStream open(Path f, int bufferSize) throws IOException { return new FSDataInputStream( new S3SeekableResilientInputStream( f.toUri(), _conf.get("fs.s3n.awsAccessKeyId"), _conf.get("fs.s3n.awsSecretAccessKey"), bufferSize, 100)); } @Override public FSDataOutputStream create(Path f, FsPermission permission, boolean overwrite, int bufferSize, short replication, long blockSize, Progressable progress) throws IOException { return null; } @Override public FSDataOutputStream append(Path f, int bufferSize, Progressable progress) throws IOException { return null; } @Override public boolean rename(Path src, Path dst) throws IOException { return false; } @Override @Deprecated public boolean delete(Path f) throws IOException { // TODO Auto-generated method stub return false; } @Override public boolean delete(Path f, boolean recursive) throws IOException { // TODO Auto-generated method stub return false; } @Override public FileStatus[] listStatus(Path f) throws IOException { // TODO Auto-generated method stub return null; } @Override public void setWorkingDirectory(Path new_dir) { // TODO Auto-generated method stub } @Override public Path getWorkingDirectory() { // TODO Auto-generated method stub return null; } @Override public boolean mkdirs(Path f, FsPermission permission) throws IOException { // TODO Auto-generated method stub return false; } @Override public FileStatus getFileStatus(Path f) throws IOException { // get uri from path ... URI uri = f.toUri(); // convert to s3 path .. String key = uri.getPath().substring(1); System.out.println("***uri path:" +key ); ObjectMetadata metadata = _s3Client.getObjectMetadata(uri.getHost(), key); if (metadata != null) { FileStatus fileStatus = new FileStatus(metadata.getContentLength(),false,1,0,metadata.getLastModified().getTime(),0,FsPermission.getDefault(),"","",f); return fileStatus; } return null; } }; } public void reduce(IntWritable key, Iterator<Text> values,OutputCollector<TextBytes, TextBytes> output, Reporter reporter) throws IOException { // collect all incoming paths first Vector<Path> incomingPaths = new Vector<Path>(); Set<String> fsType = new HashSet<String>(); while(values.hasNext()){ String path = values.next().toString(); LOG.info("Found Incoming Path:" + path); incomingPaths.add(new Path(path)); // convert to uri ... URI uri = new Path(path).toUri(); // get scheme if present ... String scheme = uri.getScheme(); if (scheme == null || scheme.length() == 0) { fsType.add("default"); } else { fsType.add(scheme); } } if (fsType.size() != 1) { throw new IOException("Only One Input Scheme at a time supported!"); } boolean isS3N = fsType.contains("s3n"); // set up merge attributes Configuration localMergeConfig = new Configuration(_conf); // we don't want to use a grouping comparator because the we are using the reducer code from the intermediate // merge localMergeConfig.setClass(MultiFileInputReader.MULTIFILE_COMPARATOR_CLASS,CrawlDBKey.LinkKeyComparator.class, RawComparator.class); localMergeConfig.setClass(MultiFileInputReader.MULTIFILE_KEY_CLASS,TextBytes.class, WritableComparable.class); // spawn merger // pick filesystem based on path ... FileSystem fs = null; if (!isS3N) { fs = FileSystem.get(incomingPaths.get(0).toUri(),_conf); } else { // use our custom s3n stub fs = getS3NFileSystem(); } LOG.info("FileSystem is:" + fs.toString()); MultiFileInputReader<TextBytes> multiFileInputReader = new MultiFileInputReader<TextBytes>(fs, incomingPaths, localMergeConfig); // create crawl db writer, which is the actual reducer we want to use ... CrawlDBMergingReducer crawlDBWriter = new CrawlDBMergingReducer(); crawlDBWriter.configure(_conf); RawValueIterator rawValueIterator = new RawValueIterator(); Pair<KeyAndValueData<TextBytes>,Iterable<RawRecordValue>> nextItem = null; // walk tuples and feed them to the actual reducer ... while ((nextItem = multiFileInputReader.getNextItemIterator()) != null) { System.out.println("PKey:" + nextItem.e0._keyObject); rawValueIterator.reset(nextItem.e1); // output to reducer ... crawlDBWriter.reduce(nextItem.e0._keyObject,rawValueIterator, output, reporter); reporter.progress(); } // flush output crawlDBWriter.close(); } private static final NumberFormat NUMBER_FORMAT = NumberFormat.getInstance(); static { NUMBER_FORMAT.setMinimumIntegerDigits(5); NUMBER_FORMAT.setGroupingUsed(false); } private static List<Path> getIntermediateSegmentPaths(AmazonS3Client s3Client)throws IOException { ArrayList<Path> listOut = Lists.newArrayList(); ObjectListing response = s3Client.listObjects(new ListObjectsRequest() .withBucketName("aws-publicdatasets") .withPrefix("common-crawl/crawl-db/merge/intermediate/") .withDelimiter("/") ); do { LOG.info("Response Key Count:" + response.getCommonPrefixes()); for (String entry : response.getCommonPrefixes()) { try { Path s3nPath =new Path("s3n","aws-publicdatasets","/"+entry); //long timestamp = Long.parseLong(s3nPath.getName()); listOut.add(s3nPath); } catch (Exception e) { } } if (response.isTruncated()) { response = s3Client.listNextBatchOfObjects(response); } else { break; } } while (true); return listOut; } /** * do a merge on a single shard for test purposes * @param args * @throws IOException */ public static void main(String[] args)throws IOException { String s3AccessKey = args[0]; String s3Secret = args[1]; int partNumber = Integer.parseInt(args[2]); AmazonS3Client s3Client = new AmazonS3Client(new BasicAWSCredentials(s3AccessKey,s3Secret)); List<Path> segments = getIntermediateSegmentPaths(s3Client); String partName = "part-" + NUMBER_FORMAT.format(partNumber); List<Text> transformedPaths = Lists.newArrayList(); for (Path path : segments) { transformedPaths.add(new Text(new Path(path,partName).toUri().toString())); if (transformedPaths.size() >= 3) break; } CrawlDBMergeSortReducer finalMerge = new CrawlDBMergeSortReducer(); JobConf conf = new JobConf(); conf.set("fs.s3n.awsAccessKeyId",s3AccessKey); conf.set("fs.s3n.awsSecretAccessKey",s3Secret); finalMerge.configure(conf); finalMerge.reduce(new IntWritable(1), transformedPaths.iterator(), new OutputCollector<TextBytes, TextBytes>() { long lastValue = Long.MIN_VALUE; @Override public void collect(TextBytes key, TextBytes value) throws IOException { long domainHash = CrawlDBKey.getLongComponentFromKey(key, CrawlDBKey.ComponentId.DOMAIN_HASH_COMPONENT_ID); if (domainHash < lastValue) { throw new IOException("LastValue:" + lastValue + " CurrentValue:" + domainHash + " " + value.toString()); } lastValue = domainHash; System.out.println("OutputKey:"+ key.toString()); } },mock(Reporter.class)); } }