package org.commoncrawl.mapred.pipelineV3.crawllistgen; import java.io.IOException; import java.net.MalformedURLException; import java.text.NumberFormat; import java.util.ArrayList; import java.util.Comparator; import java.util.List; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.DataInputBuffer; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.SequenceFile.CompressionType; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.io.compress.SnappyCodec; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reporter; import org.commoncrawl.mapred.ec2.postprocess.crawldb.CrawlDBCommon; import org.commoncrawl.mapred.ec2.postprocess.crawldb.CrawlDBKey; import org.commoncrawl.mapred.pipelineV3.CrawlPipelineStep; import org.commoncrawl.mapred.pipelineV3.CrawlPipelineTask; import org.commoncrawl.mapred.pipelineV3.domainmeta.DomainMetadataTask; import org.commoncrawl.mapred.pipelineV3.domainmeta.crawlstats.CrawlStatsCommon; import org.commoncrawl.util.GoogleURL; import org.commoncrawl.util.JSONUtils; import org.commoncrawl.util.JobBuilder; import org.commoncrawl.util.S3NFileSystem; import org.commoncrawl.util.MultiFileMergeUtils.MultiFileInputReader; import org.commoncrawl.util.MultiFileMergeUtils.MultiFileInputReader.KeyAndValueData; import org.commoncrawl.util.MultiFileMergeUtils.MultiFileInputReader.RawRecordValue; import org.commoncrawl.util.TextBytes; import org.commoncrawl.util.Tuples.Pair; import org.commoncrawl.util.URLUtils; import com.google.common.collect.Iterables; import com.google.common.collect.Lists; import com.google.gson.JsonArray; import com.google.gson.JsonElement; import com.google.gson.JsonObject; import com.google.gson.JsonParser; public class NewPartitionUrlsStep extends CrawlPipelineStep { public NewPartitionUrlsStep(CrawlPipelineTask task) { super(task, "Partition URLS", OUTPUT_DIR_NAME); } public static final String ROOTDOMAIN_METADATA_PATH = "root.meta.path"; public static final String SUBDOMAIN_METADATA_PATH = "subdomain.meta.path"; public static final int NUM_PARTITIONS = 100; private static final NumberFormat NUMBER_FORMAT = NumberFormat.getInstance(); static { NUMBER_FORMAT.setMinimumIntegerDigits(5); NUMBER_FORMAT.setGroupingUsed(false); } private static final Log LOG = LogFactory.getLog(NewPartitionUrlsStep.class); public static final String OUTPUT_DIR_NAME = "paritionUrlsStep"; @Override public Log getLogger() { return LOG; } static final String PARTITION_ID_START_PROPERTY = "listgen.partitionIdStart"; static final String NUM_PARTITIONS_PROPERTY = "listgen.numPartitions"; @Override public void runStep(Path outputPathLocation) throws IOException { // get entire set of input crawl db paths ... DomainMetadataTask rootTask = (DomainMetadataTask)getRootTask(); Configuration conf = new Configuration(rootTask.getConf()); int partitionStartId = conf.getInt(PARTITION_ID_START_PROPERTY,0); int numPartitions = conf.getInt(NUM_PARTITIONS_PROPERTY,NUM_PARTITIONS); LOG.info("Partition Id Start:" + partitionStartId); LOG.info("Num Partitions:" + numPartitions); List<Path> crawlDBPaths = rootTask.getRestrictedMergeDBDataPaths(); LOG.info("Input Paths are:" + crawlDBPaths); // partition ... Iterable<List<Path>> partitions = Iterables.partition(crawlDBPaths, crawlDBPaths.size() / numPartitions); // get a file system object ... FileSystem fs = FileSystem.get(outputPathLocation.toUri(),conf); // iterate partitions int partitionIndex = partitionStartId; for (List<Path> partitionPaths : partitions) { // construct output path ... Path partitionOutputPath = new Path(outputPathLocation,NUMBER_FORMAT.format(partitionIndex)); // if not present ... if (!fs.exists(partitionOutputPath)) { runStepForPartition(rootTask,partitionIndex,partitionPaths,partitionOutputPath); } ++partitionIndex; } } void runStepForPartition(DomainMetadataTask rootTask,int partitionIndex,List<Path> inputPaths,Path outputPath)throws IOException { // build the basic job config ... JobConf job = new JobBuilder("Parition URL List", new Configuration()) .inputFormat(PartitionJoinInputFormat.class) .mapper(RankAndFilterMapper.class) .keyValue(CrawlListKey.class, TextBytes.class) .sort(CrawlListKey.CrawListKeyComparator.class) .partition(CrawlListKey.CrawlListKeyPartitioner.class) .numReducers(CrawlListGenCommon.NUM_LIST_PARTITIONS) .compressor(CompressionType.BLOCK, SnappyCodec.class) .output(outputPath) .outputIsSeqFile() .build(); job.setInt("mapred.task.timeout",4*(60*(60*1000))); // write partition paths ... PartitionJoinInputFormat.writeSinglePathPerPartition(inputPaths, job); // ok, figure out locations of dependent metadata ... job.set(ROOTDOMAIN_METADATA_PATH, rootTask.getOutputDirForStep(ShardRootDomainClassificationStep.class).toString()); job.set(SUBDOMAIN_METADATA_PATH, rootTask.getOutputDirForStep(ShardSubDomainMetadataStep.class).toString()); // run it ... JobClient.runJob(job); } @Override public boolean isComplete() throws IOException { /* // get entire set of input crawl db paths ... DomainMetadataTask rootTask = (DomainMetadataTask)getRootTask(); Configuration conf = new Configuration(rootTask.getConf()); List<Path> crawlDBPaths = rootTask.getMergeDBDataPaths(); int partitionStartId = conf.getInt(PARTITION_ID_START_PROPERTY,0); int numPartitions = conf.getInt(NUM_PARTITIONS_PROPERTY,NUM_PARTITIONS); Path outputPath = getOutputDir(); // get a file system object ... FileSystem fs = FileSystem.get(outputPath.toUri(),conf); // iterate partitions for (int partitionIndex=partitionStartId;partitionIndex<partitionStartId+numPartitions;++partitionIndex) { // construct output path ... Path partitionOutputPath = new Path(outputPath,NUMBER_FORMAT.format(partitionIndex)); // if not present ... if (!fs.exists(partitionOutputPath)) { LOG.info("Partition output path:" + partitionOutputPath + " not found!"); return false; } else { LOG.info("Found partition output path:" + partitionOutputPath); } ++partitionIndex; } */ return true; } public static class RankAndFilterMapper implements Mapper<IntWritable,Text,CrawlListKey,TextBytes> { Path rootDomainMetaPath; Path subDomainMetaPath; JobConf _conf; @Override public void configure(JobConf job) { rootDomainMetaPath = new Path(job.get(ROOTDOMAIN_METADATA_PATH)); subDomainMetaPath = new Path(job.get(SUBDOMAIN_METADATA_PATH)); _conf = job; } @Override public void close() throws IOException { } OutputCollector<CrawlListKey, TextBytes> _collector; enum Counters { ROOT_DOMAIN_ID_MISMATCH, SKIPPING_BLACKLISTED_URL, SKIPPING_LIMITED_CRAWL_URL, SUBDOMAIN_METADATA_WITHOUT_MATCHING_ROOT_DOMAIN_METADATA, URL_METADATA_MISSING_URL, FOUND_LINK_RECORD, FILTERED_OUT_URL, SKIPPING_NON_RANKED_URL, BLEKKO_URL, CC_URL, EMITTED_URL_WITH_QUERY, COULD_NOT_RESOLVE_WWW_PREFIX, USING_WWW_PREFIX, SKIPPING_QUERY_URL, NO_SOURCE_URL } static void extractHTTPHeaderData(JsonObject metadataObject,JsonObject jsonOut) throws IOException { if (metadataObject.has(CrawlDBCommon.TOPLEVEL_SUMMARYRECORD_PROPRETY)) { JsonObject summaryRecord = metadataObject.getAsJsonObject(CrawlDBCommon.TOPLEVEL_SUMMARYRECORD_PROPRETY); if (summaryRecord.has(CrawlDBCommon.SUMMARYRECORD_CRAWLDETAILS_ARRAY_PROPERTY)) { JsonArray crawlDetails = summaryRecord.getAsJsonArray(CrawlDBCommon.SUMMARYRECORD_CRAWLDETAILS_ARRAY_PROPERTY); long latestAttemptTime = -1; long lastModifiedTime = -1; String etag = null; for (JsonElement detailElement : crawlDetails) { JsonObject detailRecord = detailElement.getAsJsonObject(); int httpResult = JSONUtils.safeGetInteger(detailRecord, "http_result"); if (httpResult >= 200 && httpResult < 300) { long attemptTime = JSONUtils.safeGetLong(detailRecord, "attempt_time"); if (attemptTime != -1 && attemptTime > latestAttemptTime) { if (detailRecord.has("last-modified")) { lastModifiedTime = detailRecord.get("last-modified").getAsLong(); } else { lastModifiedTime = -1L; } if (detailRecord.has("etag")) { etag = detailRecord.get("etag").getAsString(); } else { etag = null; } } } } if (lastModifiedTime != -1L) { jsonOut.addProperty(CrawlListGenCommon.CRAWLLIST_METADATA_LAST_MODIFIED_TIME, lastModifiedTime); } if (etag != null) { jsonOut.addProperty(CrawlListGenCommon.CRAWLLIST_METADATA_ETAG, etag); } } } } static final String stripWWW(String host) { if (host.startsWith("www.")) { return host.substring("www.".length()); } return host; } public static String canonicalizeURL(GoogleURL urlObject,boolean prefixWithWWW) throws MalformedURLException { StringBuilder urlOut = new StringBuilder(); urlOut.append(urlObject.getScheme()); urlOut.append("://"); if (urlObject.getUserName() != GoogleURL.emptyString) { urlOut.append(urlObject.getUserName()); if (urlObject.getPassword() != GoogleURL.emptyString) { urlOut.append(":"); urlOut.append(urlObject.getPassword()); } urlOut.append("@"); } String host = urlObject.getHost(); if (host.endsWith(".")) { host = host.substring(0, host.length() - 1); } // and if we should prefix with www. add it back in ... if (!host.startsWith("www.") && prefixWithWWW) { host = "www." + host; } urlOut.append(host); if (urlObject.getPort() != GoogleURL.emptyString && !urlObject.getPort().equals("80")) { urlOut.append(":"); urlOut.append(urlObject.getPort()); } if (urlObject.getPath() != GoogleURL.emptyString) { int indexOfSemiColon = urlObject.getPath().indexOf(';'); if (indexOfSemiColon != -1) { urlOut.append(urlObject.getPath().substring(0, indexOfSemiColon)); } else { urlOut.append(urlObject.getPath()); } } if (urlObject.getQuery() != GoogleURL.emptyString) { urlOut.append("?"); urlOut.append(urlObject.getQuery()); } String canonicalizedURL = urlOut.toString(); // phase 2 - remove common session id patterns canonicalizedURL = URLUtils.sessionIdNormalizer.normalize(canonicalizedURL, ""); // phase 3 - stir back in ref if #! if (urlObject.getRef().length() != 0 && urlObject.getRef().charAt(0) == '!') { canonicalizedURL += "#" + urlObject.getRef(); } return canonicalizedURL; } static void addPartFileGivenPath(List<Path> paths,FileSystem fs,Path path) throws IOException { FileStatus files[] = fs.globStatus(new Path(path,"part-*")); for (FileStatus file : files) { paths.add(file.getPath()); } } private static FileSystem getFileSystemForMergePath(Path path,Configuration conf)throws IOException { // override S3N if (path.toUri().getScheme().equalsIgnoreCase("s3n")) { FileSystem fs = new S3NFileSystem(); fs.initialize(path.toUri(), conf); return fs; } // conf.setClass("fs.s3n.impl", S3NFileSystem.class,FileSystem.class); return FileSystem.get(path.toUri(),conf); } @Override public void map(IntWritable key, Text value,OutputCollector<CrawlListKey, TextBytes> output, Reporter reporter) throws IOException { // set up merge attributes Configuration localMergeConfig = new Configuration(_conf); localMergeConfig.setClass(MultiFileInputReader.MULTIFILE_COMPARATOR_CLASS, CrawlDBKey.CrawlDBKeyComparator.class, Comparator.class); localMergeConfig.setClass(MultiFileInputReader.MULTIFILE_KEY_CLASS, CrawlDBKey.class, WritableComparable.class); // get the single input path... Path inputPath = new Path(value.toString()); // get fs based on path ... FileSystem fs = FileSystem.get(inputPath.toUri(),_conf); ArrayList<Path> paths = Lists.newArrayList(); // add join paths addPartFileGivenPath(paths, fs, rootDomainMetaPath); addPartFileGivenPath(paths, fs, subDomainMetaPath); paths.add(inputPath); LOG.info("Input Paths for Shard:" + key.get() + " Are:" + paths); // replace emr s3n for inputs ... FileSystem mergefs = getFileSystemForMergePath(paths.get(0),localMergeConfig); // ok now spawn merger MultiFileInputReader<TextBytes> multiFileInputReader = new MultiFileInputReader<TextBytes>(mergefs, paths, localMergeConfig); try { Pair<KeyAndValueData<TextBytes>, Iterable<RawRecordValue>> nextItem = null; TextBytes valueText = new TextBytes(); DataInputBuffer valueStream = new DataInputBuffer(); JsonParser parser = new JsonParser(); _collector = output; long _rootDomainId = -1L; JsonObject _rootDomainMetadata = null; long _subDomainId = -1L; boolean _isSuperDomain = false; boolean _isBlacklisted = false; boolean _limitedCrawl = false; boolean _prefixWithWWW = false; URLFilter filter = new URLFilter(); CrawlListKey keyOut = new CrawlListKey(); TextBytes valueOut = new TextBytes(); while ((nextItem = multiFileInputReader.getNextItemIterator()) != null) { //LOG.info("Key:"+ nextItem.e0._keyObject.toString()); long recordType = CrawlDBKey.getLongComponentFromKey(nextItem.e0._keyObject, CrawlDBKey.ComponentId.TYPE_COMPONENT_ID); if (recordType == CrawlDBKey.Type.KEY_TYPE_ROOTDOMAIN_METADATA_RECORD.ordinal()) { RawRecordValue rawValue = Iterables.getFirst(nextItem.e1,null); valueStream.reset(rawValue.data.getData(),0,rawValue.data.getLength()); valueText.setFromRawTextBytes(valueStream); _rootDomainId = CrawlDBKey.getLongComponentFromKey(nextItem.e0._keyObject, CrawlDBKey.ComponentId.ROOT_DOMAIN_HASH_COMPONENT_ID); //LOG.info("Got Root Domain Record:"+ _rootDomainId); _rootDomainMetadata = parser.parse(valueText.toString()).getAsJsonObject(); _isSuperDomain = JSONUtils.safeGetBoolean(_rootDomainMetadata,CrawlStatsCommon.ROOTDOMAIN_CLASSIFY_SUPERDOMAIN); _isBlacklisted = JSONUtils.safeGetBoolean(_rootDomainMetadata,CrawlStatsCommon.ROOTDOMAIN_CLASSIFY_BLACKLISTED); _limitedCrawl = JSONUtils.safeGetBoolean(_rootDomainMetadata,CrawlStatsCommon.ROOTDOMAIN_CLASSIFY_LIMITED_CRAWL); _prefixWithWWW = false; } else if (recordType == CrawlDBKey.Type.KEY_TYPE_SUBDOMAIN_METADATA_RECORD.ordinal()) { _subDomainId = CrawlDBKey.getLongComponentFromKey(nextItem.e0._keyObject, CrawlDBKey.ComponentId.DOMAIN_HASH_COMPONENT_ID); RawRecordValue rawValue = Iterables.getFirst(nextItem.e1,null); valueStream.reset(rawValue.data.getData(),0,rawValue.data.getLength()); long rootDomainId = CrawlDBKey.getLongComponentFromKey(nextItem.e0._keyObject, CrawlDBKey.ComponentId.ROOT_DOMAIN_HASH_COMPONENT_ID); if (rootDomainId != _rootDomainId) { LOG.error("SubDomain:" + _subDomainId + " Root Id:" + rootDomainId + " did not match current root domain id:" + _rootDomainId); reporter.incrCounter(Counters.SUBDOMAIN_METADATA_WITHOUT_MATCHING_ROOT_DOMAIN_METADATA, 1); _isSuperDomain = false; _isBlacklisted = false; _limitedCrawl = false; _prefixWithWWW = false; } _prefixWithWWW = valueStream.readBoolean(); } else if (recordType == CrawlDBKey.Type.KEY_TYPE_MERGED_RECORD.ordinal()) { long currentRootDomainId = CrawlDBKey.getLongComponentFromKey(nextItem.e0._keyObject, CrawlDBKey.ComponentId.ROOT_DOMAIN_HASH_COMPONENT_ID); long currentDomainId = CrawlDBKey.getLongComponentFromKey(nextItem.e0._keyObject, CrawlDBKey.ComponentId.DOMAIN_HASH_COMPONENT_ID); if (currentRootDomainId == _rootDomainId && currentDomainId == _subDomainId) { if (_isBlacklisted) { reporter.incrCounter(Counters.SKIPPING_BLACKLISTED_URL, 1); } else if (_limitedCrawl) { reporter.incrCounter(Counters.SKIPPING_LIMITED_CRAWL_URL, 1); } else { // get first record, which will be merge record ... RawRecordValue firstRawValue = Iterables.getFirst(nextItem.e1, null); // convert to json object ... valueStream.reset(firstRawValue.data.getData(),0,firstRawValue.data.getLength()); valueText.setFromRawTextBytes(valueStream); JsonObject mergedCrawlDBRecord = parser.parse(valueText.toString()).getAsJsonObject(); int extRefCount = 0; int intRefCount = 0; // get external ref count if (mergedCrawlDBRecord.has(CrawlDBCommon.TOPLEVEL_LINKSTATUS_PROPERTY)) { extRefCount = JSONUtils.safeGetInteger(mergedCrawlDBRecord.getAsJsonObject(CrawlDBCommon.TOPLEVEL_LINKSTATUS_PROPERTY), CrawlDBCommon.LINKSTATUS_EXTRADOMAIN_SOURCES_COUNT_PROPERTY,0); intRefCount = JSONUtils.safeGetInteger(mergedCrawlDBRecord.getAsJsonObject(CrawlDBCommon.TOPLEVEL_LINKSTATUS_PROPERTY), CrawlDBCommon.LINKSTATUS_INTRADOMAIN_SOURCES_COUNT_PROPERTY,0); } String sourceURL = JSONUtils.safeGetStringFromElement(mergedCrawlDBRecord,CrawlDBCommon.TOPLEVEL_SOURCE_URL_PROPRETY); if (sourceURL == null || sourceURL.length() == 0) { reporter.incrCounter(Counters.URL_METADATA_MISSING_URL, 1); } else { GoogleURL urlObject = new GoogleURL(sourceURL); if (filter.isURLCrawlable(urlObject, mergedCrawlDBRecord)) { double ccRank = calcualteScore(intRefCount, extRefCount); double blekkoRank = 0.0; boolean inBlekkoFrontier = false; boolean crawledByBlekko = false; if (mergedCrawlDBRecord.has(CrawlDBCommon.TOPLEVEL_BLEKKO_METADATA_PROPERTY)) { inBlekkoFrontier = true; JsonObject blekkoStatus = mergedCrawlDBRecord.getAsJsonObject(CrawlDBCommon.TOPLEVEL_BLEKKO_METADATA_PROPERTY); if (blekkoStatus.has(CrawlDBCommon.BLEKKO_METADATA_STATUS)) { String statusStr = blekkoStatus.get(CrawlDBCommon.BLEKKO_METADATA_STATUS).getAsString(); if (statusStr.equalsIgnoreCase("crawled")) { crawledByBlekko = true; } } if (blekkoStatus.has(CrawlDBCommon.BLEKKO_METADATA_RANK_10)) { blekkoRank = blekkoStatus.get(CrawlDBCommon.BLEKKO_METADATA_RANK_10).getAsDouble(); } } //if (inBlekkoFrontier || crawledByBlekko || blekkoRank != 0 || ((intRefCount >2 || extRefCount !=0))) { boolean allowQueryURL = (inBlekkoFrontier && blekkoRank != 0 || crawledByBlekko); if (urlObject.has_query() && !allowQueryURL ) { reporter.incrCounter(Counters.SKIPPING_QUERY_URL, 1); /* LOG.info("SKIPPED QueryURL Flags[" + " intRef:" + intRefCount + " extRef:" + extRefCount +"] URL:" + urlObject.getCanonicalURL()); */ } else { if (crawledByBlekko || blekkoRank != 0 || inBlekkoFrontier) { reporter.incrCounter(Counters.BLEKKO_URL, 1); } else { reporter.incrCounter(Counters.CC_URL, 1); } double cumilativeRank = 10000 * (crawledByBlekko ? 0 : 1); cumilativeRank += 1000 * blekkoRank; cumilativeRank += 10 * ccRank; // build output json JsonObject outputJSON = new JsonObject(); // extract http metadata ... extractHTTPHeaderData(mergedCrawlDBRecord,outputJSON); if (urlObject.has_query()) { reporter.incrCounter(Counters.EMITTED_URL_WITH_QUERY, 1); /* LOG.info("Query URL Flags[" + " BlekkoF:" + inBlekkoFrontier + " CByBlekko:" + crawledByBlekko + " BR:" + blekkoRank + " intRef:" + intRefCount + " extRef:" + extRefCount +"] URL:" + urlObject.getCanonicalURL()); */ } // canonicalize url ... String canonicalURL = canonicalizeURL(urlObject, _prefixWithWWW); // append it to output json outputJSON.addProperty(CrawlDBCommon.TOPLEVEL_SOURCE_URL_PROPRETY, canonicalURL); // figure out partition domain ... // if not super domain, then partition on root domain id long partitionDomain = currentRootDomainId; // if super domain then partition on root domain ... if (_isSuperDomain) partitionDomain = currentDomainId; // construct output key ... CrawlListKey.generateKey(keyOut, partitionDomain, currentDomainId, CrawlListKey.KEY_TYPE_URL, cumilativeRank, 0); // set value text ... valueOut.set(outputJSON.toString()); // set output.collect(keyOut, valueOut); } } /* else { reporter.incrCounter(Counters.SKIPPING_NON_RANKED_URL, 1); LOG.info("SKIPPED NonRankedURL Flags[" + " intRef:" + intRefCount + " extRef:" + extRefCount +"] URL:" + urlObject.getCanonicalURL()); } */ } else { reporter.incrCounter(Counters.FILTERED_OUT_URL, 1); } } } } else { reporter.incrCounter(Counters.ROOT_DOMAIN_ID_MISMATCH, 1); LOG.error("RootDomain Id Mismatch: Expected RH:" + _rootDomainId + " DH:" + _subDomainId + " Got:" + currentRootDomainId + ":" + currentDomainId); } } // keep pump primed... reporter.progress(); } } finally { multiFileInputReader.close(); } } } static double calcualteScore(int inlinksFromSameRoot, int inlinksFromDifferentRoot) { inlinksFromDifferentRoot = inlinksFromDifferentRoot+1; inlinksFromSameRoot = inlinksFromSameRoot+1; return (Math.min(Math.sqrt(Math.pow(Math.log(inlinksFromSameRoot) * .2, 2) + Math.pow(Math.log(inlinksFromDifferentRoot), 2)), 14) / 14.0) * 10.0; } }