/** * Copyright 2012 - CommonCrawl Foundation * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * **/ package org.commoncrawl.mapred.pipelineV3.domainmeta.rank; import java.io.IOException; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.TreeSet; import java.util.Vector; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.DataInputBuffer; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.RawComparator; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.io.WritableUtils; import org.apache.hadoop.io.SequenceFile.CompressionType; import org.apache.hadoop.io.compress.SnappyCodec; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reducer; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.SequenceFileOutputFormat; import org.commoncrawl.crawl.common.internal.CrawlEnvironment; import org.commoncrawl.mapred.ec2.postprocess.crawldb.CrawlDBCommon; import org.commoncrawl.mapred.ec2.postprocess.crawldb.CrawlDBKey; import org.commoncrawl.mapred.ec2.postprocess.crawldb.CrawlDBKey.CrawlDBKeyGroupByURLComparator; import org.commoncrawl.mapred.pipelineV3.CrawlPipelineStep; import org.commoncrawl.mapred.pipelineV3.CrawlPipelineTask; import org.commoncrawl.mapred.pipelineV3.domainmeta.DomainMetadataTask; import org.commoncrawl.protocol.URLFPV2; import org.commoncrawl.util.ByteArrayUtils; import org.commoncrawl.util.CCStringUtils; import org.commoncrawl.util.FlexBuffer; import org.commoncrawl.util.GoogleURL; import org.commoncrawl.util.JobBuilder; import org.commoncrawl.util.TextBytes; import org.commoncrawl.util.URLFPBloomFilter; import org.commoncrawl.util.URLUtils; import org.commoncrawl.util.MultiFileMergeUtils.MultiFileInputReader; import org.commoncrawl.util.MultiFileMergeUtils.MultiFileMergeInputFormat; import org.commoncrawl.util.MultiFileMergeUtils.MultiFileMergePartitioner; import org.commoncrawl.util.MultiFileMergeUtils.MultiFileInputReader.KeyAndValueData; import org.commoncrawl.util.MultiFileMergeUtils.MultiFileInputReader.RawRecordValue; import org.commoncrawl.util.Tuples.Pair; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableSet; import com.google.gson.JsonArray; import com.google.gson.JsonElement; import com.google.gson.JsonObject; import com.google.gson.JsonParser; /** * * @author rana * */ public class LinkScannerStep extends CrawlPipelineStep implements Reducer<IntWritable, Text, TextBytes, TextBytes> { enum Counters { FAILED_TO_GET_LINKS_FROM_HTML, NO_HREF_FOR_HTML_LINK, EXCEPTION_IN_MAP, GOT_HTML_METADATA, GOT_FEED_METADATA, EMITTED_ATOM_LINK, EMITTED_HTML_LINK, EMITTED_RSS_LINK, GOT_PARSED_AS_ATTRIBUTE, GOT_LINK_OBJECT, NULL_CONTENT_OBJECT, NULL_LINKS_ARRAY, FP_NULL_IN_EMBEDDED_LINK, SKIPPED_ALREADY_EMITTED_LINK, FOUND_HTTP_DATE_HEADER, FOUND_HTTP_AGE_HEADER, FOUND_HTTP_LAST_MODIFIED_HEADER, FOUND_HTTP_EXPIRES_HEADER, FOUND_HTTP_CACHE_CONTROL_HEADER, FOUND_HTTP_PRAGMA_HEADER, REDUCER_GOT_LINK, REDUCER_GOT_STATUS, ONE_REDUNDANT_LINK_IN_REDUCER, TWO_REDUNDANT_LINKS_IN_REDUCER, THREE_REDUNDANT_LINKS_IN_REDUCER, GT_THREE_REDUNDANT_LINKS_IN_REDUCER, ONE_REDUNDANT_STATUS_IN_REDUCER, TWO_REDUNDANT_STATUS_IN_REDUCER, THREE_REDUNDANT_STATUS_IN_REDUCER, GT_THREE_REDUNDANT_STATUS_IN_REDUCER, GOT_RSS_FEED, GOT_ATOM_FEED, GOT_ALTERNATE_LINK_FOR_ATOM_ITEM, GOT_CONTENT_FOR_ATOM_ITEM, GOT_ITEM_LINK_FROM_RSS_ITEM, GOT_TOP_LEVEL_LINK_FROM_RSS_ITEM, GOT_TOP_LEVEL_LINK_FROM_ATOM_ITEM, EMITTED_REDIRECT_RECORD, DISCOVERED_NEW_LINK, GOT_LINK_FOR_ITEM_WITH_STATUS, FAILED_TO_GET_SOURCE_HREF, GOT_CRAWL_STATUS_NO_LINK, GOT_CRAWL_STATUS_WITH_LINK, GOT_EXTERNAL_DOMAIN_SOURCE, NO_SOURCE_URL_FOR_CRAWL_STATUS, OUTPUT_KEY_FROM_INTERNAL_LINK, OUTPUT_KEY_FROM_EXTERNAL_LINK, FOUND_HTML_LINKS, FOUND_FEED_LINKS, HAD_OUTLINK_DATA, HAD_NO_OUTLINK_DATA , FOUND_LINK_RECORD_BUT_NO_MERGE_RECORD, EXCEPTION_IN_MERGE, BAD_SOURCE_URL, NO_SOURCE_URL_IN_MERGE_RECORD} public static final String OUTPUT_DIR_NAME = "linkScannerOutput"; private static final Log LOG = LogFactory.getLog(RankTask.class); public static final int NUM_HASH_FUNCTIONS = 10; public static final int NUM_BITS = 11; public static final int NUM_ELEMENTS = 1 << 28; static final String stripWWW(String host) { if (host.startsWith("www.")) { return host.substring("www.".length()); } return host; } JsonParser parser = new JsonParser(); JsonObject fromJsonObject = new JsonObject(); URLFPV2 bloomKey = new URLFPV2(); JobConf _jobConf; public LinkScannerStep() { super(null, null, null); } public LinkScannerStep(CrawlPipelineTask task) { super(task, "Link Scanner Step", OUTPUT_DIR_NAME); } @Override public void close() throws IOException { // TODO Auto-generated method stub } @Override public void configure(JobConf job) { _jobConf = job; } @Override public Log getLogger() { return LOG; } @Override public void reduce(IntWritable key, Iterator<Text> values, OutputCollector<TextBytes, TextBytes> output,Reporter reporter) throws IOException { // collect all incoming paths first Vector<Path> incomingPaths = new Vector<Path>(); while (values.hasNext()) { String path = values.next().toString(); LOG.info("Found Incoming Path:" + path); incomingPaths.add(new Path(path)); } // set up merge attributes Configuration localMergeConfig = new Configuration(_jobConf); localMergeConfig.setClass(MultiFileInputReader.MULTIFILE_COMPARATOR_CLASS, CrawlDBKeyGroupByURLComparator.class, RawComparator.class); localMergeConfig.setClass(MultiFileInputReader.MULTIFILE_KEY_CLASS, TextBytes.class, WritableComparable.class); FileSystem fs = FileSystem.get(incomingPaths.get(0).toUri(),_jobConf); // ok now spawn merger MultiFileInputReader<TextBytes> multiFileInputReader = new MultiFileInputReader<TextBytes>(fs, incomingPaths, localMergeConfig); try { DataInputBuffer mergeDataBuffer = new DataInputBuffer(); @SuppressWarnings("resource") DataInputBuffer linkDataBuffer = new DataInputBuffer(); boolean foundMergeRecord = false; boolean foundLinksRecord = false; TextBytes mergeBytes = new TextBytes(); TextBytes keyBytes = new TextBytes(); TextBytes valueBytes = new TextBytes(); DataInputBuffer keyInputBuffer = new DataInputBuffer(); TextBytes outputValueBytes = new TextBytes(); Pair<KeyAndValueData<TextBytes>, Iterable<RawRecordValue>> nextItem = null; while ((nextItem = multiFileInputReader.getNextItemIterator()) != null) { foundMergeRecord = false; foundLinksRecord = false; // walk records for (RawRecordValue rawValue : nextItem.e1) { // init key buffer keyInputBuffer.reset(rawValue.key.getData(),0,rawValue.key.getLength()); keyBytes.setFromRawTextBytes(keyInputBuffer); // scan key components long recordType = CrawlDBKey.getLongComponentFromKey(keyBytes, CrawlDBKey.ComponentId.TYPE_COMPONENT_ID); if (recordType == CrawlDBKey.Type.KEY_TYPE_INCOMING_URLS_SAMPLE.ordinal()) { foundLinksRecord = true; linkDataBuffer.reset(rawValue.data.getData(), rawValue.data.getLength()); } else if (recordType == CrawlDBKey.Type.KEY_TYPE_MERGED_RECORD.ordinal()) { foundMergeRecord = true; mergeDataBuffer.reset(rawValue.data.getData(),rawValue.data.getLength()); } if (foundLinksRecord && foundMergeRecord) { mergeBytes.setFromRawTextBytes(mergeDataBuffer); try { JsonObject mergeRecord = parser.parse(mergeBytes.toString()).getAsJsonObject(); if (mergeRecord.has(CrawlDBCommon.TOPLEVEL_SOURCE_URL_PROPRETY)) { String sourceURL = mergeRecord.get(CrawlDBCommon.TOPLEVEL_SOURCE_URL_PROPRETY).getAsString(); GoogleURL sourceURLObj = new GoogleURL(sourceURL); if (sourceURLObj.isValid()) { // set key bytes ... keyBytes.set(stripWWW(sourceURLObj.getHost())); int curpos = 0; int endpos = linkDataBuffer.getLength(); byte lfPattern[] = { 0xA }; byte tabPattern[] = { 0x9 }; while (curpos != endpos) { int tabIndex = ByteArrayUtils.indexOf(linkDataBuffer.getData(), curpos, endpos - curpos, tabPattern); if (tabIndex == -1) { break; } else { int lfIndex = ByteArrayUtils.indexOf(linkDataBuffer.getData(), tabIndex + 1, endpos - (tabIndex + 1), lfPattern); if (lfIndex == -1) { break; } else { // skip the source domain hash //long sourceDomainHash = ByteArrayUtils.parseLong(inputData.getBytes(),curpos, tabIndex-curpos, 10); // get source url valueBytes.set(linkDataBuffer.getData(),tabIndex + 1,lfIndex - (tabIndex + 1)); // convert to url object GoogleURL urlObject = new GoogleURL(valueBytes.toString()); if (urlObject.isValid()) { String hostName = stripWWW(urlObject.getHost()); // now emit a from tuple ... fromJsonObject.addProperty("from", hostName); outputValueBytes.set(fromJsonObject.toString()); // emit tuple output.collect(keyBytes, outputValueBytes); } curpos = lfIndex + 1; } } } } else { reporter.incrCounter(Counters.BAD_SOURCE_URL, 1); } } else { reporter.incrCounter(Counters.NO_SOURCE_URL_IN_MERGE_RECORD, 1); } } catch (Exception e) { reporter.incrCounter(Counters.EXCEPTION_IN_MERGE, 1); LOG.error(CCStringUtils.stringifyException(e)); } } else if (foundLinksRecord && !foundMergeRecord) { // record the anomaly ... reporter.incrCounter(Counters.FOUND_LINK_RECORD_BUT_NO_MERGE_RECORD, 1); } } } } finally { multiFileInputReader.close(); } } @Override public void runStep(Path outputPathLocation) throws IOException { LOG.info("Task Identity Path is:" + getTaskIdentityPath()); LOG.info("Temp Path is:" + outputPathLocation); DomainMetadataTask rootTask = (DomainMetadataTask) getRootTask(); ImmutableList<Path> paths = new ImmutableList.Builder<Path>().addAll(rootTask.getRestrictedMergeDBDataPaths()).build(); JobConf jobConf = new JobBuilder("Link Scanner Job", getConf()) .inputs(paths) .inputFormat(MultiFileMergeInputFormat.class) .mapperKeyValue(IntWritable.class, Text.class) .outputKeyValue(TextBytes.class, TextBytes.class) .outputFormat(SequenceFileOutputFormat.class) .reducer(LinkScannerStep.class, false) .partition(MultiFileMergePartitioner.class) .numReducers(CrawlEnvironment.NUM_DB_SHARDS) .speculativeExecution(false) .output(outputPathLocation) .compressMapOutput(false).compressor(CompressionType.BLOCK, SnappyCodec.class) .build(); LOG.info("Starting JOB"); JobClient.runJob(jobConf); LOG.info("Finsihed JOB"); } }