/** * Copyright 2012 - CommonCrawl Foundation * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * **/ package org.commoncrawl.mapred.pipelineV3.domainmeta; import java.io.IOException; import java.text.NumberFormat; import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.Properties; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.GnuParser; import org.apache.commons.cli.OptionBuilder; import org.apache.commons.cli.Options; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.Path; import org.apache.hadoop.util.StringUtils; import org.commoncrawl.crawl.common.internal.CrawlEnvironment; import org.commoncrawl.mapred.pipelineV3.CrawlPipelineStep; import org.commoncrawl.mapred.pipelineV3.CrawlPipelineTask; import org.commoncrawl.mapred.pipelineV3.crawllistgen.CrawlListGeneratorTask; import org.commoncrawl.mapred.pipelineV3.domainmeta.blogs.feedurlid.FeedUrlIdStep; import org.commoncrawl.mapred.pipelineV3.domainmeta.blogs.postfrequency.GenPostFrequencyStep; import org.commoncrawl.mapred.pipelineV3.domainmeta.crawlstats.CrawlStatsCollectorTask; import org.commoncrawl.mapred.pipelineV3.domainmeta.fuzzydedupe.CrossDomainDupes; import org.commoncrawl.mapred.pipelineV3.domainmeta.fuzzydedupe.FindBadIPsFromDupes; import org.commoncrawl.mapred.pipelineV3.domainmeta.fuzzydedupe.FuzzyDedupeStep1; import org.commoncrawl.mapred.pipelineV3.domainmeta.fuzzydedupe.FuzzyDedupeStep2; import org.commoncrawl.mapred.pipelineV3.domainmeta.fuzzydedupe.HostBlacklistByDupesStep; import org.commoncrawl.mapred.pipelineV3.domainmeta.iptohost.DomainIPCollectorStep; import org.commoncrawl.mapred.pipelineV3.domainmeta.iptohost.IPAddressToHostMappingStep; import org.commoncrawl.mapred.pipelineV3.domainmeta.iptohost.QuantcastIPListStep; import org.commoncrawl.mapred.pipelineV3.domainmeta.linkstats.CountInLinksStep; import org.commoncrawl.mapred.pipelineV3.domainmeta.quantcast.ImportQuantcastStep; import org.commoncrawl.mapred.pipelineV3.domainmeta.rank.RankTask; import org.commoncrawl.mapred.pipelineV3.domainmeta.subdomaincounts.SubDomainCountsStep; import org.commoncrawl.mapred.pipelineV3.domainmeta.subdomaincounts.SubDomainToQuantcastJoinStep; import com.google.common.base.Function; import com.google.common.collect.ImmutableList; import com.google.common.collect.Iterators; import com.google.common.collect.Lists; /** * * @author rana * */ public class DomainMetadataTask extends CrawlPipelineTask { private List<Integer> _partitionList = null; private String _runMode = "DEFAULT"; private Properties _properties = null; Configuration _conf = new Configuration(); private static final Log LOG = LogFactory.getLog(DomainMetadataTask.class); private static final NumberFormat NUMBER_FORMAT = NumberFormat.getInstance(); static { NUMBER_FORMAT.setMinimumIntegerDigits(5); NUMBER_FORMAT.setGroupingUsed(false); } public static final String OUTPUT_DIR_NAME = "domainMetadata"; static final Options options; static { options = new Options(); options.addOption(OptionBuilder.withArgName("dbroot").hasArg(true).isRequired().withDescription("Database Root Path").create("dbroot")); options.addOption("partitions", true, "comma separated list of partitions"); options.addOption("rebuild", false, "rebuild all the outputs"); options.addOption(OptionBuilder.withArgName("fsuri").hasArg(true).withDescription("set file system uri").create("fsuri")); options.addOption(OptionBuilder.withArgName("tempdir").hasArg(true).withDescription("Hadoop Temp Dir").create("tempdir")); options.addOption(OptionBuilder.withArgName("mode").hasArg(true).withDescription("Run Mode").create("mode")); options.addOption(OptionBuilder.withArgName( "property=value" ) .hasArgs(2) .withValueSeparator() .withDescription( "use value for given property" ) .create( "D" )); } public static void main(String[] args) throws Exception { DomainMetadataTask task = new DomainMetadataTask(); task.run(args); } public DomainMetadataTask() throws IOException { super(new Configuration(), "Domain Metdata Task",OUTPUT_DIR_NAME); } public DomainMetadataTask(String alternateTaskDescription) throws IOException { super(new Configuration(), alternateTaskDescription,OUTPUT_DIR_NAME); } public DomainMetadataTask(CrawlPipelineTask parentTask) throws IOException { super(parentTask, "Domain Metdata Task",OUTPUT_DIR_NAME); } public List<Path> getMergeDBDataPaths() throws IOException { return getParitionPaths(getTaskIdentityBasePath(), getLatestDatabaseTimestamp(), "part-"); } public List<Path> getParitionPaths(Path basePath, long databaseId, String prefix) throws IOException { ArrayList<Path> paths = new ArrayList<Path>(); for (FileStatus fileStatus : getFileSystem().globStatus( new Path(basePath, Long.toString(databaseId) + "/" + prefix + "*"))) { paths.add(fileStatus.getPath()); } return paths; } public final List<Integer> getPartitionList() { return _partitionList; } public final Properties getProperties() { return _properties; } public final Configuration getConf(){ return _conf; } public List<Path> getRedirectDataPaths() throws IOException { return getParitionPaths(getTaskIdentityBasePath(), getLatestDatabaseTimestamp(), "redirect-"); } public List<Path> getRestrictedMergeDBDataPaths() throws IOException { if (_partitionList == null || _partitionList.size() == 0) { throw new IOException("Invalid or NULL Partition List!"); } return getRestrictedParitionPaths(_partitionList, getTaskIdentityBasePath(), getLatestDatabaseTimestamp(), "part-"); } public List<Path> getRestrictedRedirectPaths() throws IOException { if (_partitionList == null || _partitionList.size() == 0) { throw new IOException("Invalid or NULL Partition List!"); } return getRestrictedParitionPaths(_partitionList, getTaskIdentityBasePath(), getLatestDatabaseTimestamp(), "redirect-"); } Pattern rangePartitionPattern = Pattern.compile("([0-9]*)-([0-9]*)"); @Override protected void parseArgs() throws IOException { CommandLineParser parser = new GnuParser(); try { // parse the command line arguments CommandLine line = parser.parse(options, _args,false); // default to single partition - partition zero List<Integer> partitions = ImmutableList.of(0); if (line.hasOption("partitions")) { ArrayList<Integer> restrictedSet = Lists.newArrayList(); for (String partitionSpec : line.getOptionValue("partitions").split(",")) { Matcher rangeMatcher = rangePartitionPattern.matcher(partitionSpec); if (rangeMatcher.matches()) { int rangeStart = Integer.parseInt(rangeMatcher.group(1)); int rangeEnd = Integer.parseInt(rangeMatcher.group(2)); if (rangeEnd < rangeStart) { throw new IOException("Invalid Range!"); } else { for (int i=rangeStart;i<rangeEnd;++i) { restrictedSet.add(i); } } } else { int partition = Integer.parseInt(partitionSpec); restrictedSet.add(partition); } partitions = restrictedSet; } if (partitions.size() == 0) { throw new IOException("One Parition Required At a Minimum!"); } _partitionList = partitions; } if (line.hasOption("rebuild")) { LOG.info("Rebuild Option Specified. Deleting Outputs"); for (CrawlPipelineStep step : getSteps()) { LOG.info("Deleting Output Dir:" + step.getOutputDir() + " for Step:" + step.getName()); getFileSystem().delete(step.getOutputDir(), true); } } if (line.hasOption("dbroot")) { setTaskIdentityBasePath(new Path(line.getOptionValue("dbroot"))); setRootOutputDir(getTaskIdentityBasePath()); } if (line.hasOption("fsuri")) { CrawlEnvironment.setDefaultHadoopFSURI(line.getOptionValue("fsuri")); } if (line.hasOption("tempdir")) { System.out.println("tempdir is:"+ line.getOptionValue("tempdir")); System.out.println("HadoopConf is:"+ CrawlEnvironment.getHadoopConfig()); CrawlEnvironment.getHadoopConfig().set("mapred.temp.dir", line.getOptionValue("tempdir")); } if (line.hasOption("mode")) { _runMode = line.getOptionValue("mode"); } _properties = line.getOptionProperties("D"); if (_properties != null) { for (Map.Entry<Object,Object> property : _properties.entrySet()) { _conf.set(property.getKey().toString(), property.getValue().toString()); } } } catch (Exception e) { LOG.error(StringUtils.stringifyException(e)); throw new IOException(e); } } @Override protected boolean promoteFinalStepOutput() { return false; } private static List<Path> getRestrictedParitionPaths(List<Integer> partitions, Path basePath, long databaseId, String prefix) throws IOException { ArrayList<Path> paths = new ArrayList<Path>(); for (int partition : partitions) { paths.add(new Path(basePath,Long.toString(databaseId) + "/" + prefix + NUMBER_FORMAT.format(partition))); } return paths; } @Override public Log getLogger() { return LOG; } @Override public void initTask(String[] args)throws IOException { super.initTask(args); if (_runMode.equalsIgnoreCase("DEFAULT")) { //addStep(new ImportQuantcastStep(this)); //addStep(new SubDomainCountsStep(this)); //addStep(new CountInLinksStep(this)); //addStep(new IPAddressToHostMappingStep(this)); //addStep(new QuantcastIPListStep(this)); //addStep(new DomainIPCollectorStep(this)); //addStep(new FuzzyDedupeStep1(this)); //addStep(new FuzzyDedupeStep2(this)); //addStep(new CrossDomainDupes(this)); //addStep(new FindBadIPsFromDupes(this)); //addStep(new HostBlacklistByDupesStep(this)); //addStep(new SubDomainToQuantcastJoinStep(this)); //addStep(new RankTask(this)); //addStep(new GenPostFrequencyStep(this)); //addStep(new FeedUrlIdStep(this)); //addStep(new CrawlStatsCollectorTask(this)); } else if (_runMode.equalsIgnoreCase("rank")) { addStep(new RankTask(this)); } else if (_runMode.equalsIgnoreCase("dedupe")) { addStep(new FuzzyDedupeStep1(this)); addStep(new FuzzyDedupeStep2(this)); addStep(new CrossDomainDupes(this)); } else if (_runMode.equalsIgnoreCase("stats")) { addStep(new RankTask(this)); addStep(new CrawlStatsCollectorTask(this)); } else if (_runMode.equalsIgnoreCase("list")) { addStep(new CrawlStatsCollectorTask(this)); addStep(new CrawlListGeneratorTask(this)); } } }