/** * Copyright 2008 - CommonCrawl Foundation * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * **/ package org.commoncrawl.crawl.common.internal; import java.io.IOException; import java.net.URI; import java.text.NumberFormat; import java.util.ArrayList; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.commoncrawl.common.Environment; import org.mortbay.log.Log; import com.google.common.collect.Lists; public final class CrawlEnvironment extends Environment { public static final NumberFormat NUMBER_FORMAT = NumberFormat.getInstance(); static { NUMBER_FORMAT.setMinimumIntegerDigits(5); NUMBER_FORMAT.setGroupingUsed(false); } public static final int MAX_URL_LENGTH_ALLOWED = 2048; /** ccbot user agent string **/ public static final String CCBOT_UA = "ccbot"; public static final int MIN_DNS_CACHE_TIME = 60 * 60 * 1000; public static final int NUM_CRAWLERS = 32; /** OLD crawler LIST **/ public static final String[] CRAWLERS = { "ccc01-01", "ccc01-02", "ccc02-01", "ccc02-02", "ccc03-01", "ccc03-02", "ccc04-01", "ccc04-02", }; /** crawl datum metadata **/ public static final String MetaData_FailureReason = "_cc_failCode_"; public static final String MetaData_DatumSource = "_cc_dsrc_"; public static final String MetaData_CrawlURLMetadata = "_cc_md_"; public static final Text CrawlURLMetadataKey = new Text( MetaData_CrawlURLMetadata); /** map reduce property names **/ public static final String PROPERTY_CRAWL_NUMBER = "crawldbserver.crawl.number"; public static final String PROPERTY_CRAWERLDB_SEGMENT_PATH = "crawldbserver.segment.path"; public static final String PROPERTY_CRAWERLDB_SEGMENT_ID = "crawldbserver.segment.id"; public static final String PROPERTY_CRAWLERS = "crawldbserver.crawlers"; public static final String PROPERTY_NUM_CRAWLERS = "crawldbserver.number.of.crawlers"; public static final String PROPERTY_NUM_BUCKETS_PER_CRAWLER = "crawldbserver.number.of.buckets.per.crawlers"; public static final String PROPERTY_START_SEGMENT_ID = "crawldbserver.start.segment.id"; public static final String PROPERTY_MAX_URLS_PER_SEGMENT = "crawldbserver.max.urls.per.segment"; public static final String PROPERTY_SEGMENT_OUTPUT_TEMP_DIR = "crawldbserver.segment.output.temp.dir"; public static final String PROPERTY_SEGMENT_LIST_ID = "crawldbserver.segment.list.id"; /** other system properties **/ public static final String PROPERTY_SEGMENT_DATA_DIR = "crawlSegmentDir"; public static final String PROPERTY_CONTENT_DATA_DIR = "crawlContentDir"; /** hdfs paths **/ public static String CC_ROOT_DIR = "/crawl"; public static final String HDFS_LinkDBDir = "linkdb"; public static final String HDFS_MergedLinkDB = "current"; public static final String HDFS_OldLinkDBDir = "old_linkDB"; public static final String HDFS_InverseLinkDBDir = "inverse_linkdb"; public static final String HDFS_InverseLinkDBCurrent = "current"; public static final String HDFS_PageRankDBDir = "pageRank"; public static final String HDFS_PageRankDir = "pagerank"; public static final String HDFS_PageRankDB = "pagerank_db"; public static final String HDFS_PRSeedValues = "values"; public static final String HDFS_InternalRankPRSeedValues = "InternalRankValues"; public static final String HDFS_PRDistributionEdges = "edges"; public static final String HDFS_PRDistributionInternalEdges = "internal-edges"; public static final int PR_NUMSLAVES = 46; // number of shards to use when creating various system databases... public static final int NUM_DB_SHARDS = 96; public static final String HDFS_HeaderDB = "header_db"; public static final String HDFS_UploadCandidateDB = "uploadCandidates_db"; public static final String HDFS_ArcFileHistoryDB = "arcFileHistory_db"; public static final String HDFS_PurgeCandidateDB = "purge_db"; public static final String HDFS_DomainDB = "domain_db"; public static final String HDFS_SuperDomainDB = "super_domain_db"; public static final String HDFS_SpamListsDB = "spam_lists"; public static final String HDFS_SuperDomainTLDList = "top_level_domain_db"; public static final String HDFS_HistoryServerBase = "/crawl/history"; public static final String HDFS_HistoryServerCheckpointMutex = "checkpointMutex"; public static final String ActiveCrawlLog = "ActiveCrawlLog"; public static final String CheckpointCrawlLog = "CheckpointCrawlLog"; public static final String ActiveSegmentLog = "ActiveSegmentLog"; public static final String CheckpointSegmentLog = "SegmentLog"; public static final String SegmentCompletionLog = "CompletionLog"; private static final String HDFS_CrawlDBDirectory = "crawldb"; private static final String HDFS_CrawlSegmentsDataDirectory = "crawl_segments"; private static final String HDFS_CrawlSegmentLogsDirectory = "crawl_segment_logs"; private static final String HDFS_ParseCandidateSegmentsDirectory = "parse_segments"; private static final String HDFS_CheckpointDataDirectory = "checkpoint_data"; private static final String HDFS_CheckpointStagingDirectory = "checkpoint_staging"; private static final String HDFS_RobotsDBDirectory = "robotsDB"; private static final String HDFS_StatsDirectory = "stats"; public static final String HDFS_CrawlSegmentsFileName = "crawlSegmentStats"; // lists public static final String ROOT_SUPER_DOMAIN_PATH = "/lists/super_domain_list"; // (UNUSED IN PROD) public static final String BLOCKED_DOMAIN_LIST = "/lists/blocked_doman_list"; // (CRAWLER) public static final String TEMPORARILY_BLOCKED_DOMAIN_LIST = "/lists/temporary_blocked_doman_list"; // (CRAWLER) public static final String IP_BLOCK_LIST = "/lists/ip_block_list"; // (CRAWLER) public static final String CRAWL_RATE_MOD_FILTER_PATH = "/lists/crawl_rate_override"; // (CRAWLER) public static final String PROXY_CRAWL_RATE_MOD_FILTER_PATH = "/lists/proxy/crawl_rate_override"; // CCPROXY public static final String PROXY_URL_BLOCK_LIST_FILTER_PATH = "/lists/proxy/url_block_list"; // CCPROXY public static final String DNS_REWRITE_RULES = "/lists/dns_rewrite_rules"; // (CRAWLER,DNSSERVICE) public static final String DNS_NOCACHE_RULES = "/lists/dns_nocache_rules"; // (DNSSERVICE) /** local paths **/ public static final String SegmentLocalDirectory = "segments"; /** crawler paths **/ public static final String CrawlerResultPath = "crawl"; // defaults shared between the servers ... public static final String DEFAULT_DATA_DIR = "./data"; public static final String DEFAULT_RPC_INTERFACE = "localhost"; public static final String DEFAULT_HTTP_INTERFACE = "localhost"; // master specific defaults ... public static final String MASTER_DB = "master_state.db"; public static final int DEFAULT_MASTER_RPC_PORT = 8020; public static final int DEFAULT_MASTER_HTTP_PORT = 8021; public static final String MASTER_WEBAPP_NAME = "master"; // crawler history server defaults ... public static final String CRAWLER_HISTORY_DB = "crawler_history_state.db"; public static final int DEFAULT_CRAWLER_HISTORY_RPC_PORT = 8032; public static final int DEFAULT_CRAWLER_HISTORY_HTTP_PORT = 8033; public static final String CRAWLER_HISTORY_WEBAPP_NAME = "crawler_history"; // crawler specific defaults ... public static final String CRAWLER_DB = "crawler_state.db"; public static final int DEFAULT_CRAWLER_RPC_PORT = 8010; public static final int DEFAULT_CRAWLER_HTTP_PORT = 8011; public static final String CRAWLER_WEBAPP_NAME = "crawler"; // database specific defaults ... public static final String CRAWLDB_DB = "crawldb_state.db"; public static final int DEFAULT_DATABASE_RPC_PORT = 8030; public static final int DEFAULT_DATABASE_HTTP_PORT = 8031; public static final String CRAWLMASTER_WEBAPP_NAME = "crawlmaster"; // query master specific defaults ... public static final String QMASTER_DB = "qmaster_state.db"; public static final int DEFAULT_QUERY_MASTER_RPC_PORT = 8040; public static final int DEFAULT_QUERY_MASTER_HTTP_PORT = 8041; public static final String QUERY_MASTER_WEBAPP_NAME = "qmaster"; // query slave specific defaults ... public static final int DEFAULT_QUERY_SLAVE_RPC_PORT = 8070; public static final int DEFAULT_QUERY_SLAVE_HTTP_PORT = 8071; public static final String QUERY_SLAVE_WEBAPP_NAME = "qslave"; // pagerank master specific defaults ... public static final String PRMASTER_DB = "prmaster_state.db"; public static final int DEFAULT_PAGERANK_MASTER_RPC_PORT = 8050; public static final int DEFAULT_PAGERANK_MASTER_HTTP_PORT = 8051; public static final String PAGERANK_MASTER_WEBAPP_NAME = "prmaster"; // pagerank slave specific defaults ... public static final int DEFAULT_PAGERANK_SLAVE_RPC_PORT = 8060; public static final int DEFAULT_PAGERANK_SLAVE_HTTP_PORT = 8061; public static final String PAGERANK_SLAVE_WEBAPP_NAME = "prslave"; // directory service specific defaults public static final String DIRECTORY_SERVICE_DB = "directory_service_state.db"; public static final int DIRECTORY_SERVICE_RPC_PORT = 8052; public static final int DIRECTORY_SERVICE_HTTP_PORT = 8053; public static final String DIRECTORY_SERVICE_WEBAPP_NAME = "dservice"; public static final String DIRECTORY_SERVICE_HDFS_ROOT = "dservice_root"; public static final String DIRECTORY_SERVICE_ADDRESS_PROPERTY = "directory.service.address"; public static final String DNS_SERVICE_DB = "dnsservice.db"; public static final int DNS_SERVICE_RPC_PORT = 8054; public static final int DNS_SERVICE_HTTP_PORT = 8055; public static final String DNS_SERVICE_WEBAPP_NAME = "dnsservice"; public static final String CRAWLER_TEST_SERVICE_DB = "crawlerTestProxy.db"; public static final int CRAWLER_TEST_SERVICE_RPC_PORT = 8056; public static final int CRAWLER_TEST_SERVICE_HTTP_PORT = 8057; // directory service specific defaults public static final String STATS_SERVICE_DB = "stats_service_state.db"; public static final int STATS_SERVICE_RPC_PORT = 8058; public static final int STATS_SERVICE_HTTP_PORT = 8059; public static final String STATS_SERVICE_WEBAPP_NAME = "statsservice"; public static final String STATS_SERVICE_HDFS_ROOT = "stats_service"; // directory service specific defaults public static final String CRAWLSTATSCOLLECTOR_SERVICE_DB = "crawlstats_service_state.db"; public static final int CRAWLSTATSCOLLECTOR_SERVICE_RPC_PORT = 8042; public static final int CRAWLSTATSCOLLECTOR_SERVICE_HTTP_PORT = 8043; public static final String CRAWLSTATSCOLLECTOR_SERVICE_WEBAPP_NAME = "crawlstats"; public static final String PROXY_SERVICE_DB = "proxyServer.db"; public static final int PROXY_SERVICE_RPC_PORT = 8022; public static final int PROXY_SERVICE_HTTP_PORT = 8023; // parser slave specific defaults ... public static final int DEFAULT_PARSER_SLAVE_RPC_PORT = 8072; public static final int DEFAULT_PARSER_SLAVE_HTTP_PORT = 8073; public static final String DEFAULT_PARSER_SLAVE_WEBAPP_NAME = "pslave"; // ec2 master specific defaults ... public static final int DEFAULT_EC2MASTER_RPC_PORT = 8074; public static final int DEFAULT_EC2MASTER_HTTP_PORT = 8075; public static final String DEFAULT_EC2MASTER_WEBAPP_NAME = "pslave"; private static boolean _unitTestMode = false; private static String _defaultHadoopFS = null; private static Configuration _hadoopConfig = null; private static String _crawlSegmentDataDirectory = null; private static String _crawlSegmentLogsDirectory = null; private static String _parseCandidateSegmentDataDirectory = null; private static String _checkpointDataDirectory = null; private static String _checkpointStagingDirectory = null; private static String CRAWL_LOG_CHECKPOINT_PREFIX = "CrawlLog_"; /** limits **/ public static final int ORIGINAL_CONTENT_SIZE_LIMIT = 2 << 16; // 131072 public static final int CONTENT_SIZE_LIMIT = 2 << 20; // 2097152 public static final int GUNZIP_SIZE_LIMIT = CONTENT_SIZE_LIMIT * 3; public static void setUnitTestMode(boolean unitTestMode) { _unitTestMode = unitTestMode; } public static boolean inUnitTestMode() { return _unitTestMode; } public static void setHadoopConfig(Configuration config) { _hadoopConfig = config; } public static Configuration getHadoopConfig() { return _hadoopConfig; } public static FileSystem getDefaultFileSystem() throws IOException { // if an override was not specified... get the default file system via // hadoop-site.xml if (getDefaultHadoopFSURI() == null) { return FileSystem.get(getHadoopConfig()); } // otherwise ... else { return FileSystem.get(URI.create(getDefaultHadoopFSURI()), getHadoopConfig()); } } public static void setCCRootDir(String directory) { CC_ROOT_DIR = directory; } public static String getCrawlDBDirectory() { return CC_ROOT_DIR + "/" + HDFS_CrawlDBDirectory; } public static String getCrawlSegmentDataDirectory() { if (_crawlSegmentDataDirectory == null) { return CC_ROOT_DIR + "/" + HDFS_CrawlSegmentsDataDirectory; } return _crawlSegmentDataDirectory; } public static void setCrawlSegmentDataDirectory(String directory) { _crawlSegmentDataDirectory = directory; } public static String getCrawlSegmentLogsDirectory() { if (_crawlSegmentLogsDirectory == null) { return CC_ROOT_DIR + "/" + HDFS_CrawlSegmentLogsDirectory; } return _crawlSegmentLogsDirectory; } public static void setCrawlSegmentLogsDirectory(String directory) { _crawlSegmentLogsDirectory = directory; } public static String getParseCandidateSegmentDataDirectory() { if (_parseCandidateSegmentDataDirectory == null) { return CC_ROOT_DIR + "/" + HDFS_ParseCandidateSegmentsDirectory; } return _parseCandidateSegmentDataDirectory; } public static String buildCrawlLogCheckpointName(String nodeName, long checkpointId) { return CRAWL_LOG_CHECKPOINT_PREFIX + nodeName + "_" + checkpointId; } public static String buildCrawlLogCheckpointWildcardString() { return "*"; } public static String buildCrawlSegmentLogCheckpointFileName(long checkpointId) { return CheckpointSegmentLog + "_" + checkpointId; } public static String buildCrawlSegmentCompletionLogFileName(String nodeName) { return SegmentCompletionLog + "_" + nodeName; } public static Path getRemoteCrawlSegmentLogWildcardPath(Path rootPath, String hostId) { Path relativePath = new Path( hostId + "/" + "*" + "/" + "*" + "/" + "SegmentLog_*"); return new Path(rootPath,relativePath); } public static Path getRemoteCrawlSegmentLogCheckpointPath(Path rootPath, String hostId,long checkpointId, int listId, int segmentId) throws IOException { Path relativePath = new Path( hostId + "/" + CrawlEnvironment.formatListId(listId) + "/" + segmentId + "/" + CrawlEnvironment.buildCrawlSegmentLogCheckpointFileName(checkpointId)); return new Path(rootPath,relativePath); } public static void setParseSegmentDataDirectory(String directory) { _parseCandidateSegmentDataDirectory = directory; } public static String getCheckpointDataDirectory() { if (_checkpointDataDirectory == null) { return CC_ROOT_DIR + "/" + HDFS_CheckpointDataDirectory; } return _checkpointDataDirectory; } public void setCheckpointDataDirectory(String checkpointDataDirectory) { _checkpointDataDirectory = checkpointDataDirectory; } public static String getCheckpointStagingDirectory() { if (_checkpointStagingDirectory == null) { return CC_ROOT_DIR + "/" + HDFS_CheckpointStagingDirectory; } return _checkpointStagingDirectory; } public void setCheckpointStagingDirectory(String checkpointStagingDirectory) { _checkpointStagingDirectory = checkpointStagingDirectory; } public static void setDefaultHadoopFSURI(String hadoopFSURI) { _defaultHadoopFS = hadoopFSURI; } public static String getDefaultHadoopFSURI() { return _defaultHadoopFS; } public static String getCrawlerLocalOutputPath() { if (CrawlEnvironment.inUnitTestMode()) { return "unitTest_" + CrawlEnvironment.CrawlerResultPath; } else { return CrawlEnvironment.CrawlerResultPath; } } public static int getCurrentCrawlNumber() { return 2; } public static String getCrawlerNameGivenId(int hostId) { return NUMBER_FORMAT.format(hostId); } public static String formatListId(int listId) { return NUMBER_FORMAT.format(listId); } public static ArrayList<String> getCrawlerNames() { ArrayList<String> crawlers = Lists.newArrayList(); for (int i=0;i<NUM_CRAWLERS;++i) { crawlers.add(NUMBER_FORMAT.format(i)); } return crawlers; } }