/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package edu.uci.ics.crawler4j.crawler; public class CrawlConfig { /** * The folder which will be used by crawler for storing the intermediate * crawl data. The content of this folder should not be modified manually. */ private String crawlStorageFolder; /** * If this feature is enabled, you would be able to resume a previously * stopped/crashed crawl. However, it makes crawling slightly slower */ private boolean resumableCrawling = false; /** * Maximum depth of crawling For unlimited depth this parameter should be * set to -1 */ private int maxDepthOfCrawling = -1; /** * Maximum number of pages to fetch For unlimited number of pages, this * parameter should be set to -1 */ private int maxPagesToFetch = -1; /** * user-agent string that is used for representing your crawler to web * servers. See http://en.wikipedia.org/wiki/User_agent for more details */ private String userAgentString = "crawler4j (http://code.google.com/p/crawler4j/)"; /** * Politeness delay in milliseconds (delay between sending two requests to * the same host). */ private int politenessDelay = 200; /** * Should we also crawl https pages? */ private boolean includeHttpsPages = false; /** * Should we fetch binary content such as images, audio, ...? */ private boolean includeBinaryContentInCrawling = false; /** * Maximum Connections per host */ private int maxConnectionsPerHost = 100; /** * Maximum total connections */ private int maxTotalConnections = 100; /** * Socket timeout in milliseconds */ private int socketTimeout = 20000; /** * Connection timeout in milliseconds */ private int connectionTimeout = 30000; /** * Max number of outgoing links which are processed from a page */ private int maxOutgoingLinksToFollow = 5000; /** * Max allowed size of a page. Pages larger than this size will not be * fetched. */ private int maxDownloadSize = 1048576; /** * Should we follow redirects? */ private boolean followRedirects = true; /** * If crawler should run behind a proxy, this parameter can be used for * specifying the proxy host. */ private String proxyHost = null; /** * If crawler should run behind a proxy, this parameter can be used for * specifying the proxy port. */ private int proxyPort = 80; /** * If crawler should run behind a proxy and user/pass is needed for * authentication in proxy, this parameter can be used for specifying the * username. */ private String proxyUsername = null; /** * If crawler should run behind a proxy and user/pass is needed for * authentication in proxy, this parameter can be used for specifying the * password. */ private String proxyPassword = null; public CrawlConfig() { } /** * Validates the configs specified by this instance. * * @throws Exception */ public void validate() throws Exception { if (crawlStorageFolder == null) { throw new Exception("Crawl storage folder is not set in the CrawlConfig."); } if (politenessDelay < 0) { throw new Exception("Invalid value for politeness delay: " + politenessDelay); } if (maxDepthOfCrawling < -1) { throw new Exception("Maximum crawl depth should be either a positive number or -1 for unlimited depth."); } if (maxDepthOfCrawling > Short.MAX_VALUE) { throw new Exception("Maximum value for crawl depth is " + Short.MAX_VALUE); } } public String getCrawlStorageFolder() { return crawlStorageFolder; } /** * The folder which will be used by crawler for storing the intermediate * crawl data. The content of this folder should not be modified manually. */ public void setCrawlStorageFolder(String crawlStorageFolder) { this.crawlStorageFolder = crawlStorageFolder; } public boolean isResumableCrawling() { return resumableCrawling; } /** * If this feature is enabled, you would be able to resume a previously * stopped/crashed crawl. However, it makes crawling slightly slower */ public void setResumableCrawling(boolean resumableCrawling) { this.resumableCrawling = resumableCrawling; } public int getMaxDepthOfCrawling() { return maxDepthOfCrawling; } /** * Maximum depth of crawling For unlimited depth this parameter should be * set to -1 */ public void setMaxDepthOfCrawling(int maxDepthOfCrawling) { this.maxDepthOfCrawling = maxDepthOfCrawling; } public int getMaxPagesToFetch() { return maxPagesToFetch; } /** * Maximum number of pages to fetch For unlimited number of pages, this * parameter should be set to -1 */ public void setMaxPagesToFetch(int maxPagesToFetch) { this.maxPagesToFetch = maxPagesToFetch; } public String getUserAgentString() { return userAgentString; } /** * user-agent string that is used for representing your crawler to web * servers. See http://en.wikipedia.org/wiki/User_agent for more details */ public void setUserAgentString(String userAgentString) { this.userAgentString = userAgentString; } public int getPolitenessDelay() { return politenessDelay; } /** * Politeness delay in milliseconds (delay between sending two requests to * the same host). * * @param politenessDelay * the delay in milliseconds. */ public void setPolitenessDelay(int politenessDelay) { this.politenessDelay = politenessDelay; } public boolean isIncludeHttpsPages() { return includeHttpsPages; } /** * Should we also crawl https pages? */ public void setIncludeHttpsPages(boolean includeHttpsPages) { this.includeHttpsPages = includeHttpsPages; } public boolean isIncludeBinaryContentInCrawling() { return includeBinaryContentInCrawling; } /** * Should we fetch binary content such as images, audio, ...? */ public void setIncludeBinaryContentInCrawling(boolean includeBinaryContentInCrawling) { this.includeBinaryContentInCrawling = includeBinaryContentInCrawling; } public int getMaxConnectionsPerHost() { return maxConnectionsPerHost; } /** * Maximum Connections per host */ public void setMaxConnectionsPerHost(int maxConnectionsPerHost) { this.maxConnectionsPerHost = maxConnectionsPerHost; } public int getMaxTotalConnections() { return maxTotalConnections; } /** * Maximum total connections */ public void setMaxTotalConnections(int maxTotalConnections) { this.maxTotalConnections = maxTotalConnections; } public int getSocketTimeout() { return socketTimeout; } /** * Socket timeout in milliseconds */ public void setSocketTimeout(int socketTimeout) { this.socketTimeout = socketTimeout; } public int getConnectionTimeout() { return connectionTimeout; } /** * Connection timeout in milliseconds */ public void setConnectionTimeout(int connectionTimeout) { this.connectionTimeout = connectionTimeout; } public int getMaxOutgoingLinksToFollow() { return maxOutgoingLinksToFollow; } /** * Max number of outgoing links which are processed from a page */ public void setMaxOutgoingLinksToFollow(int maxOutgoingLinksToFollow) { this.maxOutgoingLinksToFollow = maxOutgoingLinksToFollow; } public int getMaxDownloadSize() { return maxDownloadSize; } /** * Max allowed size of a page. Pages larger than this size will not be * fetched. */ public void setMaxDownloadSize(int maxDownloadSize) { this.maxDownloadSize = maxDownloadSize; } public boolean isFollowRedirects() { return followRedirects; } /** * Should we follow redirects? */ public void setFollowRedirects(boolean followRedirects) { this.followRedirects = followRedirects; } public String getProxyHost() { return proxyHost; } /** * If crawler should run behind a proxy, this parameter can be used for * specifying the proxy host. */ public void setProxyHost(String proxyHost) { this.proxyHost = proxyHost; } public int getProxyPort() { return proxyPort; } /** * If crawler should run behind a proxy, this parameter can be used for * specifying the proxy port. */ public void setProxyPort(int proxyPort) { this.proxyPort = proxyPort; } public String getProxyUsername() { return proxyUsername; } /** * If crawler should run behind a proxy and user/pass is needed for * authentication in proxy, this parameter can be used for specifying the * username. */ public void setProxyUsername(String proxyUsername) { this.proxyUsername = proxyUsername; } public String getProxyPassword() { return proxyPassword; } /** * If crawler should run behind a proxy and user/pass is needed for * authentication in proxy, this parameter can be used for specifying the * password. */ public void setProxyPassword(String proxyPassword) { this.proxyPassword = proxyPassword; } @Override public String toString() { StringBuilder sb = new StringBuilder(); sb.append("Crawl storage folder: " + getCrawlStorageFolder() + "\n"); sb.append("Resumable crawling: " + isResumableCrawling() + "\n"); sb.append("Max depth of crawl: " + getMaxDepthOfCrawling() + "\n"); sb.append("Max pages to fetch: " + getMaxPagesToFetch() + "\n"); sb.append("User agent string: " + getUserAgentString() + "\n"); sb.append("Include https pages: " + isIncludeHttpsPages() + "\n"); sb.append("Include binary content: " + isIncludeBinaryContentInCrawling() + "\n"); sb.append("Max connections per host: " + getMaxConnectionsPerHost() + "\n"); sb.append("Max total connections: " + getMaxTotalConnections() + "\n"); sb.append("Socket timeout: " + getSocketTimeout() + "\n"); sb.append("Max total connections: " + getMaxTotalConnections() + "\n"); sb.append("Max outgoing links to follow: " + getMaxOutgoingLinksToFollow() + "\n"); sb.append("Max download size: " + getMaxDownloadSize() + "\n"); sb.append("Should follow redirects?: " + isFollowRedirects() + "\n"); sb.append("Proxy host: " + getProxyHost() + "\n"); sb.append("Proxy port: " + getProxyPort() + "\n"); sb.append("Proxy username: " + getProxyUsername() + "\n"); sb.append("Proxy password: " + getProxyPassword() + "\n"); return sb.toString(); } }