package guang.crawler.crawlWorker;
import guang.crawler.centerConfig.CenterConfig;
import guang.crawler.centerConfig.workers.WorkerInfo;
import guang.crawler.localConfig.LocalConfig;
import guang.crawler.util.PropertiesHelper;
/**
* 爬虫工作者的本地配置信息
*
* @author sun
*
*/
public class WorkerConfig extends LocalConfig {
/**
* 爬虫本地配置的单例
*/
private static WorkerConfig config;
/**
* 获取单例
*
* @return
*/
public static WorkerConfig me() {
if (WorkerConfig.config == null) {
WorkerConfig.config = new WorkerConfig();
}
return WorkerConfig.config;
}
/**
* 爬虫所代表的user agent
*/
private String userAgentString = "crawler4j (http://code.google.com/p/crawler4j/)";
/**
* 是否爬取HTTPS的页面
*/
private boolean includeHttpsPages = true;
/**
* 是否爬取二进制内容
*/
private boolean includeBinaryContentInCrawling = false;
/**
* 针对每个目标主机,最多允许有多少个连接
*/
private int maxConnectionsPerHost = 100;
/**
* 当前主机允许的最大数目的连接数
*/
private int maxTotalConnections = 100;
/**
* 套接字超时时长,单位是ms
*/
private int socketTimeout = 20000;
/**
* 连接超时时长,单位是ms
*/
private int connectionTimeout = 30000;
/**
* 每个页面抽取的最大数量的出链
*/
private int maxOutgoingLinksToFollow = 5000;
/**
* 每个页面最大的下载大小,超过该大小将不进行下载
*/
private int maxDownloadSize = 1048576;
/**
* 这个暂时没有考虑到,实际上是应当考虑的。Should we follow redirects?
*/
private boolean followRedirects = true;
/**
* 使用代理的主机名
*/
private String proxyHost = null;
/**
* 使用的代理的端口号
*/
private int proxyPort = 80;
/**
* 使用的代理的用户名
*/
private String proxyUsername = null;
/**
* 使用的代理的密码
*/
private String proxyPassword = null;
/**
* 爬虫控制器
*/
private CenterConfig crawlerController;
/**
* 当前爬虫工作者的远程信息
*/
private WorkerInfo workerInfo;
private WorkerConfig() {
}
@Override
protected String[] getConfigResources() {
return new String[] { "/conf/crawler-worker/crawler-worker.config" };
}
public int getConnectionTimeout() {
return this.connectionTimeout;
}
public CenterConfig getCrawlerController() {
return this.crawlerController;
}
public int getMaxConnectionsPerHost() {
return this.maxConnectionsPerHost;
}
public int getMaxDownloadSize() {
return this.maxDownloadSize;
}
public int getMaxOutgoingLinksToFollow() {
return this.maxOutgoingLinksToFollow;
}
public int getMaxTotalConnections() {
return this.maxTotalConnections;
}
public String getProxyHost() {
return this.proxyHost;
}
public String getProxyPassword() {
return this.proxyPassword;
}
public int getProxyPort() {
return this.proxyPort;
}
public String getProxyUsername() {
return this.proxyUsername;
}
public int getSocketTimeout() {
return this.socketTimeout;
}
public String getUserAgentString() {
return this.userAgentString;
}
public WorkerInfo getWorkerInfo() {
return this.workerInfo;
}
public WorkerConfig init() {
return this;
}
@Override
protected void initProperties() {
super.initProperties();
this.includeHttpsPages = PropertiesHelper.readBoolean(this.configProperties,
"crawler.worker.include.https",
this.includeHttpsPages);
this.includeBinaryContentInCrawling = PropertiesHelper.readBoolean(this.configProperties,
"crawler.worker.include.binary",
this.includeBinaryContentInCrawling);
this.maxOutgoingLinksToFollow = PropertiesHelper.readInt(this.configProperties,
"crawler.worker.page.links.outgoing.max",
this.maxOutgoingLinksToFollow);
this.userAgentString = PropertiesHelper.readString(this.configProperties,
"crawler.worker.fetcher.userAgent",
this.userAgentString);
this.socketTimeout = PropertiesHelper.readInt(this.configProperties,
"crawler.worker.fetcher.socket.timeout",
this.socketTimeout);
this.connectionTimeout = PropertiesHelper.readInt(this.configProperties,
"crawler.worker.fetcher.connection.timeout",
this.connectionTimeout);
this.maxTotalConnections = PropertiesHelper.readInt(this.configProperties,
"crawler.worker.fetcher.totalConnections.max",
this.maxTotalConnections);
this.maxConnectionsPerHost = PropertiesHelper.readInt(this.configProperties,
"crawler.worker.fetcher.connectionsPerHost.max",
this.maxConnectionsPerHost);
this.maxDownloadSize = PropertiesHelper.readInt(this.configProperties,
"crawler.worker.fetcher.downloadSizePerPage.max",
this.maxDownloadSize);
this.followRedirects = PropertiesHelper.readBoolean(this.configProperties,
"crawler.worker.follow.redirects",
this.followRedirects);
this.proxyHost = PropertiesHelper.readString(this.configProperties,
"crawler.worker.fetcher.proxy.host",
this.proxyHost);
this.proxyPort = PropertiesHelper.readInt(this.configProperties,
"crawler.worker.fetcher.proxy.port",
this.proxyPort);
this.proxyUsername = PropertiesHelper.readString(this.configProperties,
"crawler.worker.fetcher.proxy.user",
this.proxyUsername);
this.proxyPassword = PropertiesHelper.readString(this.configProperties,
"crawler.worker.fetcher.proxy.password",
this.proxyPassword);
}
public boolean isFollowRedirects() {
return this.followRedirects;
}
public boolean isIncludeBinaryContentInCrawling() {
return this.includeBinaryContentInCrawling;
}
public boolean isIncludeHttpsPages() {
return this.includeHttpsPages;
}
public void setCrawlerController(final CenterConfig crawlerController) {
this.crawlerController = crawlerController;
}
public void setWorkerInfo(final WorkerInfo workerInfo) {
this.workerInfo = workerInfo;
}
}