/** * Copyright © 2010 DocuLibre inc. * * This file is part of Constellio. * * Constellio is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Constellio is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Constellio. If not, see <http://www.gnu.org/licenses/>. */ package com.constellio.app.modules.es.connectors.http.fetcher.config; import java.net.MalformedURLException; import java.net.URISyntaxException; import java.net.URL; import java.util.Date; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.lang.StringUtils; import org.joda.time.DateTime; /** * Fetcher configuration. * * @author Nicolas Bélisle (nicolas.belisle@doculibre.com) * */ public class FetcherConfig { private final String[] includePatterns; private final String[] excludePatterns; private final UrlNormalizer[] normalizers; private final int pauseTime; private final int requestTimeout; private final int depth; private final double delay; private final int threads; private final String[] startUrls; private final String[] onDemandUrls; private final String robotsUserAgent; FetcherConfig(String[] includePatterns, String[] excludePatterns, UrlNormalizer[] normalizers, int pauseTime, int requestTimeout, int depth, double delay, int threads, String[] startUrls, String[] onDemandUrls, String robotsUserAgent) { this.includePatterns = includePatterns; for (String includePattern : includePatterns) { Pattern.compile(includePattern); } this.excludePatterns = excludePatterns; if (excludePatterns != null) { for (String excludePattern : excludePatterns) { Pattern.compile(excludePattern); } } this.normalizers = normalizers; this.pauseTime = pauseTime; this.requestTimeout = requestTimeout; this.depth = depth; this.delay = delay; if (threads < 1) { throw new IllegalArgumentException("maxThread must be greather than one, actual value: " + threads); } this.threads = threads; this.startUrls = startUrls; this.onDemandUrls = onDemandUrls; this.robotsUserAgent = robotsUserAgent; } /** * We first look at rejected/black URLs, then if nothing matches look for * accepted/white URLs. If nothing matches, the URL is rejected. * * @param url * @return */ public boolean isAccepted(String url) { try { new URL(url); } catch (MalformedURLException e) { return false; } if (StringUtils.isBlank(url)) { return false; } if (excludePatterns != null) { for (String blackRegEx : excludePatterns) { Pattern pattern = Pattern.compile(blackRegEx); Matcher matcher = pattern.matcher(url); if (matcher.find()) { return false; } } } for (String whiteRegEx : includePatterns) { Pattern pattern = Pattern.compile(whiteRegEx); Matcher matcher = pattern.matcher(url); if (matcher.find()) { return true; } } return false; } public int getPauseTime() { return pauseTime; } public int getRequestTimeout() { return requestTimeout; } public String normalize(String url) throws MalformedURLException, URISyntaxException { for (UrlNormalizer normalizer : normalizers) { url = normalizer.normalize(url); } return url; } public String[] getIncludePatterns() { return this.includePatterns; } public String[] getExcludePatterns() { return this.excludePatterns; } public int getDepth() { return depth; } public Date getExpiration() { return new DateTime().minusDays((int) this.delay).toDate(); } public String[] getStartUrls() { return startUrls; } public String[] getOnDemandUrls() { return onDemandUrls; } public String getRobotsUserAgent() { return robotsUserAgent; } public int getThreads() { return threads; } }