/** * License Agreement for OpenSearchServer * <p/> * Copyright (C) 2008-2016 Emmanuel Keller / Jaeksoft * <p/> * http://www.open-search-server.com * <p/> * This file is part of OpenSearchServer. * <p/> * OpenSearchServer is free software: you can redistribute it and/or * modify it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * <p/> * OpenSearchServer is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * <p/> * You should have received a copy of the GNU General Public License * along with OpenSearchServer. * If not, see <http://www.gnu.org/licenses/>. **/ package com.jaeksoft.searchlib.crawler.web.process; import com.jaeksoft.searchlib.ClientFactory; import com.jaeksoft.searchlib.Logging; import com.jaeksoft.searchlib.SearchLibException; import com.jaeksoft.searchlib.config.Config; import com.jaeksoft.searchlib.crawler.common.database.AbstractManager; import com.jaeksoft.searchlib.crawler.common.database.FetchStatus; import com.jaeksoft.searchlib.crawler.common.process.CrawlMasterAbstract; import com.jaeksoft.searchlib.crawler.common.process.CrawlQueueAbstract; import com.jaeksoft.searchlib.crawler.common.process.CrawlStatistics; import com.jaeksoft.searchlib.crawler.common.process.CrawlStatus; import com.jaeksoft.searchlib.crawler.web.database.HostUrlList; import com.jaeksoft.searchlib.crawler.web.database.HostUrlList.ListType; import com.jaeksoft.searchlib.crawler.web.database.LinkItem; import com.jaeksoft.searchlib.crawler.web.database.NamedItem; import com.jaeksoft.searchlib.crawler.web.database.UrlCrawlQueue; import com.jaeksoft.searchlib.crawler.web.database.UrlItem; import com.jaeksoft.searchlib.crawler.web.database.UrlManager; import com.jaeksoft.searchlib.crawler.web.database.WebPropertyManager; import com.jaeksoft.searchlib.crawler.web.database.pattern.PatternListMatcher; import com.jaeksoft.searchlib.crawler.web.sitemap.SiteMapCache; import com.jaeksoft.searchlib.crawler.web.sitemap.SiteMapItem; import com.jaeksoft.searchlib.crawler.web.sitemap.SiteMapList; import com.jaeksoft.searchlib.crawler.web.sitemap.SiteMapUrl; import com.jaeksoft.searchlib.crawler.web.spider.Crawl; import com.jaeksoft.searchlib.crawler.web.spider.HttpDownloader; import com.jaeksoft.searchlib.function.expression.SyntaxError; import com.jaeksoft.searchlib.query.ParseException; import com.jaeksoft.searchlib.scheduler.TaskManager; import org.apache.commons.lang3.RandomUtils; import org.apache.commons.lang3.StringUtils; import java.io.IOException; import java.net.MalformedURLException; import java.net.URI; import java.net.URISyntaxException; import java.net.URL; import java.util.ArrayList; import java.util.Date; import java.util.LinkedList; import java.util.List; import java.util.Set; import java.util.TreeSet; import java.util.LinkedHashSet; public class WebCrawlMaster extends CrawlMasterAbstract<WebCrawlMaster, WebCrawlThread> { private final LinkedList<NamedItem> hostList; private volatile int maxUrlPerSession = 0; private final UrlCrawlQueue urlCrawlQueue; public WebCrawlMaster(Config config) throws SearchLibException, IOException { super(config); urlCrawlQueue = new UrlCrawlQueue(config); hostList = new LinkedList<>(); if (config.getWebPropertyManager().getCrawlEnabled().getValue()) { Logging.info("Webcrawler is starting for " + config.getIndexName()); start(false); } } @Override public void runner() throws Exception { Config config = getConfig(); WebPropertyManager propertyManager = config.getWebPropertyManager(); if (ClientFactory.INSTANCE.properties.isDisableWebCrawler()) { abort(); propertyManager.getCrawlEnabled().setValue(false); throw new InterruptedException("The webcrawler is disabled."); } urlCrawlQueue.setMaxBufferSize(propertyManager.getIndexDocumentBufferSize().getValue()); while (!isAborted()) { currentStats = new CrawlStatistics(); addStatistics(currentStats); urlCrawlQueue.setStatistiques(currentStats); final int threadNumber = propertyManager.getMaxThreadNumber().getValue(); maxUrlPerSession = propertyManager.getMaxUrlPerSession().getValue(); final int maxUrlPerHost = propertyManager.getMaxUrlPerHost().getValue(); final PatternListMatcher exclusionMatcher = propertyManager.getExclusionEnabled().getValue() ? config.getExclusionPatternManager().getPatternListMatcher() : null; final PatternListMatcher inclusionMatcher = propertyManager.getInclusionEnabled().getValue() ? config.getInclusionPatternManager().getPatternListMatcher() : null; final Integer maxDepth = propertyManager.getMaxDepth().getValue(); String schedulerJobName = propertyManager.getSchedulerAfterSession().getValue(); synchronized (hostList) { hostList.clear(); } extractSiteMapList(inclusionMatcher, exclusionMatcher); extractHostList(maxUrlPerHost, maxDepth); while (!isAborted()) { int howMany = urlLeftPerHost(maxUrlPerHost); if (howMany <= 0) break; NamedItem host = getNextHost(); if (host == null) break; HostUrlList hostUrlList = getNextUrlList(host, howMany, maxDepth); if (hostUrlList == null) continue; WebCrawlThread crawlThread = new WebCrawlThread(config, this, currentStats, hostUrlList); add(crawlThread); while (getThreadsCount() >= threadNumber && !isAborted()) sleepSec(5); } setStatus(CrawlStatus.WAITING_CHILD); while (getThreadsCount() > 0) { waitForChild(1800); if (isAborted()) break; } setStatus(CrawlStatus.INDEXATION); urlCrawlQueue.index(true); if (schedulerJobName != null && schedulerJobName.length() > 0) { setStatus(CrawlStatus.EXECUTE_SCHEDULER_JOB); TaskManager.getInstance().executeJob(config.getIndexName(), schedulerJobName); } if (isOnce()) break; sleepSec(5); } urlCrawlQueue.index(true); setStatus(CrawlStatus.NOT_RUNNING); } private void extractHostList(final int maxUrlPerHost, final Integer maxDepth) throws IOException, ParseException, SyntaxError, URISyntaxException, ClassNotFoundException, InterruptedException, SearchLibException, InstantiationException, IllegalAccessException { Config config = getConfig(); UrlManager urlManager = config.getUrlManager(); setStatus(CrawlStatus.EXTRACTING_HOSTLIST); Set<String> hostSet = new TreeSet<String>(); WebPropertyManager propertyManager = config.getWebPropertyManager(); final Date fetchIntervalDate = AbstractManager.getPastDate(propertyManager.getFetchInterval().getValue(), propertyManager.getFetchIntervalUnit().getValue()); int urlLimit = maxUrlPerSession; // First try fetch priority NamedItem.Selection selection = new NamedItem.Selection(ListType.PRIORITY_URL, FetchStatus.FETCH_FIRST, null, null); urlLimit = urlManager.getHostToFetch(selection, urlLimit, maxUrlPerHost, maxDepth, hostList, hostSet); // Second try old URLs selection = new NamedItem.Selection(ListType.OLD_URL, null, fetchIntervalDate, null); urlLimit = urlManager.getHostToFetch(selection, urlLimit, maxUrlPerHost, maxDepth, hostList, hostSet); // Finally try new unfetched URLs selection = new NamedItem.Selection(ListType.NEW_URL, FetchStatus.UN_FETCHED, null, fetchIntervalDate); urlLimit = urlManager.getHostToFetch(selection, urlLimit, maxUrlPerHost, maxDepth, hostList, hostSet); currentStats.addHostListSize(hostList.size()); } private void extractSiteMapList(final PatternListMatcher inclusionMatcher, final PatternListMatcher exclusionMatcher) throws SearchLibException, IOException { HttpDownloader httpDownloader = null; try { httpDownloader = getNewHttpDownloader(true); final SiteMapList siteMapList = getConfig().getSiteMapList(); final SiteMapCache siteMapCache = SiteMapCache.getInstance(); if (siteMapList != null && siteMapList.getArray() != null) { setStatus(CrawlStatus.LOADING_SITEMAP); final UrlManager urlManager = getConfig().getUrlManager(); final List<UrlItem> workInsertUrlList = new ArrayList<UrlItem>(); for (SiteMapItem siteMap : siteMapList.getArray()) { final LinkedHashSet<SiteMapUrl> siteMapUrlSet = new LinkedHashSet<>(); siteMap.fill(siteMapCache, getNewHttpDownloader(true), false, siteMapUrlSet); for (SiteMapUrl siteMapUrl : siteMapUrlSet) { final URI uri = siteMapUrl.getLoc(); final String sUri = uri.toString(); URL url; try { url = uri.toURL(); } catch (MalformedURLException e) { continue; } if (exclusionMatcher != null) if (exclusionMatcher.matchPattern(url, sUri)) continue; if (inclusionMatcher != null) if (!inclusionMatcher.matchPattern(url, sUri)) continue; if (!urlManager.exists(sUri)) { workInsertUrlList.add( urlManager.getNewUrlItem(new LinkItem(sUri, LinkItem.Origin.sitemap, null, 0))); } } } if (workInsertUrlList.size() > 0) urlManager.updateUrlItems(workInsertUrlList); } } finally { if (httpDownloader != null) httpDownloader.release(); } } public HttpDownloader getNewHttpDownloader(boolean followRedirect, String userAgent, boolean useProxies) throws SearchLibException, IOException { Config config = getConfig(); WebPropertyManager propertyManager = config.getWebPropertyManager(); if (StringUtils.isEmpty(userAgent)) userAgent = propertyManager.getUserAgent().getValue(); return new HttpDownloader(userAgent, followRedirect, useProxies ? propertyManager.getProxyHandler() : null, propertyManager.getConnectionTimeOut().getValue() * 1000); } final public HttpDownloader getNewHttpDownloader(final boolean followRedirect) throws SearchLibException, IOException { return getNewHttpDownloader(followRedirect, null, true); } private NamedItem getNextHost() { synchronized (hostList) { int s = hostList.size(); if (s > 0) { NamedItem host = hostList.remove(RandomUtils.nextInt(0, s)); if (host != null) { currentStats.incHostCount(); return host; } } } return null; } protected int urlLeft() { return (int) (maxUrlPerSession - currentStats.getFetchedCount()); } private int urlLeftPerHost(int maxUrlPerHost) { int leftCount = urlLeft(); if (leftCount < 0) return leftCount; if (leftCount > maxUrlPerHost) leftCount = maxUrlPerHost; return leftCount; } private HostUrlList getNextUrlList(final NamedItem host, final int count, final Integer maxDepth) throws ParseException, IOException, SyntaxError, URISyntaxException, ClassNotFoundException, InterruptedException, SearchLibException, InstantiationException, IllegalAccessException { setStatus(CrawlStatus.EXTRACTING_URLLIST); setInfo(host.getName()); UrlManager urlManager = getConfig().getUrlManager(); List<UrlItem> urlList = new ArrayList<UrlItem>(); HostUrlList hostUrlList = new HostUrlList(urlList, host); hostUrlList.setListType(host.selection.listType); urlManager.getUrlToFetch(host, count, maxDepth, urlList); setInfo(null); return hostUrlList; } public boolean isFull() throws IOException { return currentStats.getFetchedCount() >= getConfig().getWebPropertyManager().getMaxUrlPerSession().getValue(); } public Crawl getNewCrawl(WebCrawlThread crawlThread) throws SearchLibException, IOException { return new Crawl(crawlThread); } public WebCrawlThread manualCrawl(URL url, HostUrlList.ListType listType) throws SearchLibException, ParseException, IOException, SyntaxError, URISyntaxException, ClassNotFoundException, InterruptedException, InstantiationException, IllegalAccessException { Config config = getConfig(); if (currentStats == null) currentStats = new CrawlStatistics(); UrlManager urlManager = config.getUrlManager(); List<UrlItem> urlItemList = new ArrayList<UrlItem>(); UrlItem urlItem = urlManager.getUrlToFetch(url); if (urlItem == null) urlItem = urlManager.getNewUrlItem(new LinkItem(url.toExternalForm(), LinkItem.Origin.manual, null, 0)); urlItemList.add(urlItem); HostUrlList hostUrlList = new HostUrlList(urlItemList, new NamedItem(url.getHost())); hostUrlList.setListType(listType); WebCrawlThread crawlThread = new WebCrawlThread(config, this, new CrawlStatistics(), hostUrlList); crawlThread.execute(180); return crawlThread; } public CrawlQueueAbstract getCrawlQueue() { return urlCrawlQueue; } @Override protected WebCrawlThread[] getNewArray(int size) { return new WebCrawlThread[size]; } }