/* * Copyright 2012-2017 CodeLibs Project and the Others. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, * either express or implied. See the License for the specific language * governing permissions and limitations under the License. */ package org.codelibs.fess.helper; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Map; import javax.annotation.Resource; import org.codelibs.core.lang.StringUtil; import org.codelibs.fess.Constants; import org.codelibs.fess.app.service.BoostDocumentRuleService; import org.codelibs.fess.app.service.FailureUrlService; import org.codelibs.fess.app.service.FileAuthenticationService; import org.codelibs.fess.app.service.FileConfigService; import org.codelibs.fess.app.service.WebConfigService; import org.codelibs.fess.crawler.Crawler; import org.codelibs.fess.crawler.CrawlerContext; import org.codelibs.fess.crawler.CrawlerStatus; import org.codelibs.fess.crawler.interval.FessIntervalController; import org.codelibs.fess.crawler.service.impl.EsDataService; import org.codelibs.fess.crawler.service.impl.EsUrlFilterService; import org.codelibs.fess.crawler.service.impl.EsUrlQueueService; import org.codelibs.fess.es.config.exentity.CrawlingConfig.ConfigName; import org.codelibs.fess.es.config.exentity.FileConfig; import org.codelibs.fess.es.config.exentity.WebConfig; import org.codelibs.fess.indexer.IndexUpdater; import org.codelibs.fess.mylasta.direction.FessConfig; import org.codelibs.fess.util.ComponentUtil; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class WebFsIndexHelper { private static final Logger logger = LoggerFactory.getLogger(WebFsIndexHelper.class); @Resource public WebConfigService webConfigService; @Resource protected FileConfigService fileConfigService; @Resource protected FileAuthenticationService fileAuthenticationService; @Resource public FailureUrlService failureUrlService; @Resource protected BoostDocumentRuleService boostDocumentRuleService; @Resource protected CrawlingConfigHelper crawlingConfigHelper; public long maxAccessCount = Long.MAX_VALUE; public long crawlingExecutionInterval = Constants.DEFAULT_CRAWLING_EXECUTION_INTERVAL; public int indexUpdaterPriority = Thread.MAX_PRIORITY; public int crawlerPriority = Thread.NORM_PRIORITY; private final List<Crawler> crawlerList = Collections.synchronizedList(new ArrayList<Crawler>()); public void crawl(final String sessionId, final List<String> webConfigIdList, final List<String> fileConfigIdList) { final boolean runAll = webConfigIdList == null && fileConfigIdList == null; final List<WebConfig> webConfigList; if (runAll || webConfigIdList != null) { webConfigList = webConfigService.getWebConfigListByIds(webConfigIdList); } else { webConfigList = Collections.emptyList(); } final List<FileConfig> fileConfigList; if (runAll || fileConfigIdList != null) { fileConfigList = fileConfigService.getFileConfigListByIds(fileConfigIdList); } else { fileConfigList = Collections.emptyList(); } if (webConfigList.isEmpty() && fileConfigList.isEmpty()) { // nothing if (logger.isInfoEnabled()) { logger.info("No crawling target urls."); } return; } doCrawl(sessionId, webConfigList, fileConfigList); } protected void doCrawl(final String sessionId, final List<WebConfig> webConfigList, final List<FileConfig> fileConfigList) { final int multiprocessCrawlingCount = ComponentUtil.getFessConfig().getCrawlingThreadCount(); final SystemHelper systemHelper = ComponentUtil.getSystemHelper(); final FessConfig fessConfig = ComponentUtil.getFessConfig(); final long startTime = System.currentTimeMillis(); final List<String> sessionIdList = new ArrayList<>(); crawlerList.clear(); final List<String> crawlerStatusList = new ArrayList<>(); // Web for (final WebConfig webConfig : webConfigList) { final String sid = crawlingConfigHelper.store(sessionId, webConfig); // create crawler final Crawler crawler = ComponentUtil.getComponent(Crawler.class); crawler.setSessionId(sid); sessionIdList.add(sid); final String urlsStr = webConfig.getUrls(); if (StringUtil.isBlank(urlsStr)) { logger.warn("No target urls. Skipped"); break; } // interval time final int intervalTime = webConfig.getIntervalTime() != null ? webConfig.getIntervalTime() : Constants.DEFAULT_INTERVAL_TIME_FOR_WEB; ((FessIntervalController) crawler.getIntervalController()).setDelayMillisForWaitingNewUrl(intervalTime); final String includedUrlsStr = webConfig.getIncludedUrls() != null ? webConfig.getIncludedUrls() : StringUtil.EMPTY; final String excludedUrlsStr = webConfig.getExcludedUrls() != null ? webConfig.getExcludedUrls() : StringUtil.EMPTY; // num of threads final CrawlerContext crawlerContext = crawler.getCrawlerContext(); final int numOfThread = webConfig.getNumOfThread() != null ? webConfig.getNumOfThread() : Constants.DEFAULT_NUM_OF_THREAD_FOR_WEB; crawlerContext.setNumOfThread(numOfThread); // depth final int depth = webConfig.getDepth() != null ? webConfig.getDepth() : -1; crawlerContext.setMaxDepth(depth); // max count final long maxCount = webConfig.getMaxAccessCount() != null ? webConfig.getMaxAccessCount() : maxAccessCount; crawlerContext.setMaxAccessCount(maxCount); webConfig.initializeClientFactory(crawler.getClientFactory()); final Map<String, String> configParamMap = webConfig.getConfigParameterMap(ConfigName.CONFIG); if (Constants.TRUE.equalsIgnoreCase(configParamMap.get(Constants.CONFIG_CLEANUP_ALL))) { deleteCrawlData(sid); } else if (Constants.TRUE.equalsIgnoreCase(configParamMap.get(Constants.CONFIG_CLEANUP_FILTERS))) { final EsUrlFilterService urlFilterService = ComponentUtil.getComponent(EsUrlFilterService.class); try { urlFilterService.delete(sid); } catch (final Exception e) { logger.warn("Failed to delete url filters for " + sid); } } // set urls final String[] urls = urlsStr.split("[\r\n]"); for (final String u : urls) { if (StringUtil.isNotBlank(u)) { final String urlValue = u.trim(); if (!urlValue.startsWith("#") && fessConfig.isValidCrawlerWebProtocol(u)) { crawler.addUrl(urlValue); if (logger.isInfoEnabled()) { logger.info("Target URL: " + urlValue); } } } } // set included urls final String[] includedUrls = includedUrlsStr.split("[\r\n]"); for (final String u : includedUrls) { if (StringUtil.isNotBlank(u)) { final String urlValue = u.trim(); if (!urlValue.startsWith("#")) { crawler.addIncludeFilter(urlValue); if (logger.isInfoEnabled()) { logger.info("Included URL: " + urlValue); } } } } // set excluded urls final String[] excludedUrls = excludedUrlsStr.split("[\r\n]"); for (final String u : excludedUrls) { if (StringUtil.isNotBlank(u)) { final String urlValue = u.trim(); if (!urlValue.startsWith("#")) { crawler.addExcludeFilter(urlValue); if (logger.isInfoEnabled()) { logger.info("Excluded URL: " + urlValue); } } } } // failure url final List<String> excludedUrlList = failureUrlService.getExcludedUrlList(webConfig.getConfigId()); for (final String u : excludedUrlList) { if (StringUtil.isNotBlank(u)) { final String urlValue = u.trim(); crawler.addExcludeFilter(urlValue); if (logger.isInfoEnabled()) { logger.info("Excluded URL from failures: " + urlValue); } } } if (logger.isDebugEnabled()) { logger.debug("Crawling " + urlsStr); } crawler.setBackground(true); crawler.setThreadPriority(crawlerPriority); crawlerList.add(crawler); crawlerStatusList.add(Constants.READY); } // File for (final FileConfig fileConfig : fileConfigList) { final String sid = crawlingConfigHelper.store(sessionId, fileConfig); // create crawler final Crawler crawler = ComponentUtil.getComponent(Crawler.class); crawler.setSessionId(sid); sessionIdList.add(sid); final String pathsStr = fileConfig.getPaths(); if (StringUtil.isBlank(pathsStr)) { logger.warn("No target uris. Skipped"); break; } final int intervalTime = fileConfig.getIntervalTime() != null ? fileConfig.getIntervalTime() : Constants.DEFAULT_INTERVAL_TIME_FOR_FS; ((FessIntervalController) crawler.getIntervalController()).setDelayMillisForWaitingNewUrl(intervalTime); final String includedPathsStr = fileConfig.getIncludedPaths() != null ? fileConfig.getIncludedPaths() : StringUtil.EMPTY; final String excludedPathsStr = fileConfig.getExcludedPaths() != null ? fileConfig.getExcludedPaths() : StringUtil.EMPTY; // num of threads final CrawlerContext crawlerContext = crawler.getCrawlerContext(); final int numOfThread = fileConfig.getNumOfThread() != null ? fileConfig.getNumOfThread() : Constants.DEFAULT_NUM_OF_THREAD_FOR_FS; crawlerContext.setNumOfThread(numOfThread); // depth final int depth = fileConfig.getDepth() != null ? fileConfig.getDepth() : -1; crawlerContext.setMaxDepth(depth); // max count final long maxCount = fileConfig.getMaxAccessCount() != null ? fileConfig.getMaxAccessCount() : maxAccessCount; crawlerContext.setMaxAccessCount(maxCount); fileConfig.initializeClientFactory(crawler.getClientFactory()); final Map<String, String> configParamMap = fileConfig.getConfigParameterMap(ConfigName.CONFIG); if (Constants.TRUE.equalsIgnoreCase(configParamMap.get(Constants.CONFIG_CLEANUP_ALL))) { deleteCrawlData(sid); } else if (Constants.TRUE.equalsIgnoreCase(configParamMap.get(Constants.CONFIG_CLEANUP_FILTERS))) { final EsUrlFilterService urlFilterService = ComponentUtil.getComponent(EsUrlFilterService.class); try { urlFilterService.delete(sid); } catch (final Exception e) { logger.warn("Failed to delete url filters for " + sid); } } // set paths final String[] paths = pathsStr.split("[\r\n]"); for (String u : paths) { if (StringUtil.isNotBlank(u)) { u = u.trim(); if (!u.startsWith("#")) { if (!fessConfig.isValidCrawlerFileProtocol(u)) { if (u.startsWith("/")) { u = "file:" + u; } else { u = "file:/" + u; } } crawler.addUrl(u); if (logger.isInfoEnabled()) { logger.info("Target Path: " + u); } } } } // set included paths boolean urlEncodeDisabled = false; final String[] includedPaths = includedPathsStr.split("[\r\n]"); for (final String u : includedPaths) { if (StringUtil.isNotBlank(u)) { final String line = u.trim(); if (!line.startsWith("#")) { final String urlValue; if (urlEncodeDisabled) { urlValue = line; urlEncodeDisabled = false; } else { urlValue = systemHelper.encodeUrlFilter(line); } crawler.addIncludeFilter(urlValue); if (logger.isInfoEnabled()) { logger.info("Included Path: " + urlValue); } } else if (line.startsWith("#DISABLE_URL_ENCODE")) { urlEncodeDisabled = true; } } } // set excluded paths urlEncodeDisabled = false; final String[] excludedPaths = excludedPathsStr.split("[\r\n]"); for (final String u : excludedPaths) { if (StringUtil.isNotBlank(u)) { final String line = u.trim(); if (!line.startsWith("#")) { final String urlValue; if (urlEncodeDisabled) { urlValue = line; urlEncodeDisabled = false; } else { urlValue = systemHelper.encodeUrlFilter(line); } crawler.addExcludeFilter(urlValue); if (logger.isInfoEnabled()) { logger.info("Excluded Path: " + urlValue); } } else if (line.startsWith("#DISABLE_URL_ENCODE")) { urlEncodeDisabled = true; } } } // failure url final List<String> excludedUrlList = failureUrlService.getExcludedUrlList(fileConfig.getConfigId()); if (excludedUrlList != null) { for (final String u : excludedUrlList) { if (StringUtil.isNotBlank(u)) { final String urlValue = u.trim(); crawler.addExcludeFilter(urlValue); if (logger.isInfoEnabled()) { logger.info("Excluded Path from failures: " + urlValue); } } } } if (logger.isDebugEnabled()) { logger.debug("Crawling " + pathsStr); } crawler.setBackground(true); crawler.setThreadPriority(crawlerPriority); crawlerList.add(crawler); crawlerStatusList.add(Constants.READY); } // run index update final IndexUpdater indexUpdater = ComponentUtil.getIndexUpdater(); indexUpdater.setName("IndexUpdater"); indexUpdater.setPriority(indexUpdaterPriority); indexUpdater.setSessionIdList(sessionIdList); indexUpdater.setDaemon(true); indexUpdater.setCrawlerList(crawlerList); boostDocumentRuleService.getAvailableBoostDocumentRuleList().forEach(rule -> { indexUpdater.addDocBoostMatcher(new org.codelibs.fess.indexer.DocBoostMatcher(rule)); }); indexUpdater.start(); int startedCrawlerNum = 0; int activeCrawlerNum = 0; while (startedCrawlerNum < crawlerList.size()) { // Force to stop crawl if (systemHelper.isForceStop()) { for (final Crawler crawler : crawlerList) { crawler.stop(); } break; } if (activeCrawlerNum < multiprocessCrawlingCount) { // start crawling crawlerList.get(startedCrawlerNum).execute(); crawlerStatusList.set(startedCrawlerNum, Constants.RUNNING); startedCrawlerNum++; activeCrawlerNum++; try { Thread.sleep(crawlingExecutionInterval); } catch (final InterruptedException e) { if (logger.isDebugEnabled()) { logger.debug("Interrupted.", e); } } continue; } // check status for (int i = 0; i < startedCrawlerNum; i++) { if (crawlerList.get(i).getCrawlerContext().getStatus() == CrawlerStatus.DONE && crawlerStatusList.get(i).equals(Constants.RUNNING)) { crawlerList.get(i).awaitTermination(); crawlerStatusList.set(i, Constants.DONE); final String sid = crawlerList.get(i).getCrawlerContext().getSessionId(); indexUpdater.addFinishedSessionId(sid); activeCrawlerNum--; } } try { Thread.sleep(crawlingExecutionInterval); } catch (final InterruptedException e) { if (logger.isDebugEnabled()) { logger.debug("Interrupted.", e); } } } boolean finishedAll = false; while (!finishedAll) { finishedAll = true; for (int i = 0; i < crawlerList.size(); i++) { crawlerList.get(i).awaitTermination(crawlingExecutionInterval); if (crawlerList.get(i).getCrawlerContext().getStatus() == CrawlerStatus.DONE && !crawlerStatusList.get(i).equals(Constants.DONE)) { crawlerStatusList.set(i, Constants.DONE); final String sid = crawlerList.get(i).getCrawlerContext().getSessionId(); indexUpdater.addFinishedSessionId(sid); } if (!crawlerStatusList.get(i).equals(Constants.DONE)) { finishedAll = false; } } } crawlerList.clear(); crawlerStatusList.clear(); // put cralwing info final CrawlingInfoHelper crawlingInfoHelper = ComponentUtil.getCrawlingInfoHelper(); final long execTime = System.currentTimeMillis() - startTime; crawlingInfoHelper.putToInfoMap(Constants.WEB_FS_CRAWLING_EXEC_TIME, Long.toString(execTime)); if (logger.isInfoEnabled()) { logger.info("[EXEC TIME] crawling time: " + execTime + "ms"); } indexUpdater.setFinishCrawling(true); try { indexUpdater.join(); } catch (final InterruptedException e) { logger.warn("Interrupted index update.", e); } crawlingInfoHelper.putToInfoMap(Constants.WEB_FS_INDEX_EXEC_TIME, Long.toString(indexUpdater.getExecuteTime())); crawlingInfoHelper.putToInfoMap(Constants.WEB_FS_INDEX_SIZE, Long.toString(indexUpdater.getDocumentSize())); if (systemHelper.isForceStop()) { return; } for (final String sid : sessionIdList) { // remove config crawlingConfigHelper.remove(sid); deleteCrawlData(sid); } } protected void deleteCrawlData(final String sid) { final EsUrlFilterService urlFilterService = ComponentUtil.getComponent(EsUrlFilterService.class); final EsUrlQueueService urlQueueService = ComponentUtil.getComponent(EsUrlQueueService.class); final EsDataService dataService = ComponentUtil.getComponent(EsDataService.class); try { // clear url filter urlFilterService.delete(sid); } catch (final Exception e) { logger.warn("Failed to delete UrlFilter for " + sid, e); } try { // clear queue urlQueueService.clearCache(); urlQueueService.delete(sid); } catch (final Exception e) { logger.warn("Failed to delete UrlQueue for " + sid, e); } try { // clear dataService.delete(sid); } catch (final Exception e) { logger.warn("Failed to delete AccessResult for " + sid, e); } } }