/* * Copyright 2012-2017 CodeLibs Project and the Others. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, * either express or implied. See the License for the specific language * governing permissions and limitations under the License. */ package org.codelibs.fess.exec; import static org.codelibs.core.stream.StreamUtil.stream; import java.io.File; import java.io.IOException; import java.lang.management.ManagementFactory; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Date; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.concurrent.atomic.AtomicBoolean; import javax.annotation.Resource; import org.codelibs.core.CoreLibConstants; import org.codelibs.core.lang.StringUtil; import org.codelibs.core.misc.DynamicProperties; import org.codelibs.fess.Constants; import org.codelibs.fess.app.service.CrawlingInfoService; import org.codelibs.fess.app.service.PathMappingService; import org.codelibs.fess.crawler.client.EsClient; import org.codelibs.fess.es.client.FessEsClient; import org.codelibs.fess.exception.ContainerNotAvailableException; import org.codelibs.fess.helper.CrawlingInfoHelper; import org.codelibs.fess.helper.DataIndexHelper; import org.codelibs.fess.helper.DuplicateHostHelper; import org.codelibs.fess.helper.PathMappingHelper; import org.codelibs.fess.helper.WebFsIndexHelper; import org.codelibs.fess.mylasta.direction.FessConfig; import org.codelibs.fess.mylasta.mail.CrawlerPostcard; import org.codelibs.fess.util.ComponentUtil; import org.kohsuke.args4j.CmdLineException; import org.kohsuke.args4j.CmdLineParser; import org.kohsuke.args4j.Option; import org.lastaflute.core.mail.Postbox; import org.lastaflute.di.core.external.GenericExternalContext; import org.lastaflute.di.core.external.GenericExternalContextComponentDefRegister; import org.lastaflute.di.core.factory.SingletonLaContainerFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class Crawler { private static final Logger logger = LoggerFactory.getLogger(Crawler.class); private static final String WEB_FS_CRAWLING_PROCESS = "WebFsCrawler"; private static final String DATA_CRAWLING_PROCESS = "DataStoreCrawler"; private static AtomicBoolean running = new AtomicBoolean(false); @Resource protected FessEsClient fessEsClient; @Resource protected WebFsIndexHelper webFsIndexHelper; @Resource protected DataIndexHelper dataIndexHelper; @Resource protected PathMappingService pathMappingService; @Resource protected CrawlingInfoService crawlingInfoService; public static class Options { @Option(name = "-s", aliases = "--sessionId", metaVar = "sessionId", usage = "Session ID") public String sessionId; @Option(name = "-n", aliases = "--name", metaVar = "name", usage = "Name") public String name; @Option(name = "-w", aliases = "--webConfigIds", metaVar = "webConfigIds", usage = "Web Config IDs") public String webConfigIds; @Option(name = "-f", aliases = "--fileConfigIds", metaVar = "fileConfigIds", usage = "File Config IDs") public String fileConfigIds; @Option(name = "-d", aliases = "--dataConfigIds", metaVar = "dataConfigIds", usage = "Data Config IDs") public String dataConfigIds; @Option(name = "-p", aliases = "--properties", metaVar = "properties", usage = "Properties File") public String propertiesPath; @Option(name = "-e", aliases = "--expires", metaVar = "expires", usage = "Expires for documents") public String expires; protected Options() { // noghing } protected List<String> getWebConfigIdList() { if (StringUtil.isNotBlank(webConfigIds)) { final String[] values = webConfigIds.split(","); return createConfigIdList(values); } return null; } protected List<String> getFileConfigIdList() { if (StringUtil.isNotBlank(fileConfigIds)) { final String[] values = fileConfigIds.split(","); return createConfigIdList(values); } return null; } protected List<String> getDataConfigIdList() { if (StringUtil.isNotBlank(dataConfigIds)) { final String[] values = dataConfigIds.split(","); return createConfigIdList(values); } return null; } private static List<String> createConfigIdList(final String[] values) { final List<String> idList = new ArrayList<>(); for (final String value : values) { idList.add(value); } return idList; } @Override public String toString() { return "Options [sessionId=" + sessionId + ", name=" + name + ", webConfigIds=" + webConfigIds + ", fileConfigIds=" + fileConfigIds + ", dataConfigIds=" + dataConfigIds + ", propertiesPath=" + propertiesPath + ", expires=" + expires + "]"; } } public static void main(final String[] args) { final Options options = new Options(); final CmdLineParser parser = new CmdLineParser(options); try { parser.parseArgument(args); } catch (final CmdLineException e) { System.err.println(e.getMessage()); System.err.println("java " + Crawler.class.getCanonicalName() + " [options...] arguments..."); parser.printUsage(System.err); return; } if (logger.isDebugEnabled()) { try { ManagementFactory.getRuntimeMXBean().getInputArguments().stream().forEach(s -> logger.debug("Parameter: " + s)); System.getProperties().entrySet().stream().forEach(e -> logger.debug("Property: " + e.getKey() + "=" + e.getValue())); System.getenv().entrySet().forEach(e -> logger.debug("Env: " + e.getKey() + "=" + e.getValue())); logger.debug("Option: " + options); } catch (final Exception e) { // ignore } } final String transportAddresses = System.getProperty(Constants.FESS_ES_TRANSPORT_ADDRESSES); if (StringUtil.isNotBlank(transportAddresses)) { System.setProperty(EsClient.TRANSPORT_ADDRESSES, transportAddresses); } final String clusterName = System.getProperty(Constants.FESS_ES_CLUSTER_NAME); if (StringUtil.isNotBlank(clusterName)) { System.setProperty(EsClient.CLUSTER_NAME, clusterName); } int exitCode; try { running.set(true); SingletonLaContainerFactory.setConfigPath("app.xml"); SingletonLaContainerFactory.setExternalContext(new GenericExternalContext()); SingletonLaContainerFactory.setExternalContextComponentDefRegister(new GenericExternalContextComponentDefRegister()); SingletonLaContainerFactory.init(); final Thread shutdownCallback = new Thread("ShutdownHook") { @Override public void run() { destroyContainer(); } }; Runtime.getRuntime().addShutdownHook(shutdownCallback); exitCode = process(options); } catch (final ContainerNotAvailableException e) { if (logger.isDebugEnabled()) { logger.debug("Crawler is stopped.", e); } else if (logger.isInfoEnabled()) { logger.info("Crawler is stopped."); } exitCode = Constants.EXIT_FAIL; } catch (final Throwable t) { logger.error("Crawler does not work correctly.", t); exitCode = Constants.EXIT_FAIL; } finally { destroyContainer(); } if (exitCode != Constants.EXIT_OK) { System.exit(exitCode); } } private static void destroyContainer() { if (running.getAndSet(false)) { if (logger.isDebugEnabled()) { logger.debug("Destroying LaContainer..."); } SingletonLaContainerFactory.destroy(); logger.info("Destroyed LaContainer."); } } private static int process(final Options options) { final Crawler crawler = ComponentUtil.getComponent(Crawler.class); if (StringUtil.isBlank(options.sessionId)) { // use a default session id final SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMddHHmmss"); options.sessionId = sdf.format(new Date()); } final CrawlingInfoHelper crawlingInfoHelper = ComponentUtil.getCrawlingInfoHelper(); final DynamicProperties systemProperties = ComponentUtil.getSystemProperties(); if (StringUtil.isNotBlank(options.propertiesPath)) { systemProperties.reload(options.propertiesPath); } else { try { final File propFile = File.createTempFile("crawler_", ".properties"); if (propFile.delete() && logger.isDebugEnabled()) { logger.debug("Deleted a temp file: " + propFile.getAbsolutePath()); } systemProperties.reload(propFile.getAbsolutePath()); propFile.deleteOnExit(); } catch (final IOException e) { logger.warn("Failed to create system properties file.", e); } } try { crawlingInfoHelper.store(options.sessionId, true); final String dayForCleanupStr; int dayForCleanup = -1; if (StringUtil.isNotBlank(options.expires)) { dayForCleanupStr = options.expires; try { dayForCleanup = Integer.parseInt(dayForCleanupStr); } catch (final NumberFormatException e) {} } else { dayForCleanup = ComponentUtil.getFessConfig().getDayForCleanup(); } crawlingInfoHelper.updateParams(options.sessionId, options.name, dayForCleanup); } catch (final Exception e) { logger.warn("Failed to store crawling information.", e); } try { return crawler.doCrawl(options); } finally { try { crawlingInfoHelper.store(options.sessionId, false); } catch (final Exception e) { logger.warn("Failed to store crawling information.", e); } final Map<String, String> infoMap = crawlingInfoHelper.getInfoMap(options.sessionId); final StringBuilder buf = new StringBuilder(500); for (final Map.Entry<String, String> entry : infoMap.entrySet()) { if (buf.length() != 0) { buf.append(','); } buf.append(entry.getKey()).append('=').append(entry.getValue()); } logger.info("[CRAWL INFO] " + buf.toString()); // notification try { crawler.sendMail(infoMap); } catch (final Exception e) { logger.warn("Failed to send a mail.", e); } } } protected void sendMail(final Map<String, String> infoMap) { final FessConfig fessConfig = ComponentUtil.getFessConfig(); final String toStrs = fessConfig.getNotificationTo(); if (StringUtil.isNotBlank(toStrs)) { final String[] toAddresses = toStrs.split(","); final Map<String, String> dataMap = new HashMap<>(); for (final Map.Entry<String, String> entry : infoMap.entrySet()) { dataMap.put(StringUtil.decapitalize(entry.getKey()), entry.getValue()); } dataMap.put("hostname", ComponentUtil.getSystemHelper().getHostname()); logger.debug("\ninfoMap: {}\ndataMap: {}", infoMap, dataMap); final Postbox postbox = ComponentUtil.getComponent(Postbox.class); CrawlerPostcard.droppedInto(postbox, postcard -> { postcard.setFrom(fessConfig.getMailFromAddress(), fessConfig.getMailFromName()); postcard.addReplyTo(fessConfig.getMailReturnPath()); stream(toAddresses).of(stream -> stream.forEach(address -> { postcard.addTo(address); })); postcard.setCrawlerEndTime(getValueFromMap(dataMap, "crawlerEndTime", StringUtil.EMPTY)); postcard.setCrawlerExecTime(getValueFromMap(dataMap, "crawlerExecTime", "0")); postcard.setCrawlerStartTime(getValueFromMap(dataMap, "crawlerStartTime", StringUtil.EMPTY)); postcard.setDataCrawlEndTime(getValueFromMap(dataMap, "dataCrawlEndTime", StringUtil.EMPTY)); postcard.setDataCrawlExecTime(getValueFromMap(dataMap, "dataCrawlExecTime", "0")); postcard.setDataCrawlStartTime(getValueFromMap(dataMap, "dataCrawlStartTime", StringUtil.EMPTY)); postcard.setDataIndexSize(getValueFromMap(dataMap, "dataIndexSize", "0")); postcard.setDataIndexExecTime(getValueFromMap(dataMap, "dataIndexExecTime", "0")); postcard.setHostname(getValueFromMap(dataMap, "hostname", StringUtil.EMPTY)); postcard.setWebFsCrawlEndTime(getValueFromMap(dataMap, "webFsCrawlEndTime", StringUtil.EMPTY)); postcard.setWebFsCrawlExecTime(getValueFromMap(dataMap, "webFsCrawlExecTime", "0")); postcard.setWebFsCrawlStartTime(getValueFromMap(dataMap, "webFsCrawlStartTime", StringUtil.EMPTY)); postcard.setWebFsIndexExecTime(getValueFromMap(dataMap, "webFsIndexExecTime", "0")); postcard.setWebFsIndexSize(getValueFromMap(dataMap, "webFsIndexSize", "0")); if (Constants.TRUE.equalsIgnoreCase(infoMap.get(Constants.CRAWLER_STATUS))) { postcard.setStatus(Constants.OK); } else { postcard.setStatus(Constants.FAIL); } }); } } private String getValueFromMap(final Map<String, String> dataMap, final String key, final String defaultValue) { final String value = dataMap.get(key); if (StringUtil.isBlank(value)) { return defaultValue; } return value; } public int doCrawl(final Options options) { if (logger.isInfoEnabled()) { logger.info("Starting Crawler.."); } final PathMappingHelper pathMappingHelper = ComponentUtil.getPathMappingHelper(); final long totalTime = System.currentTimeMillis(); final CrawlingInfoHelper crawlingInfoHelper = ComponentUtil.getCrawlingInfoHelper(); boolean completed = false; try { writeTimeToSessionInfo(crawlingInfoHelper, Constants.CRAWLER_START_TIME); // setup path mapping final List<String> ptList = new ArrayList<>(); ptList.add(Constants.PROCESS_TYPE_CRAWLING); ptList.add(Constants.PROCESS_TYPE_BOTH); pathMappingHelper.setPathMappingList(options.sessionId, pathMappingService.getPathMappingList(ptList)); // duplicate host try { final DuplicateHostHelper duplicateHostHelper = ComponentUtil.getDuplicateHostHelper(); duplicateHostHelper.init(); } catch (final Exception e) { logger.warn("Could not initialize duplicateHostHelper.", e); } // delete expired sessions crawlingInfoService.deleteSessionIdsBefore(options.sessionId, options.name, ComponentUtil.getSystemHelper() .getCurrentTimeAsLong()); final List<String> webConfigIdList = options.getWebConfigIdList(); final List<String> fileConfigIdList = options.getFileConfigIdList(); final List<String> dataConfigIdList = options.getDataConfigIdList(); final boolean runAll = webConfigIdList == null && fileConfigIdList == null && dataConfigIdList == null; Thread webFsCrawlerThread = null; Thread dataCrawlerThread = null; if (runAll || webConfigIdList != null || fileConfigIdList != null) { webFsCrawlerThread = new Thread((Runnable) () -> { // crawl web writeTimeToSessionInfo(crawlingInfoHelper, Constants.WEB_FS_CRAWLER_START_TIME); webFsIndexHelper.crawl(options.sessionId, webConfigIdList, fileConfigIdList); writeTimeToSessionInfo(crawlingInfoHelper, Constants.WEB_FS_CRAWLER_END_TIME); }, WEB_FS_CRAWLING_PROCESS); webFsCrawlerThread.start(); } if (runAll || dataConfigIdList != null) { dataCrawlerThread = new Thread((Runnable) () -> { // crawl data system writeTimeToSessionInfo(crawlingInfoHelper, Constants.DATA_CRAWLER_START_TIME); dataIndexHelper.crawl(options.sessionId, dataConfigIdList); writeTimeToSessionInfo(crawlingInfoHelper, Constants.DATA_CRAWLER_END_TIME); }, DATA_CRAWLING_PROCESS); dataCrawlerThread.start(); } joinCrawlerThread(webFsCrawlerThread); joinCrawlerThread(dataCrawlerThread); if (logger.isInfoEnabled()) { logger.info("Finished Crawler"); } completed = true; return Constants.EXIT_OK; } catch (final Throwable t) { logger.warn("An exception occurs on the crawl task.", t); return Constants.EXIT_FAIL; } finally { pathMappingHelper.removePathMappingList(options.sessionId); crawlingInfoHelper.putToInfoMap(Constants.CRAWLER_STATUS, completed ? Constants.T.toString() : Constants.F.toString()); writeTimeToSessionInfo(crawlingInfoHelper, Constants.CRAWLER_END_TIME); crawlingInfoHelper.putToInfoMap(Constants.CRAWLER_EXEC_TIME, Long.toString(System.currentTimeMillis() - totalTime)); } } protected void writeTimeToSessionInfo(final CrawlingInfoHelper crawlingInfoHelper, final String key) { if (crawlingInfoHelper != null) { final SimpleDateFormat dateFormat = new SimpleDateFormat(CoreLibConstants.DATE_FORMAT_ISO_8601_EXTEND); crawlingInfoHelper.putToInfoMap(key, dateFormat.format(new Date())); } } private void joinCrawlerThread(final Thread crawlerThread) { if (crawlerThread != null) { try { crawlerThread.join(); } catch (final Exception e) { logger.info("Interrupted a crawling process: " + crawlerThread.getName()); } } } }