package org.codelibs.riverweb;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.UUID;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.atomic.AtomicLong;
import java.util.function.IntConsumer;
import java.util.stream.Stream;
import javax.annotation.Resource;
import org.apache.http.auth.AuthScheme;
import org.apache.http.auth.AuthScope;
import org.apache.http.auth.Credentials;
import org.apache.http.auth.NTCredentials;
import org.apache.http.auth.UsernamePasswordCredentials;
import org.apache.http.impl.auth.BasicScheme;
import org.apache.http.impl.auth.DigestScheme;
import org.apache.http.impl.auth.NTLMScheme;
import org.codelibs.core.lang.StringUtil;
import org.codelibs.fess.crawler.Crawler;
import org.codelibs.fess.crawler.CrawlerContext;
import org.codelibs.fess.crawler.client.CrawlerClient;
import org.codelibs.fess.crawler.client.CrawlerClientFactory;
import org.codelibs.fess.crawler.client.http.Authentication;
import org.codelibs.fess.crawler.client.http.HcHttpClient;
import org.codelibs.fess.crawler.client.http.RequestHeader;
import org.codelibs.fess.crawler.client.http.impl.AuthenticationImpl;
import org.codelibs.fess.crawler.client.http.ntlm.JcifsEngine;
import org.codelibs.fess.crawler.service.impl.EsDataService;
import org.codelibs.fess.crawler.service.impl.EsUrlFilterService;
import org.codelibs.fess.crawler.service.impl.EsUrlQueueService;
import org.codelibs.riverweb.config.RiverConfig;
import org.codelibs.riverweb.config.RiverConfigManager;
import org.codelibs.riverweb.interval.WebRiverIntervalController;
import org.codelibs.riverweb.util.ConfigProperties;
import org.codelibs.riverweb.util.ScriptUtils;
import org.codelibs.riverweb.util.SettingsUtils;
import org.elasticsearch.action.get.GetResponse;
import org.elasticsearch.action.search.SearchRequestBuilder;
import org.elasticsearch.index.IndexNotFoundException;
import org.elasticsearch.index.engine.DocumentAlreadyExistsException;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.index.query.functionscore.ScoreFunctionBuilders;
import org.kohsuke.args4j.CmdLineParser;
import org.kohsuke.args4j.Option;
import org.kohsuke.args4j.ParserProperties;
import org.lastaflute.di.core.SingletonLaContainer;
import org.lastaflute.di.core.factory.SingletonLaContainerFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class RiverWeb {
public static final Logger logger = LoggerFactory.getLogger(RiverWeb.class);
private static final String NTLM_SCHEME = "NTLM";
private static final String DIGEST_SCHEME = "DIGEST";
private static final String BASIC_SCHEME = "BASIC";
@Option(name = "--queue-timeout")
protected long queueTimeout = 300000; // 5min
@Option(name = "--threads")
protected int numThreads = 1;
@Option(name = "--interval")
protected long interval = 1000;
@Option(name = "--config-id")
protected String configId;
@Option(name = "--session-id")
protected String sessionId;
@Option(name = "--cleanup")
protected boolean cleanup;
@Option(name = "--es-hosts")
protected String esHosts;
@Option(name = "--cluster-name")
protected String clusterName;
@Option(name = "--quiet")
protected boolean quiet;
@Option(name = "--queue-query")
protected String queueQuery;
@Resource
protected org.codelibs.fess.crawler.client.EsClient esClient;
@Resource
protected ConfigProperties config;
@Resource
protected RiverConfigManager riverConfigManager;
@Resource
protected String defaultUserAgent;
protected static IntConsumer exitMethod = System::exit;
public static void main(final String[] args) {
Runtime.getRuntime().addShutdownHook(new Thread() {
@Override
public void run() {
synchronized (this) {
SingletonLaContainerFactory.destroy();
}
}
});
SingletonLaContainerFactory.init();
final RiverWeb riverWeb = SingletonLaContainer.getComponent(RiverWeb.class);
final CmdLineParser parser = new CmdLineParser(riverWeb, ParserProperties.defaults().withUsageWidth(80));
try {
parser.parseArgument(args);
} catch (final Exception e) {
parser.printUsage(System.out);
exitMethod.accept(1);
return;
}
try {
exitMethod.accept(riverWeb.execute());
} catch (final Exception e) {
riverWeb.print(e.getMessage());
exitMethod.accept(1);
logger.error("Failed to process your request.", e);
} finally {
SingletonLaContainerFactory.destroy();
}
}
private void print(final String format, final Object... args) {
final String log = String.format(format, args);
if (quiet) {
logger.info(log);
} else {
System.out.println(log);
}
}
private int execute() {
// update esClient
esClient.setClusterName(config.getElasticsearchClusterName(clusterName));
esClient.setAddresses(config.getElasticsearchHosts(esHosts));
esClient.connect();
if (StringUtil.isNotBlank(configId)) {
return crawl(SingletonLaContainer.getComponent(Crawler.class), configId, sessionId);
} else {
final String configIndex = config.getConfigIndex();
final String queueType = config.getQueueType();
final ExecutorService threadPool = Executors.newFixedThreadPool(numThreads);
final Future<?>[] results = new Future[numThreads];
for (int i = 0; i < numThreads; i++) {
final int threadId = i + 1;
results[i] = threadPool.submit(() -> {
AtomicLong lastProcessed = new AtomicLong(System.currentTimeMillis());
while (SingletonLaContainerFactory.hasContainer()
&& (queueTimeout <= 0 || lastProcessed.get() + queueTimeout > System.currentTimeMillis())) {
logger.debug("Checking queue: {}/{}", configIndex, queueType);
try {
final SearchRequestBuilder builder = esClient.prepareSearch(configIndex).setTypes(queueType);
if (StringUtil.isNotBlank(queueQuery)) {
builder.setQuery(queueQuery);
} else {
builder.setQuery(
QueryBuilders.functionScoreQuery().add(ScoreFunctionBuilders.randomFunction(System.nanoTime())));
}
builder.setSize(config.getQueueParsingSize()).execute().actionGet().getHits().forEach(hit -> {
if (esClient.prepareDelete(hit.getIndex(), hit.getType(), hit.getId()).execute().actionGet().isFound()) {
Map<String, Object> source = hit.getSource();
final Object configId = source.get("config_id");
final String sessionId = (String) source.get("session_id");
if (configId instanceof String) {
print("Config %s is started with Session %s.", configId, sessionId);
try {
crawl(SingletonLaContainer.getComponent(Crawler.class), configId.toString(), sessionId);
} finally {
print("Config %s is finished.", configId);
lastProcessed.set(System.currentTimeMillis());
}
}
} else if (logger.isDebugEnabled()) {
logger.debug("No data in queue: " + hit.getIndex() + "/" + hit.getType() + "/" + hit.getId());
}
});
} catch (IndexNotFoundException e) {
logger.debug("Index is not found.", e);
} catch (Exception e) {
logger.warn("Failed to process a queue.", e);
}
try {
Thread.sleep(interval);
} catch (InterruptedException e) {
// ignore
}
}
print("Thread %d is finished.", threadId);
});
}
Stream.of(results).forEach(f -> {
try {
f.get();
} catch (Exception e) {
// ignore
}
});
threadPool.shutdown();
return 0;
}
}
private int crawl(Crawler crawler, String configId, String sessionId) {
// Load config data
final String configIndex = config.getConfigIndex();
final String configType = config.getConfigType();
final GetResponse response = esClient.prepareGet(configIndex, configType, configId).execute().actionGet();
if (!response.isExists()) {
print("Config ID %s is not found in %s/%s.", configId, configIndex, configType);
return 1;
}
final Map<String, Object> crawlSettings = response.getSource();
if (StringUtil.isBlank(sessionId)) {
sessionId = UUID.randomUUID().toString();
}
final Map<String, Object> vars = new HashMap<String, Object>();
vars.put("configId", configId);
vars.put("client", esClient);
vars.put("sessionId", sessionId);
final RiverConfig riverConfig = riverConfigManager.get(sessionId);
final Map<String, Object> scriptSettings = SettingsUtils.get(crawlSettings, "script");
try {
// invoke execute event script
ScriptUtils.execute(scriptSettings, "execute", v -> {
v.putAll(vars);
v.put("container", SingletonLaContainerFactory.getContainer());
v.put("settings", crawlSettings);
v.put("logger", RiverWeb.logger);
});
@SuppressWarnings("unchecked")
final List<Map<String, Object>> targetList = (List<Map<String, Object>>) crawlSettings.get("target");
if (targetList == null || targetList.isEmpty()) {
print("No targets for crawling.");
return 1;
}
crawler.setSessionId(sessionId);
// HttpClient Parameters
final Map<String, Object> paramMap = new HashMap<String, Object>();
final CrawlerClientFactory clientFactory = crawler.getClientFactory();
final Integer connectionTimeout = SettingsUtils.get(crawlSettings, "connection_timeout", config.getConnectionTimeout());
if (connectionTimeout != null) {
paramMap.put(HcHttpClient.CONNECTION_TIMEOUT_PROPERTY, connectionTimeout);
}
final Integer soTimeout = SettingsUtils.get(crawlSettings, "so_timeout", config.getSoTimeout());
if (soTimeout != null) {
paramMap.put(HcHttpClient.SO_TIMEOUT_PROPERTY, soTimeout);
}
// web driver
@SuppressWarnings("unchecked")
final List<String> wdUrlList = (List<String>) crawlSettings.get("web_driver_urls");
if (wdUrlList != null) {
CrawlerClient client = SingletonLaContainer.getComponent("webDriverClient");
wdUrlList.stream().forEach(regex -> clientFactory.addClient(regex, client, 0));
}
clientFactory.setInitParameterMap(paramMap);
// user agent
final String userAgent = SettingsUtils.get(crawlSettings, "user_agent", defaultUserAgent);
if (StringUtil.isNotBlank(userAgent)) {
paramMap.put(HcHttpClient.USER_AGENT_PROPERTY, userAgent);
}
// robots.txt parser
final Boolean robotsTxtEnabled = SettingsUtils.get(crawlSettings, "robots_txt", config.isRobotsTxtEnabled());
paramMap.put(HcHttpClient.ROBOTS_TXT_ENABLED_PROPERTY, robotsTxtEnabled);
// redirect automatically
final Boolean redirectsEnabled = SettingsUtils.get(crawlSettings, "auto_redirect", config.isRedirectsEnabled());
paramMap.put(HcHttpClient.REDIRECTS_ENABLED, redirectsEnabled);
// proxy
final Map<String, Object> proxyMap = SettingsUtils.get(crawlSettings, "proxy", null);
if (proxyMap != null) {
final Object host = proxyMap.get("host");
if (host != null) {
paramMap.put(HcHttpClient.PROXY_HOST_PROPERTY, host);
final Object portObj = proxyMap.get("port");
if (portObj instanceof Integer) {
paramMap.put(HcHttpClient.PROXY_PORT_PROPERTY, portObj);
} else {
paramMap.put(HcHttpClient.PROXY_PORT_PROPERTY, Integer.valueOf(8080));
}
}
}
// authentications
// "authentications":[{"scope":{"scheme":"","host":"","port":0,"realm":""},
// "credentials":{"username":"","password":""}},{...}]
final List<Map<String, Object>> authList = SettingsUtils.get(crawlSettings, "authentications", null);
if (authList != null && !authList.isEmpty()) {
final List<Authentication> basicAuthList = new ArrayList<Authentication>();
for (final Map<String, Object> authObj : authList) {
@SuppressWarnings("unchecked")
final Map<String, Object> scopeMap = (Map<String, Object>) authObj.get("scope");
String scheme = SettingsUtils.get(scopeMap, "scheme", StringUtil.EMPTY).toUpperCase(Locale.ENGLISH);
if (StringUtil.isBlank(scheme)) {
logger.warn("Invalid authentication: " + authObj);
continue;
}
@SuppressWarnings("unchecked")
final Map<String, Object> credentialMap = (Map<String, Object>) authObj.get("credentials");
final String username = SettingsUtils.get(credentialMap, "username", null);
if (StringUtil.isBlank(username)) {
logger.warn("Invalid authentication: " + authObj);
continue;
}
final String host = SettingsUtils.get(authObj, "host", AuthScope.ANY_HOST);
final int port = SettingsUtils.get(authObj, "port", AuthScope.ANY_PORT);
final String realm = SettingsUtils.get(authObj, "realm", AuthScope.ANY_REALM);
final String password = SettingsUtils.get(credentialMap, "password", null);
AuthScheme authScheme = null;
Credentials credentials = null;
if (BASIC_SCHEME.equalsIgnoreCase(scheme)) {
authScheme = new BasicScheme();
credentials = new UsernamePasswordCredentials(username, password);
} else if (DIGEST_SCHEME.equals(scheme)) {
authScheme = new DigestScheme();
credentials = new UsernamePasswordCredentials(username, password);
} else if (NTLM_SCHEME.equals(scheme)) {
authScheme = new NTLMScheme(new JcifsEngine());
scheme = AuthScope.ANY_SCHEME;
final String workstation = SettingsUtils.get(credentialMap, "workstation", null);
final String domain = SettingsUtils.get(credentialMap, "domain", null);
credentials = new NTCredentials(username, password, workstation == null ? StringUtil.EMPTY : workstation,
domain == null ? StringUtil.EMPTY : domain);
}
final AuthenticationImpl auth =
new AuthenticationImpl(new AuthScope(host, port, realm, scheme), credentials, authScheme);
basicAuthList.add(auth);
}
paramMap.put(HcHttpClient.BASIC_AUTHENTICATIONS_PROPERTY, basicAuthList.toArray(new Authentication[basicAuthList.size()]));
}
// request header
// "headers":[{"name":"","value":""},{}]
final List<Map<String, Object>> headerList = SettingsUtils.get(crawlSettings, "headers", null);
if (headerList != null && !headerList.isEmpty()) {
final List<RequestHeader> requestHeaderList = new ArrayList<RequestHeader>();
for (final Map<String, Object> headerObj : headerList) {
final String name = SettingsUtils.get(headerObj, "name", null);
final String value = SettingsUtils.get(headerObj, "value", null);
if (name != null && value != null) {
requestHeaderList.add(new RequestHeader(name, value));
}
}
paramMap.put(HcHttpClient.REQUERT_HEADERS_PROPERTY, requestHeaderList.toArray(new RequestHeader[requestHeaderList.size()]));
}
// url
@SuppressWarnings("unchecked")
final List<String> urlList = (List<String>) crawlSettings.get("urls");
if (urlList == null || urlList.isEmpty()) {
print("No url for crawling.");
return 1;
}
for (final String url : urlList) {
try {
crawler.addUrl(url);
} catch (DocumentAlreadyExistsException e) {
logger.warn(url + " exists in " + sessionId);
}
}
// include regex
@SuppressWarnings("unchecked")
final List<String> includeFilterList = (List<String>) crawlSettings.get("include_urls");
if (includeFilterList != null) {
for (final String regex : includeFilterList) {
try {
crawler.addIncludeFilter(regex);
} catch (DocumentAlreadyExistsException e) {
logger.warn(regex + " exists in " + sessionId);
}
}
}
// exclude regex
@SuppressWarnings("unchecked")
final List<String> excludeFilterList = (List<String>) crawlSettings.get("exclude_urls");
if (excludeFilterList != null) {
for (final String regex : excludeFilterList) {
try {
crawler.addExcludeFilter(regex);
} catch (DocumentAlreadyExistsException e) {
logger.warn(regex + " exists in " + sessionId);
}
}
}
final CrawlerContext robotContext = crawler.getCrawlerContext();
// max depth
final int maxDepth = SettingsUtils.get(crawlSettings, "max_depth", -1);
robotContext.setMaxDepth(maxDepth);
// max access count
final int maxAccessCount = SettingsUtils.get(crawlSettings, "max_access_count", 100);
robotContext.setMaxAccessCount(maxAccessCount);
// num of thread
final int numOfThread = SettingsUtils.get(crawlSettings, "num_of_thread", 5);
robotContext.setNumOfThread(numOfThread);
// interval
final long interval = SettingsUtils.get(crawlSettings, "interval", 1000L);
final WebRiverIntervalController intervalController = (WebRiverIntervalController) crawler.getIntervalController();
intervalController.setDelayMillisForWaitingNewUrl(interval);
// river params
riverConfig.setIndex(SettingsUtils.get(crawlSettings, "index", "web"));
riverConfig.setType(SettingsUtils.get(crawlSettings, "type", configId));
riverConfig.setOverwrite(SettingsUtils.get(crawlSettings, "overwrite", Boolean.FALSE));
riverConfig.setIncremental(SettingsUtils.get(crawlSettings, "incremental", Boolean.FALSE));
riverConfig.setScriptSettings(scriptSettings);
// crawl config
for (final Map<String, Object> targetMap : targetList) {
@SuppressWarnings("unchecked")
final Map<String, Object> patternMap = (Map<String, Object>) targetMap.get("pattern");
@SuppressWarnings("unchecked")
final Map<String, Map<String, Object>> propMap = (Map<String, Map<String, Object>>) targetMap.get("properties");
if (patternMap != null && propMap != null) {
if (logger.isDebugEnabled()) {
logger.debug("patternMap: " + patternMap);
logger.debug("propMap: " + propMap);
}
@SuppressWarnings("unchecked")
final Map<String, Object> settingMap = (Map<String, Object>) targetMap.get("settings");
riverConfig.addScrapingRule(settingMap, patternMap, propMap);
} else {
logger.warn("Invalid pattern or target: patternMap: " + patternMap + ", propMap: " + propMap);
}
}
// run s2robot
crawler.execute();
crawler.stop();
} finally {
// invoke finish event script
ScriptUtils.execute(scriptSettings, "finish", v -> {
v.putAll(vars);
v.put("container", SingletonLaContainerFactory.getContainer());
v.put("settings", crawlSettings);
v.put("logger", RiverWeb.logger);
});
riverConfigManager.remove(sessionId);
if (cleanup) {
final EsUrlFilterService urlFilterService = SingletonLaContainer.getComponent(EsUrlFilterService.class);
final EsUrlQueueService urlQueueService = SingletonLaContainer.getComponent(EsUrlQueueService.class);
final EsDataService dataService = SingletonLaContainer.getComponent(EsDataService.class);
try {
// clear url filter
urlFilterService.delete(sessionId);
} catch (Exception e) {
logger.warn("Failed to delete UrlFilter for " + sessionId, e);
}
try {
// clear queue
urlQueueService.clearCache();
urlQueueService.delete(sessionId);
} catch (Exception e) {
logger.warn("Failed to delete UrlQueue for " + sessionId, e);
}
try {
// clear
dataService.delete(sessionId);
} catch (Exception e) {
logger.warn("Failed to delete AccessResult for " + sessionId, e);
}
}
}
return 0;
}
}