/*
* Copyright 2012-2017 CodeLibs Project and the Others.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
* either express or implied. See the License for the specific language
* governing permissions and limitations under the License.
*/
package org.codelibs.fess.ds.impl;
import static org.codelibs.core.stream.StreamUtil.stream;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.util.stream.Collectors;
import org.codelibs.core.io.SerializeUtil;
import org.codelibs.fess.Constants;
import org.codelibs.fess.crawler.builder.RequestDataBuilder;
import org.codelibs.fess.crawler.client.CrawlerClient;
import org.codelibs.fess.crawler.client.CrawlerClientFactory;
import org.codelibs.fess.crawler.entity.ResponseData;
import org.codelibs.fess.crawler.entity.ResultData;
import org.codelibs.fess.crawler.exception.ChildUrlsException;
import org.codelibs.fess.crawler.exception.CrawlerSystemException;
import org.codelibs.fess.crawler.processor.ResponseProcessor;
import org.codelibs.fess.crawler.processor.impl.DefaultResponseProcessor;
import org.codelibs.fess.crawler.rule.Rule;
import org.codelibs.fess.crawler.rule.RuleManager;
import org.codelibs.fess.crawler.transformer.Transformer;
import org.codelibs.fess.ds.IndexUpdateCallback;
import org.codelibs.fess.es.client.FessEsClient;
import org.codelibs.fess.exception.DataStoreCrawlingException;
import org.codelibs.fess.helper.IndexingHelper;
import org.codelibs.fess.mylasta.direction.FessConfig;
import org.codelibs.fess.util.ComponentUtil;
import org.lastaflute.di.core.SingletonLaContainer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class FileListIndexUpdateCallbackImpl implements IndexUpdateCallback {
private static final Logger logger = LoggerFactory.getLogger(FileListIndexUpdateCallbackImpl.class);
protected IndexUpdateCallback indexUpdateCallback;
protected CrawlerClientFactory crawlerClientFactory;
protected List<String> deleteUrlList = new ArrayList<>(100);
protected int maxDeleteDocumentCacheSize = 100;
protected int maxRedirectCount = 10;
private final ExecutorService executor;
private int executorTerminationTimeout = 300;
protected FileListIndexUpdateCallbackImpl(final IndexUpdateCallback indexUpdateCallback,
final CrawlerClientFactory crawlerClientFactory, final int nThreads) {
this.indexUpdateCallback = indexUpdateCallback;
this.crawlerClientFactory = crawlerClientFactory;
executor = newFixedThreadPool(nThreads < 1 ? 1 : nThreads);
}
protected ExecutorService newFixedThreadPool(final int nThreads) {
if (logger.isDebugEnabled()) {
logger.debug("Executor Thread Pool: " + nThreads);
}
return new ThreadPoolExecutor(nThreads, nThreads, 0L, TimeUnit.MILLISECONDS, new LinkedBlockingQueue<Runnable>(nThreads),
new ThreadPoolExecutor.CallerRunsPolicy());
}
@Override
public void store(final Map<String, String> paramMap, final Map<String, Object> dataMap) {
executor.execute(() -> {
final Object eventType = dataMap.remove(getParamValue(paramMap, "field.event_type", "event_type"));
if (getParamValue(paramMap, "event.create", "create").equals(eventType)
|| getParamValue(paramMap, "event.modify", "modify").equals(eventType)) {
// updated file
addDocument(paramMap, dataMap);
} else if (getParamValue(paramMap, "event.delete", "delete").equals(eventType)) {
// deleted file
deleteDocument(paramMap, dataMap);
} else {
logger.warn("unknown event: " + eventType + ", data: " + dataMap);
}
});
}
protected String getParamValue(final Map<String, String> paramMap, final String key, final String defaultValue) {
return paramMap.getOrDefault(key, defaultValue);
}
protected void addDocument(final Map<String, String> paramMap, final Map<String, Object> dataMap) {
final FessConfig fessConfig = ComponentUtil.getFessConfig();
synchronized (indexUpdateCallback) {
// required check
if (!dataMap.containsKey(fessConfig.getIndexFieldUrl()) || dataMap.get(fessConfig.getIndexFieldUrl()) == null) {
logger.warn("Could not add a doc. Invalid data: " + dataMap);
return;
}
final String url = dataMap.get(fessConfig.getIndexFieldUrl()).toString();
final CrawlerClient client = crawlerClientFactory.getClient(url);
if (client == null) {
logger.warn("CrawlerClient is null. Data: " + dataMap);
return;
}
String processingUrl = url;
for (int i = 0; i < maxRedirectCount; i++) {
processingUrl = processRequest(paramMap, dataMap, processingUrl, client);
if (processingUrl == null) {
break;
}
dataMap.put(fessConfig.getIndexFieldUrl(), processingUrl);
}
}
}
protected String processRequest(final Map<String, String> paramMap, final Map<String, Object> dataMap, final String url,
final CrawlerClient client) {
final long startTime = System.currentTimeMillis();
try (final ResponseData responseData = client.execute(RequestDataBuilder.newRequestData().get().url(url).build())) {
if (responseData.getRedirectLocation() != null) {
return responseData.getRedirectLocation();
}
responseData.setExecutionTime(System.currentTimeMillis() - startTime);
if (dataMap.containsKey(Constants.SESSION_ID)) {
responseData.setSessionId((String) dataMap.get(Constants.SESSION_ID));
} else {
responseData.setSessionId(paramMap.get(Constants.CRAWLING_INFO_ID));
}
final RuleManager ruleManager = SingletonLaContainer.getComponent(RuleManager.class);
final Rule rule = ruleManager.getRule(responseData);
if (rule == null) {
logger.warn("No url rule. Data: " + dataMap);
} else {
responseData.setRuleId(rule.getRuleId());
final ResponseProcessor responseProcessor = rule.getResponseProcessor();
if (responseProcessor instanceof DefaultResponseProcessor) {
final Transformer transformer = ((DefaultResponseProcessor) responseProcessor).getTransformer();
final ResultData resultData = transformer.transform(responseData);
final byte[] data = resultData.getData();
if (data != null) {
try {
@SuppressWarnings("unchecked")
final Map<String, Object> responseDataMap = (Map<String, Object>) SerializeUtil.fromBinaryToObject(data);
dataMap.putAll(responseDataMap);
} catch (final Exception e) {
throw new CrawlerSystemException("Could not create an instance from bytes.", e);
}
}
// remove
String[] ignoreFields;
if (paramMap.containsKey("ignore.field.names")) {
ignoreFields = paramMap.get("ignore.field.names").split(",");
} else {
ignoreFields = new String[] { Constants.INDEXING_TARGET, Constants.SESSION_ID };
}
stream(ignoreFields).of(stream -> stream.map(s -> s.trim()).forEach(s -> dataMap.remove(s)));
indexUpdateCallback.store(paramMap, dataMap);
} else {
logger.warn("The response processor is not DefaultResponseProcessor. responseProcessor: " + responseProcessor
+ ", Data: " + dataMap);
}
}
return null;
} catch (final ChildUrlsException e) {
throw new DataStoreCrawlingException(url, "Redirected to "
+ e.getChildUrlList().stream().map(r -> r.getUrl()).collect(Collectors.joining(", ")), e);
} catch (final Exception e) {
throw new DataStoreCrawlingException(url, "Failed to add: " + dataMap, e);
}
}
protected boolean deleteDocument(final Map<String, String> paramMap, final Map<String, Object> dataMap) {
if (logger.isDebugEnabled()) {
logger.debug("Deleting " + dataMap);
}
final FessConfig fessConfig = ComponentUtil.getFessConfig();
// required check
if (!dataMap.containsKey(fessConfig.getIndexFieldUrl()) || dataMap.get(fessConfig.getIndexFieldUrl()) == null) {
logger.warn("Could not delete a doc. Invalid data: " + dataMap);
return false;
}
synchronized (indexUpdateCallback) {
deleteUrlList.add(dataMap.get(fessConfig.getIndexFieldUrl()).toString());
if (deleteUrlList.size() >= maxDeleteDocumentCacheSize) {
deleteDocuments();
}
}
return true;
}
@Override
public void commit() {
try {
if (logger.isDebugEnabled()) {
logger.debug("Shutting down thread executor.");
}
executor.shutdown();
executor.awaitTermination(executorTerminationTimeout, TimeUnit.SECONDS);
} catch (final InterruptedException e) {
if (logger.isDebugEnabled()) {
logger.debug("Failed to interrupt executor.", e);
}
} finally {
executor.shutdownNow();
}
if (!deleteUrlList.isEmpty()) {
deleteDocuments();
}
indexUpdateCallback.commit();
}
protected void deleteDocuments() {
final FessEsClient fessEsClient = ComponentUtil.getFessEsClient();
final IndexingHelper indexingHelper = ComponentUtil.getIndexingHelper();
for (final String url : deleteUrlList) {
indexingHelper.deleteDocumentByUrl(fessEsClient, url);
}
if (logger.isDebugEnabled()) {
logger.debug("Deleted " + deleteUrlList);
}
deleteUrlList.clear();
}
@Override
public long getDocumentSize() {
return indexUpdateCallback.getDocumentSize();
}
@Override
public long getExecuteTime() {
return indexUpdateCallback.getExecuteTime();
}
public void setMaxDeleteDocumentCacheSize(final int maxDeleteDocumentCacheSize) {
this.maxDeleteDocumentCacheSize = maxDeleteDocumentCacheSize;
}
public void setMaxRedirectCount(final int maxRedirectCount) {
this.maxRedirectCount = maxRedirectCount;
}
public void setExecutorTerminationTimeout(final int executorTerminationTimeout) {
this.executorTerminationTimeout = executorTerminationTimeout;
}
}