/*
* Copyright 2012-2017 CodeLibs Project and the Others.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
* either express or implied. See the License for the specific language
* governing permissions and limitations under the License.
*/
package org.codelibs.fess.ds.impl;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.function.Consumer;
import java.util.stream.Collectors;
import org.codelibs.core.lang.StringUtil;
import org.codelibs.elasticsearch.runner.net.Curl;
import org.codelibs.elasticsearch.runner.net.CurlResponse;
import org.codelibs.fess.crawler.client.CrawlerClientFactory;
import org.codelibs.fess.crawler.client.http.HcHttpClient;
import org.codelibs.fess.crawler.client.http.RequestHeader;
import org.codelibs.fess.ds.IndexUpdateCallback;
import org.codelibs.fess.es.config.exentity.CrawlingConfig;
import org.codelibs.fess.es.config.exentity.CrawlingConfigWrapper;
import org.codelibs.fess.es.config.exentity.DataConfig;
import org.codelibs.fess.helper.SystemHelper;
import org.codelibs.fess.util.ComponentUtil;
import org.elasticsearch.common.xcontent.NamedXContentRegistry;
import org.elasticsearch.common.xcontent.json.JsonXContent;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* @author Keiichi Watanabe
*/
public class GitBucketDataStoreImpl extends AbstractDataStoreImpl {
private static final Logger logger = LoggerFactory.getLogger(CsvDataStoreImpl.class);
private static final int MAX_DEPTH = 20;
protected static final String TOKEN_PARAM = "token";
protected static final String GITBUCKET_URL_PARAM = "url";
protected static final String PRIVATE_REPOSITORY_PARAM = "is_private";
protected static final String COLLABORATORS_PARAM = "collaborators";
@Override
protected void storeData(final DataConfig dataConfig, final IndexUpdateCallback callback, final Map<String, String> paramMap,
final Map<String, String> scriptMap, final Map<String, Object> defaultDataMap) {
final String rootURL = getRootURL(paramMap);
final String authToken = getAuthToken(paramMap);
final long readInterval = getReadInterval(paramMap);
// Non-emptiness Check for URL and Token
if (rootURL.isEmpty() || authToken.isEmpty()) {
logger.warn("parameter \"" + TOKEN_PARAM + "\" and \"" + GITBUCKET_URL_PARAM + "\" are required");
return;
}
// Get List of Repositories
final List<Map<String, Object>> repositoryList = getRepositoryList(rootURL, authToken);
if (repositoryList.isEmpty()) {
logger.warn("Token is invalid or no Repository");
return;
}
// Get Labels
final Map<String, String> pluginInfo = getFessPluginInfo(rootURL, authToken);
final String sourceLabel = pluginInfo.get("source_label");
final String issueLabel = pluginInfo.get("issue_label");
final String wikiLabel = pluginInfo.get("wiki_label");
final CrawlingConfig crawlingConfig = new CrawlingConfigWrapper(dataConfig) {
@Override
public Map<String, Object> initializeClientFactory(final CrawlerClientFactory crawlerClientFactory) {
final Map<String, Object> paramMap = super.initializeClientFactory(crawlerClientFactory);
final List<RequestHeader> headerList = new ArrayList<>();
final RequestHeader[] headers = (RequestHeader[]) paramMap.get(HcHttpClient.REQUERT_HEADERS_PROPERTY);
if (headers != null) {
for (final RequestHeader header : headers) {
headerList.add(header);
}
}
headerList.add(new RequestHeader("Authorization", "token " + authToken));
headerList.add(new RequestHeader("Accept", "application/vnd.github.v3.raw"));
paramMap.put(HcHttpClient.REQUERT_HEADERS_PROPERTY, headerList.toArray(new RequestHeader[headerList.size()]));
return paramMap;
}
};
// Crawl each repository
for (final Map<String, Object> repository : repositoryList) {
try {
final String owner = (String) repository.get("owner");
final String name = (String) repository.get("name");
final String refStr = getGitRef(rootURL, authToken, owner, name, "master");
final int issueCount = (int) repository.get("issue_count");
final int pullCount = (int) repository.get("pull_count");
final List<String> roleList = createRoleList(owner, repository);
logger.info("Crawl " + owner + "/" + name);
// crawl and store file contents recursively
crawlFileContents(
rootURL,
authToken,
owner,
name,
refStr,
StringUtil.EMPTY,
0,
readInterval,
path -> {
storeFileContent(rootURL, authToken, sourceLabel, owner, name, refStr, roleList, path, crawlingConfig,
callback, paramMap, scriptMap, defaultDataMap);
if (readInterval > 0) {
sleep(readInterval);
}
});
logger.info("Crawl issues in " + owner + "/" + name);
// store issues
for (int issueId = 1; issueId <= issueCount + pullCount; issueId++) {
storeIssueById(rootURL, authToken, issueLabel, owner, name, new Integer(issueId), roleList, crawlingConfig, callback,
paramMap, scriptMap, defaultDataMap);
if (readInterval > 0) {
sleep(readInterval);
}
}
logger.info("Crawl Wiki in " + owner + "/" + name);
// crawl Wiki
storeWikiContents(rootURL, authToken, wikiLabel, owner, name, roleList, crawlingConfig, callback, paramMap, scriptMap,
defaultDataMap, readInterval);
} catch (final Exception e) {
logger.warn("Failed to access to " + repository, e);
}
}
}
protected String getRootURL(final Map<String, String> paramMap) {
if (paramMap.containsKey(GITBUCKET_URL_PARAM)) {
final String url = paramMap.get(GITBUCKET_URL_PARAM);
if (!url.endsWith("/")) {
return url + "/";
}
return url;
}
return StringUtil.EMPTY;
}
protected String getAuthToken(final Map<String, String> paramMap) {
if (paramMap.containsKey(TOKEN_PARAM)) {
return paramMap.get(TOKEN_PARAM);
}
return StringUtil.EMPTY;
}
protected Map<String, String> getFessPluginInfo(final String rootURL, final String authToken) {
final String url = rootURL + "api/v3/fess/info";
try (CurlResponse curlResponse = Curl.get(url).header("Authorization", "token " + authToken).execute()) {
@SuppressWarnings({ "rawtypes", "unchecked" })
final Map<String, String> map = (Map) curlResponse.getContentAsMap();
assert (map.containsKey("version"));
assert (map.containsKey("source_label") && map.containsKey("wiki_label") & map.containsKey("issue_label"));
return map;
} catch (final Exception e) {
logger.warn("Failed to access to " + rootURL, e);
return Collections.emptyMap();
}
}
protected List<String> getSourceLabelList(final String rootURL, final String authToken) {
final String url = rootURL + "api/v3/fess/label";
try (CurlResponse curlResponse = Curl.get(url).header("Authorization", "token " + authToken).execute()) {
final Map<String, Object> map = curlResponse.getContentAsMap();
assert (map.containsKey("source_label"));
@SuppressWarnings("unchecked")
final List<String> sourceLabels = (List<String>) map.get("source_label");
return sourceLabels;
} catch (final Exception e) {
logger.warn("Failed to access to " + rootURL, e);
return Collections.emptyList();
}
}
protected List<Map<String, Object>> getRepositoryList(final String rootURL, final String authToken) {
final String url = rootURL + "api/v3/fess/repos";
try (CurlResponse curlResponse = Curl.get(url).header("Authorization", "token " + authToken).execute()) {
final Map<String, Object> map = curlResponse.getContentAsMap();
assert (map.containsKey("repositories"));
@SuppressWarnings("unchecked")
final List<Map<String, Object>> repoList = (List<Map<String, Object>>) map.get("repositories");
return repoList;
} catch (final Exception e) {
logger.warn("Failed to access to " + rootURL, e);
return Collections.emptyList();
}
}
protected String getGitRef(final String rootURL, final String authToken, final String owner, final String name, final String branch) {
final String url = rootURL + "api/v3/repos/" + owner + "/" + name + "/git/refs/heads/" + branch;
try (CurlResponse curlResponse = Curl.get(url).header("Authorization", "token " + authToken).execute()) {
final Map<String, Object> map = curlResponse.getContentAsMap();
assert (map.containsKey("object"));
@SuppressWarnings("unchecked")
final Map<String, String> objmap = (Map<String, String>) map.get("object");
assert (objmap.containsKey("sha"));
return objmap.get("sha");
} catch (final Exception e) {
logger.warn("Failed to access to " + rootURL, e);
return branch;
}
}
private List<String> createRoleList(final String owner, final Map<String, Object> repository) {
Boolean isPrivate = true;
if (repository.containsKey(PRIVATE_REPOSITORY_PARAM)) {
isPrivate = (Boolean) repository.get(PRIVATE_REPOSITORY_PARAM);
}
if (!isPrivate) {
return Collections.singletonList("Rguest");
}
@SuppressWarnings("unchecked")
final List<String> collaboratorList = (List<String>) repository.get(COLLABORATORS_PARAM);
final SystemHelper systemHelper = ComponentUtil.getSystemHelper();
collaboratorList.add(owner);
return collaboratorList.stream().map(user -> systemHelper.getSearchRoleByUser(user)).collect(Collectors.toList());
}
private List<Object> parseList(final InputStream is) { // TODO This function should be moved to CurlResponse
try {
return JsonXContent.jsonXContent.createParser(NamedXContentRegistry.EMPTY, is).list();
} catch (final Exception e) {
logger.warn("Failed to parse a list.", e);
return Collections.emptyList();
}
}
private void storeFileContent(final String rootURL, final String authToken, final String sourceLabel, final String owner,
final String name, final String refStr, final List<String> roleList, final String path, final CrawlingConfig crawlingConfig,
final IndexUpdateCallback callback, final Map<String, String> paramMap, final Map<String, String> scriptMap,
final Map<String, Object> defaultDataMap) {
final String apiUrl = rootURL + "api/v3/repos/" + owner + "/" + name + "/contents/" + path;
final String viewUrl = rootURL + owner + "/" + name + "/blob/" + refStr + "/" + path;
if (logger.isInfoEnabled()) {
logger.info("Get a content from " + apiUrl);
}
final Map<String, Object> dataMap = new HashMap<>();
dataMap.putAll(defaultDataMap);
dataMap.putAll(ComponentUtil.getDocumentHelper().processRequest(crawlingConfig, paramMap.get("crawlingInfoId"),
apiUrl + "?ref=" + refStr + "&large_file=true"));
dataMap.put("url", viewUrl);
dataMap.put("role", roleList);
dataMap.put("label", Collections.singletonList(sourceLabel));
// TODO scriptMap
callback.store(paramMap, dataMap);
return;
}
private void storeIssueById(final String rootURL, final String authToken, final String issueLabel, final String owner,
final String name, final Integer issueId, final List<String> roleList, final CrawlingConfig crawlingConfig,
final IndexUpdateCallback callback, final Map<String, String> paramMap, final Map<String, String> scriptMap,
final Map<String, Object> defaultDataMap) {
final String issueUrl = rootURL + "api/v3/repos/" + owner + "/" + name + "/issues/" + issueId.toString();
// final String commentsUrl = issueUrl + "/comments";
final String viewUrl = rootURL + owner + "/" + name + "/issues/" + issueId.toString();
if (logger.isInfoEnabled()) {
logger.info("Get a content from " + issueUrl);
}
final Map<String, Object> dataMap = new HashMap<>();
String contentStr = "";
dataMap.putAll(defaultDataMap);
// Get issue description
// FIXME: Use `ComponentUtil.getDocumentHelper().processRequest` instead of `Curl.get`
try (CurlResponse curlResponse = Curl.get(issueUrl).header("Authorization", "token " + authToken).execute()) {
final Map<String, Object> map = curlResponse.getContentAsMap();
dataMap.put("title", map.getOrDefault("title", ""));
contentStr = (String) map.getOrDefault("body", "");
} catch (final Exception e) {
logger.warn("Failed to access to " + issueUrl, e);
}
// FIXME: Get issue comments from `commentsUrl`
// How to parse JSON-style list?
dataMap.put("content", contentStr);
dataMap.put("url", viewUrl);
dataMap.put("role", roleList);
dataMap.put("label", Collections.singletonList(issueLabel));
// TODO scriptMap
callback.store(paramMap, dataMap);
return;
}
@SuppressWarnings("unchecked")
private void storeWikiContents(final String rootURL, final String authToken, final String wikiLabel, final String owner,
final String name, final List<String> roleList, final CrawlingConfig crawlingConfig, final IndexUpdateCallback callback,
final Map<String, String> paramMap, final Map<String, String> scriptMap, final Map<String, Object> defaultDataMap,
final long readInterval) {
final String wikiUrl = rootURL + "api/v3/fess/" + owner + "/" + name + "/wiki";
List<String> pageList = Collections.emptyList();
// Get list of pages
try (CurlResponse curlResponse = Curl.get(wikiUrl).header("Authorization", "token " + authToken).execute()) {
final Map<String, Object> map = curlResponse.getContentAsMap();
pageList = (List<String>) map.get("pages");
} catch (final Exception e) {
logger.warn("Failed to access to " + wikiUrl, e);
}
for (final String page : pageList) {
// FIXME: URL encoding (e.g. page name that contains spaces)
final String pageUrl = wikiUrl + "/contents/" + page + ".md";
final String viewUrl = rootURL + owner + "/" + name + "/wiki/" + page;
if (logger.isInfoEnabled()) {
logger.info("Get a content from " + pageUrl);
}
final Map<String, Object> dataMap = new HashMap<>();
dataMap.putAll(defaultDataMap);
dataMap.putAll(ComponentUtil.getDocumentHelper().processRequest(crawlingConfig, paramMap.get("crawlingInfoId"), pageUrl));
dataMap.put("url", viewUrl);
dataMap.put("role", roleList);
dataMap.put("label", Collections.singletonList(wikiLabel));
// TODO scriptMap
callback.store(paramMap, dataMap);
logger.info("Stored " + pageUrl);
if (readInterval > 0) {
sleep(readInterval);
}
}
}
protected void crawlFileContents(final String rootURL, final String authToken, final String owner, final String name,
final String refStr, final String path, final int depth, final long readInterval, final Consumer<String> consumer) {
if (MAX_DEPTH <= depth) {
return;
}
final String url = rootURL + "api/v3/repos/" + owner + "/" + name + "/contents/" + path + "?ref=" + refStr;
try (CurlResponse curlResponse = Curl.get(url).header("Authorization", "token " + authToken).execute()) {
final InputStream iStream = curlResponse.getContentAsStream();
final List<Object> fileList = parseList(iStream);
for (int i = 0; i < fileList.size(); ++i) {
@SuppressWarnings("unchecked")
final Map<String, String> file = (Map<String, String>) fileList.get(i);
final String newPath = path.isEmpty() ? file.get("name") : path + "/" + file.get("name");
switch (file.get("type")) {
case "file":
consumer.accept(newPath);
break;
case "dir":
if (readInterval > 0) {
sleep(readInterval);
}
crawlFileContents(rootURL, authToken, owner, name, refStr, newPath, depth + 1, readInterval, consumer);
break;
}
}
} catch (final Exception e) {
logger.warn("Failed to access to " + url, e);
}
}
}