package org.codelibs.riverweb.transformer; import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder; import java.io.BufferedInputStream; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.lang.reflect.Method; import java.util.ArrayList; import java.util.Collections; import java.util.Date; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.stream.Collectors; import javax.annotation.PostConstruct; import org.codelibs.core.beans.BeanDesc; import org.codelibs.core.beans.factory.BeanDescFactory; import org.codelibs.core.beans.util.BeanUtil; import org.codelibs.core.io.CopyUtil; import org.codelibs.core.io.FileUtil; import org.codelibs.core.lang.MethodUtil; import org.codelibs.core.lang.StringUtil; import org.codelibs.core.misc.Base64Util; import org.codelibs.fess.crawler.Constants; import org.codelibs.fess.crawler.builder.RequestDataBuilder; import org.codelibs.fess.crawler.client.EsClient; import org.codelibs.fess.crawler.entity.AccessResultData; import org.codelibs.fess.crawler.entity.RequestData; import org.codelibs.fess.crawler.entity.ResponseData; import org.codelibs.fess.crawler.entity.ResultData; import org.codelibs.fess.crawler.exception.CrawlingAccessException; import org.codelibs.fess.crawler.helper.EncodingHelper; import org.codelibs.fess.crawler.transformer.impl.HtmlTransformer; import org.codelibs.riverweb.WebRiverConstants; import org.codelibs.riverweb.app.service.ScriptService; import org.codelibs.riverweb.config.RiverConfig; import org.codelibs.riverweb.config.RiverConfigManager; import org.codelibs.riverweb.entity.ScrapingRule; import org.codelibs.riverweb.util.SettingsUtils; import org.elasticsearch.index.query.QueryBuilders; import org.elasticsearch.script.ScriptService.ScriptType; import org.jsoup.Jsoup; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.lastaflute.di.core.SingletonLaContainer; import org.lastaflute.di.core.factory.SingletonLaContainerFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class ScrapingTransformer extends HtmlTransformer { private static final long DEFAULT_MAX_ATTACHMENT_SIZE = 1000 * 1000; // 1M private static final String VALUE_QUERY_TYPE = "value"; private static final String TYPE_QUERY_TYPE = "type"; private static final String SCRIPT_QUERY_TYPE = "script"; private static final String ARGS_QUERY_TYPE = "args"; private static final String IS_ARRAY_PROP_NAME = "is_array"; private static final String IS_DISTINCT_PROP_NAME = "is_distinct"; private static final String IS_CHILD_URL_PROP_NAME = "is_child"; private static final String TRIM_SPACES_PROP_NAME = "trim_spaces"; private static final String TIMESTAMP_FIELD = "@timestamp"; private static final String POSITION_FIELD = "position"; private static final String ARRAY_PROPERTY_PREFIX = "[]"; private static final Logger logger = LoggerFactory.getLogger(ScrapingTransformer.class); private static final String[] queryTypes = new String[] { "className", "data", "html", "id", "ownText", "tagName", "text", "val", "nodeName", "outerHtml", "attr", "baseUri", "absUrl" }; public String[] copiedResonseDataFields = new String[] { "url", "parentUrl", "httpStatusCode", "method", "charSet", "contentLength", "mimeType", "executionTime", "lastModified" }; private EsClient esClient; protected RiverConfigManager riverConfigManager; protected ThreadLocal<Set<String>> childUrlSetLocal = new ThreadLocal<Set<String>>(); protected ThreadLocal<RiverConfig> riverConfigLocal = new ThreadLocal<>(); @PostConstruct public void init() { esClient = SingletonLaContainer.getComponent(EsClient.class); riverConfigManager = SingletonLaContainer.getComponent(RiverConfigManager.class); } @Override public ResultData transform(final ResponseData responseData) { final RiverConfig riverConfig = riverConfigManager.get(responseData.getSessionId()); try { riverConfigLocal.set(riverConfig); return super.transform(responseData); } finally { riverConfigLocal.remove(); childUrlSetLocal.remove(); } } @Override protected void updateCharset(final ResponseData responseData) { int preloadSize = preloadSizeForCharset; final ScrapingRule scrapingRule = riverConfigLocal.get().getScrapingRule(responseData); if (scrapingRule != null) { final Integer s = scrapingRule.getSetting("preloadSizeForCharset", Integer.valueOf(0)); if (s.intValue() > 0) { preloadSize = s.intValue(); } } final String encoding = loadCharset(responseData.getResponseBody(), preloadSize); if (encoding == null) { if (defaultEncoding == null) { responseData.setCharSet(Constants.UTF_8); } else if (responseData.getCharSet() == null) { responseData.setCharSet(defaultEncoding); } } else { responseData.setCharSet(encoding.trim()); } if (!isSupportedCharset(responseData.getCharSet())) { responseData.setCharSet(Constants.UTF_8); } } protected String loadCharset(final InputStream inputStream, final int preloadSize) { BufferedInputStream bis = null; String encoding = null; try { bis = new BufferedInputStream(inputStream); final byte[] buffer = new byte[preloadSize]; final int size = bis.read(buffer); if (size != -1) { final String content = new String(buffer, 0, size); encoding = parseCharset(content); } } catch (final IOException e) { throw new CrawlingAccessException("Could not load a content.", e); } try { final EncodingHelper encodingHelper = SingletonLaContainer.getComponent(EncodingHelper.class); encoding = encodingHelper.normalize(encoding); } catch (final Exception e) { // NOP } return encoding; } @Override protected void storeData(final ResponseData responseData, final ResultData resultData) { File file = null; try { final ScrapingRule scrapingRule = riverConfigLocal.get().getScrapingRule(responseData); if (scrapingRule == null) { logger.info("Skip Scraping: " + responseData.getUrl()); return; } file = File.createTempFile("river-web-", ".tmp"); CopyUtil.copy(responseData.getResponseBody(), file); processData(scrapingRule, file, responseData, resultData); } catch (final IOException e) { throw new CrawlingAccessException("Failed to create a temp file.", e); } finally { if (file != null && !file.delete()) { logger.warn("Failed to delete " + file.getAbsolutePath()); } } } protected void processData(final ScrapingRule scrapingRule, final File file, final ResponseData responseData, final ResultData resultData) { final Map<String, Map<String, Object>> scrapingRuleMap = scrapingRule.getRuleMap(); org.jsoup.nodes.Document document = null; String charsetName = responseData.getCharSet(); if (charsetName == null) { charsetName = Constants.UTF_8; } final Boolean isHtmlParsed = scrapingRule.getSetting("html", Boolean.TRUE); if (isHtmlParsed.booleanValue()) { try (InputStream is = new BufferedInputStream(new FileInputStream(file))) { document = Jsoup.parse(is, charsetName, responseData.getUrl()); } catch (final IOException e) { throw new CrawlingAccessException("Could not parse " + responseData.getUrl(), e); } } final Map<String, Object> dataMap = new LinkedHashMap<String, Object>(); BeanUtil.copyBeanToMap(responseData, dataMap, op -> { op.include(copiedResonseDataFields).excludeNull().excludeWhitespace(); }); if (logger.isDebugEnabled()) { logger.debug("ruleMap: " + scrapingRuleMap); logger.debug("dataMap: " + dataMap); } for (final Map.Entry<String, Map<String, Object>> entry : scrapingRuleMap.entrySet()) { final String propName = entry.getKey(); final Map<String, Object> params = entry.getValue(); final boolean isTrimSpaces = SettingsUtils.get(params, TRIM_SPACES_PROP_NAME, Boolean.FALSE).booleanValue(); boolean isArray = SettingsUtils.get(params, IS_ARRAY_PROP_NAME, Boolean.FALSE).booleanValue(); boolean isChildUrl = SettingsUtils.get(params, IS_CHILD_URL_PROP_NAME, Boolean.FALSE).booleanValue(); boolean isDistinct = SettingsUtils.get(params, IS_DISTINCT_PROP_NAME, Boolean.FALSE).booleanValue(); final List<String> strList = new ArrayList<String>(); final Object value = SettingsUtils.get(params, VALUE_QUERY_TYPE, null); final String type = SettingsUtils.get(params, TYPE_QUERY_TYPE, null); if (value != null) { if (value instanceof String) { strList.add(trimSpaces(value.toString(), isTrimSpaces)); } else if (value instanceof List) { @SuppressWarnings("unchecked") final List<Object> list = (List<Object>) value; for (final Object obj : list) { strList.add(trimSpaces(obj.toString(), isTrimSpaces)); } } } else if ("data".equals(type) || "attachment".equals(type)) { final long maxFileSize = SettingsUtils.get(params, "maxFileSize", DEFAULT_MAX_ATTACHMENT_SIZE); final long fileSize = file.length(); if (fileSize <= maxFileSize) { strList.add(Base64Util.encode(FileUtil.readBytes(file))); isArray = false; isChildUrl = false; isDistinct = false; } else { logger.info("The max file size(" + fileSize + "/" + maxFileSize + " is exceeded: " + responseData.getUrl()); } } else if ("source".equals(type)) { try { strList.add(trimSpaces(FileUtil.readText(file, charsetName), isTrimSpaces)); } catch (Exception e) { logger.warn("Failed to read type:source from " + responseData.getUrl(), e); } } else if (document != null) { processCssQuery(document, propName, params, isTrimSpaces, strList); } Object propertyValue; final ScriptInfo scriptInfo = getScriptValue(params); if (isDistinct) { final Set<String> strSet = new HashSet<>(); final List<String> distinctList = strList.stream().filter(s -> strSet.add(s) && (!isTrimSpaces || StringUtil.isNotBlank(s))) .collect(Collectors.toList()); strList.clear(); strList.addAll(distinctList); } if (scriptInfo == null) { propertyValue = isArray ? strList : String.join(" ", strList); } else { final Map<String, Object> vars = new HashMap<String, Object>(); vars.put("container", SingletonLaContainerFactory.getContainer()); vars.put("client", esClient); vars.put("data", responseData); vars.put("result", resultData); vars.put("property", propName); vars.put("parameters", params); vars.put("array", isArray); vars.put("list", strList); if (isArray) { final List<Object> list = new ArrayList<Object>(); for (int i = 0; i < strList.size(); i++) { final Map<String, Object> localVars = new HashMap<String, Object>(vars); localVars.put("index", i); localVars.put("value", String.join(" ", strList)); list.add(executeScript(scriptInfo.getLang(), scriptInfo.getScript(), scriptInfo.getScriptType(), localVars)); } propertyValue = list; } else { vars.put("value", String.join(" ", strList)); propertyValue = executeScript(scriptInfo.getLang(), scriptInfo.getScript(), scriptInfo.getScriptType(), vars); } } addPropertyData(dataMap, propName, propertyValue); if (isChildUrl) { Set<String> childUrlSet = childUrlSetLocal.get(); if (childUrlSet == null) { childUrlSet = new HashSet<String>(); childUrlSetLocal.set(childUrlSet); } if (propertyValue instanceof String) { final String str = (String) propertyValue; if (StringUtil.isNotBlank(str)) { childUrlSet.add(str); } } else if (propertyValue instanceof List) { @SuppressWarnings("unchecked") final List<Object> list = (List<Object>) propertyValue; for (final Object obj : list) { final String str = obj.toString(); if (StringUtil.isNotBlank(str)) { childUrlSet.add(str); } } } } } storeIndex(responseData, dataMap); } private Object executeScript(final String lang, final String script, final String scriptTypeValue, final Map<String, Object> vars) { ScriptType scriptType; if (ScriptType.FILE.toString().equalsIgnoreCase(scriptTypeValue)) { scriptType = ScriptType.FILE; } else if (ScriptType.INDEXED.toString().equalsIgnoreCase(scriptTypeValue)) { scriptType = ScriptType.INDEXED; } else { scriptType = ScriptType.INLINE; } vars.put("logger", logger); final ScriptService scriptService = SingletonLaContainer.getComponent(ScriptService.class); return scriptService.execute(lang, script, scriptType, vars); } protected ScriptInfo getScriptValue(final Map<String, Object> params) { final Object value = SettingsUtils.get(params, SCRIPT_QUERY_TYPE, null); if (value == null) { return null; } else if (value instanceof String) { return new ScriptInfo(value.toString()); } else if (value instanceof List) { @SuppressWarnings("unchecked") final List<CharSequence> list = (List<CharSequence>) value; return new ScriptInfo(String.join("", list)); } else if (value instanceof Map) { @SuppressWarnings("unchecked") final Map<String, Object> scriptMap = (Map<String, Object>) value; final String script = SettingsUtils.get(scriptMap, SCRIPT_QUERY_TYPE); if (script == null) { return null; } return new ScriptInfo(script, SettingsUtils.get(scriptMap, "lang", WebRiverConstants.DEFAULT_SCRIPT_LANG), SettingsUtils.get(scriptMap, "script_type", "inline")); } return null; } private static class ScriptInfo { private final String script; private final String lang; private final String scriptType; ScriptInfo(final String script) { this(script, WebRiverConstants.DEFAULT_SCRIPT_LANG, "inline"); } ScriptInfo(final String script, final String lang, final String scriptType) { this.script = script; this.lang = lang; this.scriptType = scriptType; } public String getScript() { return script; } public String getLang() { return lang; } public String getScriptType() { return scriptType; } } protected void processCssQuery(final org.jsoup.nodes.Document document, final String propName, final Map<String, Object> params, final boolean isTrimSpaces, final List<String> strList) { for (final String queryType : queryTypes) { final Object queryObj = SettingsUtils.get(params, queryType, null); Element[] elements = null; if (queryObj instanceof String) { elements = getElements(new Element[] { document }, queryObj.toString()); } else if (queryObj instanceof List) { @SuppressWarnings("unchecked") final List<String> queryList = (List<String>) queryObj; elements = getElements(new Element[] { document }, queryList, propName.startsWith(ARRAY_PROPERTY_PREFIX)); } if (elements != null) { for (final Element element : elements) { if (element == null) { strList.add(null); } else { final List<Object> argList = SettingsUtils.get(params, ARGS_QUERY_TYPE, Collections.emptyList()); try { final Method queryMethod = getQueryMethod(element, queryType, argList); strList.add(trimSpaces( (String) MethodUtil.invoke(queryMethod, element, argList.toArray(new Object[argList.size()])), isTrimSpaces)); } catch (final Exception e) { logger.warn("Could not invoke " + queryType + " on " + element, e); strList.add(null); } } } break; } } } protected Method getQueryMethod(final Element element, final String queryType, final List<Object> argList) { final BeanDesc elementDesc = BeanDescFactory.getBeanDesc(element.getClass()); if (argList == null || argList.isEmpty()) { return elementDesc.getMethodDesc(queryType).getMethod(); } else { final Class<?>[] paramTypes = new Class[argList.size()]; for (int i = 0; i < paramTypes.length; i++) { paramTypes[i] = String.class; } return elementDesc.getMethodDesc(queryType, paramTypes).getMethod(); } } protected Element[] getElements(final Element[] elements, final List<String> queries, final boolean isArrayProperty) { Element[] targets = elements; for (final String query : queries) { final List<Element> elementList = new ArrayList<Element>(); for (final Element element : targets) { if (element == null) { elementList.add(null); } else { final Element[] childElements = getElements(new Element[] { element }, query); if (childElements.length == 0 && isArrayProperty) { elementList.add(null); } else { for (final Element childElement : childElements) { elementList.add(childElement); } } } } targets = elementList.toArray(new Element[elementList.size()]); } return targets; } protected Element[] getElements(final Element[] elements, final String query) { Element[] targets = elements; final Pattern pattern = Pattern.compile(":eq\\(([0-9]+)\\)|:lt\\(([0-9]+)\\)|:gt\\(([0-9]+)\\)"); final Matcher matcher = pattern.matcher(query); final StringBuffer buf = new StringBuffer(); while (matcher.find()) { final String value = matcher.group(); matcher.appendReplacement(buf, ""); if (buf.charAt(buf.length() - 1) != ' ') { try { final int index = Integer.parseInt(matcher.group(1)); final List<Element> elementList = new ArrayList<Element>(); final String childQuery = buf.toString(); for (final Element element : targets) { final Elements childElements = element.select(childQuery); if (value.startsWith(":eq")) { if (index < childElements.size()) { elementList.add(childElements.get(index)); } } else if (value.startsWith(":lt")) { for (int i = 0; i < childElements.size() && i < index; i++) { elementList.add(childElements.get(i)); } } else if (value.startsWith(":gt")) { for (int i = index + 1; i < childElements.size(); i++) { elementList.add(childElements.get(i)); } } } targets = elementList.toArray(new Element[elementList.size()]); buf.setLength(0); } catch (final NumberFormatException e) { logger.warn("Invalid number: " + query, e); buf.append(value); } } else { buf.append(value); } } matcher.appendTail(buf); final String lastQuery = buf.toString(); if (StringUtil.isNotBlank(lastQuery)) { final List<Element> elementList = new ArrayList<Element>(); for (final Element element : targets) { if (element == null) { elementList.add(null); } else { final Elements childElements = element.select(lastQuery); for (int i = 0; i < childElements.size(); i++) { elementList.add(childElements.get(i)); } } } targets = elementList.toArray(new Element[elementList.size()]); } return targets; } protected String trimSpaces(final String value, final boolean trimSpaces) { if (value == null) { return null; } if (trimSpaces) { return value.replaceAll("\\s+", " ").trim(); } return value; } protected void addPropertyData(final Map<String, Object> dataMap, final String key, final Object value) { Map<String, Object> currentDataMap = dataMap; final String[] keys = key.split("\\."); for (int i = 0; i < keys.length - 1; i++) { final String currentKey = keys[i]; @SuppressWarnings("unchecked") Map<String, Object> map = (Map<String, Object>) currentDataMap.get(currentKey); if (map == null) { map = new LinkedHashMap<String, Object>(); currentDataMap.put(currentKey, map); } currentDataMap = map; } currentDataMap.put(keys[keys.length - 1], value); } protected void storeIndex(final ResponseData responseData, final Map<String, Object> dataMap) { final String sessionId = responseData.getSessionId(); final RiverConfig riverConfig = riverConfigLocal.get(); final String indexName = riverConfig.getIndex(); final String typeName = riverConfig.getType(); final boolean overwrite = riverConfig.isOverwrite(); if (logger.isDebugEnabled()) { logger.debug("Index: " + indexName + ", sessionId: " + sessionId + ", Data: " + dataMap); } if (overwrite) { final int count = esClient.deleteByQuery(indexName, typeName, QueryBuilders.termQuery("url", responseData.getUrl())); if (count > 0) { esClient.admin().indices().prepareRefresh(indexName).execute().actionGet(); } } @SuppressWarnings("unchecked") final Map<String, Object> arrayDataMap = (Map<String, Object>) dataMap.remove(ARRAY_PROPERTY_PREFIX); if (arrayDataMap != null) { final Map<String, Object> flatArrayDataMap = new LinkedHashMap<String, Object>(); convertFlatMap("", arrayDataMap, flatArrayDataMap); int maxSize = 0; for (final Map.Entry<String, Object> entry : flatArrayDataMap.entrySet()) { final Object value = entry.getValue(); if (value instanceof List) { @SuppressWarnings("rawtypes") final int size = ((List) value).size(); if (size > maxSize) { maxSize = size; } } } for (int i = 0; i < maxSize; i++) { final Map<String, Object> newDataMap = new LinkedHashMap<String, Object>(); newDataMap.put(POSITION_FIELD, i); deepCopy(dataMap, newDataMap); for (final Map.Entry<String, Object> entry : flatArrayDataMap.entrySet()) { final Object value = entry.getValue(); if (value instanceof List) { @SuppressWarnings("unchecked") final List<Object> list = (List<Object>) value; if (i < list.size()) { addPropertyData(newDataMap, entry.getKey(), list.get(i)); } } else if (i == 0) { addPropertyData(newDataMap, entry.getKey(), value); } } storeIndex(indexName, typeName, newDataMap); } } else { storeIndex(indexName, typeName, dataMap); } } protected void storeIndex(final String indexName, final String typeName, final Map<String, Object> dataMap) { dataMap.put(TIMESTAMP_FIELD, new Date()); if (logger.isDebugEnabled()) { logger.debug(indexName + "/" + typeName + " : dataMap" + dataMap); } try { esClient.prepareIndex(indexName, typeName).setRefresh(true).setSource(jsonBuilder().value(dataMap)).execute().actionGet(); } catch (final Exception e) { logger.warn("Could not write a content into index.", e); } } protected void deepCopy(final Map<String, Object> oldMap, final Map<String, Object> newMap) { final Map<String, Object> flatMap = new LinkedHashMap<String, Object>(); convertFlatMap("", oldMap, flatMap); for (final Map.Entry<String, Object> entry : flatMap.entrySet()) { addPropertyData(newMap, entry.getKey(), entry.getValue()); } } @SuppressWarnings("unchecked") protected void convertFlatMap(final String prefix, final Map<String, Object> oldMap, final Map<String, Object> newMap) { for (final Map.Entry<String, Object> entry : oldMap.entrySet()) { final Object value = entry.getValue(); if (value instanceof Map) { convertFlatMap(prefix + entry.getKey() + ".", (Map<String, Object>) value, newMap); } else { newMap.put(prefix + entry.getKey(), value); } } } @Override protected void storeChildUrls(final ResponseData responseData, final ResultData resultData) { final Set<String> childLinkSet = childUrlSetLocal.get(); if (childLinkSet != null) { List<RequestData> requestDataList = convertChildUrlList(childLinkSet.stream().filter(u -> StringUtil.isNotBlank(u)) .map(u -> RequestDataBuilder.newRequestData().get().url(u).build()).collect(Collectors.toList())); resultData.addAllUrl(requestDataList); final RequestData requestData = responseData.getRequestData(); resultData.removeUrl(requestData); resultData.removeUrl(getDuplicateUrl(requestData)); } else { super.storeChildUrls(responseData, resultData); } } /** * Returns data as XML content of String. * * @return XML content of String. */ @Override public Object getData(final AccessResultData accessResultData) { return null; } }