/* * Copyright 2012-2017 CodeLibs Project and the Others. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, * either express or implied. See the License for the specific language * governing permissions and limitations under the License. */ package org.codelibs.fess.crawler.transformer; import static org.codelibs.core.stream.StreamUtil.stream; import java.io.BufferedInputStream; import java.net.MalformedURLException; import java.net.URL; import java.util.ArrayList; import java.util.Collections; import java.util.Date; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedHashMap; import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Set; import javax.annotation.PostConstruct; import javax.xml.transform.TransformerException; import org.apache.xpath.objects.XObject; import org.codelibs.core.io.InputStreamUtil; import org.codelibs.core.io.SerializeUtil; import org.codelibs.core.lang.StringUtil; import org.codelibs.core.misc.ValueHolder; import org.codelibs.fess.Constants; import org.codelibs.fess.crawler.builder.RequestDataBuilder; import org.codelibs.fess.crawler.entity.AccessResultData; import org.codelibs.fess.crawler.entity.RequestData; import org.codelibs.fess.crawler.entity.ResponseData; import org.codelibs.fess.crawler.entity.ResultData; import org.codelibs.fess.crawler.entity.UrlQueue; import org.codelibs.fess.crawler.exception.ChildUrlsException; import org.codelibs.fess.crawler.exception.CrawlerSystemException; import org.codelibs.fess.crawler.exception.CrawlingAccessException; import org.codelibs.fess.crawler.transformer.impl.XpathTransformer; import org.codelibs.fess.crawler.util.CrawlingParameterUtil; import org.codelibs.fess.es.config.exentity.CrawlingConfig; import org.codelibs.fess.es.config.exentity.CrawlingConfig.ConfigName; import org.codelibs.fess.helper.CrawlingConfigHelper; import org.codelibs.fess.helper.CrawlingInfoHelper; import org.codelibs.fess.helper.DocumentHelper; import org.codelibs.fess.helper.DuplicateHostHelper; import org.codelibs.fess.helper.FileTypeHelper; import org.codelibs.fess.helper.LabelTypeHelper; import org.codelibs.fess.helper.PathMappingHelper; import org.codelibs.fess.helper.SystemHelper; import org.codelibs.fess.mylasta.direction.FessConfig; import org.codelibs.fess.util.ComponentUtil; import org.codelibs.fess.util.PrunedTag; import org.cyberneko.html.parsers.DOMParser; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.w3c.dom.Document; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.xml.sax.InputSource; public class FessXpathTransformer extends XpathTransformer implements FessTransformer { private static final Logger logger = LoggerFactory.getLogger(FessXpathTransformer.class); private static final String META_NAME_ROBOTS_CONTENT = "//META[@name=\"robots\" or @name=\"ROBOTS\"]/@content"; private static final String META_ROBOTS_NONE = "none"; private static final String META_ROBOTS_NOINDEX = "noindex"; private static final String META_ROBOTS_NOFOLLOW = "nofollow"; private static final int UTF8_BOM_SIZE = 3; public boolean prunedContent = true; public Map<String, String> convertUrlMap = new HashMap<>(); protected FessConfig fessConfig; protected boolean useGoogleOffOn = true; @PostConstruct public void init() { fessConfig = ComponentUtil.getFessConfig(); } @Override public FessConfig getFessConfig() { return fessConfig; } @Override public Logger getLogger() { return logger; } @Override protected void storeData(final ResponseData responseData, final ResultData resultData) { final DOMParser parser = getDomParser(); try (final BufferedInputStream bis = new BufferedInputStream(responseData.getResponseBody())) { final byte[] bomBytes = new byte[UTF8_BOM_SIZE]; bis.mark(UTF8_BOM_SIZE); final int size = bis.read(bomBytes); if (size < 3 || !isUtf8BomBytes(bomBytes)) { bis.reset(); } final InputSource is = new InputSource(bis); if (responseData.getCharSet() != null) { is.setEncoding(responseData.getCharSet()); } parser.parse(is); } catch (final Exception e) { throw new CrawlingAccessException("Could not parse " + responseData.getUrl(), e); } final Document document = parser.getDocument(); if (!fessConfig.isCrawlerIgnoreMetaRobots()) { processMetaRobots(responseData, resultData, document); } final Map<String, Object> dataMap = new LinkedHashMap<>(); for (final Map.Entry<String, String> entry : fieldRuleMap.entrySet()) { final String path = entry.getValue(); try { final XObject xObj = getXPathAPI().eval(document, path); final int type = xObj.getType(); switch (type) { case XObject.CLASS_BOOLEAN: final boolean b = xObj.bool(); putResultDataBody(dataMap, entry.getKey(), Boolean.toString(b)); break; case XObject.CLASS_NUMBER: final double d = xObj.num(); putResultDataBody(dataMap, entry.getKey(), Double.toString(d)); break; case XObject.CLASS_STRING: final String str = xObj.str(); putResultDataBody(dataMap, entry.getKey(), str); break; case XObject.CLASS_NULL: case XObject.CLASS_UNKNOWN: case XObject.CLASS_NODESET: case XObject.CLASS_RTREEFRAG: case XObject.CLASS_UNRESOLVEDVARIABLE: default: final Node value = getXPathAPI().selectSingleNode(document, entry.getValue()); putResultDataBody(dataMap, entry.getKey(), value != null ? value.getTextContent() : null); break; } } catch (final TransformerException e) { logger.warn("Could not parse a value of " + entry.getKey() + ":" + entry.getValue()); } } putAdditionalData(dataMap, responseData, document); try { resultData.setData(SerializeUtil.fromObjectToBinary(dataMap)); } catch (final Exception e) { throw new CrawlingAccessException("Could not serialize object: " + responseData.getUrl(), e); } resultData.setEncoding(charsetName); } protected void processMetaRobots(final ResponseData responseData, final ResultData resultData, final Document document) { try { final Node value = getXPathAPI().selectSingleNode(document, META_NAME_ROBOTS_CONTENT); if (value != null) { final String content = value.getTextContent().toLowerCase(Locale.ROOT); boolean noindex = false; boolean nofollow = false; if (content.contains(META_ROBOTS_NONE)) { noindex = true; nofollow = true; } else { if (content.contains(META_ROBOTS_NOINDEX)) { noindex = true; } if (content.contains(META_ROBOTS_NOFOLLOW)) { nofollow = true; } } if (noindex && nofollow) { logger.info("META(robots=noindex,nofollow): " + responseData.getUrl()); throw new ChildUrlsException(Collections.emptySet(), "#processMetaRobots(Document)"); } else if (noindex) { logger.info("META(robots=noindex): " + responseData.getUrl()); storeChildUrls(responseData, resultData); throw new ChildUrlsException(resultData.getChildUrlSet(), "#processMetaRobots(Document)"); } else if (nofollow) { logger.info("META(robots=nofollow): " + responseData.getUrl()); responseData.setNoFollow(true); } } } catch (final TransformerException e) { logger.warn("Could not parse a value of " + META_NAME_ROBOTS_CONTENT); } } protected void putAdditionalData(final Map<String, Object> dataMap, final ResponseData responseData, final Document document) { // canonical if (StringUtil.isNotBlank(fessConfig.getCrawlerDocumentHtmlCanonicalXpath())) { final String canonicalUrl = getCanonicalUrl(responseData, document); if (canonicalUrl != null && !canonicalUrl.equals(responseData.getUrl())) { final Set<RequestData> childUrlSet = new HashSet<>(); childUrlSet.add(RequestDataBuilder.newRequestData().get().url(canonicalUrl).build()); throw new ChildUrlsException(childUrlSet, this.getClass().getName() + "#putAdditionalData(Map<String, Object>, ResponseData, Document)"); } } final FessConfig fessConfig = ComponentUtil.getFessConfig(); final CrawlingInfoHelper crawlingInfoHelper = ComponentUtil.getCrawlingInfoHelper(); final String sessionId = crawlingInfoHelper.getCanonicalSessionId(responseData.getSessionId()); final PathMappingHelper pathMappingHelper = ComponentUtil.getPathMappingHelper(); final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper(); final CrawlingConfig crawlingConfig = crawlingConfigHelper.get(responseData.getSessionId()); final Date documentExpires = crawlingInfoHelper.getDocumentExpires(crawlingConfig); final SystemHelper systemHelper = ComponentUtil.getSystemHelper(); final FileTypeHelper fileTypeHelper = ComponentUtil.getFileTypeHelper(); final DocumentHelper documentHelper = ComponentUtil.getDocumentHelper(); final LabelTypeHelper labelTypeHelper = ComponentUtil.getLabelTypeHelper(); String url = responseData.getUrl(); final String indexingTarget = crawlingConfig.getIndexingTarget(url); url = pathMappingHelper.replaceUrl(sessionId, url); final String mimeType = responseData.getMimeType(); final Map<String, String> fieldConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.FIELD); final Map<String, String> xpathConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.XPATH); String urlEncoding; final UrlQueue<?> urlQueue = CrawlingParameterUtil.getUrlQueue(); if (urlQueue != null && urlQueue.getEncoding() != null) { urlEncoding = urlQueue.getEncoding(); } else { urlEncoding = responseData.getCharSet(); } // cid final String configId = crawlingConfig.getConfigId(); if (configId != null) { putResultDataBody(dataMap, fessConfig.getIndexFieldConfigId(), configId); } // expires if (documentExpires != null) { putResultDataBody(dataMap, fessConfig.getIndexFieldExpires(), documentExpires); } // lang final String lang = systemHelper.normalizeLang(getSingleNodeValue(document, getLangXpath(fessConfig, xpathConfigMap), true)); if (lang != null) { putResultDataBody(dataMap, fessConfig.getIndexFieldLang(), lang); } // title // content final String body = getSingleNodeValue(document, getContentXpath(fessConfig, xpathConfigMap), prunedContent); putResultDataBody(dataMap, fessConfig.getIndexFieldContent(), documentHelper.getContent(responseData, body, dataMap)); if ((Constants.TRUE.equalsIgnoreCase(fieldConfigMap.get(fessConfig.getIndexFieldCache())) || fessConfig .isCrawlerDocumentCacheEnabled()) && fessConfig.isSupportedDocumentCacheMimetypes(mimeType)) { if (responseData.getContentLength() > 0 && responseData.getContentLength() <= fessConfig.getCrawlerDocumentCacheMaxSizeAsInteger().longValue()) { String charSet = responseData.getCharSet(); if (charSet == null) { charSet = Constants.UTF_8; } try (final BufferedInputStream is = new BufferedInputStream(responseData.getResponseBody())) { // cache putResultDataBody(dataMap, fessConfig.getIndexFieldCache(), new String(InputStreamUtil.getBytes(is), charSet)); putResultDataBody(dataMap, fessConfig.getIndexFieldHasCache(), Constants.TRUE); } catch (final Exception e) { logger.warn("Failed to write a cache: " + sessionId + ":" + responseData, e); } } else { logger.debug("Content size is too large({} > {}): {}", responseData.getContentLength(), fessConfig.getCrawlerDocumentCacheMaxSizeAsInteger(), responseData.getUrl()); } } // digest final String digest = getSingleNodeValue(document, getDigestXpath(fessConfig, xpathConfigMap), false); if (StringUtil.isNotBlank(digest)) { putResultDataBody(dataMap, fessConfig.getIndexFieldDigest(), digest); } else { putResultDataBody(dataMap, fessConfig.getIndexFieldDigest(), documentHelper.getDigest(responseData, body, dataMap, fessConfig.getCrawlerDocumentHtmlMaxDigestLengthAsInteger())); } // segment putResultDataBody(dataMap, fessConfig.getIndexFieldSegment(), sessionId); // host putResultDataBody(dataMap, fessConfig.getIndexFieldHost(), getHost(url)); // site putResultDataBody(dataMap, fessConfig.getIndexFieldSite(), getSite(url, urlEncoding)); // filename final String fileName = getFileName(url, urlEncoding); if (StringUtil.isNotBlank(fileName)) { putResultDataBody(dataMap, fessConfig.getIndexFieldFilename(), fileName); } // url putResultDataBody(dataMap, fessConfig.getIndexFieldUrl(), url); // created final Date now = systemHelper.getCurrentTime(); putResultDataBody(dataMap, fessConfig.getIndexFieldCreated(), now); // anchor putResultDataBody(dataMap, fessConfig.getIndexFieldAnchor(), getAnchorList(document, responseData)); // mimetype putResultDataBody(dataMap, fessConfig.getIndexFieldMimetype(), mimeType); if (fileTypeHelper != null) { // filetype putResultDataBody(dataMap, fessConfig.getIndexFieldFiletype(), fileTypeHelper.get(mimeType)); } // content_length putResultDataBody(dataMap, fessConfig.getIndexFieldContentLength(), Long.toString(responseData.getContentLength())); // last_modified final Date lastModified = responseData.getLastModified(); if (lastModified != null) { putResultDataBody(dataMap, fessConfig.getIndexFieldLastModified(), lastModified); // timestamp putResultDataBody(dataMap, fessConfig.getIndexFieldTimestamp(), lastModified); } else { // timestamp putResultDataBody(dataMap, fessConfig.getIndexFieldTimestamp(), now); } // indexingTarget putResultDataBody(dataMap, Constants.INDEXING_TARGET, indexingTarget); // boost putResultDataBody(dataMap, fessConfig.getIndexFieldBoost(), crawlingConfig.getDocumentBoost()); // label: labelType final Set<String> labelTypeSet = new HashSet<>(); for (final String labelType : crawlingConfig.getLabelTypeValues()) { labelTypeSet.add(labelType); } labelTypeSet.addAll(labelTypeHelper.getMatchedLabelValueSet(url)); putResultDataBody(dataMap, fessConfig.getIndexFieldLabel(), labelTypeSet); // role: roleType final List<String> roleTypeList = new ArrayList<>(); stream(crawlingConfig.getPermissions()).of(stream -> stream.forEach(p -> roleTypeList.add(p))); putResultDataBody(dataMap, fessConfig.getIndexFieldRole(), roleTypeList); // id putResultDataBody(dataMap, fessConfig.getIndexFieldId(), crawlingInfoHelper.generateId(dataMap)); // parentId String parentUrl = responseData.getParentUrl(); if (StringUtil.isNotBlank(parentUrl)) { parentUrl = pathMappingHelper.replaceUrl(sessionId, parentUrl); putResultDataBody(dataMap, fessConfig.getIndexFieldUrl(), parentUrl); putResultDataBody(dataMap, fessConfig.getIndexFieldParentId(), crawlingInfoHelper.generateId(dataMap)); putResultDataBody(dataMap, fessConfig.getIndexFieldUrl(), url); // set again } // from config final Map<String, String> scriptConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.SCRIPT); xpathConfigMap.entrySet().stream().filter(e -> !e.getKey().startsWith("default.")).forEach(e -> { final String key = e.getKey(); final String value = getSingleNodeValue(document, e.getValue(), true); putResultDataWithTemplate(dataMap, key, value, scriptConfigMap.get(key)); }); crawlingConfig.getConfigParameterMap(ConfigName.VALUE).entrySet().stream().forEach(e -> { final String key = e.getKey(); final String value = e.getValue(); putResultDataWithTemplate(dataMap, key, value, scriptConfigMap.get(key)); }); } protected String getLangXpath(final FessConfig fessConfig, final Map<String, String> xpathConfigMap) { final String xpath = xpathConfigMap.get("default.lang"); if (StringUtil.isNotBlank(xpath)) { return xpath; } return fessConfig.getCrawlerDocumentHtmlLangXpath(); } protected String getContentXpath(final FessConfig fessConfig, final Map<String, String> xpathConfigMap) { final String xpath = xpathConfigMap.get("default.content"); if (StringUtil.isNotBlank(xpath)) { return xpath; } return fessConfig.getCrawlerDocumentHtmlContentXpath(); } protected String getDigestXpath(final FessConfig fessConfig, final Map<String, String> xpathConfigMap) { final String xpath = xpathConfigMap.get("default.digest"); if (StringUtil.isNotBlank(xpath)) { return xpath; } return fessConfig.getCrawlerDocumentHtmlDigestXpath(); } protected String getCanonicalUrl(final ResponseData responseData, final Document document) { final String canonicalUrl = getSingleNodeValue(document, fessConfig.getCrawlerDocumentHtmlCanonicalXpath(), false); if (StringUtil.isBlank(canonicalUrl)) { return null; } if (canonicalUrl.startsWith("/")) { return normalizeCanonicalUrl(responseData.getUrl(), canonicalUrl); } return canonicalUrl; } protected String normalizeCanonicalUrl(final String baseUrl, final String canonicalUrl) { try { return new URL(new URL(baseUrl), canonicalUrl).toString(); } catch (final MalformedURLException e) { logger.warn("Invalid canonical url: " + baseUrl + " : " + canonicalUrl, e); } return null; } protected String removeCommentTag(final String content) { if (content == null) { return StringUtil.EMPTY; } String value = content; int pos = value.indexOf("<!--"); while (pos >= 0) { final int lastPos = value.indexOf("-->", pos); if (lastPos >= 0) { if (pos == 0) { value = " " + value.substring(lastPos + 3); } else { value = value.substring(0, pos) + " " + value.substring(lastPos + 3); } } else { break; } pos = value.indexOf("<!--"); } return value; } protected String getSingleNodeValue(final Document document, final String xpath, final boolean pruned) { StringBuilder buf = null; NodeList list = null; try { list = getXPathAPI().selectNodeList(document, xpath); for (int i = 0; i < list.getLength(); i++) { if (buf == null) { buf = new StringBuilder(1000); } Node node = list.item(i).cloneNode(true); if (useGoogleOffOn) { node = processGoogleOffOn(node, new ValueHolder<>(true)); } if (pruned) { node = pruneNode(node); } parseTextContent(node, buf); } } catch (final Exception e) { logger.warn("Could not parse a value of " + xpath); } if (buf == null) { return null; } return buf.toString().trim(); } protected void parseTextContent(final Node node, final StringBuilder buf) { if (node.hasChildNodes()) { final NodeList nodeList = node.getChildNodes(); for (int i = 0; i < nodeList.getLength(); i++) { final Node childNode = nodeList.item(i); parseTextContent(childNode, buf); } } else if (node.getNodeType() == Node.TEXT_NODE) { final String value = node.getTextContent(); if (value != null) { final String content = value.trim(); if (content.length() > 0) { buf.append(' ').append(content); } } } } protected Node processGoogleOffOn(final Node node, final ValueHolder<Boolean> flag) { final NodeList nodeList = node.getChildNodes(); List<Node> removedNodeList = null; for (int i = 0; i < nodeList.getLength(); i++) { final Node childNode = nodeList.item(i); if (childNode.getNodeType() == Node.COMMENT_NODE) { final String comment = childNode.getNodeValue().trim(); if (comment.startsWith("googleoff:")) { flag.setValue(false); } else if (comment.startsWith("googleon:")) { flag.setValue(true); } } if (!flag.getValue() && childNode.getNodeType() == Node.TEXT_NODE) { if (removedNodeList == null) { removedNodeList = new ArrayList<>(); } removedNodeList.add(childNode); } else { processGoogleOffOn(childNode, flag); } } if (removedNodeList != null) { removedNodeList.stream().forEach(n -> node.removeChild(n)); } return node; } protected Node pruneNode(final Node node) { final NodeList nodeList = node.getChildNodes(); final List<Node> childNodeList = new ArrayList<>(); final List<Node> removedNodeList = new ArrayList<>(); for (int i = 0; i < nodeList.getLength(); i++) { final Node childNode = nodeList.item(i); if (isPrunedTag(childNode)) { removedNodeList.add(childNode); } else { childNodeList.add(childNode); } } for (final Node childNode : removedNodeList) { node.removeChild(childNode); } for (final Node childNode : childNodeList) { pruneNode(childNode); } return node; } protected boolean isPrunedTag(final Node node) { for (final PrunedTag prunedTag : fessConfig.getCrawlerDocumentHtmlPrunedTagsAsArray()) { if (prunedTag.matches(node)) { return true; } } return false; } protected String getMultipleNodeValue(final Document document, final String xpath) { NodeList nodeList = null; final StringBuilder buf = new StringBuilder(100); try { nodeList = getXPathAPI().selectNodeList(document, xpath); for (int i = 0; i < nodeList.getLength(); i++) { final Node node = nodeList.item(i); buf.append(node.getTextContent()); buf.append("\n"); } } catch (final Exception e) { logger.warn("Could not parse a value of " + xpath); } return buf.toString().trim(); } protected String replaceDuplicateHost(final String url) { try { // remove duplicate host final DuplicateHostHelper duplicateHostHelper = ComponentUtil.getDuplicateHostHelper(); return duplicateHostHelper.convert(url); } catch (final Exception e) { return url; } } protected List<String> getAnchorList(final Document document, final ResponseData responseData) { List<RequestData> anchorList = new ArrayList<>(); final String baseHref = getBaseHref(document); try { final URL url = new URL(baseHref != null ? baseHref : responseData.getUrl()); for (final Map.Entry<String, String> entry : childUrlRuleMap.entrySet()) { for (final String u : getUrlFromTagAttribute(url, document, entry.getKey(), entry.getValue(), responseData.getCharSet())) { anchorList.add(RequestDataBuilder.newRequestData().get().url(u).build()); } } anchorList = convertChildUrlList(anchorList); } catch (final Exception e) { logger.warn("Could not parse anchor tags.", e); // } finally { // xpathAPI.remove(); } final List<String> urlList = new ArrayList<>(anchorList.size()); for (final RequestData requestData : anchorList) { urlList.add(requestData.getUrl()); } return urlList; } @Override protected List<RequestData> convertChildUrlList(final List<RequestData> urlList) { if (urlList != null) { for (final RequestData requestData : urlList) { String url = requestData.getUrl(); for (final Map.Entry<String, String> entry : convertUrlMap.entrySet()) { url = url.replaceAll(entry.getKey(), entry.getValue()); } requestData.setUrl(replaceDuplicateHost(url)); } } return urlList; } @Override public Object getData(final AccessResultData<?> accessResultData) { final byte[] data = accessResultData.getData(); if (data != null) { try { return SerializeUtil.fromBinaryToObject(data); } catch (final Exception e) { throw new CrawlerSystemException("Could not create an instanced from bytes.", e); } } return new HashMap<String, Object>(); } @Override protected boolean isValidPath(final String path) { return super.isValidPath(path); } @Override protected void addChildUrlFromTagAttribute(final List<String> urlList, final URL url, final String attrValue, final String encoding) { final String urlValue = attrValue.trim(); URL childUrl; String u = null; try { childUrl = new URL(url, urlValue); u = encodeUrl(normalizeUrl(childUrl.toExternalForm()), encoding); } catch (final MalformedURLException e) { final int pos = urlValue.indexOf(':'); if (pos > 0 && pos < 10) { u = encodeUrl(normalizeUrl(urlValue), encoding); } } if (u == null) { logger.warn("Ignored child URL: " + attrValue + " in " + url); return; } if (logger.isDebugEnabled()) { logger.debug(attrValue + " -> " + u); } if (StringUtil.isNotBlank(u)) { if (logger.isDebugEnabled()) { logger.debug("Add Child: " + u); } urlList.add(u); } else { if (logger.isDebugEnabled()) { logger.debug("Skip Child: " + u); } } } private boolean isUtf8BomBytes(final byte[] b) { return b[0] == (byte) 0xEF && b[1] == (byte) 0xBB && b[2] == (byte) 0xBF; } public void setUseGoogleOffOn(final boolean useGoogleOffOn) { this.useGoogleOffOn = useGoogleOffOn; } }