/* * Copyright 2012-2017 CodeLibs Project and the Others. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, * either express or implied. See the License for the specific language * governing permissions and limitations under the License. */ package org.codelibs.fess.crawler.transformer; import static org.codelibs.core.stream.StreamUtil.stream; import java.io.InputStream; import java.net.URLDecoder; import java.util.ArrayList; import java.util.Date; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import org.apache.commons.lang3.StringUtils; import org.apache.tika.metadata.HttpHeaders; import org.apache.tika.metadata.TikaMetadataKeys; import org.codelibs.core.io.SerializeUtil; import org.codelibs.core.lang.StringUtil; import org.codelibs.core.misc.Pair; import org.codelibs.fess.Constants; import org.codelibs.fess.crawler.client.smb.SmbClient; import org.codelibs.fess.crawler.entity.AccessResultData; import org.codelibs.fess.crawler.entity.ExtractData; import org.codelibs.fess.crawler.entity.ResponseData; import org.codelibs.fess.crawler.entity.ResultData; import org.codelibs.fess.crawler.entity.UrlQueue; import org.codelibs.fess.crawler.exception.CrawlerSystemException; import org.codelibs.fess.crawler.exception.CrawlingAccessException; import org.codelibs.fess.crawler.extractor.Extractor; import org.codelibs.fess.crawler.transformer.impl.AbstractTransformer; import org.codelibs.fess.crawler.util.CrawlingParameterUtil; import org.codelibs.fess.es.config.exentity.CrawlingConfig; import org.codelibs.fess.es.config.exentity.CrawlingConfig.ConfigName; import org.codelibs.fess.helper.CrawlingConfigHelper; import org.codelibs.fess.helper.CrawlingInfoHelper; import org.codelibs.fess.helper.DocumentHelper; import org.codelibs.fess.helper.FileTypeHelper; import org.codelibs.fess.helper.LabelTypeHelper; import org.codelibs.fess.helper.PathMappingHelper; import org.codelibs.fess.helper.SambaHelper; import org.codelibs.fess.helper.SystemHelper; import org.codelibs.fess.mylasta.direction.FessConfig; import org.codelibs.fess.util.ComponentUtil; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import jcifs.smb.ACE; import jcifs.smb.SID; public abstract class AbstractFessFileTransformer extends AbstractTransformer implements FessTransformer { private static final Logger logger = LoggerFactory.getLogger(AbstractFessFileTransformer.class); protected Map<String, String> metaContentMapping; protected FessConfig fessConfig; protected abstract Extractor getExtractor(ResponseData responseData); @Override public ResultData transform(final ResponseData responseData) { if (responseData == null || !responseData.hasResponseBody()) { throw new CrawlingAccessException("No response body."); } final ResultData resultData = new ResultData(); resultData.setTransformerName(getName()); try { resultData.setData(SerializeUtil.fromObjectToBinary(generateData(responseData))); } catch (final Exception e) { throw new CrawlingAccessException("Could not serialize object", e); } resultData.setEncoding(fessConfig.getCrawlerCrawlingDataEncoding()); return resultData; } protected Map<String, Object> generateData(final ResponseData responseData) { final Extractor extractor = getExtractor(responseData); final Map<String, String> params = new HashMap<>(); params.put(TikaMetadataKeys.RESOURCE_NAME_KEY, getResourceName(responseData)); final String mimeType = responseData.getMimeType(); params.put(HttpHeaders.CONTENT_TYPE, mimeType); params.put(HttpHeaders.CONTENT_ENCODING, responseData.getCharSet()); final StringBuilder contentMetaBuf = new StringBuilder(1000); final Map<String, Object> dataMap = new HashMap<>(); final Map<String, Object> metaDataMap = new HashMap<>(); String content; try (final InputStream in = responseData.getResponseBody()) { final ExtractData extractData = getExtractData(extractor, in, params); content = extractData.getContent(); if (fessConfig.isCrawlerDocumentFileIgnoreEmptyContent() && StringUtil.isBlank(content)) { return null; } if (getLogger().isDebugEnabled()) { getLogger().debug("ExtractData: " + extractData); } // meta extractData.getKeySet().stream()// .filter(k -> extractData.getValues(k) != null)// .forEach(key -> { final String[] values = extractData.getValues(key); metaDataMap.put(key, values); // meta -> content if (fessConfig.isCrawlerMetadataContentIncluded(key)) { final String joinedValue = StringUtils.join(values, ' '); if (StringUtil.isNotBlank(joinedValue)) { if (contentMetaBuf.length() > 0) { contentMetaBuf.append(' '); } contentMetaBuf.append(joinedValue.trim()); } } final Pair<String, String> mapping = fessConfig.getCrawlerMetadataNameMapping(key); if (mapping != null) { if (Constants.MAPPING_TYPE_ARRAY.equalsIgnoreCase(mapping.getSecond())) { dataMap.put(mapping.getFirst(), values); } else if (Constants.MAPPING_TYPE_STRING.equalsIgnoreCase(mapping.getSecond())) { final String joinedValue = StringUtils.join(values, ' '); dataMap.put(mapping.getFirst(), joinedValue.trim()); } else if (values.length == 1) { try { if (Constants.MAPPING_TYPE_LONG.equalsIgnoreCase(mapping.getSecond())) { dataMap.put(mapping.getFirst(), Long.parseLong(values[0])); } else if (Constants.MAPPING_TYPE_DOUBLE.equalsIgnoreCase(mapping.getSecond())) { dataMap.put(mapping.getFirst(), Double.parseDouble(values[0])); } else { logger.warn("Unknown mapping type: {}={}", key, mapping); } } catch (final NumberFormatException e) { logger.warn("Failed to parse " + values[0], e); } } } }); } catch (final Exception e) { final CrawlingAccessException rcae = new CrawlingAccessException("Could not get a text from " + responseData.getUrl(), e); rcae.setLogLevel(CrawlingAccessException.WARN); throw rcae; } if (content == null) { content = StringUtil.EMPTY; } final String contentMeta = contentMetaBuf.toString().trim(); final FessConfig fessConfig = ComponentUtil.getFessConfig(); final CrawlingInfoHelper crawlingInfoHelper = ComponentUtil.getCrawlingInfoHelper(); final String sessionId = crawlingInfoHelper.getCanonicalSessionId(responseData.getSessionId()); final PathMappingHelper pathMappingHelper = ComponentUtil.getPathMappingHelper(); final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper(); final CrawlingConfig crawlingConfig = crawlingConfigHelper.get(responseData.getSessionId()); final Date documentExpires = crawlingInfoHelper.getDocumentExpires(crawlingConfig); final SystemHelper systemHelper = ComponentUtil.getSystemHelper(); final FileTypeHelper fileTypeHelper = ComponentUtil.getFileTypeHelper(); final DocumentHelper documentHelper = ComponentUtil.getDocumentHelper(); String url = responseData.getUrl(); final String indexingTarget = crawlingConfig.getIndexingTarget(url); url = pathMappingHelper.replaceUrl(sessionId, url); final Map<String, String> fieldConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.FIELD); String urlEncoding; final UrlQueue<?> urlQueue = CrawlingParameterUtil.getUrlQueue(); if (urlQueue != null && urlQueue.getEncoding() != null) { urlEncoding = urlQueue.getEncoding(); } else { urlEncoding = responseData.getCharSet(); } // cid final String configId = crawlingConfig.getConfigId(); if (configId != null) { putResultDataBody(dataMap, fessConfig.getIndexFieldConfigId(), configId); } // expires if (documentExpires != null) { putResultDataBody(dataMap, fessConfig.getIndexFieldExpires(), documentExpires); } // segment putResultDataBody(dataMap, fessConfig.getIndexFieldSegment(), sessionId); // content final StringBuilder buf = new StringBuilder(content.length() + 1000); if (fessConfig.isCrawlerDocumentFileAppendBodyContent()) { buf.append(content); } if (fessConfig.isCrawlerDocumentFileAppendMetaContent()) { if (buf.length() > 0) { buf.append(' '); } buf.append(contentMeta); } final String bodyBase = buf.toString().trim(); final String body = documentHelper.getContent(responseData, bodyBase, dataMap); putResultDataBody(dataMap, fessConfig.getIndexFieldContent(), body); if ((Constants.TRUE.equalsIgnoreCase(fieldConfigMap.get(fessConfig.getIndexFieldCache())) || fessConfig .isCrawlerDocumentCacheEnabled()) && fessConfig.isSupportedDocumentCacheMimetypes(mimeType)) { if (responseData.getContentLength() > 0 && responseData.getContentLength() <= fessConfig.getCrawlerDocumentCacheMaxSizeAsInteger().longValue()) { final String cache = content.trim().replaceAll("[ \\t\\x0B\\f]+", " "); // text cache putResultDataBody(dataMap, fessConfig.getIndexFieldCache(), cache); putResultDataBody(dataMap, fessConfig.getIndexFieldHasCache(), Constants.TRUE); } } // digest putResultDataBody(dataMap, fessConfig.getIndexFieldDigest(), documentHelper.getDigest(responseData, bodyBase, dataMap, fessConfig.getCrawlerDocumentFileMaxDigestLengthAsInteger())); // title final String fileName = getFileName(url, urlEncoding); if (!dataMap.containsKey(fessConfig.getIndexFieldTitle())) { if (url.endsWith("/")) { if (StringUtil.isNotBlank(content)) { putResultDataBody( dataMap, fessConfig.getIndexFieldTitle(), documentHelper.getDigest(responseData, body, dataMap, fessConfig.getCrawlerDocumentFileMaxTitleLengthAsInteger())); } else { putResultDataBody(dataMap, fessConfig.getIndexFieldTitle(), fessConfig.getCrawlerDocumentFileNoTitleLabel()); } } else { if (StringUtil.isBlank(fileName)) { putResultDataBody(dataMap, fessConfig.getIndexFieldTitle(), decodeUrlAsName(url, url.startsWith("file:"))); } else { putResultDataBody(dataMap, fessConfig.getIndexFieldTitle(), fileName); } } } // host putResultDataBody(dataMap, fessConfig.getIndexFieldHost(), getHostOnFile(url)); // site putResultDataBody(dataMap, fessConfig.getIndexFieldSite(), getSiteOnFile(url, urlEncoding)); // filename if (StringUtil.isNotBlank(fileName)) { putResultDataBody(dataMap, fessConfig.getIndexFieldFilename(), fileName); } // url putResultDataBody(dataMap, fessConfig.getIndexFieldUrl(), url); // created final Date now = systemHelper.getCurrentTime(); putResultDataBody(dataMap, fessConfig.getIndexFieldCreated(), now); // TODO anchor putResultDataBody(dataMap, fessConfig.getIndexFieldAnchor(), StringUtil.EMPTY); // mimetype putResultDataBody(dataMap, fessConfig.getIndexFieldMimetype(), mimeType); if (fileTypeHelper != null) { // filetype putResultDataBody(dataMap, fessConfig.getIndexFieldFiletype(), fileTypeHelper.get(mimeType)); } // content_length putResultDataBody(dataMap, fessConfig.getIndexFieldContentLength(), Long.toString(responseData.getContentLength())); // last_modified final Date lastModified = responseData.getLastModified(); if (lastModified != null) { putResultDataBody(dataMap, fessConfig.getIndexFieldLastModified(), lastModified); // timestamp putResultDataBody(dataMap, fessConfig.getIndexFieldTimestamp(), lastModified); } else { // timestamp putResultDataBody(dataMap, fessConfig.getIndexFieldTimestamp(), now); } // indexingTarget putResultDataBody(dataMap, Constants.INDEXING_TARGET, indexingTarget); // boost putResultDataBody(dataMap, fessConfig.getIndexFieldBoost(), crawlingConfig.getDocumentBoost()); // label: labelType final Set<String> labelTypeSet = new HashSet<>(); for (final String labelType : crawlingConfig.getLabelTypeValues()) { labelTypeSet.add(labelType); } final LabelTypeHelper labelTypeHelper = ComponentUtil.getLabelTypeHelper(); labelTypeSet.addAll(labelTypeHelper.getMatchedLabelValueSet(url)); putResultDataBody(dataMap, fessConfig.getIndexFieldLabel(), labelTypeSet); // role: roleType final List<String> roleTypeList = getRoleTypes(responseData); stream(crawlingConfig.getPermissions()).of(stream -> stream.forEach(p -> roleTypeList.add(p))); putResultDataBody(dataMap, fessConfig.getIndexFieldRole(), roleTypeList); // TODO date // lang if (StringUtil.isNotBlank(fessConfig.getCrawlerDocumentFileDefaultLang())) { putResultDataBody(dataMap, fessConfig.getIndexFieldLang(), fessConfig.getCrawlerDocumentFileDefaultLang()); } // id putResultDataBody(dataMap, fessConfig.getIndexFieldId(), crawlingInfoHelper.generateId(dataMap)); // parentId String parentUrl = responseData.getParentUrl(); if (StringUtil.isNotBlank(parentUrl)) { parentUrl = pathMappingHelper.replaceUrl(sessionId, parentUrl); putResultDataBody(dataMap, fessConfig.getIndexFieldUrl(), parentUrl); putResultDataBody(dataMap, fessConfig.getIndexFieldParentId(), crawlingInfoHelper.generateId(dataMap)); putResultDataBody(dataMap, fessConfig.getIndexFieldUrl(), url); // set again } // from config final Map<String, String> scriptConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.SCRIPT); final Map<String, String> metaConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.META); for (final Map.Entry<String, String> entry : metaConfigMap.entrySet()) { final String key = entry.getKey(); final String[] values = entry.getValue().split(","); for (final String value : values) { putResultDataWithTemplate(dataMap, key, metaDataMap.get(value), scriptConfigMap.get(key)); } } final Map<String, String> valueConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.VALUE); for (final Map.Entry<String, String> entry : valueConfigMap.entrySet()) { final String key = entry.getKey(); putResultDataWithTemplate(dataMap, key, entry.getValue(), scriptConfigMap.get(key)); } return dataMap; } private ExtractData getExtractData(final Extractor extractor, final InputStream in, final Map<String, String> params) { try { return extractor.getText(in, params); } catch (final RuntimeException e) { if (!fessConfig.isCrawlerIgnoreContentException()) { throw e; } if (logger.isDebugEnabled()) { logger.debug("Could not get a text.", e); } } return new ExtractData(); } private String getResourceName(final ResponseData responseData) { String name = responseData.getUrl(); final String enc = responseData.getCharSet(); if (name == null || enc == null) { return null; } name = name.replaceAll("/+$", StringUtil.EMPTY); final int idx = name.lastIndexOf('/'); if (idx >= 0) { name = name.substring(idx + 1); } try { return URLDecoder.decode(name, enc); } catch (final Exception e) { return name; } } protected String getHostOnFile(final String url) { if (StringUtil.isBlank(url)) { return StringUtil.EMPTY; // empty } if (url.startsWith("file:////")) { final String value = decodeUrlAsName(url.substring(9), true); final int pos = value.indexOf('/'); if (pos > 0) { return value.substring(0, pos); } else if (pos == -1) { return value; } else { return "localhost"; } } else if (url.startsWith("file:")) { return "localhost"; } return getHost(url); } protected List<String> getRoleTypes(final ResponseData responseData) { final List<String> roleTypeList = new ArrayList<>(); if (fessConfig.isSmbRoleFromFile() && responseData.getUrl().startsWith("smb://")) { final SambaHelper sambaHelper = ComponentUtil.getSambaHelper(); final ACE[] aces = (ACE[]) responseData.getMetaDataMap().get(SmbClient.SMB_ACCESS_CONTROL_ENTRIES); if (aces != null) { for (final ACE item : aces) { final SID sid = item.getSID(); final String accountId = sambaHelper.getAccountId(sid); if (accountId != null) { roleTypeList.add(accountId); } } if (getLogger().isDebugEnabled()) { getLogger().debug("smbUrl:" + responseData.getUrl() + " roleType:" + roleTypeList.toString()); } } } return roleTypeList; } protected String getSiteOnFile(final String url, final String encoding) { if (StringUtil.isBlank(url)) { return StringUtil.EMPTY; // empty } if (url.startsWith("file:////")) { final String value = decodeUrlAsName(url.substring(9), true); return StringUtils.abbreviate("\\\\" + value.replace('/', '\\'), getMaxSiteLength()); } else if (url.startsWith("file:")) { final String value = decodeUrlAsName(url.substring(5), true); if (value.length() > 2 && value.charAt(2) == ':') { // Windows return StringUtils.abbreviate(value.substring(1).replace('/', '\\'), getMaxSiteLength()); } else { // Unix return StringUtils.abbreviate(value, getMaxSiteLength()); } } return getSite(url, encoding); } @Override public Object getData(final AccessResultData<?> accessResultData) { final byte[] data = accessResultData.getData(); if (data != null) { try { return SerializeUtil.fromBinaryToObject(data); } catch (final Exception e) { throw new CrawlerSystemException("Could not create an instanced from bytes.", e); } } return new HashMap<String, Object>(); } public void addMetaContentMapping(final String metaname, final String dynamicField) { if (metaContentMapping == null) { metaContentMapping = new HashMap<>(); } metaContentMapping.put(metaname, dynamicField); } }