/* * Copyright 2012-2017 CodeLibs Project and the Others. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, * either express or implied. See the License for the specific language * governing permissions and limitations under the License. */ package org.codelibs.fess.es.config.exentity; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.regex.Pattern; import org.apache.http.auth.UsernamePasswordCredentials; import org.codelibs.core.lang.StringUtil; import org.codelibs.fess.Constants; import org.codelibs.fess.app.service.RequestHeaderService; import org.codelibs.fess.app.service.WebAuthenticationService; import org.codelibs.fess.crawler.client.CrawlerClientFactory; import org.codelibs.fess.crawler.client.http.Authentication; import org.codelibs.fess.crawler.client.http.HcHttpClient; import org.codelibs.fess.es.config.bsentity.BsWebConfig; import org.codelibs.fess.es.config.exbhv.LabelTypeBhv; import org.codelibs.fess.es.config.exbhv.WebConfigToLabelBhv; import org.codelibs.fess.mylasta.direction.FessConfig; import org.codelibs.fess.util.ComponentUtil; import org.codelibs.fess.util.ParameterUtil; import org.dbflute.cbean.result.ListResultBean; /** * @author FreeGen */ public class WebConfig extends BsWebConfig implements CrawlingConfig { private static final long serialVersionUID = 1L; private String[] labelTypeIds; protected volatile Pattern[] includedDocUrlPatterns; protected volatile Pattern[] excludedDocUrlPatterns; protected transient volatile Map<ConfigName, Map<String, String>> configParameterMap; private volatile List<LabelType> labelTypeList; public WebConfig() { super(); setBoost(1.0f); } /* (non-Javadoc) * @see org.codelibs.fess.db.exentity.CrawlingConfig#getLabelTypeIds() */ public String[] getLabelTypeIds() { if (labelTypeIds == null) { return StringUtil.EMPTY_STRINGS; } return labelTypeIds; } public void setLabelTypeIds(final String[] labelTypeIds) { this.labelTypeIds = labelTypeIds; } public List<LabelType> getLabelTypeList() { if (labelTypeList == null) { synchronized (this) { if (labelTypeList == null) { final FessConfig fessConfig = ComponentUtil.getFessConfig(); final WebConfigToLabelBhv webConfigToLabelBhv = ComponentUtil.getComponent(WebConfigToLabelBhv.class); final ListResultBean<WebConfigToLabel> mappingList = webConfigToLabelBhv.selectList(cb -> { cb.query().setWebConfigId_Equal(getId()); cb.specify().columnLabelTypeId(); cb.paging(fessConfig.getPageLabeltypeMaxFetchSizeAsInteger().intValue(), 1); }); final List<String> labelIdList = new ArrayList<>(); for (final WebConfigToLabel mapping : mappingList) { labelIdList.add(mapping.getLabelTypeId()); } final LabelTypeBhv labelTypeBhv = ComponentUtil.getComponent(LabelTypeBhv.class); labelTypeList = labelIdList.isEmpty() ? Collections.emptyList() : labelTypeBhv.selectList(cb -> { cb.query().setId_InScope(labelIdList); cb.query().addOrderBy_SortOrder_Asc(); cb.fetchFirst(fessConfig.getPageLabeltypeMaxFetchSizeAsInteger()); }); } } } return labelTypeList; } @Override public String[] getLabelTypeValues() { final List<LabelType> list = getLabelTypeList(); final List<String> labelValueList = new ArrayList<>(list.size()); for (final LabelType labelType : list) { labelValueList.add(labelType.getValue()); } return labelValueList.toArray(new String[labelValueList.size()]); } @Override public String getDocumentBoost() { return Float.valueOf(getBoost().floatValue()).toString(); } @Override public String getIndexingTarget(final String input) { if (includedDocUrlPatterns == null || excludedDocUrlPatterns == null) { initDocUrlPattern(); } if (includedDocUrlPatterns.length == 0 && excludedDocUrlPatterns.length == 0) { return Constants.TRUE; } for (final Pattern pattern : includedDocUrlPatterns) { if (pattern.matcher(input).matches()) { return Constants.TRUE; } } for (final Pattern pattern : excludedDocUrlPatterns) { if (pattern.matcher(input).matches()) { return Constants.FALSE; } } return Constants.TRUE; } protected synchronized void initDocUrlPattern() { if (includedDocUrlPatterns == null) { if (StringUtil.isNotBlank(getIncludedDocUrls())) { final List<Pattern> urlPatterList = new ArrayList<>(); final String[] urls = getIncludedDocUrls().split("[\r\n]"); for (final String u : urls) { if (StringUtil.isNotBlank(u) && !u.trim().startsWith("#")) { urlPatterList.add(Pattern.compile(u.trim())); } } includedDocUrlPatterns = urlPatterList.toArray(new Pattern[urlPatterList.size()]); } else { includedDocUrlPatterns = new Pattern[0]; } } if (excludedDocUrlPatterns == null) { if (StringUtil.isNotBlank(getExcludedDocUrls())) { final List<Pattern> urlPatterList = new ArrayList<>(); final String[] urls = getExcludedDocUrls().split("[\r\n]"); for (final String u : urls) { if (StringUtil.isNotBlank(u) && !u.trim().startsWith("#")) { urlPatterList.add(Pattern.compile(u.trim())); } } excludedDocUrlPatterns = urlPatterList.toArray(new Pattern[urlPatterList.size()]); } else if (includedDocUrlPatterns.length > 0) { excludedDocUrlPatterns = new Pattern[] { Pattern.compile(".*") }; } else { excludedDocUrlPatterns = new Pattern[0]; } } } public String getBoostValue() { if (boost != null) { return boost.toString(); } return null; } public void setBoostValue(final String value) { if (value != null) { try { boost = Float.parseFloat(value); } catch (final Exception e) {} } } @Override public String getConfigId() { return ConfigType.WEB.getConfigId(getId()); } @Override public Map<String, Object> initializeClientFactory(final CrawlerClientFactory clientFactory) { final WebAuthenticationService webAuthenticationService = ComponentUtil.getComponent(WebAuthenticationService.class); final RequestHeaderService requestHeaderService = ComponentUtil.getComponent(RequestHeaderService.class); final FessConfig fessConfig = ComponentUtil.getFessConfig(); // HttpClient Parameters final Map<String, Object> paramMap = new HashMap<>(); clientFactory.setInitParameterMap(paramMap); final Map<String, String> clientConfigMap = getConfigParameterMap(ConfigName.CLIENT); if (clientConfigMap != null) { paramMap.putAll(clientConfigMap); } // robots txt enabled if (paramMap.get(HcHttpClient.ROBOTS_TXT_ENABLED_PROPERTY) == null) { paramMap.put(HcHttpClient.ROBOTS_TXT_ENABLED_PROPERTY, !fessConfig.isCrawlerIgnoreRobotsTxt()); } final String userAgent = getUserAgent(); if (StringUtil.isNotBlank(userAgent)) { paramMap.put(HcHttpClient.USER_AGENT_PROPERTY, userAgent); } final List<WebAuthentication> webAuthList = webAuthenticationService.getWebAuthenticationList(getId()); final List<Authentication> basicAuthList = new ArrayList<>(); for (final WebAuthentication webAuth : webAuthList) { basicAuthList.add(webAuth.getAuthentication()); } paramMap.put(HcHttpClient.BASIC_AUTHENTICATIONS_PROPERTY, basicAuthList.toArray(new Authentication[basicAuthList.size()])); // request header final List<RequestHeader> requestHeaderList = requestHeaderService.getRequestHeaderList(getId()); final List<org.codelibs.fess.crawler.client.http.RequestHeader> rhList = new ArrayList<>(); for (final RequestHeader requestHeader : requestHeaderList) { rhList.add(requestHeader.getCrawlerRequestHeader()); } paramMap.put(HcHttpClient.REQUERT_HEADERS_PROPERTY, rhList.toArray(new org.codelibs.fess.crawler.client.http.RequestHeader[rhList.size()])); // proxy credentials if (paramMap.get("proxyUsername") != null && paramMap.get("proxyPassword") != null) { paramMap.put(HcHttpClient.PROXY_CREDENTIALS_PROPERTY, new UsernamePasswordCredentials(paramMap.remove("proxyUsername") .toString(), paramMap.remove("proxyPassword").toString())); } return paramMap; } @Override public Map<String, String> getConfigParameterMap(final ConfigName name) { if (configParameterMap == null) { configParameterMap = ParameterUtil.createConfigParameterMap(getConfigParameter()); } final Map<String, String> configMap = configParameterMap.get(name); if (configMap == null) { return Collections.emptyMap(); } return configMap; } @Override public String getId() { return asDocMeta().id(); } public void setId(final String id) { asDocMeta().id(id); } public Long getVersionNo() { return asDocMeta().version(); } public void setVersionNo(final Long version) { asDocMeta().version(version); } @Override public String toString() { return "WebConfig [available=" + available + ", boost=" + boost + ", configParameter=" + configParameter + ", createdBy=" + createdBy + ", createdTime=" + createdTime + ", depth=" + depth + ", excludedDocUrls=" + excludedDocUrls + ", excludedUrls=" + excludedUrls + ", includedDocUrls=" + includedDocUrls + ", includedUrls=" + includedUrls + ", intervalTime=" + intervalTime + ", timeToLive=" + timeToLive + ", maxAccessCount=" + maxAccessCount + ", name=" + name + ", numOfThread=" + numOfThread + ", permissions=" + Arrays.toString(permissions) + ", sortOrder=" + sortOrder + ", updatedBy=" + updatedBy + ", updatedTime=" + updatedTime + ", urls=" + urls + ", userAgent=" + userAgent + "]"; } }