/* * Carrot2 project. * * Copyright (C) 2002-2016, Dawid Weiss, Stanisław Osiński. * All rights reserved. * * Refer to the full license file "carrot2.LICENSE" * in the root folder of the repository checkout or at: * http://www.carrot2.org/carrot2.LICENSE */ package org.carrot2.source.etools; import java.util.Collections; import java.util.Map; import org.apache.http.client.HttpResponseException; import org.carrot2.core.Document; import org.carrot2.core.LanguageCode; import org.carrot2.core.ProcessingException; import org.carrot2.core.attribute.Internal; import org.carrot2.core.attribute.Processing; import org.carrot2.source.SearchEngineResponse; import org.carrot2.source.xml.RemoteXmlSimpleSearchEngineBase; import org.carrot2.util.StringUtils; import org.carrot2.util.attribute.Attribute; import org.carrot2.util.attribute.AttributeLevel; import org.carrot2.util.attribute.Bindable; import org.carrot2.util.attribute.DefaultGroups; import org.carrot2.util.attribute.Group; import org.carrot2.util.attribute.Input; import org.carrot2.util.attribute.Label; import org.carrot2.util.attribute.Level; import org.carrot2.util.attribute.constraint.IntRange; import org.carrot2.util.resource.ClassResource; import org.carrot2.util.resource.IResource; import org.carrot2.shaded.guava.common.base.Joiner; import org.carrot2.shaded.guava.common.base.Strings; import org.carrot2.shaded.guava.common.collect.Maps; /** * A Carrot2 input component for the eTools service (http://www.etools.ch). For commercial * licensing of the eTools feed, please e-mail: <code>contact@comcepta.com</code>. */ @Bindable(prefix = "EToolsDocumentSource") public class EToolsDocumentSource extends RemoteXmlSimpleSearchEngineBase { /** * Base URL for the eTools service */ @Input @Processing @Internal @Attribute @Label("Service URL") @Level(AttributeLevel.ADVANCED) @Group(SERVICE) public String serviceUrlBase = "http://www.etools.ch/partnerSearch.do"; /** * Enumeration for countries supported by {@link EToolsDocumentSource}, see * {@link EToolsDocumentSource#country}. */ public enum Country { ALL("web"), AUSTRIA("AT"), FRANCE("FR"), GERMANY("DE"), GREAT_BRITAIN("GB"), ITALY("IT"), LICHTENSTEIN("LI"), SPAIN("ES"), SWITZERLAND("CH"); private String code; private Country(String code) { this.code = code; } @Override public String toString() { return StringUtils.identifierToHumanReadable(name()); } public String getCode() { return code; } } /** * Determines the country of origin for the returned search results. */ @Input @Processing @Attribute @Label("Country") @Level(AttributeLevel.MEDIUM) @Group(DefaultGroups.FILTERING) public Country country = Country.ALL; /** * Enumeration for languages supported by {@link EToolsDocumentSource}, see * {@link EToolsDocumentSource#language}. */ public enum Language { ALL("all"), ENGLISH("en"), FRENCH("fr"), GERMAN("de"), ITALIAN("it"), SPANISH("es"); /** * Maps <b>some</b> of the values of this enum to {@link LanguageCode}s. */ private final static Map<Language, LanguageCode> TO_LANGUAGE_CODE; static { final Map<Language, LanguageCode> map = Maps.newEnumMap(Language.class); map.put(ENGLISH, LanguageCode.ENGLISH); map.put(FRENCH, LanguageCode.FRENCH); map.put(GERMAN, LanguageCode.GERMAN); map.put(ITALIAN, LanguageCode.ITALIAN); map.put(SPANISH, LanguageCode.SPANISH); TO_LANGUAGE_CODE = Collections.unmodifiableMap(map); } private String code; private Language(String code) { this.code = code; } @Override public String toString() { return StringUtils.identifierToHumanReadable(name()); } public String getCode() { return code; } /** * Returns a corresponding {@link LanguageCode} or <code>null</code> if no * {@link LanguageCode} corresponds to this {@link Language} constant. */ public LanguageCode toLanguageCode() { return TO_LANGUAGE_CODE.get(this); } } /** * Determines the language of the returned search results. */ @Input @Processing @Attribute @Label("Language") @Level(AttributeLevel.MEDIUM) @Group(DefaultGroups.FILTERING) public Language language = Language.ENGLISH; /** * Maximum time in milliseconds to wait for all data sources to return results. */ @Input @Processing @Attribute @IntRange(min = 0) @Label("Timeout") @Level(AttributeLevel.ADVANCED) @Group(SERVICE) public int timeout = 4000; /** * Determines which data sources to search. */ @Input @Processing @Attribute @Label("Data sources") @Level(AttributeLevel.ADVANCED) @Group(SERVICE) public DataSources dataSources = DataSources.ALL; /** * Enumeration for the data sources modes supported by {@link EToolsDocumentSource}, * see {@link EToolsDocumentSource#dataSources}. */ public enum DataSources { /** * All eTools data sources will be searched. */ ALL("all"), /** * Five fastest eTools data sources at the moment will be searched. */ FASTEST("fastest"); private String code; private DataSources(String code) { this.code = code; } @Override public String toString() { return StringUtils.identifierToHumanReadable(name()); } public String getCode() { return code; } } /** * If enabled, excludes offensive content from the results. */ @Input @Processing @Attribute @Label("Safe search") @Level(AttributeLevel.BASIC) @Group(DefaultGroups.FILTERING) public boolean safeSearch = false; /** * Site URL or comma-separated list of site site URLs to which the returned results * should be restricted. For example: <tt>wikipedia.org</tt> or * <tt>en.wikipedia.org,de.wikipedia.org</tt>. Very larger lists of site restrictions * (larger than 2000 characters) may result in a processing exception. */ @Input @Processing @Attribute @Label("Site restriction") @Level(AttributeLevel.ADVANCED) @Group(DefaultGroups.FILTERING) public String site = null; /** * eTools partner identifier. If you have commercial arrangements with eTools, specify * your partner id here. */ @Input @Processing @Attribute @Internal @Label("Partner ID") @Level(AttributeLevel.ADVANCED) @Group(SERVICE) public String partnerId = "Carrot2"; /** * eTools customer identifier. For commercial use of eTools, please e-mail: * <code>contact@comcepta.com</code> to obtain your customer identifier. */ @Input @Processing @Attribute @Label("Customer ID") @Level(AttributeLevel.MEDIUM) @Group(SERVICE) public String customerId = ""; /** Some constants for calculation of request parameters */ private static final int MAX_DATA_SOURCE_RESULTS = 40; private static final int FASTEST_SOURCES_COUNT = 5; private static final int ALL_SOURCES_COUNT = 10; @Override protected IResource getXsltResource() { return new ClassResource(EToolsDocumentSource.class, "etools-to-c2.xsl"); } @Override protected String buildServiceUrl() { String urlBase = serviceUrlBase; if (urlBase.endsWith("/")) { urlBase = urlBase.substring(0, urlBase.length() - 1); } return urlBase + "?partner=" + partnerId + "&query=" + org.carrot2.util.StringUtils.urlEncodeWrapException(query, "UTF-8") + "&dataSourceResults=" + Integer.toString(getDataSourceResultsCount()) + "&maxRecords=" + results + "&language=" + language.getCode() + "&timeout=" + Integer.toString(timeout) + "&dataSources=" + dataSources.getCode() + "&safeSearch=" + Boolean.toString(safeSearch) + "&country=" + country.getCode() + "&customerId=" + StringUtils.urlEncodeWrapException(customerId, "UTF-8"); } @Override protected SearchEngineResponse fetchSearchResponse() throws Exception { try { return super.fetchSearchResponse(); } catch (Exception e) { if (e instanceof HttpResponseException) { HttpResponseException httpException = (HttpResponseException) e; int sCode = httpException.getStatusCode(); if (sCode == 302 || sCode == 403) { throw new IpBannedException(httpException); } } throw e; } } /** * Returns the number of results per data source, estimated based on the total * requested results. */ int getDataSourceResultsCount() { int sources = DataSources.ALL.equals(dataSources) ? ALL_SOURCES_COUNT : FASTEST_SOURCES_COUNT; if (results == 0) { return 0; } int rawDataSourceResults = results / sources; return Math.min(((rawDataSourceResults + 9) / 10 + 1) * 10, MAX_DATA_SOURCE_RESULTS); } @Override public void beforeProcessing() throws ProcessingException { super.beforeProcessing(); if (!Strings.isNullOrEmpty(site)) { String [] sites = site.split(",\\s*"); for (int i = 0; i < sites.length; i++) { if (!sites[i].startsWith("site:")) { sites[i] = "site:" + sites[i]; } } this.query = "(" + this.query + ") AND (" + Joiner.on(" OR ").join(sites) + ")"; if (this.query.length() > 2048) { throw new ProcessingException( "Query length must not exceed 2048 characters"); } } } @Override protected void afterFetch(SearchEngineResponse response) { // Set document's language if (language != Language.ALL) { for (Document document : response.results) { document.setLanguage(language.toLanguageCode()); } } } }