/* * Carrot2 project. * * Copyright (C) 2002-2016, Dawid Weiss, Stanisław Osiński. * All rights reserved. * * Refer to the full license file "carrot2.LICENSE" * in the root folder of the repository checkout or at: * http://www.carrot2.org/carrot2.LICENSE */ package org.carrot2.source.opensearch; import java.util.Map; import java.util.concurrent.Callable; import org.carrot2.core.Document; import org.carrot2.core.IDocumentSource; import org.carrot2.core.ProcessingException; import org.carrot2.core.attribute.Init; import org.carrot2.core.attribute.Processing; import org.carrot2.source.MultipageSearchEngine; import org.carrot2.source.MultipageSearchEngineMetadata; import org.carrot2.source.SearchEngineResponse; import org.carrot2.util.StringUtils; import org.carrot2.util.attribute.Attribute; import org.carrot2.util.attribute.AttributeLevel; import org.carrot2.util.attribute.Bindable; import org.carrot2.util.attribute.Group; import org.carrot2.util.attribute.Input; import org.carrot2.util.attribute.Label; import org.carrot2.util.attribute.Level; import org.carrot2.util.attribute.Required; import org.carrot2.util.attribute.constraint.IntRange; import org.carrot2.util.resource.URLResourceWithParams; import org.slf4j.Logger; import org.carrot2.shaded.guava.common.collect.Maps; import com.sun.syndication.fetcher.FeedFetcher; import com.sun.syndication.fetcher.impl.HttpURLFeedFetcher; /** * A {@link IDocumentSource} fetching {@link Document}s (search results) from an * OpenSearch feed. * <p> * Based on code donated by Julien Nioche. * * @see <a href="http://www.opensearch.org">OpenSearch.org</a> */ @Bindable(prefix = "OpenSearchDocumentSource") public class OpenSearchDocumentSource extends MultipageSearchEngine { /** Logger for this class. */ final static Logger logger = org.slf4j.LoggerFactory.getLogger(OpenSearchDocumentSource.class); /** * Maximum concurrent threads from all instances of this component. */ private static final int MAX_CONCURRENT_THREADS = 10; /** * URL to fetch the search feed from. The URL template can contain variable place * holders as defined by the OpenSearch specification that will be replaced during * runtime. The format of the place holder is <code>${variable}</code>. The following * variables are supported: * <ul> * <li><code>searchTerms</code> will be replaced by the query</li> * <li><code>startIndex</code> index of the first result to be searched. Mutually * exclusive with <code>startPage</code></li> * <li><code>startPage</code> index of the first result * to be searched. Mutually exclusive with <code>startIndex</code>.</li> * <li><code>count</code> the number of search results per page</li> * </ul> * * <p>Example URL feed templates for public services:</p> * <dl> * <dt>nature.com</dt> * <dd><code>http://www.nature.com/opensearch/request?interface=opensearch&operation=searchRetrieve&query=${searchTerms}&startRecord=${startIndex}&maximumRecords=${count}&httpAccept=application/rss%2Bxml</code></dd> * <dt>indeed.com</dt> * <dd><code>http://www.indeed.com/opensearch?q=${searchTerms}&start=${startIndex}&limit=${count}</code></dd> * </dl> */ @Input @Processing @Init @Attribute @Required @Label("Feed URL template") @Level(AttributeLevel.BASIC) @Group(SERVICE) public String feedUrlTemplate; /** * Results per page. The number of results per page the document source will expect * the feed to return. */ @Input @Processing @Init @Attribute @Required @IntRange(min = 1) @Label("Results per page") @Level(AttributeLevel.BASIC) @Group(SERVICE) public int resultsPerPage = 50; /** * Maximum number of results. The maximum number of results the document source can * deliver. */ @Input @Processing @Init @Attribute @IntRange(min = 1) @Label("Maximum results") @Level(AttributeLevel.BASIC) @Group(SERVICE) public int maximumResults = 1000; /** * Additional parameters to be appended to {@link #feedUrlTemplate} on each request. */ @Input @Init @Processing @Attribute @Label("Feed URL parameters") @Level(AttributeLevel.ADVANCED) @Group(SERVICE) public Map<String, String> feedUrlParams = null; /** * User agent header. The contents of the User-Agent HTTP header to use when making * requests to the feed URL. If empty or <code>null</code> value is provided, * the following User-Agent will be sent: <code>Rome Client (http://tinyurl.com/64t5n) * Ver: UNKNOWN</code>. */ @Input @Init @Processing @Attribute @Label("User agent") @Level(AttributeLevel.ADVANCED) @Group(SERVICE) public String userAgent = null; /** * Search engine metadata create upon initialization. */ private MultipageSearchEngineMetadata metadata; /** Fetcher for OpenSearch feed. */ private FeedFetcher feedFetcher; /** searchTerms variable */ private static final String SEARCH_TERMS_VARIABLE_NAME = "searchTerms"; /** startIndex variable */ private static final String START_INDEX_VARIABLE_NAME = "startIndex"; /** startPage variable */ private static final String START_PAGE_VARIABLE_NAME = "startPage"; /** count variable */ private static final String COUNT_VARIABLE_NAME = "count"; @Override public void beforeProcessing() { // Verify that the attributes are legal final boolean hasStartPage = URLResourceWithParams.containsAttributePlaceholder( feedUrlTemplate, START_PAGE_VARIABLE_NAME); final boolean hasStartIndex = URLResourceWithParams.containsAttributePlaceholder( feedUrlTemplate, START_INDEX_VARIABLE_NAME); if (!(hasStartPage ^ hasStartIndex)) { throw new ProcessingException( "The feedUrlTemplate must contain either " + URLResourceWithParams .formatAttributePlaceholder(START_INDEX_VARIABLE_NAME) + " or " + URLResourceWithParams .formatAttributePlaceholder(START_PAGE_VARIABLE_NAME) + " variable"); } if (!URLResourceWithParams.containsAttributePlaceholder(feedUrlTemplate, SEARCH_TERMS_VARIABLE_NAME)) { throw new ProcessingException( "The feedUrlTemplate must contain " + URLResourceWithParams .formatAttributePlaceholder(SEARCH_TERMS_VARIABLE_NAME) + " variable"); } if (resultsPerPage == 0) { throw new ProcessingException("resultsPerPage must be set"); } this.metadata = new MultipageSearchEngineMetadata(resultsPerPage, maximumResults, hasStartPage); this.feedFetcher = new HttpURLFeedFetcher(); if (org.apache.commons.lang.StringUtils.isNotBlank(this.userAgent)) { this.feedFetcher.setUserAgent(this.userAgent); } } @Override public void process() throws ProcessingException { super.process(metadata, getSharedExecutor(MAX_CONCURRENT_THREADS, this.getClass())); } @Override protected Callable<SearchEngineResponse> createFetcher(final SearchRange bucket) { return new SearchEngineResponseCallable() { public SearchEngineResponse search() throws Exception { // Replace variables in the URL final Map<String, Object> values = Maps.newHashMap(); values.put(SEARCH_TERMS_VARIABLE_NAME, query); values.put(START_INDEX_VARIABLE_NAME, bucket.start + 1); values.put(START_PAGE_VARIABLE_NAME, bucket.start + 1); values.put(COUNT_VARIABLE_NAME, bucket.results); final StringBuilder urlExtension = new StringBuilder( URLResourceWithParams.substituteAttributes(feedUrlTemplate, values)); if (feedUrlParams != null) { for (Map.Entry<String, String> entry : feedUrlParams.entrySet()) { urlExtension.append('&'); urlExtension.append(entry.getKey()); urlExtension.append('='); urlExtension.append(StringUtils.urlEncodeWrapException(entry .getValue(), "UTF-8")); } } final String url = urlExtension.toString(); logger.debug("Fetching URL: " + url); return RomeFetcherUtils.fetchUrl(url, feedFetcher); } }; } }