/*
* Carrot2 project.
*
* Copyright (C) 2002-2016, Dawid Weiss, Stanisław Osiński.
* All rights reserved.
*
* Refer to the full license file "carrot2.LICENSE"
* in the root folder of the repository checkout or at:
* http://www.carrot2.org/carrot2.LICENSE
*/
package org.carrot2.source.microsoft.v5;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.TimeUnit;
import org.apache.http.Header;
import org.apache.http.NameValuePair;
import org.apache.http.message.BasicHeader;
import org.apache.http.message.BasicNameValuePair;
import org.carrot2.core.Document;
import org.carrot2.core.IDocumentSource;
import org.carrot2.core.LanguageCode;
import org.carrot2.core.ProcessingException;
import org.carrot2.core.attribute.CommonAttributes;
import org.carrot2.core.attribute.Init;
import org.carrot2.core.attribute.Internal;
import org.carrot2.core.attribute.Processing;
import org.carrot2.shaded.guava.common.base.Strings;
import org.carrot2.shaded.guava.common.util.concurrent.RateLimiter;
import org.carrot2.source.MultipageSearchEngine;
import org.carrot2.source.MultipageSearchEngineMetadata;
import org.carrot2.source.SearchEngineResponse;
import org.carrot2.source.SimpleSearchEngine;
import org.carrot2.util.attribute.Attribute;
import org.carrot2.util.attribute.AttributeLevel;
import org.carrot2.util.attribute.Bindable;
import org.carrot2.util.attribute.DefaultGroups;
import org.carrot2.util.attribute.Group;
import org.carrot2.util.attribute.Input;
import org.carrot2.util.attribute.Label;
import org.carrot2.util.attribute.Level;
import org.carrot2.util.attribute.Required;
import org.carrot2.util.attribute.constraint.NotBlank;
import org.carrot2.util.httpclient.HttpRedirectStrategy;
import org.carrot2.util.httpclient.HttpUtils;
/**
* A {@link IDocumentSource} fetching web page search results from Bing,
* using Search API V5.
*
* <p>Important: there are limits for free use of the above API (beyond which it is a
* paid service).
*
* @see "https://msdn.microsoft.com/en-us/library/mt604056.aspx"
*/
@Bindable(prefix = "Bing5DocumentSource", inherit = CommonAttributes.class)
public class Bing5DocumentSource extends MultipageSearchEngine
{
/**
* System property name for passing Bing API key.
*
* You can also override the key per-controller or request
* via init or runtime attributes.
*/
public static final String SYSPROP_BING5_API = "bing5.key";
/**
* Default timeout.
*/
private static final int BING_TIMEOUT = (int) TimeUnit.SECONDS.toMillis(10);
/**
* Max concurrent requests to Bing.
*
* @see #RATE_LIMITER
*/
private static final int MAX_CONCURRENT_REQUESTS = 4;
/**
* As per Bing's official guidelines, limit the rate to a maximum of 5 requests per second.
*/
static final RateLimiter RATE_LIMITER = RateLimiter.create(MAX_CONCURRENT_REQUESTS);
/**
* REST endpoint.
*/
private final static String SERVICE_URL = "https://api.cognitive.microsoft.com/bing/v5.0/search";
/** Web search specific metadata. */
final static MultipageSearchEngineMetadata METADATA = new MultipageSearchEngineMetadata(30, 200);
/**
* The API key used to authenticate requests. You will have to provide your own API key.
* There is a free monthly grace request limit.
*
* <p>By default takes the system property's value under key: <code>bing5.key</code>.</p>
*/
@Init
@Processing
@Input
@Attribute
@Label("Application API key")
@Level(AttributeLevel.BASIC)
@Group(SERVICE)
@Required
@NotBlank
public String apiKey = System.getProperty(SYSPROP_BING5_API);
/**
* Search type filter. We only use webpages since news and images on a generic
* websearch query are always returned in very small numbers and cannot be windowed.
*/
private final SourceType sourceType = SourceType.WEBPAGES;
/**
* Site restriction to return value under a given URL. Example:
* <tt>http://www.wikipedia.org</tt> or simply <tt>wikipedia.org</tt>.
*/
@Processing
@Input
@Attribute
@Label("Site restriction")
@Level(AttributeLevel.ADVANCED)
@Group(DefaultGroups.FILTERING)
public String site;
/**
* Language and country/region information for the request.
*/
@Input
@Processing
@Attribute
@Label("Market")
@Level(AttributeLevel.BASIC)
@Group(DefaultGroups.FILTERING)
public MarketOption market = MarketOption.ENGLISH_UNITED_STATES;
/**
* Adult search restriction (porn filter).
*/
@Processing
@Input
@Attribute
@Label("Safe search")
@Level(AttributeLevel.MEDIUM)
@Group(DefaultGroups.FILTERING)
public AdultOption adult;
/**
* HTTP redirect response strategy (follow or throw an error).
*/
@Input
@Processing
@Attribute
@Label("HTTP redirect strategy")
@Level(AttributeLevel.MEDIUM)
@Group(SimpleSearchEngine.SERVICE)
@Internal
public HttpRedirectStrategy redirectStrategy = HttpRedirectStrategy.NO_REDIRECTS;
/**
* Respect official guidelines concerning rate limits. If set to false,
* rate limits are not observed.
*/
@Input
@Processing
@Attribute
@Label("Respect request rate limits")
@Level(AttributeLevel.ADVANCED)
@Group(SimpleSearchEngine.SERVICE)
public boolean respectRateLimits = true;
private final MultipageSearchEngineMetadata metadata;
private final String serviceURL;
public Bing5DocumentSource() {
this(METADATA, SERVICE_URL);
}
protected Bing5DocumentSource(MultipageSearchEngineMetadata metadata, String serviceURL) {
this.metadata = metadata;
this.serviceURL = serviceURL;
}
@Override
public final void process() throws ProcessingException {
process(metadata, getSharedExecutor(MAX_CONCURRENT_REQUESTS, this.getClass()));
}
@Override
protected final void process(MultipageSearchEngineMetadata metadata, ExecutorService executor) throws ProcessingException
{
if (Strings.isNullOrEmpty(apiKey)) {
throw new ProcessingException("Bing V5 API requires a key. See "
+ Bing5DocumentSource.class.getSimpleName() + " class documentation.");
}
super.process(metadata, executor);
}
/**
* Create a single page fetcher for the search range.
*/
@Override
protected final Callable<SearchEngineResponse> createFetcher(final SearchRange bucket)
{
return new SearchEngineResponseCallable()
{
public SearchEngineResponse search() throws Exception
{
return doSearch(query, bucket.start, bucket.results);
}
};
}
/**
* Run a single request to Bing API V5.
*/
private final SearchEngineResponse doSearch(String query, int startAt, int totalResultsRequested)
throws Exception
{
if (respectRateLimits) {
RATE_LIMITER.acquire();
}
if (!Strings.isNullOrEmpty(site)) {
query = Strings.nullToEmpty(query) + " site:" + site;
}
final ArrayList<NameValuePair> params = new ArrayList<>();
params.add(new BasicNameValuePair("q", query == null ? "" : query.trim()));
params.add(new BasicNameValuePair("offset", Integer.toString(startAt)));
params.add(new BasicNameValuePair("count", Integer.toString(totalResultsRequested)));
if (market != null) {
params.add(new BasicNameValuePair("mkt", market.marketCode));
}
if (adult != null) {
params.add(new BasicNameValuePair("safeSearch", adult.name()));
}
augmentSearchParameters(params);
List<Header> headers = Arrays.<Header> asList(
new BasicHeader("Ocp-Apim-Subscription-Key", apiKey));
augmentSearchHeaders(headers);
HttpUtils.Response response = null;
retry:
do {
response = HttpUtils.doGET(
serviceURL,
params,
headers,
/* user */ null,
/* pwd */ null,
BING_TIMEOUT,
redirectStrategy.value());
if (response.status == 429) {
for (String [] header : response.headers) {
if ("Retry-After".equalsIgnoreCase(header[0])) {
long secs = Long.parseLong(header[1]);
if (secs <= 5) {
Thread.sleep(TimeUnit.SECONDS.toMillis(secs));
continue retry;
} else {
// We'd have to wait too long, key saturated or something else is wrong.
// break out.
}
}
}
}
// Always break
break;
} while (true);
BingResponse parsed;
InputStream is = response.getPayloadAsStream();
try {
parsed = BingResponse.parse(is);
} finally {
is.close();
}
if (parsed instanceof ErrorResponse) {
throw new IOException(((ErrorResponse) parsed).errors.get(0).message);
} else if (parsed instanceof UnstructuredResponse) {
throw new IOException(((UnstructuredResponse) parsed).message);
} else {
SearchEngineResponse ser = new SearchEngineResponse();
ser.metadata.put(SearchEngineResponse.COMPRESSION_KEY, response.compression);
handleResponse(parsed, ser);
if (market != null) {
LanguageCode languageCode = market.toLanguageCode();
for (Document doc : ser.results) {
doc.setLanguage(languageCode);
}
}
return ser;
}
}
protected void augmentSearchHeaders(List<Header> headers) {
}
protected void augmentSearchParameters(List<NameValuePair> params) {
params.add(new BasicNameValuePair("responseFilter", sourceType.responseFilter()));
}
protected void handleResponse(BingResponse response, SearchEngineResponse ser) {
SearchResponse searchResponse = (SearchResponse) response;
if (searchResponse.webPages != null) {
ser.metadata.put(SearchEngineResponse.RESULTS_TOTAL_KEY, searchResponse.webPages.totalEstimatedMatches);
for (SearchResponse.WebPages.Result r : searchResponse.webPages.value) {
Document doc = new Document(r.name, r.snippet, r.displayUrl);
if (r.displayUrl != null) {
doc.setField(Document.CLICK_URL, r.url);
}
ser.results.add(doc);
}
}
}
}