package edu.uncc.cs.watsonsim.search; import java.io.IOException; import java.net.URI; import java.net.URISyntaxException; import java.util.ArrayList; import java.util.List; import org.apache.http.client.fluent.*; import org.apache.http.client.utils.URIBuilder; import org.apache.log4j.Logger; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import edu.uncc.cs.watsonsim.Environment; import edu.uncc.cs.watsonsim.Passage; import edu.uncc.cs.watsonsim.Score; import edu.uncc.cs.watsonsim.scorers.Merge; /** * Internet-enabled Searcher for Bing. * * You will need a Bing api key, which you can (as of the time of this writing) * get from <a href="http://datamarket.azure.com">Microsoft</a> * * Bing gives around 5000 queries per month, which means that in most cases for * sustained development you will need to use CachingSearcher. * * @see CachingSearcher * @see privatedata.bingAPIKey * @author Sean Gallagher * @author Stephen Stanton * @author D Haval */ public class BingSearcher extends Searcher { private final String key; private final Logger log = Logger.getLogger(getClass()); public BingSearcher(Environment env) { super(env); Score.register("BING_ANSWER_RANK", -1, Merge.Mean); Score.register("BING_ANSWER_PRESENT", 0.0, Merge.Sum); key = env.getConfOrDie("bing_api_key"); } public List<Passage> query(String query) { URI uri = URI.create(""); // A bogus workaround for "may not have been initialized" try { uri = new URIBuilder() .setScheme("https") .setHost("api.datamarket.azure.com") .setPath("/Data.ashx/Bing/Search/v1/Web") .addParameter("Query", String.format("'%s'", query)).build(); // Should we place it in quotes? //.addParameter("$top", "50") //.addParameter("$format", "Atom").build(); } catch (URISyntaxException e1) { /* This bogus block is required by Java, * but strictly speaking new URIBuilder() can't actually throw * this error because it has no input (so there can be no syntax * error). Hence, this block is unreachable. */ e1.printStackTrace(); } List<Passage> results = new ArrayList<Passage>(); try { String resp = Executor .newInstance() .auth(key, key) .execute(Request.Get(uri)) .returnContent().asString(); Document doc = Jsoup.parse(resp); List<Element> elements = doc.select("entry"); // Perhaps limit to MAX_RESULTS? for (int i=0; i < elements.size(); i++) { Element e = elements.get(i); results.add(new Passage( "bing", // Engine e.select("d|Title").text(), // Title e.select("d|Description").text(), // Full Text e.select("d|Url").text()) // Reference .score("BING_ANSWER_RANK", (double) i) // Score .score("BING_ANSWER_PRESENT", 1.0) ); } log.info("Retrieved " + elements.size() + " candidates from Bing."); } catch (IOException e) { log.error("Error while searching with Bing. Ignoring. Details follow."); log.error(e.getMessage()); } return results; } }