/************************************************************************* * * * This file is part of the 20n/act project. * * 20n/act enables DNA prediction for synthetic biology/bioengineering. * * Copyright (C) 2017 20n Labs, Inc. * * * * Please direct all queries to act@20n.com. * * * * This program is free software: you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * * the Free Software Foundation, either version 3 of the License, or * * (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU General Public License * * along with this program. If not, see <http://www.gnu.org/licenses/>. * * * *************************************************************************/ package act.installer.bing; import act.server.BingCacheMongoDB; import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; import com.mongodb.BasicDBList; import com.mongodb.BasicDBObject; import org.apache.http.HttpEntity; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.client.utils.URIBuilder; import org.apache.http.entity.ContentType; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.HttpStatus; import org.apache.http.impl.conn.BasicHttpClientConnectionManager; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import java.io.BufferedReader; import java.io.FileInputStream; import java.io.InputStreamReader; import java.io.IOException; import java.net.URI; import java.net.URISyntaxException; import java.net.URLEncoder; import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.util.HashSet; import java.util.Set; /** * BingSearchResults provides methods for: * - querying the Bing Search API, * - caching its results in a Mongo database * - returning searched or cached results * - finding the best name for a molecule */ public class BingSearchResults { private static final Logger LOGGER = LogManager.getFormatterLogger(BingSearchResults.class); // Full path to the account key for the Bing Search API (on the NAS) private static final String ACCOUNT_KEY_FILEPATH = "data/bing/bing_search_api_account_key.txt"; // Maximum number of results possible per API call. This is the maximum value for URL parameter "count" private static final Integer MAX_RESULTS_PER_CALL = 50; // How many search results should be retrieved when getting topSearchResults private static final Integer TOP_N = 50; // The centralized location for caching Bing Search queries. // TODO: make this changeable without a code change (with CLI maybe?) private static final String BING_CACHE_HOST = "localhost"; private static final int BING_CACHE_MONGO_PORT = 27777; private static final String BING_CACHE_MONGO_DATABASE = "bingsearch"; private static final String BING_API_HOST = "api.cognitive.microsoft.com"; private static final String BING_API_PATH = "/bing/v5.0/search"; private static ObjectMapper mapper = new ObjectMapper(); private BingCacheMongoDB bingCacheMongoDB; private BasicHttpClientConnectionManager basicConnManager; private String accountKey; private boolean cacheOnly; public BingSearchResults() { this(ACCOUNT_KEY_FILEPATH); } public BingSearchResults(boolean cacheOnly) { this.cacheOnly = cacheOnly; this.bingCacheMongoDB = new BingCacheMongoDB(BING_CACHE_HOST, BING_CACHE_MONGO_PORT, BING_CACHE_MONGO_DATABASE); } public BingSearchResults(String accountKeyFilepath) { this.cacheOnly = false; this.bingCacheMongoDB = new BingCacheMongoDB(BING_CACHE_HOST, BING_CACHE_MONGO_PORT, BING_CACHE_MONGO_DATABASE); this.basicConnManager = new BasicHttpClientConnectionManager(); try { this.accountKey = getAccountKey(accountKeyFilepath); } catch (IOException e) { String msg = String.format("Bing Searcher could not find account key at %s", accountKeyFilepath); LOGGER.error(msg); throw new RuntimeException(msg); } } /** This function gets the account key located on the NAS * @return the account key to be used for authentication purposes * @throws IOException */ private static String getAccountKey(String accountKeyFilename) throws IOException { FileInputStream fs = new FileInputStream(accountKeyFilename); BufferedReader br = new BufferedReader(new InputStreamReader(fs)); String account_key = br.readLine(); return account_key; } /** This function fetches the total number of Bing search results and return the "totalCountSearchResult". * @param formattedName name that will be used as search query, lowercase formatted * @return the total count search results from Bing search * @throws IOException */ private Long fetchTotalCountSearchResults(String formattedName) throws IOException { LOGGER.debug("Updating totalCountSearchResults for name: %s.", formattedName); final String queryTerm = URLEncoder.encode(formattedName, StandardCharsets.UTF_8.name()); // Set count to 1 and offset to 0 since we need only one search result to extract the estimated count. final int count = 1; final int offset = 0; JsonNode results = fetchBingSearchAPIResponse(queryTerm, count, offset); return results.path("totalEstimatedMatches").asLong(); } /** This function fetches the topN Bing search results for the current instance of NameSearchResult object * and updates the "topSearchResults" instance variable. Existing value is overridden. * @param formattedName name that will be used as search query, lowercase formatted * @param topN number of Web results to fetch from Bing Search API * @return returns a set of SearchResults containing the topN Bing search results * @throws IOException */ private Set<SearchResult> fetchTopSearchResults(String formattedName, Integer topN) throws IOException { LOGGER.debug("Updating topSearchResults for name: %s.", formattedName); Set<SearchResult> topSearchResults = new HashSet<>(); final String queryTerm = URLEncoder.encode(formattedName, StandardCharsets.UTF_8.name()); // The Bing search API cannot return more than 100 results at once, but it is possible to iterate // through the results. // For example, if we need topN = 230 results, we will issue the following queries // (count and offset are URL parameters) // QUERY 1: count = 100, offset = 0 // QUERY 2: count = 100, offset = 100 // QUERY 3: count = 30, offset = 200 Integer iterations = topN / MAX_RESULTS_PER_CALL; Integer remainder = topN % MAX_RESULTS_PER_CALL; for (int i = 0; i < iterations; i++) { topSearchResults.addAll(fetchSearchResults(queryTerm, MAX_RESULTS_PER_CALL, MAX_RESULTS_PER_CALL * i)); } if (remainder > 0) { topSearchResults.addAll(fetchSearchResults(queryTerm, remainder, MAX_RESULTS_PER_CALL * iterations)); } return topSearchResults; } /** This function issues a Bing Search API call and parses the response to extract a set of SearchResults. * @param query (String) the term to query for. * @param count (int) URL parameter indicating how many results to return. Max value is 100. * @param offset (int) URL parameter indicating the offset for results. * @return returns a set of SearchResults containing [count] search results with offset [offset] * @throws IOException */ private Set<SearchResult> fetchSearchResults(String query, int count, int offset) throws IOException { if (count > MAX_RESULTS_PER_CALL) { LOGGER.warn("Number of results requested (%d) was too high. Will get only %d", count, MAX_RESULTS_PER_CALL); } Set<SearchResult> searchResults = new HashSet<>(); JsonNode results = fetchBingSearchAPIResponse(query, count, offset); final JsonNode webResults = results.path("value"); for (JsonNode webResult : webResults) { SearchResult searchResult = new SearchResult(); searchResult.populateFromJsonNode(webResult); searchResults.add(searchResult); } return searchResults; } /** This function issues a Bing search API call and gets the JSONObject containing the relevant results * (including TotalCounts and SearchResults) * @param queryTerm (String) the term to query for. * @param count (int) URL parameter indicating how many results to return. Max value is 100. * @param offset (int) URL parameter indicating the offset for results. * @return a JSONObject containing the response. * @throws IOException */ private JsonNode fetchBingSearchAPIResponse(String queryTerm, Integer count, Integer offset) throws IOException { if (count <= 0) { LOGGER.error("Bing Search API was called with \"count\" URL parameter = 0. Please request at least one result."); return null; } URI uri = null; try { // Bing URL pattern. Note that we use composite queries to allow retrieval of the total results count. // Transaction cost is [count] bings, where [count] is the value of the URL parameter "count". // In other words, we can make 5M calls with [count]=1 per month. // Example: https://api.cognitive.microsoft.com/bing/v5.0/search?q=porsche&responseFilter=webpages uri = new URIBuilder() .setScheme("https") .setHost(BING_API_HOST) .setPath(BING_API_PATH) // Wrap the query term (%s) with double quotes (%%22) for exact search .setParameter("q", String.format("%s", queryTerm)) // Restrict response to Web Pages only .setParameter("responseFilter", "webpages") // "count" parameter. .setParameter("count", count.toString()) // "offset" parameter. .setParameter("offset", offset.toString()) .build(); } catch (URISyntaxException e) { LOGGER.error("An error occurred when trying to build the Bing Search API URI", e); } JsonNode results; HttpGet httpget = new HttpGet(uri); // Yay for un-encrypted account key! // TODO: actually is there a way to encrypt it? httpget.setHeader("Ocp-Apim-Subscription-Key", accountKey); CloseableHttpClient httpclient = HttpClients.custom().setConnectionManager(basicConnManager).build(); try (CloseableHttpResponse response = httpclient.execute(httpget)) { Integer statusCode = response.getStatusLine().getStatusCode(); // TODO: The Web Search API returns useful error messages, we could use them to have better insights on failures. // See: https://dev.cognitive.microsoft.com/docs/services/56b43eeccf5ff8098cef3807/operations/56b4447dcf5ff8098cef380d if (!statusCode.equals(HttpStatus.SC_OK)) { LOGGER.error("Bing Search API call returned an unexpected status code (%d) for URI: %s", statusCode, uri); return null; } HttpEntity entity = response.getEntity(); ContentType contentType = ContentType.getOrDefault(entity); Charset charset = contentType.getCharset(); if (charset == null) { charset = StandardCharsets.UTF_8; } try (final BufferedReader in = new BufferedReader(new InputStreamReader(entity.getContent(), charset))) { String inputLine; final StringBuilder stringResponse = new StringBuilder(); while ((inputLine = in.readLine()) != null) { stringResponse.append(inputLine); } JsonNode rootNode = mapper.readValue(stringResponse.toString(), JsonNode.class); results = rootNode.path("webPages"); } } return results; } /** This key function caches in a MongoDB collection and returns a set of SearchResults. * If present, the results are returned from the cache. If not, the results are queried and returned after updating * the cache. * @param name (String) the name to return results for. Will be normalized to lower case. * @return a set of SearchResults * @throws IOException */ public Set<SearchResult> getAndCacheTopSearchResults(String name) throws IOException { String formattedName = name.toLowerCase(); BasicDBObject nameSearchResultDBObject = bingCacheMongoDB.getNameSearchResultDBObjectFromName(formattedName); Set<SearchResult> searchResults = new HashSet<>(); // There are 3 cases: // 1) There is a corresponding entry in the cache AND the topSearchResults are populated. // In this case, we read from the cache and return the results. // 2) There is a corresponding entry in the cache BUT the topSearchResults are not populated. // This case occurs when only totalCountSearchResults is populated. // In this case, perform the relevant query, update the cache and return the results // 3) There is no corresponding entry in the cache. // In this case, perform the relevant query, create a new entry in the cache and return the results. if (nameSearchResultDBObject == null) { // Case 3) LOGGER.debug("No corresponding entry in the cache. Fetching results and populating cache."); // Query the results searchResults = fetchTopSearchResults(formattedName, TOP_N); // Create new object and update it NameSearchResults nameSearchResults = new NameSearchResults(formattedName); nameSearchResults.setTopSearchResults(searchResults); // Save new document in the cache bingCacheMongoDB.cacheNameSearchResult(nameSearchResults); return searchResults; } // There is an existing entry in the DB BasicDBList topSearchResultsList = (BasicDBList) nameSearchResultDBObject.get("topSearchResults"); if (topSearchResultsList == null) { // Case 2) LOGGER.debug("Existing entry in the cache, with empty topSearchResults. Fetching results and updating cache."); // Query the results searchResults = fetchTopSearchResults(formattedName, TOP_N); // Create new object and update its instance variable NameSearchResults nameSearchResults = new NameSearchResults(formattedName); nameSearchResults.setTopSearchResults(searchResults); // Update the cache bingCacheMongoDB.updateTopSearchResults(formattedName, nameSearchResults); return searchResults; } // Case 1) LOGGER.debug("Existing entry in the cache, with populated topSearchResults. Returning from the cache."); for (Object topSearchResult : topSearchResultsList) { SearchResult searchResult = new SearchResult(); BasicDBObject topSearchResultDBObject = (BasicDBObject) topSearchResult; searchResult.populateFromBasicDBObject(topSearchResultDBObject); searchResults.add(searchResult); } return searchResults; } public Set<SearchResult> getTopSearchResultsFromCache(String name) { Set<SearchResult> searchResults = new HashSet<>(); String formattedName = name.toLowerCase(); BasicDBObject nameSearchResultDBObject = bingCacheMongoDB.getNameSearchResultDBObjectFromName(formattedName); if (nameSearchResultDBObject == null) { return searchResults; } BasicDBList topSearchResultsList = (BasicDBList) nameSearchResultDBObject.get("topSearchResults"); if (topSearchResultsList == null) { return searchResults; } for (Object topSearchResult : topSearchResultsList) { SearchResult searchResult = new SearchResult(); BasicDBObject topSearchResultDBObject = (BasicDBObject) topSearchResult; searchResult.populateFromBasicDBObject(topSearchResultDBObject); searchResults.add(searchResult); } return searchResults; } public Long getTotalCountSearchResultsFromCache(String name) { String formattedName = name.toLowerCase(); BasicDBObject nameSearchResultDBObject = bingCacheMongoDB.getNameSearchResultDBObjectFromName(formattedName); Long totalCountSearchResults; if (nameSearchResultDBObject == null) { return -1L; } totalCountSearchResults = (Long) nameSearchResultDBObject.get("totalCountSearchResults"); if (totalCountSearchResults == null) { return -1L; } return totalCountSearchResults; } /** This key function caches in a MongoDB collection and returns the total count of Bing search results. * If present, the results are returned from the cache. If not, the results are queried and returned after updating * the cache. * @param name (String) the name to return results for. Will be normalized to lower case. * @return the total search result count * @throws IOException */ public Long getAndCacheTotalCountSearchResults(String name) throws IOException { String formattedName = name.toLowerCase(); BasicDBObject nameSearchResultDBObject = bingCacheMongoDB.getNameSearchResultDBObjectFromName(formattedName); Long totalCountSearchResults; // There are 3 cases: // 1) There is a corresponding entry in the cache AND the totalCountSearchResults are populated. // In this case, we read from the cache and return the results. // 2) There is a corresponding entry in the cache BUT the totalCountSearchResults are not populated. // This case occurs when only topSearchResults is populated. // In this case, perform the relevant query, update the cache and return the total count // 3) There is no corresponding entry in the cache. // In this case, perform the relevant query, create a new entry in the cache and return the total count. if (nameSearchResultDBObject == null) { // Case 3) LOGGER.debug("No corresponding entry in the cache. Fetching results and populating cache."); // Query the results totalCountSearchResults = fetchTotalCountSearchResults(formattedName); // Create new object and update it NameSearchResults nameSearchResults = new NameSearchResults(formattedName); nameSearchResults.setTotalCountSearchResults(totalCountSearchResults); // Save new document in the cache bingCacheMongoDB.cacheNameSearchResult(nameSearchResults); return totalCountSearchResults; } // There is an existing entry in the cache totalCountSearchResults = (Long) nameSearchResultDBObject.get("totalCountSearchResults"); if (totalCountSearchResults == null || totalCountSearchResults < 0) { // Case 2) LOGGER.debug("Existing entry in the cache, with empty totalCountSearchResults. " + "Fetching results and updating cache."); // Query the results totalCountSearchResults = fetchTotalCountSearchResults(formattedName); // Create new object and update its instance variable NameSearchResults nameSearchResults = new NameSearchResults(formattedName); nameSearchResults.setTotalCountSearchResults(totalCountSearchResults); // Update the cache bingCacheMongoDB.updateTotalCountSearchResults(formattedName, nameSearchResults); return totalCountSearchResults; } // Case 1) LOGGER.debug("Existing entry in the cache, with populated totalCountSearchResults. Returning from the cache."); return totalCountSearchResults; } /** Heuristic to find the best name for a given InChI, based on the total number of search results * @param namesOfMolecule (NamesOfMolecule) Java object containing Brenda, MetaCyc, ChEBI and DrugBank names for a given * InChI. * @return the name with the highest total number of search results, called Best Name * @throws IOException */ public String findBestMoleculeName(NamesOfMolecule namesOfMolecule) throws IOException { Long maxCount = -1L; String bestName = ""; String inchi = namesOfMolecule.getInchi(); String[] splittedInchi = inchi.split("/"); String formulaFromInchi = null; if (splittedInchi.length >= 2) { formulaFromInchi = inchi.split("/")[1]; } LOGGER.debug("Formula %s extracted from %s", formulaFromInchi, inchi); String wikipediaName = namesOfMolecule.getWikipediaName(); if (wikipediaName != null) { bestName = wikipediaName; } else { Set<String> names = namesOfMolecule.getAllNames(); names.remove(formulaFromInchi); for (String name : names) { // Ignore name if <= 4 characters if (name.length() <= 4) { continue; } LOGGER.debug("Getting search hits for %s", name); Long count = (cacheOnly) ? getTotalCountSearchResultsFromCache(name) : getAndCacheTotalCountSearchResults(name); // Ignore name if there was a previous better candidate if (count <= maxCount) { continue; } maxCount = count; bestName = name; } } // Note we don't use ChEBI or DrugBank names to keep this function simple. // If Brenda and MetaCyc names are not populated, it is very rare that ChEBI or DrugBank would be. LOGGER.debug("Best name found for %s is %s", namesOfMolecule.getInchi(), bestName); return bestName; } public static void main(String[] args) { String apiKeyFilepath = "MNT_SHARED_DATA/Thomas/test-bing/microsoft-cognitive-service-api-key"; BingSearchResults bingSearchResults = new BingSearchResults(apiKeyFilepath); try { Set<SearchResult> res = bingSearchResults.getAndCacheTopSearchResults("new query"); Long count = bingSearchResults.getAndCacheTotalCountSearchResults("new query"); } catch (IOException e) { throw new RuntimeException("Exception occurred when computing example query, %s", e); } } }