/*************************************************************************
* *
* This file is part of the 20n/act project. *
* 20n/act enables DNA prediction for synthetic biology/bioengineering. *
* Copyright (C) 2017 20n Labs, Inc. *
* *
* Please direct all queries to act@20n.com. *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* *
*************************************************************************/
package act.installer.bing;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import com.act.utils.CLIUtil;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.Option;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import com.mongodb.BasicDBObject;
import act.server.MongoDB;
/**
* This class contains the main logic for installing Bing Search results in the Installer DB
*/
public class BingSearcher {
private static final Logger LOGGER = LogManager.getFormatterLogger(BingSearcher.class);
private static final String USAGE_TERMS_FILENAME = "usage_terms.txt";
public static final String HELP_MESSAGE =
"This class contains the main logic for installing Bing Search results in the Installer DB.";
public static final String OPTION_DB_NAME = "n";
public static final String OPTION_DB_PORT = "p";
public static final String OPTION_DB_HOST = "h";
public static final String OPTION_CACHE_ONLY = "c";
public static final String DEFAULT_HOST = "localhost";
public static final String DEFAULT_PORT = "27017";
public static final String DEFAULT_DATABASE = "actv01";
public static final List<Option.Builder> OPTION_BUILDERS = new ArrayList<Option.Builder>() {{
add(Option.builder(OPTION_DB_NAME)
.argName("db name")
.desc(String.format("The name of the database from which to fetch chemicals inchis and " +
"write Bing cross-references (default: %s). It needs to contains a 'chemicals' collection.",
DEFAULT_DATABASE))
.hasArg()
.longOpt("db")
.type(String.class)
);
add(Option.builder(OPTION_DB_HOST)
.argName("DB host")
.desc(String.format("The database host to which to connect (default: %s)", DEFAULT_HOST))
.hasArg()
.longOpt("db-host")
);
add(Option.builder(OPTION_DB_PORT)
.argName("DB port")
.desc(String.format("The port on which to connect to the database (default: %s)", DEFAULT_PORT))
.hasArg()
.longOpt("db-port")
);
add(Option.builder(OPTION_CACHE_ONLY)
.argName("CACHE_ONLY")
.desc("Use the cache only. If that option is used, they will not be any queries against the Bing search API.")
.longOpt("cache-only")
.type(String.class)
);
}};
private MongoDB db;
private BingSearchResults bingSearchResults;
private Set<String> usageTerms;
private boolean forceUpdate;
private boolean cacheOnly;
public static void main(String args[]) {
CLIUtil cliUtil = new CLIUtil(BingSearcher.class, HELP_MESSAGE, OPTION_BUILDERS);
CommandLine cl = cliUtil.parseCommandLine(args);
MongoDB db = new MongoDB(
cl.getOptionValue(OPTION_DB_HOST, DEFAULT_HOST),
Integer.parseInt(cl.getOptionValue(OPTION_DB_PORT, DEFAULT_PORT)),
cl.getOptionValue(OPTION_DB_NAME, DEFAULT_DATABASE)
);
BingSearcher bingSearcher = new BingSearcher(db, true, cl.hasOption(OPTION_CACHE_ONLY));
bingSearcher.addBingSearchResultsForEntireDatabase();
}
public BingSearcher(MongoDB db) {
this(db, false, false);
}
public BingSearcher(MongoDB db, boolean forceUpdate, boolean cacheOnly) {
this.db = db;
this.forceUpdate = forceUpdate;
this.cacheOnly = cacheOnly;
this.bingSearchResults = new BingSearchResults(true);
// Get the usage terms
LOGGER.debug("Getting usage terms corpus.");
UsageTermsCorpus usageTermsCorpus = new UsageTermsCorpus(USAGE_TERMS_FILENAME);
try {
usageTermsCorpus.buildCorpus();
} catch (IOException e) {
LOGGER.error("Usage term corpus source file not found in class resources: %s", USAGE_TERMS_FILENAME);
System.exit(1);
}
this.usageTerms = usageTermsCorpus.getUsageTerms();
}
private void addBingSearchResultsForEntireDatabase() {
Iterator<String> it = db.getIteratorOverInchis(new BasicDBObject());
while (it.hasNext()) {
String inchi = it.next();
if (inchi.contains("FAKE")) {
continue;
}
try {
addBingSearchResultsForInChI(inchi);
} catch (IOException e) {
LOGGER.error("Bing Search results could not be added for: %s", inchi);
}
}
}
private void addBingSearchResultsForInChI(String inchi) throws IOException {
LOGGER.debug("Processing InChI " + inchi);
// Fetches the names (Brenda, Metacyc, Chebi, Drugbank)
NamesOfMolecule namesOfMolecule = db.fetchNamesFromInchi(inchi);
if (namesOfMolecule == null) {
LOGGER.debug("Molecule corresponding to %s was not found in the database. Skipping.", inchi);
return;
}
// Chooses the best name according to Bing search results
String bestName = bingSearchResults.findBestMoleculeName(namesOfMolecule);
if (bestName.equals("")) { return; }
// Get the total number of hits and the top search results
Long totalCountSearchResults;
Set<SearchResult> topSearchResults;
if (cacheOnly) {
totalCountSearchResults = bingSearchResults.getTotalCountSearchResultsFromCache(bestName);
topSearchResults = bingSearchResults.getTopSearchResultsFromCache(bestName);
} else {
totalCountSearchResults = bingSearchResults.getAndCacheTotalCountSearchResults(bestName);
topSearchResults = bingSearchResults.getAndCacheTopSearchResults(bestName);
}
NameSearchResults nameSearchResults = new NameSearchResults(bestName);
nameSearchResults.setTotalCountSearchResults(totalCountSearchResults);
nameSearchResults.setTopSearchResults(topSearchResults);
// Intersect usage names with search results
Set<UsageTermUrlSet> moleculeUsageTerms = new HashSet<>();
for (String usageTerm : usageTerms) {
UsageTermUrlSet usageTermUrlSet = new UsageTermUrlSet(usageTerm);
usageTermUrlSet.populateUrlsFromNameSearchResults(nameSearchResults);
if (usageTermUrlSet.getUrlSet().size() > 0) {
moleculeUsageTerms.add(usageTermUrlSet);
}
}
// Annotate the chemical with Bing Search Results
BasicDBObject doc = db.createBingMetadataDoc(moleculeUsageTerms, totalCountSearchResults, bestName);
db.updateChemicalWithBingSearchResults(inchi, bestName, doc);
}
public void addBingSearchResultsForInchiSet(Set<String> inchis) {
Set<String> filteredInchis = inchis.stream().filter(inchi -> !inchi.contains("FAKE")).collect(Collectors.toSet());
LOGGER.info("Annotating %d chemicals with Bing Search results and usage terms.", filteredInchis.size());
int counter = 0;
for (String inchi : filteredInchis) {
if (!forceUpdate && db.hasBingSearchResultsFromInchi(inchi)) {
LOGGER.debug("Existing Bing search results found for %s. Skipping.", inchi);
continue;
}
try {
addBingSearchResultsForInChI(inchi);
} catch (IOException e) {
LOGGER.error("Could not add bing results for %s. Skipping.", inchi);
}
if (++counter % 100 == 0) {
LOGGER.info("Added Bing Search results for %d chemicals (total %d)", counter, filteredInchis.size());
}
}
}
}