/************************************************************************* * * * This file is part of the 20n/act project. * * 20n/act enables DNA prediction for synthetic biology/bioengineering. * * Copyright (C) 2017 20n Labs, Inc. * * * * Please direct all queries to act@20n.com. * * * * This program is free software: you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * * the Free Software Foundation, either version 3 of the License, or * * (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU General Public License * * along with this program. If not, see <http://www.gnu.org/licenses/>. * * * *************************************************************************/ package act.installer.reachablesexplorer; import act.installer.pubchem.PubchemSynonymType; import com.act.utils.CLIUtil; import com.twentyn.patentSearch.Searcher; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.Option; import org.apache.commons.lang3.StringUtils; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.mongojack.DBCursor; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.List; import java.util.Map; import java.util.Set; public class PatentFinder { private static final Logger LOGGER = LogManager.getFormatterLogger(PatentFinder.class); private static final String OPTION_DB_HOST = "H"; private static final String OPTION_DB_PORT = "p"; private static final String OPTION_TARGET_DB = "t"; private static final String OPTION_TARGET_REACHABLES_COLLECTION = "c"; private static final String OPTION_PATENT_INDEX_DIR = "i"; // Default host. If running on a laptop, please set a SSH bridge to access speakeasy private static final String DEFAULT_HOST = "localhost"; private static final Integer DEFAULT_PORT = 27017; // Target database and collection. We populate these with reachables // TODO These should all be turned into more long-term collections private static final String DEFAULT_TARGET_DATABASE = "wiki_reachables"; private static final String DEFAULT_TARGET_COLLECTION = "reachablesv6_test_thomas"; private static final String UNUSED_SEQUENCES_COLLECTION = null; // "dummy_sequences_v0"; // We won't touch these but need private static final String UNUSED_ASSETS_DIR = "/tmp"; // them for Loader's constructor. private static final String UNUSED_SOURCE_DB = null; // "dummy_source_db"; // A directory of directories. Each directory is one year's index, and ends in `.index`. private static final String DEFAULT_PATENT_INDEX_LOCATION = "data/patents"; public static final String HELP_MESSAGE = StringUtils.join(new String[]{ "This class searches for patents related to molecules in a reachables DB, and updates the reachable documents ", "with references to those patents. Patents are filtered by a manually selected relevance threshold." }, " "); public static final List<Option.Builder> OPTION_BUILDERS = new ArrayList<Option.Builder>() {{ add(Option.builder(OPTION_DB_HOST) .argName("DB host") .desc(String.format("The database host to which to connect (default: %s)", DEFAULT_HOST)) .hasArg() .longOpt("db-host") ); add(Option.builder(OPTION_DB_PORT) .argName("DB port") .desc(String.format("The port on which to connect to the database (default: %d)", DEFAULT_PORT)) .hasArg() .longOpt("db-port") ); add(Option.builder(OPTION_PATENT_INDEX_DIR) .argName("path") .desc(String.format( "A path to a directory of per-year indexes (directories) of patents ending in `.index` (default: %s)", DEFAULT_PATENT_INDEX_LOCATION)) .hasArg() .longOpt("indexes-dir") ); add(Option.builder(OPTION_TARGET_DB) .argName("DB name") .desc(String.format("The name of the DB into which to write reachable molecule documents (default: %s)", DEFAULT_TARGET_DATABASE)) .hasArg() .longOpt("dest-db-name") ); add(Option.builder(OPTION_TARGET_REACHABLES_COLLECTION) .argName("collection name") .desc(String.format( "The name of the collection in the dest DB to which to add patent references (default: %s)", DEFAULT_TARGET_COLLECTION)) .hasArg() .longOpt("reachables-collection") ); }}; private static final List<PubchemSynonymType> SYNONYM_TYPE_PREFERENCE = Collections.unmodifiableList(Arrays.asList( PubchemSynonymType.TRIVIAL_NAME, PubchemSynonymType.INTL_NONPROPRIETARY_NAME, PubchemSynonymType.DEPOSITORY_NAME, // Beware: this list can be huge. TODO: be clever and shorten it? PubchemSynonymType.DRUG_TRADE_NAME, PubchemSynonymType.IUPAC_NAME )); public static void main(String[] args) throws Exception { CLIUtil cliUtil = new CLIUtil(Loader.class, HELP_MESSAGE, OPTION_BUILDERS); CommandLine cl = cliUtil.parseCommandLine(args); String host = cl.getOptionValue(OPTION_DB_HOST, DEFAULT_HOST); Integer port = Integer.parseInt(cl.getOptionValue(OPTION_DB_PORT, DEFAULT_PORT.toString())); String targetDB = cl.getOptionValue(OPTION_TARGET_DB, DEFAULT_TARGET_DATABASE); String collection = cl.getOptionValue(OPTION_TARGET_REACHABLES_COLLECTION, DEFAULT_TARGET_COLLECTION); LOGGER.info("Connecting to %s:%d/%s, using collection %s", host, port, targetDB, collection); Loader loader = new Loader(host, port, UNUSED_SOURCE_DB, targetDB, collection, UNUSED_SEQUENCES_COLLECTION, UNUSED_ASSETS_DIR); File indexesTopDir = new File(cl.getOptionValue(OPTION_PATENT_INDEX_DIR, DEFAULT_PATENT_INDEX_LOCATION)); if (!indexesTopDir.exists() || !indexesTopDir.isDirectory()) { cliUtil.failWithMessage("Index top-level directory at %s is not a directory", indexesTopDir.getAbsolutePath()); } LOGGER.info("Using index top level dir: %s", indexesTopDir.getAbsolutePath()); PatentFinder finder = new PatentFinder(); try (Searcher searcher = Searcher.Factory.getInstance().build(indexesTopDir)) { finder.run(loader, searcher); } } private void run(Loader loader, Searcher searcher) throws IOException { DBCursor<Reachable> reachableDBCursor = loader.getJacksonReachablesCollection().find(); while (reachableDBCursor.hasNext()) { Reachable reachable = reachableDBCursor.next(); SynonymData synonyms = reachable.getSynonyms(); Set<String> preferredSynonyms = null; if (synonyms != null) { Map<PubchemSynonymType, Set<String>> pubchemSynonyms = synonyms.getPubchemSynonyms(); /* Search for different kinds of synonyms in order of preference (where preference tries to strike a balance * between verbosity and specificity). Stop when we've found a type of synonym that is available for this * molecule, and use that in the patent search. */ for (PubchemSynonymType type : SYNONYM_TYPE_PREFERENCE) { if (pubchemSynonyms.containsKey(type)) { preferredSynonyms = pubchemSynonyms.get(type); break; } } } if (preferredSynonyms == null) { LOGGER.warn("No synonyms for molecule %s", reachable.getInchi()); preferredSynonyms = Collections.emptySet(); } List<String> allNames = new ArrayList<>(reachable.getNames()); allNames.addAll(preferredSynonyms); allNames.removeIf(s -> s == null || s.length() < 3); // Eliminate potential garbage rankings for short names. // Note: stop words should not appear in the index, so no need to filter on terms. Collections.sort(allNames); LOGGER.info("Running query with terms: %s", StringUtils.join(allNames, ", ")); List<Searcher.SearchResult> results = searcher.searchInClaims(allNames); if (results.size() > 0) { LOGGER.info("Results (%d) for %s:", results.size(), reachable.getPageName()); List<PatentSummary> summaries = new ArrayList<>(results.size()); for (Searcher.SearchResult result : results) { LOGGER.info("(%.3f) %s: %s", result.getRelevanceScore(), result.getId(), result.getTitle()); summaries.add(new PatentSummary(result.getId(), result.getTitle(), result.getRelevanceScore())); } reachable.setPatentSummaries(summaries); loader.upsert(reachable); } else { LOGGER.info("No results for %s", reachable.getPageName()); } } } }