/************************************************************************* * * * This file is part of the 20n/act project. * * 20n/act enables DNA prediction for synthetic biology/bioengineering. * * Copyright (C) 2017 20n Labs, Inc. * * * * Please direct all queries to act@20n.com. * * * * This program is free software: you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * * the Free Software Foundation, either version 3 of the License, or * * (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU General Public License * * along with this program. If not, see <http://www.gnu.org/licenses/>. * * * *************************************************************************/ package com.twentyn.patentSearch; import org.apache.commons.lang3.tuple.Pair; import org.apache.commons.lang3.tuple.Triple; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.apache.lucene.document.Document; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import java.io.File; import java.io.IOException; import java.io.UncheckedIOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.List; import java.util.Set; import java.util.function.Function; import java.util.stream.Collectors; import java.util.stream.Stream; public class Searcher implements AutoCloseable { public static final Logger LOGGER = LogManager.getFormatterLogger(Searcher.class); private static final List<String> KEYWORDS = Collections.unmodifiableList(Arrays.asList( "yeast", "cerevisiae", "coli", "biosynthesis", "biogenesis", "anabolism", "catalysis", "ferment", "fermenter", "fermentor", "fermentation", "fermentive" )); private static final String CLAIMS_FIELD = "claims"; private static final int MAX_RESULTS_PER_QUERY = 100; // Note: this score is likely dependent on the set of keywords above. Adjust this if KEYWORDS change. private static final float DEFAULT_SCORE_THRESHOLD = 0.1f; private List<Pair<IndexReader, IndexSearcher>> indexReadersAndSearchers = new ArrayList<>(); private float scoreThreshold = DEFAULT_SCORE_THRESHOLD; private Searcher() { } private Searcher(float scoreThreshold) { this.scoreThreshold = scoreThreshold; } private void init(List<File> indexDirectories) throws IOException { for (File indexDirectory : indexDirectories) { LOGGER.info("Opening index dir at %s", indexDirectory.getAbsolutePath()); Directory indexDir = FSDirectory.open(indexDirectory.toPath()); IndexReader indexReader = DirectoryReader.open(indexDir); IndexSearcher searcher = new IndexSearcher(indexReader); // Only add to the list if both of these calls work. indexReadersAndSearchers.add(Pair.of(indexReader, searcher)); } } @Override public void close() throws IOException { for (IndexReader reader : indexReadersAndSearchers.stream().map(Pair::getLeft).collect(Collectors.toList())) { try { reader.close(); } catch (IOException e) { LOGGER.error("Unable to close index reader, but continuing to try closing others: %s", e.getMessage()); } } } public static class Factory { private static final Factory INSTANCE = new Factory(); private Factory() { } public static Factory getInstance() { return INSTANCE; } public Searcher build(File indexTopDir, float scoreThreshold) throws IOException { Searcher s = new Searcher(scoreThreshold); runInit(indexTopDir, s); return s; } public Searcher build(File indexTopDir) throws IOException { Searcher s = new Searcher(); runInit(indexTopDir, s); return s; } private void runInit(File indexTopDir, Searcher s) throws IOException { if (!indexTopDir.isDirectory()) { String msg = String.format("Top level directory at %s is not a directory", indexTopDir.getAbsolutePath()); LOGGER.error(msg); throw new IOException(msg); } List<File> individualIndexes = Arrays.stream(indexTopDir.listFiles()). filter(f -> f.getName().endsWith(".index")).collect(Collectors.toList()); if (individualIndexes.size() == 0) { String msg = String.format("Top level directory at %s contains no index sub-directories", indexTopDir.getAbsolutePath()); LOGGER.error(msg); throw new IOException(msg); } s.init(individualIndexes); } } /** * Search for patents that contain any of the specified chemical synonyms, scored based on synonym and biosynthesis * keyword occurrence. Results are filtered by score. * @param synonyms A list of chemical synonyms to use in the search. * @return A list of search results whose relevance scores are above the searcher's score threshold. * @throws IOException */ public List<SearchResult> searchInClaims(List<String> synonyms) throws IOException { if (synonyms.size() == 0) { LOGGER.info("Not running search for no synonyms!"); return Collections.emptyList(); } // Make queries for all synonyms. final List<BooleanQuery> queries = makeQueries(synonyms, CLAIMS_FIELD).collect(Collectors.toList()); // Reuse the compiled queries for all indices. try { Set<Triple<Float, String, String>> uniqueResults = indexReadersAndSearchers.stream(). map(p -> runSearch(p, queries)). // Search to get per-query streams... flatMap(Function.identity()). // combine all the streams into one... collect(Collectors.toSet()); // and collect the merged results in a list. /* Uniq-ify! It is completely reasonable for a patent to appear for multiple queries. * TODO: we haven't seen results appear multiple times with different scores. We should probably unique-ify * on id and take the result with the best score just to be safe. */ List<Triple<Float, String, String>> results = new ArrayList<>(uniqueResults); Collections.sort(results); return results.stream(). map(t -> new SearchResult(t.getMiddle(), t.getRight(), t.getLeft())). collect(Collectors.toList()); } catch (UncheckedIOException e) { throw e.getCause(); // Promote back to a regular exception for handling by the caller. } } // Run a set of queries over a single reader + searcher. private Stream<Triple<Float, String, String>> runSearch( Pair<IndexReader, IndexSearcher> readerSearcher, List<BooleanQuery> queries) throws UncheckedIOException { // With hints from http://stackoverflow.com/questions/22382453/java-8-streams-flatmap-method-example return queries.stream().map(q -> executeQuery(readerSearcher, q)).flatMap(Collection::stream); } // Run a single query on a single reader + searcher. private List<Triple<Float, String, String>> executeQuery( Pair<IndexReader, IndexSearcher> readerSearcher, BooleanQuery query) throws UncheckedIOException { TopDocs topDocs; try { topDocs = readerSearcher.getRight().search(query, MAX_RESULTS_PER_QUERY); } catch (IOException e) { LOGGER.error("Caught IO exception when trying to run search for %s: %s", query, e.getMessage()); /* Wrap `e` in an unchecked exception to allow it to escape our call stack. The top level function with catch * and rethrow it as a normal IOException. */ throw new UncheckedIOException(e); } ScoreDoc[] scoreDocs = topDocs.scoreDocs; if (scoreDocs.length == 0) { LOGGER.debug("Search returned no results."); return Collections.emptyList(); } // ScoreDoc just contains a score and an id. We need to retrieve the documents' content using that id. /* Crux of the next bit: * Filter by score and convert from scoreDocs to document features. * No need to use `limit` here since we already had Lucene cap the result set size. */ return Arrays.stream(scoreDocs). filter(scoreDoc -> scoreDoc.score >= scoreThreshold). map(scoreDoc -> { // try { Pair<String, String> features = this.extractDocFeatures(readerSearcher.getLeft().document(scoreDoc.doc)); // Put the score first so the natural sort order is based on score. return Triple.of(scoreDoc.score, features.getLeft(), features.getRight()); } catch (IOException e) { // Yikes, this is v. bad. LOGGER.error("Caught IO exception when trying to read doc id %d: %s", scoreDoc.doc, e.getMessage()); throw new UncheckedIOException(e); // Same as above. } }).collect(Collectors.toList()); } // Just extract the id and title for now. The id contains the patent number, and the title is enough for display. private Pair<String, String> extractDocFeatures(Document doc) { return Pair.of(doc.get("id"), doc.get("title")); } private Stream<BooleanQuery> makeQueries(List<String> synonyms, String field) { return synonyms.stream(). filter(syn -> syn != null && !syn.isEmpty()). map(syn -> makeQuery(syn, field)); } private BooleanQuery makeQuery(String synonym, String field) { BooleanQuery bq = new BooleanQuery(); // Set the synonym as a required phrase query. Phrase queries handle multi-word synonyms, but require construction. String queryString = synonym.trim().toLowerCase(); String[] parts = queryString.split("\\s+"); PhraseQuery query = new PhraseQuery(); Arrays.stream(parts).forEach(p -> query.add(new Term(field, p))); bq.add(query, BooleanClause.Occur.MUST); // Append all keywords as optional clauses. The more of these we find, the higher the score will be. KEYWORDS.forEach(term -> bq.add(new TermQuery(new Term(field, term)), BooleanClause.Occur.SHOULD)); return bq; } public static class SearchResult { String id; String title; Float relevanceScore; public SearchResult(String id, String title, Float relevanceScore) { this.id = id; this.title = title; /* Relevance scores are defined by Apache Lucene, and are dependent on the structure of the query. * See the scoring docs at * https://lucene.apache.org/core/5_2_1/core/org/apache/lucene/search/package-summary.html#package_description * for details. */ this.relevanceScore = relevanceScore; } public String getId() { return id; } public String getTitle() { return title; } public Float getRelevanceScore() { return relevanceScore; } } }