/*************************************************************************
* *
* This file is part of the 20n/act project. *
* 20n/act enables DNA prediction for synthetic biology/bioengineering. *
* Copyright (C) 2017 20n Labs, Inc. *
* *
* Please direct all queries to act@20n.com. *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* *
*************************************************************************/
package com.twentyn.patentSearch;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.commons.lang3.tuple.Triple;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import java.io.File;
import java.io.IOException;
import java.io.UncheckedIOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Set;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.Stream;
public class Searcher implements AutoCloseable {
public static final Logger LOGGER = LogManager.getFormatterLogger(Searcher.class);
private static final List<String> KEYWORDS = Collections.unmodifiableList(Arrays.asList(
"yeast",
"cerevisiae",
"coli",
"biosynthesis",
"biogenesis",
"anabolism",
"catalysis",
"ferment",
"fermenter",
"fermentor",
"fermentation",
"fermentive"
));
private static final String CLAIMS_FIELD = "claims";
private static final int MAX_RESULTS_PER_QUERY = 100;
// Note: this score is likely dependent on the set of keywords above. Adjust this if KEYWORDS change.
private static final float DEFAULT_SCORE_THRESHOLD = 0.1f;
private List<Pair<IndexReader, IndexSearcher>> indexReadersAndSearchers = new ArrayList<>();
private float scoreThreshold = DEFAULT_SCORE_THRESHOLD;
private Searcher() {
}
private Searcher(float scoreThreshold) {
this.scoreThreshold = scoreThreshold;
}
private void init(List<File> indexDirectories) throws IOException {
for (File indexDirectory : indexDirectories) {
LOGGER.info("Opening index dir at %s", indexDirectory.getAbsolutePath());
Directory indexDir = FSDirectory.open(indexDirectory.toPath());
IndexReader indexReader = DirectoryReader.open(indexDir);
IndexSearcher searcher = new IndexSearcher(indexReader);
// Only add to the list if both of these calls work.
indexReadersAndSearchers.add(Pair.of(indexReader, searcher));
}
}
@Override
public void close() throws IOException {
for (IndexReader reader : indexReadersAndSearchers.stream().map(Pair::getLeft).collect(Collectors.toList())) {
try {
reader.close();
} catch (IOException e) {
LOGGER.error("Unable to close index reader, but continuing to try closing others: %s", e.getMessage());
}
}
}
public static class Factory {
private static final Factory INSTANCE = new Factory();
private Factory() {
}
public static Factory getInstance() {
return INSTANCE;
}
public Searcher build(File indexTopDir, float scoreThreshold) throws IOException {
Searcher s = new Searcher(scoreThreshold);
runInit(indexTopDir, s);
return s;
}
public Searcher build(File indexTopDir) throws IOException {
Searcher s = new Searcher();
runInit(indexTopDir, s);
return s;
}
private void runInit(File indexTopDir, Searcher s) throws IOException {
if (!indexTopDir.isDirectory()) {
String msg = String.format("Top level directory at %s is not a directory", indexTopDir.getAbsolutePath());
LOGGER.error(msg);
throw new IOException(msg);
}
List<File> individualIndexes = Arrays.stream(indexTopDir.listFiles()).
filter(f -> f.getName().endsWith(".index")).collect(Collectors.toList());
if (individualIndexes.size() == 0) {
String msg = String.format("Top level directory at %s contains no index sub-directories",
indexTopDir.getAbsolutePath());
LOGGER.error(msg);
throw new IOException(msg);
}
s.init(individualIndexes);
}
}
/**
* Search for patents that contain any of the specified chemical synonyms, scored based on synonym and biosynthesis
* keyword occurrence. Results are filtered by score.
* @param synonyms A list of chemical synonyms to use in the search.
* @return A list of search results whose relevance scores are above the searcher's score threshold.
* @throws IOException
*/
public List<SearchResult> searchInClaims(List<String> synonyms) throws IOException {
if (synonyms.size() == 0) {
LOGGER.info("Not running search for no synonyms!");
return Collections.emptyList();
}
// Make queries for all synonyms.
final List<BooleanQuery> queries = makeQueries(synonyms, CLAIMS_FIELD).collect(Collectors.toList());
// Reuse the compiled queries for all indices.
try {
Set<Triple<Float, String, String>> uniqueResults = indexReadersAndSearchers.stream().
map(p -> runSearch(p, queries)). // Search to get per-query streams...
flatMap(Function.identity()). // combine all the streams into one...
collect(Collectors.toSet()); // and collect the merged results in a list.
/* Uniq-ify! It is completely reasonable for a patent to appear for multiple queries.
* TODO: we haven't seen results appear multiple times with different scores. We should probably unique-ify
* on id and take the result with the best score just to be safe. */
List<Triple<Float, String, String>> results = new ArrayList<>(uniqueResults);
Collections.sort(results);
return results.stream().
map(t -> new SearchResult(t.getMiddle(), t.getRight(), t.getLeft())).
collect(Collectors.toList());
} catch (UncheckedIOException e) {
throw e.getCause(); // Promote back to a regular exception for handling by the caller.
}
}
// Run a set of queries over a single reader + searcher.
private Stream<Triple<Float, String, String>> runSearch(
Pair<IndexReader, IndexSearcher> readerSearcher, List<BooleanQuery> queries) throws UncheckedIOException {
// With hints from http://stackoverflow.com/questions/22382453/java-8-streams-flatmap-method-example
return queries.stream().map(q -> executeQuery(readerSearcher, q)).flatMap(Collection::stream);
}
// Run a single query on a single reader + searcher.
private List<Triple<Float, String, String>> executeQuery(
Pair<IndexReader, IndexSearcher> readerSearcher, BooleanQuery query) throws UncheckedIOException {
TopDocs topDocs;
try {
topDocs = readerSearcher.getRight().search(query, MAX_RESULTS_PER_QUERY);
} catch (IOException e) {
LOGGER.error("Caught IO exception when trying to run search for %s: %s", query, e.getMessage());
/* Wrap `e` in an unchecked exception to allow it to escape our call stack. The top level function with catch
* and rethrow it as a normal IOException. */
throw new UncheckedIOException(e);
}
ScoreDoc[] scoreDocs = topDocs.scoreDocs;
if (scoreDocs.length == 0) {
LOGGER.debug("Search returned no results.");
return Collections.emptyList();
}
// ScoreDoc just contains a score and an id. We need to retrieve the documents' content using that id.
/* Crux of the next bit:
* Filter by score and convert from scoreDocs to document features.
* No need to use `limit` here since we already had Lucene cap the result set size. */
return Arrays.stream(scoreDocs).
filter(scoreDoc -> scoreDoc.score >= scoreThreshold).
map(scoreDoc -> { //
try {
Pair<String, String> features = this.extractDocFeatures(readerSearcher.getLeft().document(scoreDoc.doc));
// Put the score first so the natural sort order is based on score.
return Triple.of(scoreDoc.score, features.getLeft(), features.getRight());
} catch (IOException e) {
// Yikes, this is v. bad.
LOGGER.error("Caught IO exception when trying to read doc id %d: %s", scoreDoc.doc, e.getMessage());
throw new UncheckedIOException(e); // Same as above.
}
}).collect(Collectors.toList());
}
// Just extract the id and title for now. The id contains the patent number, and the title is enough for display.
private Pair<String, String> extractDocFeatures(Document doc) {
return Pair.of(doc.get("id"), doc.get("title"));
}
private Stream<BooleanQuery> makeQueries(List<String> synonyms, String field) {
return synonyms.stream().
filter(syn -> syn != null && !syn.isEmpty()).
map(syn -> makeQuery(syn, field));
}
private BooleanQuery makeQuery(String synonym, String field) {
BooleanQuery bq = new BooleanQuery();
// Set the synonym as a required phrase query. Phrase queries handle multi-word synonyms, but require construction.
String queryString = synonym.trim().toLowerCase();
String[] parts = queryString.split("\\s+");
PhraseQuery query = new PhraseQuery();
Arrays.stream(parts).forEach(p -> query.add(new Term(field, p)));
bq.add(query, BooleanClause.Occur.MUST);
// Append all keywords as optional clauses. The more of these we find, the higher the score will be.
KEYWORDS.forEach(term -> bq.add(new TermQuery(new Term(field, term)), BooleanClause.Occur.SHOULD));
return bq;
}
public static class SearchResult {
String id;
String title;
Float relevanceScore;
public SearchResult(String id, String title, Float relevanceScore) {
this.id = id;
this.title = title;
/* Relevance scores are defined by Apache Lucene, and are dependent on the structure of the query.
* See the scoring docs at
* https://lucene.apache.org/core/5_2_1/core/org/apache/lucene/search/package-summary.html#package_description
* for details. */
this.relevanceScore = relevanceScore;
}
public String getId() {
return id;
}
public String getTitle() {
return title;
}
public Float getRelevanceScore() {
return relevanceScore;
}
}
}