/*************************************************************************
* *
* This file is part of the 20n/act project. *
* 20n/act enables DNA prediction for synthetic biology/bioengineering. *
* Copyright (C) 2017 20n Labs, Inc. *
* *
* Please direct all queries to act@20n.com. *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* *
*************************************************************************/
package act.installer.patents;
import act.installer.WebData;
import java.net.URL;
import java.util.HashSet;
import java.util.HashMap;
import java.util.Set;
import java.util.Map;
import org.json.JSONObject;
import org.json.JSONArray;
import org.json.XML;
import act.shared.ConsistentInChI;
import act.server.MongoDB;
import act.shared.helpers.MongoDBToJSON;
import com.mongodb.DBObject;
import java.io.FileInputStream;
import java.io.BufferedWriter;
import java.io.PrintWriter;
import java.io.FileWriter;
import java.io.InputStreamReader;
import java.io.BufferedReader;
import java.io.DataInputStream;
import java.io.IOException;
import java.io.FileNotFoundException;
public class FTO extends WebData {
public void addPatents(MongoDB db, String patents_file, Set<String> priority_chems_files) {
// first get all chemicals in the db
System.out.println("reading all chemicals for patent lookup");
Map<String, Long> all_db_chems = db.constructAllInChIs();
// also the set tagged as priority to be looked up first
Set<String> priority_chemicals = new HashSet<String>();
// read the cached patents file (inchi<TAB>json_patents)
System.out.println("reading patents for chemicals");
try {
// read list of chemicals tagged as priority,
// these could be the reachables, or others...
for (String priority_chems_file : priority_chems_files)
priority_chemicals.addAll(readChemicalsFromFile(priority_chems_file));
// now read and install into DB chemicals for
// whom the patents were pulled in a past run
// and cached in patents_file
BufferedReader br = new BufferedReader(new InputStreamReader(new DataInputStream(new FileInputStream(patents_file))));
String patentline;
while ((patentline = br.readLine()) != null) {
JSONObject cached = deconstruct_cache_format(patentline);
String chem = cached.getString("inchi");
Integer num_patents = cached.getInt("num_patents");
JSONArray patents_json_cached = cached.getJSONArray("patents_json");
// now install the data (that we just paged in) into the DB
DBObject patents = MongoDBToJSON.conv(patents_json_cached);
String inchi = ConsistentInChI.consistentInChI(chem, "Adding patents");
// install the patents data into the db
// 1. update the chemical entry to point to all these patents
// 2. update the patents collection with the (patent_id, scores, patent_text)
db.updateChemicalWithPatents(inchi, num_patents, patents);
// mark this chemical as installed in the db
all_db_chems.remove(inchi);
// in case this was a priority chemical, remove from that set too
priority_chemicals.remove(inchi);
}
br.close();
} catch (FileNotFoundException e) {
// this happens when initializing the DB completely from
// scratch, and not even a single chemical has been looked up
// Ignore, as the lookups below will initialize a file...
} catch (Exception e) {
e.printStackTrace();
}
System.out.println("\nFTO Search: Installing from patents cached file: Done.\n");
// the remaining inchis in all_db_chems that did not have a patent
Integer num_patents = null;
JSONArray patents_json = null;
String inchi = null;
try {
// open the patents cache file for append now
PrintWriter patents_cache = new PrintWriter(new BufferedWriter(new FileWriter(patents_file, true)));
status_total = all_db_chems.size();
for (String chem : priority_chemicals) {
retrieveFromGooglePatents(chem, patents_cache, db);
// mark this chemical as installed in the db
all_db_chems.remove(chem);
}
System.out.println("\nFTO Search: Priority chemicals: Done.\n");
// now pull the remaining chemicals in the dataset
for (String chem : all_db_chems.keySet()) {
retrieveFromGooglePatents(chem, patents_cache, db);
}
System.out.println("\nFTO Search: All chemicals: Done.\n");
patents_cache.close();
} catch (IOException e) {
System.out.println("FTO Search: CRITICAL ERROR. Opening cache file " + patents_file + " failed. Abort."); System.exit(-1);
} catch (Exception e) {
System.out.println("FTO Searcg: SHOULD NOT HAPPEN. Unexpected error. Exception: " + e.getMessage() + "\n\tinchi: " + inchi + "\n\tnum_patents: " + num_patents + "\n\tjson: " + patents_json);
try { System.in.read(); } catch (Exception edum) {}
}
}
private void retrieveFromGooglePatents(String chem, PrintWriter patents_cache, MongoDB db) throws IOException {
// call the web api to retrieve the results
// and write to the cache
int num_patents = apiCallCacheResults(chem, patents_cache, db);
// report status to screen for running count
logStatusToConsole(num_patents);
}
int apiCallCacheResults(String inchi, PrintWriter patents_cache, MongoDB db) throws IOException {
// Dont waste time processing big molecules from MetaCyc with FAKE inchi
if (inchi.startsWith("InChI=/FAKE"))
return 0;
// get vendors by searching Google Patent
// note that this can return an empty JSON
JSONArray patents_json = new JSONArray();
Integer num_patents = 0;
patents_json = googleQuery(inchi);
num_patents = patents_json.length();
DBObject patents = MongoDBToJSON.conv(patents_json);
// add these patents to db
// 1. update the chemical entry to point to all these patents
// 2. update the patents collection with the (patent_id, scores, patent_text)
db.updateChemicalWithPatents(inchi, num_patents, patents);
// concatenate the retrieved vendors to this.chem_vendors file
// so that for this chemical we dont have to retrieve the
// vendors again in the future
patents_cache.println(cache_format(inchi, num_patents, patents_json));
patents_cache.flush();
return num_patents;
}
JSONArray googleQuery(String inchi) throws IOException {
FTO_GoogleNonAPISearch google = new FTO_GoogleNonAPISearch();
// get all patent #s corresponding to these keywords
Set<String> patentIDs = google.GetPatentIDs(inchi);
// for each of those patent #s, get their plain text
Map<String, String> patentText = new HashMap<String, String>();
// use the plain text of the patent to score its relevance to biosynthesis
Map<String, Double> patentScores = new HashMap<String, Double>();
for (String patentID : patentIDs) {
String plaintext = google.GetPatentText(patentID);
patentText.put(patentID, plaintext);
double score = FTO_PatentScorer_TrainedModel.getModel().ProbabilityOf(plaintext);
patentScores.put(patentID, score);
}
JSONArray patents = new JSONArray();
for (String patentID : patentIDs) {
JSONObject found = new JSONObject();
found.put("patent_num", patentID);
found.put("patent_txt", patentText.get(patentID));
found.put("likely_biosynthesis", patentScores.get(patentID));
patents.put(found);
}
return patents;
}
// this function should be in sync with the fn deconstruct_cache_format below
String cache_format(String inchi, Integer count, JSONArray json) {
return inchi + "\t" +
count + "\t" +
json.toString();
}
// this function should be in sync with the fn cache_format above
JSONObject deconstruct_cache_format(String cache_line) {
String[] tokens = cache_line.split("\t");
JSONObject cache_read = new JSONObject();
cache_read.put("inchi" , tokens[0]);
cache_read.put("count" , Integer.parseInt(tokens[1]));
cache_read.put("json" , new JSONArray(tokens[2]));
return cache_read;
}
}