/*************************************************************************
* *
* This file is part of the 20n/act project. *
* 20n/act enables DNA prediction for synthetic biology/bioengineering. *
* Copyright (C) 2017 20n Labs, Inc. *
* *
* Please direct all queries to act@20n.com. *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* *
*************************************************************************/
package act.installer.patents;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileInputStream;
import java.io.FileWriter;
import java.io.BufferedReader;
import java.io.DataInputStream;
import java.io.InputStreamReader;
import java.io.Writer;
import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.json.JSONArray;
import org.json.JSONObject;
import act.shared.helpers.P;
public class FTO_GoogleNonAPISearch {
private final boolean _AddSynonymsToQuery = true;
private final boolean _UseGoogleCustomSearchAPI = false;
private final String _PatentCacheRootDir = "FTO_patents_cached";
public FTO_GoogleNonAPISearch() { }
public String GetPatentText(String id) throws IOException {
return FTO_Utils.GetPatentText(id);
}
public Set<String> GetPatentIDsForCompanyPatents(String inchi, String company_name) throws IOException {
// use the inchi to get all synonyms
List<String> names = namesFromPubchem(inchi, true);
// get all patent #s that mention these names AND the customer company_name
Set<String> idSet = queryGoogleForPatentsOfCustomer(company_name, names);
return idSet;
}
private Set<String> queryGoogleForPatentsOfCustomer(String company, List<String> names) throws IOException {
String searchPhrase = "inassignee:\"" + company + "\" AND (";
searchPhrase+= "\"" + names.get(0) + "\"";
for(int i=1; i<names.size(); i++)
searchPhrase+= " OR \"" + names.get(i) + "\"";
searchPhrase+= ")";
return QueryGooglePatents_NonAPI.query(searchPhrase);
}
public Set<String> GetPatentIDs(String inchi) throws IOException {
// use the inchi to get all synonyms
List<String> names = namesFromPubchem(inchi, true);
// get all patent #s that mention these names
Set<String> idSet = queryGoogleForPatentIDs(null, names);
return idSet;
}
private Set<String> queryGoogleForPatentIDs(String common_name, List<String> names) throws IOException {
if (!_AddSynonymsToQuery && common_name != null) {
names = new ArrayList<String>();
names.add(common_name);
}
// String searchPhrase = "(cerevisiae OR coli) AND (";
String searchPhrase = "(yeast OR cerevisiae OR coli) AND (";
searchPhrase+= "\"" + names.get(0) + "\"";
for(int i=1; i<names.size(); i++)
searchPhrase+= " OR \"" + names.get(i) + "\"";
searchPhrase+= ")";
Set<String> idSet = new HashSet<String>();
if (_UseGoogleCustomSearchAPI)
idSet.addAll(QueryGoogleAPI.query(searchPhrase));
idSet.addAll(QueryGooglePatents_NonAPI.query(searchPhrase));
return idSet;
}
private List<String> namesFromPubchem(String name, boolean inputIsInChI) throws IOException {
List<String> out = new ArrayList<>();
// Query pubchem for synonyms
String jsonStr;
if (inputIsInChI) {
String base = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/inchi/synonyms/json";
List<P<String, String>> post_data = new ArrayList<>();
post_data.add(new P<String, String>("inchi", name));
jsonStr = FTO_Utils.fetch(base, post_data);
} else {
String base = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/";
name = URLEncoder.encode(name, "UTF-8");
String pubchem_query = base + name + "/synonyms/json";
jsonStr = FTO_Utils.fetch(pubchem_query);
}
JSONObject json = new JSONObject(jsonStr);
JSONObject InformationList = json.getJSONObject("InformationList");
JSONArray Information = InformationList.getJSONArray("Information");
JSONObject data = Information.getJSONObject(0);
JSONArray Synonym = data.getJSONArray("Synonym");
for(int i=0; i < Synonym.length(); i++) {
String syn = Synonym.getString(i);
out.add(syn);
if(out.size() > 5) {
break;
}
}
return out;
}
private Map<String, Double> scorePatents(Set<String> idSet) {
// Collect up all the patent IDs
Map<String, Double> patentScores = new HashMap<>();
// For each patent ID
// Fetch the text of the patent
// Score the patent based on FTO
// Add the score and the patent ID to the Map
for(String id : idSet) {
String text = null;
try {
// Try to fetch it from disk first
// text = readPatentFromDisk(id);
// If not, get it online
if (text == null) {
text = FTO_Utils.GetPatentText(id);
}
// The Raw score is now not exposed by the PatentScorer
// Instead it provides a computed probability
// int rawScore = FTO_PatentScorer_TrainedModel.getModel().ScoreText(text);
// savePatentToDisk(id, text, rawScore);
// normalize to a probability of it being a biosynthesis patent
double probability = FTO_PatentScorer_TrainedModel.getModel().ProbabilityOf(text);
patentScores.put(id, probability);
} catch (Exception ex) {
System.err.println("error on " + id);
ex.printStackTrace();
System.err.println("\t" + ex.getMessage());
}
}
return patentScores;
}
// Standalone wrapper around the entire functionality in this file
// Only used for testing, and maybe calls from experimental/
// But in main trunk, this is not used. Instead, in FTO.java calls
// other functions directly to take:
// inchis -> patent IDs -> patent texts -> scores
private Map<String, Double> FTO_WriteToDisk(String common_name) throws Exception {
// use the "common_name" to get all synonyms
List<String> names = namesFromPubchem(common_name, false);
// get all patent #s that mention these names
Set<String> idSet = queryGoogleForPatentIDs(common_name, names);
// score each patent based on its text
Map<String, Double> results = scorePatents(idSet);
System.err.println("Scored patents: " + results.size());
// write the output to the directory with "common_name"
File dir1 = new File(_PatentCacheRootDir);
File dir2 = new File(_PatentCacheRootDir + "/" + common_name);
boolean made1 = true, made2 = true;
if(!dir1.exists()) { made1 = dir1.mkdir(); }
if(!dir2.exists()) { made2 = dir2.mkdir(); }
if (!made1 || !made2) {
throw new Exception("Could not make cache dir: " + dir2.getAbsolutePath());
}
StringBuilder sb = new StringBuilder();
for(String str : results.keySet()) {
Double probability = results.get(str);
sb.append(str + "\t" + probability + System.getProperty("line.separator"));
}
FTO_Utils.writeFile(sb.toString(), dir2.getAbsolutePath() + "/" + common_name + "_fto.txt");
return results;
}
// Only used for testing. See comment on FTO_WriteToDisk(..)
public Map<String, Double> FTO_WriteToDisk(String common_name, double probability_threshold) throws Exception {
Map<String, Double> all_patents = FTO_WriteToDisk(common_name);
Map<String, Double> thresholded = new HashMap<>();
for (String pid : all_patents.keySet())
if (all_patents.get(pid) <= probability_threshold)
thresholded.put(pid, all_patents.get(pid));
return thresholded;
}
public static void main(String[] args) throws Exception {
FTO_GoogleNonAPISearch google = new FTO_GoogleNonAPISearch();
google.FTO_WriteToDisk(args[0]);
}
}
class QueryGoogleAPI {
// You can find the CSE and API_KEY here:
// https://cse.google.com/cse/all
// - Go to the custom engine you created (& paid)
// - Edit Search Engine -> Business -> XML & JSON
// Custom search engine identifier from above
// API_KEY is the key allowing us to access google apis
private static final String CSE="XXXXXXXXXXXXXXXXXXXXX:xxxxxxxxxxx";
private static final String API_KEY="AAAAAAAAAAAAPPPPPPPPPPPPPPIIIIIIIIIIIII";
// curl -s "https://www.googleapis.com/customsearch/v1?key=$API_KEY&cx=$CSE&q=$QUERY" > $QUERY.firstpage
// "template": "https://www.googleapis.com/customsearch/v1?q={searchTerms}&num={count?}&start={startIndex?}&lr={language?}&safe={safe?}&cx={cx?}&cref={cref?}&sort={sort?}&filter={filter?}&gl={gl?}&cr={cr?}&googlehost={googleHost?}&c2coff={disableCnTwTranslation?}&hq={hq?}&hl={hl?}&siteSearch={siteSearch?}&siteSearchFilter={siteSearchFilter?}&exactTerms={exactTerms?}&excludeTerms={excludeTerms?}&linkSite={linkSite?}&orTerms={orTerms?}&relatedSite={relatedSite?}&dateRestrict={dateRestrict?}&lowRange={lowRange?}&highRange={highRange?}&searchType={searchType}&fileType={fileType?}&rights={rights?}&imgSize={imgSize?}&imgType={imgType?}&imgColorType={imgColorType?}&imgDominantColor={imgDominantColor?}&alt=json"
public static Set<String> query(String searchPhrase) throws IOException {
Set<String> idSet = new HashSet<>();
searchPhrase = URLEncoder.encode(searchPhrase, "UTF-8");
String base = "https://www.googleapis.com/customsearch/v1?key=" + API_KEY + "&cx=" + CSE + "&q=" + searchPhrase;
boolean hasNext = true;
int start = 0;
int page = 0;
while (hasNext) {
String url = base + "&num=10" + (start == 0 ? "" : "&start=" + start);
String json_str = FTO_Utils.fetch(url);
JSONObject json = new JSONObject(json_str);
System.err.println("Total: " + json.getJSONObject("searchInformation").get("totalResults"));
String filepath = "google_cse:page" + page;
FTO_Utils.writeFile(json.toString(2), filepath);
JSONObject meta = json.getJSONObject("queries");
Set<String> patentIDs = extractAllIDs(json.getJSONArray("items"));
idSet.addAll(patentIDs);
int this_count = meta.getJSONArray("request").getJSONObject(0).getInt("count");
System.err.println("\t Got: " + start + " -> " + (start + this_count));
if (meta.has("nextPage")) {
// start = meta.getJSONArray("nextPage").getJSONObject(0).getInt("startIndex");
start += this_count;
page++;
} else {
hasNext = false;
}
}
return idSet;
}
private static Set<String> extractAllIDs(JSONArray items) {
Set<String> idSet = new HashSet<>();
String prefix = "patents/";
for (int i=0; i<items.length(); i++) {
JSONObject item = items.getJSONObject(i);
String url = item.getString("link");
// e.g., http://www.google.com/patents/CN102406082A?cl=en"
// e.g., https://www.google.com/patents/US7201928
// because it might be https or http, we look for patents/ and chop..
String pid = url.substring(url.indexOf(prefix) + prefix.length());
int suffix = pid.indexOf("?");
if (suffix != -1)
pid = pid.substring(0, suffix);
idSet.add(pid);
}
return idSet;
}
}
/*
This class is the naive crawler that traverses google search result pages
pretending to be a human (which is why the delay(5) seconds.
We can make this much faster by using Google's custom search engines and
querying within our quota, without delays...
*/
class QueryGooglePatents_NonAPI {
private static final int MAX_RESULTS = 10000;
public static Set<String> query(String searchPhrase) throws IOException {
Set<String> idSet = new HashSet<>();
searchPhrase = URLEncoder.encode(searchPhrase, "UTF-8");
String base = "https://www.google.com/search?q=" + searchPhrase + "&num=100&biw=1440&bih=557&tbm=pts&start=INSERTSTARTINDEX&sa=N";
// String base = "https://www.google.com/?tbm=pts&gws_rd=ssl#tbm=pts&q=Bactericide+composition+and+abietic+methanol+bacteria+coli&num=100&biw=1440&bih=557&tbm=pts&start=INSERTSTARTINDEX&sa=N";
outer: for(int i=0; i<MAX_RESULTS; i++) {
String pageStart = Integer.toString(i*100);
String url = base.replaceAll("INSERTSTARTINDEX", pageStart);
try {
FTO_Utils.delay(5);
String text = FTO_Utils.fetch(url);
List<String> ids = extractAllIDs(text);
//Break if it is duplicating a page
boolean breakit = true;
inner: for(String str : ids) {
if(!idSet.contains(str)) {
breakit = false;
break inner;
}
}
if(breakit) {
break outer;
}
idSet.addAll(ids);
} catch (Exception ex) {
if (ex.getMessage().startsWith("StatusCode = 503")
|| ex.getMessage().startsWith("StatusCode = 403")) {
// google is blocking us now. no point in continuing, abort
throw ex;
} else {
// not blocked, but some other error: dump to log, and continue
ex.printStackTrace();
}
}
}
return idSet;
}
private static List<String> extractAllIDs(String text) {
//First extract all urls that are within quotes
List<String> results = new ArrayList<>();
String patternString = "(?<=\")https://www.google.com/patents/[^\"]+(?=\")";
Pattern patt = Pattern.compile(patternString);
Matcher matcher = patt.matcher(text);
boolean matches = matcher.matches();
int count = 0;
while(matcher.find()) {
count++;
results.add(text.substring(matcher.start(), matcher.end()));
}
// Clean up each URL to just the id
List<String> out = new ArrayList<>();
for(String rawurl : results) {
String[] split = rawurl.split("[^0-9a-zA-Z]");
String id = split[7];
if(id.equals("related")) {
continue;
}
if(!out.contains(id)) {
out.add(id);
}
}
return out;
}
}
class FTO_PatentScorer_TrainedModel {
private Map<String, Integer> model;
private Double modelNormalizationParam;
private final String _RootDir = "FTO_training";
private final String _NegDataSet = _RootDir + "/bioneg";
private final String _PosDataSet = _RootDir + "/biopos";
private final String _ChemNegDataSet = _RootDir + "/chemneg";
private final String _ChemPosDataSet = _RootDir + "/chempos";
private static FTO_PatentScorer_TrainedModel instance = null;
public static FTO_PatentScorer_TrainedModel getModel() {
if (instance == null) {
instance = new FTO_PatentScorer_TrainedModel();
}
return instance;
}
private FTO_PatentScorer_TrainedModel() {
initModel();
dumpValidationAgainstTrainingData();
}
public double ProbabilityOf(String text) {
return NormalizeScoreToProbability(ScoreText(text));
}
private double NormalizeScoreToProbability(int score) {
// normalization function is 1-e(-B x score)
// where B is calculated optimally from the dataset
return 1 - Math.exp(-this.modelNormalizationParam * score);
}
private int ScoreText(String text) {
int out = 0;
Set<String> extract = extractTokens(text);
for(String str : this.model.keySet()) {
if (extract.contains(str)) {
out += this.model.get(str);
}
}
return out;
}
private void initModel() {
// check that there are training files in the positive, negative datasets
if (!FTO_Utils.filesPresentIn(_PosDataSet) || !FTO_Utils.filesPresentIn(_NegDataSet)) {
System.err.println("First time initialization. Downloading training set.");
DownloadTrainingDataSets();
}
Map<String, Integer> pattern = calculatePattern(_NegDataSet, _PosDataSet);
this.model = pattern;
Double normParam = calculateNormalizationParam(_NegDataSet, _PosDataSet);
this.modelNormalizationParam = normParam;
System.err.println("FTO: Pattern size = " + pattern.size());
System.err.println("FTO: 1-exp(-Bx) norm. B = " + normParam);
}
private void DownloadTrainingDataSets() {
// download text for biosynthesis and chemosynthesis datasets
CreateBiosynthesisDataSet();
CreateChemosynthesisDataSet();
}
private void dumpValidationAgainstTrainingData() {
try {
// dump all scores and probabilities for training negatives
File dir = new File(_NegDataSet);
for(File fily : dir.listFiles()) {
String text = FTO_Utils.readFile(fily.getAbsolutePath());
dumpScoreProbability("-", fily.getName(), text);
}
// dump all scores and probabilities for training positives
dir = new File(_PosDataSet);
for(File fily : dir.listFiles()) {
String text = FTO_Utils.readFile(fily.getAbsolutePath());
dumpScoreProbability("+", fily.getName(), text);
}
} catch (IOException e) {
e.printStackTrace();
}
}
private void dumpScoreProbability(String posOrNeg, String name, String text) {
double probability = ProbabilityOf(text);
System.err.println(posOrNeg + "\t" + name + "\t" + probability);
}
private final int CUTOFF = 5;
private Map<String, Integer> calculatePattern(String negDir, String posDir) {
Map<String, Integer> negs = readFolderAndHashOut(negDir);
Map<String, Integer> poss = readFolderAndHashOut(posDir);
Map<String, Integer> pattern = new HashMap<>();
for(String str : negs.keySet()) {
Integer negvalue = negs.get(str);
if(poss.containsKey(str)) {
Integer posvalue = poss.get(str);
Integer newval = posvalue-negvalue;
if (newval > CUTOFF) {
pattern.put(str, newval);
}
}
}
return pattern;
}
private Double calculateNormalizationParam(String negDir, String posDir) {
Set<Integer> negs = scoreFolder(negDir);
Set<Integer> poss = scoreFolder(posDir);
Double Lp = average(poss), Hn = average(negs);
if (Lp < Hn) {
System.err.println("FTO: Error. Centroid of +ves < -ves. Bad training data.");
System.err.println("FTO: This means that on average the +ve patents score.");
System.err.println("FTO: less than the -ve patents; but higher scores are");
System.err.println("FTO: supposed to mean more +ve. Abort!");
System.exit(-1);
}
// fit a 1-e(-B * x) curve to the positive and negative dataset
// where B is a positive real, which is learnt by maximizing
// the distance between the average of the negatives Hn and
// the average of the positives Lp.
// Maximization occurs where
// d/dB( e(-Hn * B) - e(-Lp * B) ) = 0
// i.e., Hn * e(-Hn * B) = Lp * e(-Lp * B) - solve for B
// Or Log(Lp/Hn) = B(Lp - Hn)
// Or B = Log(Lp/Hn)/(Lp - Hn)
Double B = Math.log(Lp/Hn) / (Lp - Hn);
return B;
}
private Double average(Set<Integer> S) {
Double avg = 0.0;
int sz = S.size();
for (Integer i : S) avg += (double)i/(double)sz;
return avg;
}
private Set<Integer> scoreFolder(String path) {
try {
Set<Integer> out = new HashSet<>();
File dir = new File(path);
for(File afile : dir.listFiles()) {
String text = FTO_Utils.readFile(afile.getAbsolutePath());
out.add(ScoreText(text));
}
return out;
} catch (IOException e) {
e.printStackTrace();
System.exit(-1);
return null;
}
}
private Map<String, Integer> readFolderAndHashOut(String path) {
try {
Map<String, Integer> out = new HashMap<>();
File dir = new File(path);
for(File afile : dir.listFiles()) {
String text = FTO_Utils.readFile(afile.getAbsolutePath());
Set<String> extract = extractTokens(text);
for(String str : extract) {
Integer value = 0;
if(out.containsKey(str)) {
value = out.get(str);
}
value++;
out.put(str, value);
}
}
return out;
} catch (IOException e) {
e.printStackTrace();
System.exit(-1);
return null;
}
}
private Set<String> extractTokens(String text) {
Set<String> out = new HashSet<>();
String patternString = "[0-9a-zA-Z]+";
Pattern patt = Pattern.compile(patternString);
Matcher matcher = patt.matcher(text);
boolean matches = matcher.matches();
int count = 0;
while(matcher.find()) {
count++;
out.add((text.substring(matcher.start(), matcher.end())).toLowerCase());
}
return out;
}
private void CreateBiosynthesisDataSet() {
File training = new File(_RootDir);
if(!training.exists()) {
training.mkdir();
}
// Create a list of patent urls talking about biosynthesis
List<String> positives = new ArrayList<>();
positives.add("WO2012016177A2"); //Amyris farnesene
positives.add("WO2013192543A2"); //Phytogene styrene
positives.add("EP2438178A2"); //Genomatica BDO
positives.add("US6194185"); //Wash U limonene
positives.add("US8828693"); //isopropanol
positives.add("EP1799828B1"); //phloroglucinol
positives.add("CA2112374C"); //yeast xylitol
positives.add("WO2013071112A1"); //yeast xylose
positives.add("WO2014066892A1"); //Dupont isoprene
positives.add("EP2252691B1"); //santalene Firmenich
positives.add("US7374920"); //
positives.add("US20120107893"); //Stephanopoulus very broad claim about something with indole and coli specifically
positives.add("US8889381"); //A host cell, comprising a nucleic acid molecule encoding a cis-abienol synthase
positives.add("US7238514"); //
positives.add("US20130302861"); //mitochondrial targeting
positives.add("US8062878"); //levopimaradiene synthase
positives.add("US5994114"); //taxadiene synthase
List<String> negatives = new ArrayList<>();
negatives.add("US5274029"); //
negatives.add("US3284393"); //
negatives.add("US7141615"); //
negatives.add("US3632822"); //
negatives.add("US3787335"); //
negatives.add("US8017658"); //
negatives.add("WO2012173477A1"); //
negatives.add("CN103275146A"); //
negatives.add("CN103113443A"); //
negatives.add("CN103755556A"); //
negatives.add("US20130143826"); //
negatives.add("WO2014078168A1"); //
negatives.add("US20130005581"); //
negatives.add("US20140303361"); //
negatives.add("CN103193799A"); //
negatives.add("CN103467567A"); //
negatives.add("WO2002044197A2"); //
negatives.add("US20140058063"); //
negatives.add("EP2729123A2"); //
negatives.add("US8470822"); //
negatives.add("WO2014031646A3"); //
negatives.add("CN102558143B"); //
negatives.add("WO2000026174A2"); //
negatives.add("US20110250626"); //coatings incorporating bioactive enzymes
// NOT IN CHRIS' DATASET THAT HE SENT OVER....
// negatives.add("US20130189677"); //terpenoid transporters
negatives.add("US20090238811"); //Enzymatic antimicrobial and antifouling coatings
negatives.add("US8846351"); //degrading cellulose
negatives.add("US20100248334"); //Biological active coating components
negatives.add("US20130338330"); //chemical synthesis
negatives.add("US20130331342"); //hair/scalp care compositions
negatives.add("CA2595380A1"); //Stabilized liquid polypeptide formulations
File afile = new File(_PosDataSet);
if(!afile.exists()) {
afile.mkdir();
}
for(String id : positives) {
try {
String text = FTO_Utils.GetPatentText(id);
FTO_Utils.writeFile(text, _PosDataSet + "/" + id + ".txt");
} catch(Exception err) {
err.printStackTrace();
}
}
afile = new File(_NegDataSet);
if(!afile.exists()) {
afile.mkdir();
}
for(String id : negatives) {
try {
String text = FTO_Utils.GetPatentText(id);
FTO_Utils.writeFile(text, _NegDataSet + "/" + id+".txt");
} catch(Exception err) {
err.printStackTrace();
}
}
}
private void CreateChemosynthesisDataSet() {
File training = new File(_RootDir);
if(!training.exists()) {
training.mkdir();
}
// Create a list of patent urls talking about biosynthesis
List<String> positives = new ArrayList<>();
positives.add("US2623897"); //Galllic acid esters
positives.add("WO2008065527A2"); //
positives.add("US7045654"); //
positives.add("US4788331"); //
positives.add("EP0771782A1"); //
positives.add("US2606186"); //
positives.add("US1836568"); //
positives.add("US2886438"); //
positives.add("US6399810"); //
positives.add("US2945068"); //
positives.add("US2155856"); //
List<String> negatives = new ArrayList<>();
negatives.add("US6180666"); //use
negatives.add("EP1159007A1"); //use
negatives.add("EP2753336A1"); //use
negatives.add("US3792014"); //use
negatives.add("WO2011138345A2");
negatives.add("US20100034762");
negatives.add("WO2012131348A1");
negatives.add("US6669964");
negatives.add("WO2009084020A2");
negatives.add("US2211485");
negatives.add("US5223179");
negatives.add("US20060286061");
negatives.add("US5756446");
negatives.add("EP2595599A1");
negatives.add("US4368056");
negatives.add("EP2582775A1");
negatives.add("US4379168");
negatives.add("US4915707");
negatives.add("US6200625");
negatives.add("USRE36982");
negatives.add("US4818250");
negatives.add("CA2118071C");
negatives.add("US6194185");
negatives.add("US5849680");
negatives.add("WO1999021891A1");
negatives.add("CA2492498C");
negatives.add("US6342535");
negatives.add("US5344776");
negatives.add("US7622269");
negatives.add("US20020058075");
negatives.add("US20040204497");
negatives.add("WO2014151732A1");
negatives.add("EP2502621A1");
negatives.add("US5427798");
negatives.add("US6312716");
negatives.add("WO1999038502A1");
negatives.add("US6462237");
negatives.add("EP2316456A1");
negatives.add("WO1999038503A1");
negatives.add("US4820522");
negatives.add("EP2649993A1");
negatives.add("US20110124718");
negatives.add("US8518438");
negatives.add("US20070237816");
negatives.add("US8658631");
negatives.add("US8609684");
negatives.add("US20080293804");
File afile = new File(_ChemPosDataSet);
if(!afile.exists()) {
afile.mkdir();
}
for(String id : positives) {
try {
String text = FTO_Utils.GetPatentText(id);
FTO_Utils.writeFile(text, _ChemPosDataSet + "/" + id + ".txt");
} catch(Exception err) {
err.printStackTrace();
}
}
afile = new File(_ChemNegDataSet);
if(!afile.exists()) {
afile.mkdir();
}
for(String id : negatives) {
try {
String text = FTO_Utils.GetPatentText(id);
FTO_Utils.writeFile(text, _ChemNegDataSet + "/" + id + ".txt");
// We use the negatives in this training set to also serve as
// training for the bio dataset; in addition to the chem dataset
// This is because the bioalgorithm has already been seeded with
// the positives, and could do with more negatives
// FTO_Utils.writeFile(text, _NegDataSet + "/" + id + ".txt");
} catch(Exception err) {
err.printStackTrace();
}
}
}
}
class FTO_Utils {
public static String GetPatentText(String id) throws IOException {
return fetch("https://www.google.com/patents/" + id);
}
public static String fetch(String link) throws IOException {
URL url = new URL(link);
String USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.76 Safari/537.36";
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
conn.setRequestMethod("GET");
conn.setRequestProperty("User-Agent", USER_AGENT);
int respCode = conn.getResponseCode();
System.err.println("\nSearch Sending 'GET' request to URL : " + url);
System.err.println("Response Code : " + respCode);
if (respCode != 200) {
throw new IOException("StatusCode = " + respCode + " - GET returned not OK.\n" + url);
}
BufferedReader in = new BufferedReader(new InputStreamReader(conn.getInputStream()));
StringBuffer resp = new StringBuffer();
String inputLine;
while ((inputLine = in.readLine()) != null)
resp.append(inputLine);
in.close();
return resp.toString();
}
public static String fetch(String link, List<P<String, String>> data) throws IOException {
StringBuilder postData = new StringBuilder();
for (P<String,String> param : data) {
if (postData.length() != 0) postData.append('&');
postData.append(URLEncoder.encode(param.fst(), "UTF-8"));
postData.append('=');
postData.append(URLEncoder.encode(String.valueOf(param.snd()), "UTF-8"));
}
byte[] postDataBytes = postData.toString().getBytes("UTF-8");
URL url = new URL(link);
String USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.76 Safari/537.36";
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
conn.setRequestMethod("POST");
conn.setRequestProperty("User-Agent", USER_AGENT);
conn.setRequestProperty("Content-Type", "application/x-www-form-urlencoded");
conn.setRequestProperty("Content-Length", String.valueOf(postDataBytes.length));
conn.setDoOutput(true);
conn.getOutputStream().write(postDataBytes);
int respCode = conn.getResponseCode();
System.err.println("\nSearch Sending 'GET' request to URL : " + url);
System.err.println("Response Code : " + respCode);
if (respCode != 200) {
throw new IOException("StatusCode = " + respCode + " - GET returned not OK.\n" + url);
}
BufferedReader in = new BufferedReader(new InputStreamReader(conn.getInputStream()));
StringBuffer resp = new StringBuffer();
String inputLine;
while ((inputLine = in.readLine()) != null)
resp.append(inputLine);
in.close();
return resp.toString();
}
// NOT USED
private static String readPatentFromDisk(String id) throws IOException {
String out = FTO_Utils.readFile("patents" + "/" + id.substring(0,4) + "/" + id + ".txt");;
if (out == null || out.isEmpty()) {
return null;
}
return out;
}
// NOT USED
private static void savePatentToDisk(String id, String text, int score) {
// Wierd! What does saving to disk have to do with CUTOFF_SCORES!
// final int CUTOFF_SCORE = 700;
// if(score < CUTOFF_SCORE) {
// text = Integer.toString(score);
// }
File dir = new File("patents");
if(!dir.exists()) {
dir.mkdir();
}
File subdir = new File("patents" + "/" + id.substring(0,4));
if(!subdir.exists()) {
subdir.mkdir();
}
String filename = "patents" + "/" + id.substring(0,4) + "/" + id + ".txt";
String filepath = new File(filename).getAbsolutePath();
FTO_Utils.writeFile(text, filepath);
}
public static boolean filesPresentIn(String dir) {
File dirf = new File(dir);
return dirf.isDirectory() && dirf.listFiles().length > 0;
}
public static String readFile(String path) throws IOException {
BufferedReader br = new BufferedReader(new InputStreamReader(new DataInputStream(new FileInputStream(path))));
String line;
StringBuffer sb = new StringBuffer();
while ((line = br.readLine()) != null) {
sb.append(line);
}
return sb.toString();
}
public static void writeFile(String datafile, String filePath) {
try {
Writer output = null;
File file = new File(filePath);
output = new FileWriter(file);
output.write(datafile);
output.close();
} catch (IOException ex) {
ex.printStackTrace();
}
}
public static void delay(int seconds) {
try {
// long rand = 0;
// while(rand < seconds*1000 || rand > seconds*2000) {
// rand = (long) (Math.random()*seconds*4000);
// }
long ms = seconds * (1000 + ((long) Math.random() * 1000));
Thread.sleep(ms);
} catch (InterruptedException ex) {
}
}
}