/*************************************************************************
* *
* This file is part of the 20n/act project. *
* 20n/act enables DNA prediction for synthetic biology/bioengineering. *
* Copyright (C) 2017 20n Labs, Inc. *
* *
* Please direct all queries to act@20n.com. *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* *
*************************************************************************/
package com.twentyn.patentScorer;
import com.twentyn.patentExtractor.PatentDocument;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* This class is very slightly modified copy of PatentMiner/src/org/twentyn/patentminer/GoogleSearcher.java from the
* experimental git repo.
*/
public class PatentModel {
private Map<String, Integer> model;
private Double modelNormalizationParam;
private final String _RootDir = "FTO_training";
private final String _NegDataSet = _RootDir + "/bioneg";
private final String _PosDataSet = _RootDir + "/biopos";
private final String _ChemNegDataSet = _RootDir + "/chemneg";
private final String _ChemPosDataSet = _RootDir + "/chempos";
private static PatentModel instance = null;
public static PatentModel getModel() {
if (instance == null) {
instance = new PatentModel();
}
return instance;
}
private PatentModel() {
initModel();
dumpValidationAgainstTrainingData();
}
public double ProbabilityOf(String text) {
return NormalizeScoreToProbability(ScoreText(text));
}
public double ProbabilityOf(PatentDocument patentDocument) {
StringBuilder builder = new StringBuilder();
for (String text : patentDocument.getClaimsText()) {
builder.append(text).append("\n");
}
for (String text : patentDocument.getTextContent()) {
builder.append(text).append("\n");
}
builder.append(patentDocument.getClaimsText());
return ProbabilityOf(builder.toString());
}
private double NormalizeScoreToProbability(int score) {
// normalization function is 1-e(-B x score)
// where B is calculated optimally from the dataset
return 1 - Math.exp(-this.modelNormalizationParam * score);
}
private int ScoreText(String text) {
int out = 0;
Set<String> extract = extractTokens(text);
for (String str : this.model.keySet()) {
if (extract.contains(str)) {
out += this.model.get(str);
}
}
return out;
}
private void initModel() {
// check that there are training files in the positive, negative datasets
if (!Utils.filesPresentIn(_PosDataSet) || !Utils.filesPresentIn(_NegDataSet)) {
System.err.println("First time initialization. Downloading training set.");
DownloadTrainingDataSets();
}
Map<String, Integer> pattern = calculatePattern(_NegDataSet, _PosDataSet);
this.model = pattern;
Double normParam = calculateNormalizationParam(_NegDataSet, _PosDataSet);
this.modelNormalizationParam = normParam;
System.err.println("FTO: Pattern size = " + pattern.size());
System.err.println("FTO: 1-exp(-Bx) norm. B = " + normParam);
}
private void DownloadTrainingDataSets() {
// download text for biosynthesis and chemosynthesis datasets
CreateBiosynthesisDataSet();
CreateChemosynthesisDataSet();
}
private void dumpValidationAgainstTrainingData() {
try {
// dump all scores and probabilities for training negatives
File dir = new File(_NegDataSet);
for (File fily : dir.listFiles()) {
String text = Utils.readFile(fily.getAbsolutePath());
dumpScoreProbability("-", fily.getName(), text);
}
// dump all scores and probabilities for training positives
dir = new File(_PosDataSet);
for (File fily : dir.listFiles()) {
String text = Utils.readFile(fily.getAbsolutePath());
dumpScoreProbability("+", fily.getName(), text);
}
} catch (IOException e) {
e.printStackTrace();
}
}
private void dumpScoreProbability(String posOrNeg, String name, String text) {
double probability = ProbabilityOf(text);
System.err.println(posOrNeg + "\t" + name + "\t" + probability);
}
private final int CUTOFF = 5;
private Map<String, Integer> calculatePattern(String negDir, String posDir) {
Map<String, Integer> negs = readFolderAndHashOut(negDir);
Map<String, Integer> poss = readFolderAndHashOut(posDir);
Map<String, Integer> pattern = new HashMap<>();
for (String str : negs.keySet()) {
Integer negvalue = negs.get(str);
if (poss.containsKey(str)) {
Integer posvalue = poss.get(str);
Integer newval = posvalue - negvalue;
if (newval > CUTOFF) {
pattern.put(str, newval);
}
}
}
return pattern;
}
private Double calculateNormalizationParam(String negDir, String posDir) {
Set<Integer> negs = scoreFolder(negDir);
Set<Integer> poss = scoreFolder(posDir);
Double Lp = average(poss), Hn = average(negs);
if (Lp < Hn) {
System.err.println("FTO: Error. Centroid of +ves < -ves. Bad training data.");
System.err.println("FTO: This means that on average the +ve patents score.");
System.err.println("FTO: less than the -ve patents; but higher scores are");
System.err.println("FTO: supposed to mean more +ve. Abort!");
System.exit(-1);
}
// fit a 1-e(-B * x) curve to the positive and negative dataset
// where B is a positive real, which is learnt by maximizing
// the distance between the average of the negatives Hn and
// the average of the positives Lp.
// Maximization occurs where
// d/dB( e(-Hn * B) - e(-Lp * B) ) = 0
// i.e., Hn * e(-Hn * B) = Lp * e(-Lp * B) - solve for B
// Or Log(Lp/Hn) = B(Lp - Hn)
// Or B = Log(Lp/Hn)/(Lp - Hn)
Double B = Math.log(Lp / Hn) / (Lp - Hn);
return B;
}
private Double average(Set<Integer> S) {
Double avg = 0.0;
int sz = S.size();
for (Integer i : S) avg += (double) i / (double) sz;
return avg;
}
private Set<Integer> scoreFolder(String path) {
try {
Set<Integer> out = new HashSet<>();
File dir = new File(path);
for (File afile : dir.listFiles()) {
String text = Utils.readFile(afile.getAbsolutePath());
out.add(ScoreText(text));
}
return out;
} catch (IOException e) {
e.printStackTrace();
System.exit(-1);
return null;
}
}
private Map<String, Integer> readFolderAndHashOut(String path) {
try {
Map<String, Integer> out = new HashMap<>();
File dir = new File(path);
for (File afile : dir.listFiles()) {
String text = Utils.readFile(afile.getAbsolutePath());
Set<String> extract = extractTokens(text);
for (String str : extract) {
Integer value = 0;
if (out.containsKey(str)) {
value = out.get(str);
}
value++;
out.put(str, value);
}
}
return out;
} catch (IOException e) {
e.printStackTrace();
System.exit(-1);
return null;
}
}
private Set<String> extractTokens(String text) {
Set<String> out = new HashSet<>();
String patternString = "[0-9a-zA-Z]+";
Pattern patt = Pattern.compile(patternString);
Matcher matcher = patt.matcher(text);
boolean matches = matcher.matches();
int count = 0;
while (matcher.find()) {
count++;
out.add((text.substring(matcher.start(), matcher.end())).toLowerCase());
}
return out;
}
private void CreateBiosynthesisDataSet() {
File training = new File(_RootDir);
if (!training.exists()) {
training.mkdir();
}
// Create a list of patent urls talking about biosynthesis
List<String> positives = new ArrayList<>();
positives.add("WO2012016177A2"); //Amyris farnesene
positives.add("WO2013192543A2"); //Phytogene styrene
positives.add("EP2438178A2"); //Genomatica BDO
positives.add("US6194185"); //Wash U limonene
positives.add("US8828693"); //isopropanol
positives.add("EP1799828B1"); //phloroglucinol
positives.add("CA2112374C"); //yeast xylitol
positives.add("WO2013071112A1"); //yeast xylose
positives.add("WO2014066892A1"); //Dupont isoprene
positives.add("EP2252691B1"); //santalene Firmenich
positives.add("US7374920"); //
positives.add("US20120107893"); //Stephanopoulus very broad claim about something with indole and coli specifically
positives.add("US8889381"); //A host cell, comprising a nucleic acid molecule encoding a cis-abienol synthase
positives.add("US7238514"); //
positives.add("US20130302861"); //mitochondrial targeting
positives.add("US8062878"); //levopimaradiene synthase
positives.add("US5994114"); //taxadiene synthase
List<String> negatives = new ArrayList<>();
negatives.add("US5274029"); //
negatives.add("US3284393"); //
negatives.add("US7141615"); //
negatives.add("US3632822"); //
negatives.add("US3787335"); //
negatives.add("US8017658"); //
negatives.add("WO2012173477A1"); //
negatives.add("CN103275146A"); //
negatives.add("CN103113443A"); //
negatives.add("CN103755556A"); //
negatives.add("US20130143826"); //
negatives.add("WO2014078168A1"); //
negatives.add("US20130005581"); //
negatives.add("US20140303361"); //
negatives.add("CN103193799A"); //
negatives.add("CN103467567A"); //
negatives.add("WO2002044197A2"); //
negatives.add("US20140058063"); //
negatives.add("EP2729123A2"); //
negatives.add("US8470822"); //
negatives.add("WO2014031646A3"); //
negatives.add("CN102558143B"); //
negatives.add("WO2000026174A2"); //
negatives.add("US20110250626"); //coatings incorporating bioactive enzymes
// NOT IN CHRIS' DATASET THAT HE SENT OVER....
// negatives.add("US20130189677"); //terpenoid transporters
negatives.add("US20090238811"); //Enzymatic antimicrobial and antifouling coatings
negatives.add("US8846351"); //degrading cellulose
negatives.add("US20100248334"); //Biological active coating components
negatives.add("US20130338330"); //chemical synthesis
negatives.add("US20130331342"); //hair/scalp care compositions
negatives.add("CA2595380A1"); //Stabilized liquid polypeptide formulations
File afile = new File(_PosDataSet);
if (!afile.exists()) {
afile.mkdir();
}
for (String id : positives) {
try {
String text = Utils.GetPatentText(id);
Utils.writeFile(text, _PosDataSet + "/" + id + ".txt");
} catch (Exception err) {
err.printStackTrace();
}
}
afile = new File(_NegDataSet);
if (!afile.exists()) {
afile.mkdir();
}
for (String id : negatives) {
try {
String text = Utils.GetPatentText(id);
Utils.writeFile(text, _NegDataSet + "/" + id + ".txt");
} catch (Exception err) {
err.printStackTrace();
}
}
}
private void CreateChemosynthesisDataSet() {
File training = new File(_RootDir);
if (!training.exists()) {
training.mkdir();
}
// Create a list of patent urls talking about biosynthesis
List<String> positives = new ArrayList<>();
positives.add("US2623897"); //Galllic acid esters
positives.add("WO2008065527A2"); //
positives.add("US7045654"); //
positives.add("US4788331"); //
positives.add("EP0771782A1"); //
positives.add("US2606186"); //
positives.add("US1836568"); //
positives.add("US2886438"); //
positives.add("US6399810"); //
positives.add("US2945068"); //
positives.add("US2155856"); //
List<String> negatives = new ArrayList<>();
negatives.add("US6180666"); //use
negatives.add("EP1159007A1"); //use
negatives.add("EP2753336A1"); //use
negatives.add("US3792014"); //use
negatives.add("WO2011138345A2");
negatives.add("US20100034762");
negatives.add("WO2012131348A1");
negatives.add("US6669964");
negatives.add("WO2009084020A2");
negatives.add("US2211485");
negatives.add("US5223179");
negatives.add("US20060286061");
negatives.add("US5756446");
negatives.add("EP2595599A1");
negatives.add("US4368056");
negatives.add("EP2582775A1");
negatives.add("US4379168");
negatives.add("US4915707");
negatives.add("US6200625");
negatives.add("USRE36982");
negatives.add("US4818250");
negatives.add("CA2118071C");
negatives.add("US6194185");
negatives.add("US5849680");
negatives.add("WO1999021891A1");
negatives.add("CA2492498C");
negatives.add("US6342535");
negatives.add("US5344776");
negatives.add("US7622269");
negatives.add("US20020058075");
negatives.add("US20040204497");
negatives.add("WO2014151732A1");
negatives.add("EP2502621A1");
negatives.add("US5427798");
negatives.add("US6312716");
negatives.add("WO1999038502A1");
negatives.add("US6462237");
negatives.add("EP2316456A1");
negatives.add("WO1999038503A1");
negatives.add("US4820522");
negatives.add("EP2649993A1");
negatives.add("US20110124718");
negatives.add("US8518438");
negatives.add("US20070237816");
negatives.add("US8658631");
negatives.add("US8609684");
negatives.add("US20080293804");
File afile = new File(_ChemPosDataSet);
if (!afile.exists()) {
afile.mkdir();
}
for (String id : positives) {
try {
String text = Utils.GetPatentText(id);
Utils.writeFile(text, _ChemPosDataSet + "/" + id + ".txt");
} catch (Exception err) {
err.printStackTrace();
}
}
afile = new File(_ChemNegDataSet);
if (!afile.exists()) {
afile.mkdir();
}
for (String id : negatives) {
try {
String text = Utils.GetPatentText(id);
Utils.writeFile(text, _ChemNegDataSet + "/" + id + ".txt");
// We use the negatives in this training set to also serve as
// training for the bio dataset; in addition to the chem dataset
// This is because the bioalgorithm has already been seeded with
// the positives, and could do with more negatives
// FTO_Utils.writeFile(text, _NegDataSet + "/" + id + ".txt");
} catch (Exception err) {
err.printStackTrace();
}
}
}
}