/*************************************************************************
* *
* This file is part of the 20n/act project. *
* 20n/act enables DNA prediction for synthetic biology/bioengineering. *
* Copyright (C) 2017 20n Labs, Inc. *
* *
* Please direct all queries to act@20n.com. *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* *
*************************************************************************/
package com.act.biointerpretation.metadata;
import act.server.NoSQLAPI;
import act.shared.Reaction;
import org.apache.commons.io.FileUtils;
import org.json.JSONArray;
import org.json.JSONObject;
import java.io.File;
import java.util.Map;
import java.util.HashMap;
import java.util.Set;
import java.util.HashSet;
import java.util.List;
import java.util.ArrayList;
import java.util.Iterator;
/**
* Requires newer database:
* mongod --dbpath /Users/jca20n/Downloads/2016-11-22-full-run-w-validation & disown %1
* <p>
* This one has the modification and cloning data also included
* mongod --dbpath /Users/jca20n/Downloads/2016-12-08-actv01_vijay_proteins_only & disown %1
* Created by jca20n on 12/1/16.
*/
public class ProteinMetadataFactory {
private Set<String> dataList = new HashSet<>();
private Map<String, Integer> dataMap = new HashMap<>();
//Data used in algorithms
private Set<String> modificationTermsTrue;
private Set<String> modificationTermsFalse;
private Map<String, Map<Host, Integer>> clonedtermToScore;
private Map<String, Map<Host, Localization>> termToHostLocalization;
private ProteinMetadataFactory() {}
public static ProteinMetadataFactory initiate() throws Exception {
//Construct the Sets to hold modification data terms
Set<String> modTrue = new HashSet<>();
Set<String> modFalse = new HashSet<>();
// TODO: Move this to resources directory?
File termfile = new File("data/ProteinMetadata/2016_12_07-modification_terms.txt");
String data = FileUtils.readFileToString(termfile);
String[] lines = data.split("\\r|\\r?\\n");
for(int i=1; i<lines.length; i++) {
String line = lines[i];
String[] tabs = line.split("\t");
String term = tabs[0].toLowerCase();
boolean val = Boolean.parseBoolean(tabs[1]);
if(val==true) {
modTrue.add(term);
} else {
modFalse.add(term);
}
}
//Construct term data for 'cloned'
Map<String,String> termToGenus = new HashMap<>();
// TODO: Move this to resources directory?
termfile = new File("data/ProteinMetadata/2016_12_07-cloned_term_to_genus.txt");
data = FileUtils.readFileToString(termfile);
lines = data.split("\\r|\\r?\\n");
for(int i=1; i<lines.length; i++) {
String line = lines[i];
String[] tabs = line.split("\t");
if(tabs[1].isEmpty() || tabs[0].isEmpty()) {
continue;
}
termToGenus.put(tabs[0],tabs[1]);
}
//Pre-Compute distance to hosts for all terms for "cloned"
Map<String, Genus> nameToGenus = Genus.parseGenuses();
Map<String, Map<Host, Integer>> termToScore = new HashMap<>();
for(String term : termToGenus.keySet()) {
String tgenus = termToGenus.get(term);
Genus ggenus = nameToGenus.get(tgenus);
Map<Host, Integer> hostToScore = new HashMap<>();
for(Host host : Host.values()) {
Genus hostgenus = nameToGenus.get(host.toString());
Integer score = Genus.similarity(ggenus, hostgenus);
hostToScore.put(host, score);
}
termToScore.put(term, hostToScore);
}
//Construct the data to handle localization
Map<String, Map<Host, Localization>> locMap = new HashMap<>();
// TODO: Move this to resources directory?
termfile = new File("data/ProteinMetadata/2016_12_06-localization.txt");
data = FileUtils.readFileToString(termfile);
lines = data.split("\\r|\\r?\\n");
for(int i=1; i<lines.length; i++) {
//Create the host map for each term and initially assume all values are 'questionable'
Map<Host, Localization> hostToLoc = new HashMap<>();
for(Host host : Host.values()) {
hostToLoc.put(host, Localization.questionable);
}
//Parse the line and put in replacement values per host
String line = lines[i];
String[] tabs = line.split("\t");
String term = tabs[0];
for(int x=1; x<tabs.length; x++) {
try {
String sloc = tabs[x];
Localization loc = Localization.valueOf(sloc);
//Ecoli Bsubtilis Cglutamicum Scerevisiae Ppasteuris Aniger Hsapiens Sfrugiperda
if(x==1) {
hostToLoc.put(Host.Ecoli, loc);
} else if(x==2) {
hostToLoc.put(Host.Bsubtilis, loc);
} else if(x==3) {
hostToLoc.put(Host.Cglutamicum, loc);
} else if(x==4) {
hostToLoc.put(Host.Scerevisiae, loc);
} else if(x==5) {
hostToLoc.put(Host.Ppasteuris, loc);
} else if(x==6) {
hostToLoc.put(Host.Aniger, loc);
} else if(x==7) {
hostToLoc.put(Host.Hsapiens, loc);
} else if(x==8) {
hostToLoc.put(Host.Sfrugiperda, loc);
}
} catch(Exception err) {
err.printStackTrace();
}
}
locMap.put(term, hostToLoc);
}
//Create the factory and put in data
ProteinMetadataFactory factory = new ProteinMetadataFactory();
factory.modificationTermsTrue = modTrue;
factory.modificationTermsFalse = modFalse;
factory.clonedtermToScore = termToScore;
factory.termToHostLocalization = locMap;
return factory;
}
public ProteinMetadata create(JSONObject json) throws Exception {
Double kcatkm = handleKcatKm(json);
Double specificActivity = handleSpecificActivity(json);
Boolean heteroSubunits = handlesubunits(json);
Boolean modifications = handleModifications(json);
Map<Host, Integer> cloning = handleCloned(json);
Map<Host, Localization> localization = handleLocalization(json);
List<Long> seqIds = handleSequences(json);
ProteinMetadata out = new ProteinMetadata();
out.kcatkm = kcatkm;
out.specificActivity = specificActivity;
out.heteroSubunits = heteroSubunits;
out.modifications = modifications;
out.cloned = cloning;
out.localization = localization;
out.sequences = seqIds;
return out;
}
private List<Long> handleSequences(JSONObject json) {
//Try to pull the data
JSONArray jarray = null;
List<Long> seqIds = new ArrayList<>();
try {
jarray = json.getJSONArray("sequences");
} catch (Exception err) {
return seqIds;
}
//If there is no data, the value is undefined
if (jarray.length() == 0) {
return seqIds;
}
for (int i = 0; i < jarray.length(); i++) {
Long data = (Long) jarray.get(i);
seqIds.add(data);
}
return seqIds;
}
private Double handleKcatKm(JSONObject json) {
//Try to pull the data
JSONArray jarray = null;
try {
jarray = json.getJSONArray("kcat/km");
} catch (Exception err) {
return null;
}
//If there is no data, the value is undefined
if (jarray.length() == 0) {
return null;
}
//If many observations are given for the protein, return the highest value observed
/*TODO: the name of chemical in the substrate field could be matched to the substrate of the reaction
Currently I am just picking the highest value, but it would be best to pick the highest value for the
substrate being examined. This complicates the wiring, so I didn't try to implement
*/
if (jarray.length() > 1) {
Double highest = -1.0;
for (int i = 0; i < jarray.length(); i++) {
try {
JSONObject obj = jarray.getJSONObject(i);
Double dval = obj.getDouble("val");
if (dval > highest) {
highest = dval;
}
} catch (Exception err) {
}
}
if (highest > 0) {
return highest;
} else {
return null;
}
}
//Otherwise return the one value
try {
JSONObject obj = jarray.getJSONObject(0);
Double val = obj.getDouble("val");
return val;
} catch (Exception err) {
return null;
}
}
private Double handleSpecificActivity(JSONObject json) {
//Try to pull the data
JSONArray jarray = null;
try {
jarray = json.getJSONArray("specific_activity");
} catch (Exception err) {
return null;
}
//If there is no data, the value is undefined
if (jarray.length() == 0) {
return null;
}
//If many observations are given for the protein, return the highest value observed
if (jarray.length() > 1) {
Double highest = -1.0;
for (int i = 0; i < jarray.length(); i++) {
try {
JSONObject obj = jarray.getJSONObject(i);
Double dval = obj.getDouble("val");
if (dval > highest) {
highest = dval;
}
} catch (Exception err) {
}
}
if (highest > 0) {
return highest;
} else {
return null;
}
}
//Otherwise return the one value
try {
JSONObject obj = jarray.getJSONObject(0);
Double val = obj.getDouble("val");
return val;
} catch (Exception err) {
return null;
}
}
private Boolean handlesubunits(JSONObject json) {
//Try to pull the data
JSONArray jarray = null;
try {
jarray = json.getJSONArray("subunits");
} catch (Exception err) {
return null;
}
//If there is no data, the value is undefined
if (jarray.length() == 0) {
return null;
}
//If many observations are given for the protein, return the consensus with a true value dominating
if (jarray.length() > 1) {
Boolean out = null;
for (int i = 0; i < jarray.length(); i++) {
try {
JSONObject obj = jarray.getJSONObject(i);
Boolean bval = assignSubunitHelper(obj);
if (bval == true) {
return true;
} else if(bval == false) {
out = false;
}
} catch (Exception err) {
}
}
return out;
}
//Otherwise return the one value
try {
JSONObject obj = jarray.getJSONObject(0);
Boolean bval = assignSubunitHelper(obj);
return bval;
} catch (Exception err) {
return null;
}
}
private Boolean assignSubunitHelper(JSONObject json) {
try {
String val = json.getString("val");
String comment = json.getString("comment");
if(val.equals("?")) {
return null;
}
if(val.contains("hetero")) {
return true;
}
if(comment.contains("hetero")) {
return true;
}
if(val.contains("monomer")) {
return false;
}
if(val.startsWith("homo")) {
return false;
}
if(comment.contains("alpha") && comment.contains("beta")) {
return true;
}
return false;
} catch(Exception err) {
return null;
}
}
/**
* This is a unit test for handleSubunits
* @throws Exception
*/
private boolean testHandlesubunits() throws Exception {
// TODO: Move this to resources directory?
File testfile = new File("data/ProteinMetadata/2016_12_07-subunit_testset.txt");
String data = FileUtils.readFileToString(testfile);
data = data.replaceAll("\"\"", "\"");
String[] lines = data.split("\\r|\\r?\\n");
for(int i=1; i<lines.length; i++) {
String line = lines[i];
String[] tabs = line.split("\t");
//Pull out the json for each test and re-wrap it
String arrayStr = tabs[0];
if(arrayStr.startsWith("\"")) {
arrayStr = arrayStr.substring(1);
}
if(arrayStr.endsWith("\"")) {
arrayStr = arrayStr.substring(0, arrayStr.length() -1);
}
String jsonstr = "{\"subunits\":" + arrayStr + "}";
JSONObject json = new JSONObject(jsonstr);
//Run the test and compare result to expected
Boolean result = this.handlesubunits(json);
Boolean expected = null;
if(!tabs[1].equals("null")) {
expected = Boolean.parseBoolean(tabs[1]);
}
if(result != expected) {
System.err.println("Subunit testing error:\n" + json.toString() + " " + tabs[1]);
System.err.println("Expect: " + expected + " Found: " + result);
return false;
}
}
return true;
}
private Boolean handleModifications(JSONObject json) {
//Try to pull the data
JSONArray jarray = null;
try {
jarray = json.getJSONArray("post_translational_modification");
} catch (Exception err) {
return null;
}
//If there is no data, nothing is known
if (jarray.length() == 0) {
return null;
}
//If many observations are given for the protein, return the consensus with a true value dominating
if (jarray.length() > 1) {
Boolean out = null;
for (int i = 0; i < jarray.length(); i++) {
try {
JSONObject obj = jarray.getJSONObject(i);
Boolean bval = assignModificationHelper(obj);
if (bval == true) {
return true;
} else if(bval == false) {
out = false;
}
} catch (Exception err) {
}
}
return out;
}
//Otherwise return the one value
try {
JSONObject obj = jarray.getJSONObject(0);
Boolean bval = assignModificationHelper(obj);
return bval;
} catch (Exception err) {
return null;
}
}
private Boolean assignModificationHelper(JSONObject json) {
try {
String ptm = json.getString("post_translational_modification");
if (this.modificationTermsTrue.contains(ptm)) {
return true;
}
if (this.modificationTermsFalse.contains(ptm)) {
return false;
}
return null;
} catch(Exception err) {
return null;
}
}
private Map<Host, Integer> handleCloned(JSONObject json) {
//Populate output with null for all hosts (no prediction)
Map<Host, Integer> out = new HashMap<>();
for(Host host : Host.values()) {
out.put(host, null);
}
//Read in any data from JSON and interpret prediction based on phylogenetic distance to host
try {
JSONArray jarray = json.getJSONArray("cloned");
for(int i=0; i<jarray.length(); i++) {
JSONObject obj = jarray.getJSONObject(i);
String comment = obj.getString("comment");
String[] words = comment.toLowerCase().split("[\\s,;]+");
for(String word : words) {
Map<Host,Integer> hosttoint = this.clonedtermToScore.get(word);
if(hosttoint == null) {
continue;
}
for(Host host : hosttoint.keySet()) {
Integer currval = out.get(host);
if(currval == null) {
currval = -99999;
}
Integer newval = hosttoint.get(host);
if(newval > currval) {
out.put(host, newval);
}
}
}
}
} catch (Exception err) {
}
return out;
}
// Dead code: This was being called from within `handleCloned` but not now.
private void printoutHosts(JSONObject json) {
try {
JSONArray jarray = json.getJSONArray("cloned");
for(int i=0; i<jarray.length(); i++) {
JSONObject obj = jarray.getJSONObject(i);
String comment = obj.getString("comment");
int index = comment.indexOf("in ");
if(index < 0) {
continue;
}
String[] words = comment.substring(index).toLowerCase().split("[\\s,;]+");
String datapoint = "";
int limit = 1;
int counter = 0;
for(String word : words) {
if(counter>limit) {
break;
}
counter++;
datapoint += word + "\t";
}
Integer currval = dataMap.get(datapoint);
if(currval == null) {
currval = 0;
}
currval++;
dataMap.put(datapoint, currval);
}
} catch (Exception err) {
}
}
private Map<Host, Localization> handleLocalization(JSONObject json) {
//Construct the output with unknown for all Hosts
Map<Host, Localization> out = new HashMap<>();
for(Host host : Host.values()) {
out.put(host, Localization.unknown);
}
//Parse out localization information from json and populate the output map with predictions
JSONArray jarray = null;
try {
jarray = json.getJSONArray("localization");
} catch (Exception err) {
err.printStackTrace();
}
//If there is no metadata, all values should be "unknown"
if(jarray.length() == 0) {
return out;
}
//Scan through each observation
for (int i = 0; i < jarray.length(); i++) {
try {
JSONObject obj = jarray.getJSONObject(i);
String term = obj.getString("val");
Map<Host, Localization> hostToLoc = this.termToHostLocalization.get(term);
if(hostToLoc == null) {
System.out.println("missing term: " + term);
dataList.add(term);
continue;
}
for (Host host : hostToLoc.keySet()) {
Localization currval = out.get(host); // Whatever is currently in the Map
Localization newval = hostToLoc.get(host); // The potential new value
//If the current value is "unknown", replace that value the one with the term
if (currval == Localization.unknown) {
out.put(host, newval);
}
//If the current value is "questionable", stay with questionable
else if (currval == Localization.questionable) {
out.put(host, Localization.questionable);
}
//If currval and newval have non-identical, but aren't unknown/questionable, then
//There must be 2 non-identical predictions, in which case this becomes questionable
else if (currval != newval) {
out.put(host, Localization.questionable);
}
}
} catch (Exception err) {
err.printStackTrace();
}
}
return out;
}
public static void main(String[] args) throws Exception {
// TODO: This is referencing a temporary collection. Change it!
// TODO: FIX THIS BEFORE MERGE!
NoSQLAPI api = new NoSQLAPI("actv01_vijay_proteins", "actv01_vijay_proteins");
Iterator<Reaction> iterator = api.readRxnsFromInKnowledgeGraph();
//Create a single instance of the factory method to use for all json
ProteinMetadataFactory factory = ProteinMetadataFactory.initiate();
//Run some tests
try {
if(factory.testHandlesubunits() == true) {
System.out.println("Subunit test OK");
}
} catch (Exception err) {
System.err.println("Failed to test subunits");
}
//Create a list to aggregate the results of the database scan
List<ProteinMetadata> agg = new ArrayList<>();
//Scan the database and store ProteinMetadata objects
while (iterator.hasNext()) {
Reaction rxn = iterator.next();
Reaction.RxnDataSource source = rxn.getDataSource();
if (!source.equals(Reaction.RxnDataSource.BRENDA)) {
continue;
}
Set<JSONObject> jsons = rxn.getProteinData();
for (JSONObject json : jsons) {
ProteinMetadata meta = factory.create(json);
agg.add(meta);
}
}
//Write out any messages to file
StringBuilder sb = new StringBuilder();
for(String aline : factory.dataList) {
sb.append(aline).append("\n");
}
File outfile = new File("output/ProteinMetadata/Factory_output.txt");
if(outfile.exists()) {
outfile.delete();
}
FileUtils.writeStringToFile(outfile, sb.toString());
sb = new StringBuilder();
for(String key : factory.dataMap.keySet()) {
int value = factory.dataMap.get(key);
sb.append(key + "\t" + value + "\n");
}
outfile = new File("output/ProteinMetadata/Factory_output_map.txt");
if(outfile.exists()) {
outfile.delete();
}
FileUtils.writeStringToFile(outfile, sb.toString());
//Count up the results of modifications to get statistics
int falsecount = 0;
int truecount = 0;
int nullcount = 0;
for(ProteinMetadata datum : agg) {
if(datum == null) {
System.err.println("null datum");
continue;
}
if(datum.modifications == null) {
nullcount++;
} else if(datum.modifications == false) {
falsecount++;
} else if(datum.modifications == true) {
truecount++;
}
}
System.out.println("Total # protein metadata: " + agg.size());
System.out.println();
System.out.println("modification true count: " + truecount);
System.out.println("modification false count: " + falsecount);
System.out.println("modification null count: " + nullcount);
System.out.println();
//Get some statistics for cloned
nullcount = 0;
int emptycount = 0;
int colicount = 0;
int humancount = 0;
int bothcount = 0;
for(ProteinMetadata datum : agg) {
if(datum == null) {
System.err.println("null datum");
continue;
}
if(datum.cloned == null) {
nullcount++;
continue;
}
if(datum.cloned.isEmpty()) {
emptycount++;
continue;
}
Integer human = datum.cloned.get(Host.Hsapiens);
if(human !=null && human > 0) {
humancount++;
}
Integer coli = datum.cloned.get(Host.Ecoli);
if(coli !=null && coli > 0) {
colicount++;
if(human !=null && human > 0) {
bothcount++;
}
}
}
System.out.println("cloned null count: " + nullcount);
System.out.println("cloned empty count: " + emptycount);
System.out.println("cloned coli count: " + colicount);
System.out.println("cloned human count: " + humancount);
System.out.println("cloned both count: " + bothcount);
System.out.println();
}
}