/*************************************************************************
* *
* This file is part of the 20n/act project. *
* 20n/act enables DNA prediction for synthetic biology/bioengineering. *
* Copyright (C) 2017 20n Labs, Inc. *
* *
* Please direct all queries to act@20n.com. *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* *
*************************************************************************/
package act.installer.sequence;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringWriter;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.List;
import java.util.ArrayList;
import java.util.Set;
import java.util.HashSet;
import java.util.HashMap;
import java.util.concurrent.Callable;
import java.util.concurrent.ThreadFactory;
import act.server.MongoDB;
import act.shared.Seq;
import act.shared.helpers.MongoDBToJSON;
import act.shared.sar.SAR;
import com.mongodb.DBObject;
import org.json.JSONObject;
import org.json.JSONArray;
import org.json.XML;
import org.json.JSONException;
import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLEventWriter;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLOutputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.events.XMLEvent;
public class SwissProtEntry extends SequenceEntry {
JSONObject data;
public static void parsePossiblyMany(File uniprot_file, final MongoDB db) throws IOException {
try (
FileInputStream fis = new FileInputStream(uniprot_file);
) {
SwissProtEntryHandler handler = new SwissProtEntryHandler() {
@Override
public void handle(SwissProtEntry entry) {
// TODO: run this in a separate thread w/ a synchronized queue to connect it to the parser.
entry.writeToDB(db, Seq.AccDB.swissprot);
}
};
parsePossiblyMany(handler, fis, uniprot_file.toString());
}
}
public static Set<SequenceEntry> parsePossiblyMany(String is) throws IOException {
final HashSet<SequenceEntry> results = new HashSet<SequenceEntry>();
SwissProtEntryHandler handler = new SwissProtEntryHandler() {
@Override
public void handle(SwissProtEntry entry) {
results.add(entry);
}
};
InputStream sis = new ByteArrayInputStream(is.getBytes(StandardCharsets.UTF_8));
parsePossiblyMany(handler, sis, "[String input]");
return results;
}
private static void parsePossiblyMany(SwissProtEntryHandler handler,
InputStream is, String debugSource) throws IOException {
XMLInputFactory xmlInputFactory = XMLInputFactory.newInstance();
XMLOutputFactory xmlOutputFactory = XMLOutputFactory.newInstance();
XMLEventReader xr = null;
try {
xr = xmlInputFactory.createXMLEventReader(is, "utf-8");
/* The following few lines would look more natural as `new XMLEventWriter(new StringWriter())`, which sets up an
* event chain that looks like `[Xml Event] -> {XMLEventWriter -> StringWriter} -> String`. The writers are in
* brackets because they're composed via these next few lines. The XMLEventWriter does the interpretation and
* serialization of the events, and the StringWriter acts as a buffer (which is why we need a reference to the
* StringWriter).
* */
StringWriter w = new StringWriter();
XMLEventWriter xw = xmlOutputFactory.createXMLEventWriter(w);
boolean inEntry = false;
int processedEntries = 0;
while (xr.hasNext()) {
XMLEvent e = xr.nextEvent();
if (!inEntry && e.isStartElement() && e.asStartElement().getName().getLocalPart().equals(("entry"))) {
// Found <entry>.
inEntry = true;
} else if (e.isEndElement() && e.asEndElement().getName().getLocalPart().equals("entry")) {
// Found </entry>.
// Ensure that the XMLEventWriter has processed all events and sent them to the StringWriter it wraps.
xw.flush();
// w.toString() gets a textual representation of all the XML events we've sent to xw.
String xmlText = w.toString();
SwissProtEntry entry = new SwissProtEntry(XML.toJSONObject(xmlText));
handler.handle(entry);
/* Reset the XMLEventWriter(StringWriter()) chain to prepare for the next <entry> we find.
* Note: this can also be accomplished with `w.getBuffer().setLength(0);`, but using a new event writer
* seems safer. */
xw.close();
w = new StringWriter();
xw = xmlOutputFactory.createXMLEventWriter(w);
inEntry = false;
processedEntries++;
if (processedEntries % 10000 == 0) {
// TODO: proper logging!
System.out.println("Processed " + processedEntries + " UniProt/SwissProt entries from " + debugSource);
}
} else if (inEntry) {
// Found some element inside of an <entry></entry> element--just send it to the event stream.
xw.add(e);
}
}
xr.close();
if (xw != null) {
xw.close();
}
System.out.println("Completed processing " + processedEntries + " UniProt/SwissProt entries from " + debugSource);
} catch (JSONException je) {
System.out.println("Failed SwissProt parse: " + je.toString() + " XML file: " + debugSource);
} catch (XMLStreamException e) {
// TODO: do better.
throw new IOException(e);
}
}
private SwissProtEntry(JSONObject gene_entry) {
this.data = gene_entry;
}
String getEc() {
// data.dbReference.[{id:x.x.x.x, type:"EC"}...]
return lookup_ref(this.data, "EC");
}
DBObject getMetadata() {
return MongoDBToJSON.conv(this.data);
}
Set<Long> getCatalyzedRxns() {
// optionally add reactions to actfamilies by processing
// "catalytic activity" annotations and then return those
// catalyzed reaction ids (Long _id of actfamilies). This
// function SHOULD NOT infer which actfamilies refer to
// this object, as that is done in map_seq install.
return new HashSet<Long>();
}
Set<Long> getCatalyzedSubstratesDiverse() {
// see comment in get_catalyzed_rxns for the function here
// when we want to NLP/parse out the "catalysis activity"
// field, we will return that here.
return new HashSet<Long>();
}
Set<Long> getCatalyzedProductsDiverse() {
// see comment in get_catalyzed_rxns for the function here
// when we want to NLP/parse out the "catalysis activity"
// field, we will return that here.
return new HashSet<Long>();
}
Set<Long> getCatalyzedSubstratesUniform() {
// see comment in get_catalyzed_rxns for the function here
// when we want to NLP/parse out the "catalysis activity"
// field, we will return that here.
return new HashSet<Long>();
}
Set<Long> getCatalyzedProductsUniform() {
// see comment in get_catalyzed_rxns for the function here
// when we want to NLP/parse out the "catalysis activity"
// field, we will return that here.
return new HashSet<Long>();
}
HashMap<Long, Set<Long>> getCatalyzedRxnsToSubstrates() {
// see comment in get_catalyzed_rxns for the function here
// when we want to NLP/parse out the "catalysis activity"
// field, we will return that here.
return new HashMap<Long, Set<Long>>();
}
HashMap<Long, Set<Long>> getCatalyzedRxnsToProducts() {
// see comment in get_catalyzed_rxns for the function here
// when we want to NLP/parse out the "catalysis activity"
// field, we will return that here.
return new HashMap<Long, Set<Long>>();
}
SAR getSar() {
// sar is computed later; using "initdb infer_sar"
// for now add the empty sar constraint set
return new SAR();
}
List<JSONObject> getRefs() {
// data.reference.[ {citation: {type: "journal article", dbReference.{id:, type:PubMed}, title:XYA } ... } .. ]
List<String> pmids = new ArrayList<String>();
JSONArray refs = possible_list(this.data.get("reference"));
for (int i = 0; i<refs.length(); i++) {
JSONObject citation = (JSONObject)((JSONObject)refs.get(i)).get("citation");
if (citation.get("type").equals("journal article")) {
String id = lookup_ref(citation, "PubMed");
if (id != null) pmids.add(id);
}
}
List<JSONObject> pmid_references = new ArrayList<>();
for (String pmid : pmids) {
JSONObject obj = new JSONObject();
obj.put("val", pmid);
obj.put("src", "PMID");
pmid_references.add(obj);
}
return pmid_references;
}
Long getOrgId() {
// data.organism.dbReference.{id: 9606, type: "NCBI Taxonomy"}
String id = lookup_ref(this.data.get("organism"), "NCBI Taxonomy");
if (id == null) return null;
return Long.parseLong(id);
}
String getSeq() {
// data.sequence.content: "MSTAGKVIKCKAAV.."
return (String)((JSONObject)this.data.get("sequence")).get("content");
}
private String lookup_ref(Object o, String typ) {
// o.dbReference.{id: 9606, type: typ}
// o.dbReference.[{id: x.x.x.x, type: typ}]
JSONObject x = (JSONObject)o;
if (!x.has("dbReference"))
return null;
JSONArray set = possible_list(x.get("dbReference"));
for (int i = 0; i<set.length(); i++) {
JSONObject entry = set.getJSONObject(i);
if (typ.equals(entry.get("type"))) {
return entry.get("id").toString();
}
}
return null; // did not find the requested type; not_found indicated by null
}
private JSONArray possible_list(Object o) {
JSONArray l = null;
if (o instanceof JSONObject) {
l = new JSONArray();
l.put(o);
} else if (o instanceof JSONArray) {
l = (JSONArray) o;
} else {
System.out.println("Json object is neither an JSONObject nor a JSONArray. Abort.");
System.exit(-1);
}
return l;
}
@Override
public String toString() {
return this.data.toString(2); // format it with 2 spaces
}
private static abstract class SwissProtEntryHandler {
public abstract void handle(SwissProtEntry entry);
}
}