/************************************************************************* * * * This file is part of the 20n/act project. * * 20n/act enables DNA prediction for synthetic biology/bioengineering. * * Copyright (C) 2017 20n Labs, Inc. * * * * Please direct all queries to act@20n.com. * * * * This program is free software: you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * * the Free Software Foundation, either version 3 of the License, or * * (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU General Public License * * along with this program. If not, see <http://www.gnu.org/licenses/>. * * * *************************************************************************/ package act.installer; import act.installer.sequence.GenBankEntry; import act.installer.sequence.SequenceEntry; import act.installer.sequence.SwissProtEntry; import act.server.MongoDB; import act.shared.Reaction; import act.shared.Seq; import act.shared.helpers.P; import org.json.JSONArray; import org.json.JSONException; import org.json.JSONObject; import org.json.XML; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.net.URL; import java.net.URLEncoder; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; class AccID { Seq.AccDB db; String acc_num; AccID(Seq.AccDB db, String a) { this.db = db; this.acc_num = a; } @Override public String toString() { return this.db + ":" + this.acc_num; } @Override public int hashCode() { return this.db.hashCode() ^ this.acc_num.hashCode(); } @Override public boolean equals(Object other) { if (!(other instanceof AccID)) return false; AccID o = (AccID)other; return o.db == this.db && o.acc_num.equals(this.acc_num); } } public class SeqIdentMapper { boolean LOOK_FOR_EC_ORG_IN_NCBI_PROTEIN = false; boolean SEQ_RXN_MAP_USING_EC_ORG_PMID_TRIPLE = false; private MongoDB db; private static final int _debug_level = 1; // 0 = no log; 1 = only main stats; 2 = all public SeqIdentMapper(MongoDB db) { this.db = db; } public void map() { /* This one (connect_using_fingerprint), maps db.actfamilies entries to db.seq entries by keying both on * "EC ORG PMID", i.e., ec# organism and pubmed literature ref. * * This might be useful later, in the corner cases where BRENDA people did not put an accession number there, but * the same paper talking about the same ec# and organism is a pretty decent signal that the sequence is the right * one. Later, when we resurrect this, we should see how many links this makes (that weren't already made with the * explicit accession numbers specified by the BRENDA people). And for those extra links this makes, manually * examine how accurate those mappings are." */ if (SEQ_RXN_MAP_USING_EC_ORG_PMID_TRIPLE) { System.out.println("[MAP_SEQ] *** Phase 2: mapping using seq fingerprint"); connect_using_fingerprint(); } else { System.out.println("[MAP_SEQ] SKIPPING: mapping using seq fingerprint"); } /* This one (connect_using_ncbi_protein_ec_org_lookup) is the most broad search for the sequence. It uses the EC# * and Organism as the keys to do a web lookup in NCBI Protein for a protein that matches that. Those might hit * hypothetical inferred proteins, or annotated proteins. NCBI Protein's description says: "The Protein database is * a collection of sequences from several sources, including translations from annotated coding regions in GenBank, * RefSeq and TPA, as well as records from SwissProt, PIR, PRF, and PDB. Protein sequences are the fundamental * determinants of biological structure and function." * * In theory that sounds great, if you want are looking to assign a sequence (and if you are ok with the confidence * in the sequence being low). In practice 1) This function does a web lookup, and does not cache the results across * different installer runs, 2) We already have all of SwissProt in db.seq, and have downloaded genbank to the NAS * (and would integrate that into db.seq soon), and we will integrate PDB entries soon as well; RefSeq, TPA, PIR, * PRF need to be looked into. So once these are local, we should do a local lookup rather than going through the * NCBI web api. */ if (LOOK_FOR_EC_ORG_IN_NCBI_PROTEIN) { System.out.println("[MAP_SEQ] *** Phase 3: mapping using NCBI Protein ec# + org lookup"); connect_using_ncbi_protein_ec_org_lookup(); } else { System.out.println("[MAP_SEQ] SKIPPING: mapping using NCBI Protein ec# + org lookup"); } } private void connect_using_explicit_brenda_accession_annotation() { HashMap<Integer, Set<AccID>> rxnid2accession = new HashMap<Integer, Set<AccID>>(); HashMap<AccID, Integer> accession2seqid = new HashMap<AccID, Integer>(); double done, total; System.out.println("[MAP_SEQ] mapping all reactions to accession numbers"); List<Long> reactionids = db.getAllReactionUUIDs(); done = 0; total = reactionids.size(); for (Long uuid : reactionids) { Reaction r = db.getReactionFromUUID(uuid); Set<AccID> accessions = getAccessionNumbers(r.getReactionName()); if (accessions.size() > 0) rxnid2accession.put(r.getUUID(), accessions); System.out.format("[MAP_SEQ] Done: %.0f%%\r", (100*done++/total)); } System.out.println(); System.out.println("[MAP_SEQ] mapping all sequences to accession numbers"); List<Long> seqids = db.getAllSeqUUIDs(); done = 0; total = seqids.size(); for (Long seqid : seqids) { Seq s = db.getSeqFromID(seqid); for (String acc : s.getUniprotAccession()) accession2seqid.put(new AccID(s.getSrcdb(), acc), s.getUUID()); System.out.format("[MAP_SEQ] Done: %.0f%%\r", (100*done++/total)); } System.out.println(); System.out.println("[MAP_SEQ] resolving unmapped accessions from web api"); HashSet<AccID> from_web_lookup = new HashSet<AccID>(); for (int rxnid : rxnid2accession.keySet()) { System.out.println("Getting accessions in rxn: " + rxnid); Set<AccID> rxnaccessions = rxnid2accession.get(rxnid); System.out.println("Accessions: " + rxnaccessions); for (AccID rxnacc : rxnaccessions) { // first check if db.seq contains the mapping to sequence if (accession2seqid.containsKey(rxnacc)) continue; // ELSE: maybe it is unreviewed, i.e., from TrEMBL/EMBL, // we currently do not have that integrated (that is a 61.800GB) // we only have Swiss-Prot integrated (which was about 0.789GB) // TrEMBL entries: <entry dataset="TrEMBL" ...> // : E.g., http://www.uniprot.org/uniprot/Q7XYH5.xml) // SwissProt : <entry dataset="Swiss-Prot" ...> // : E.g., http://www.uniprot.org/uniprot/Q14DK4.xml) // Later we can keep a local copy of the 61GB TrEMBL, but for // now we just call the web api to retrieve the 2715 accessions // that we cannot locate in SwissProt // System.out.println("Did not find in db.seq. Doing web lookup: " + rxnacc); Set<SequenceEntry> apiget_entries = web_lookup(rxnacc); for (SequenceEntry apiget : apiget_entries) { // insert the newly retrieved data from the web api into db.seq int seqid = apiget.writeToDB(this.db, rxnacc.db); for (String acc_num : db.getSeqFromID(new Long(seqid)).getUniprotAccession()) { AccID ret_acc = new AccID(rxnacc.db, acc_num); // update the map of accession2seqid accession2seqid.put(ret_acc, seqid); from_web_lookup.add(ret_acc); } } } } HashMap<Integer, Set<AccID>> unmapped_rxns = new HashMap<Integer, Set<AccID>>(); for (Integer rid : rxnid2accession.keySet()) { Long rxnid = new Long(rid); for (AccID rxnacc : rxnid2accession.get(rid)) { // check if we have an AA sequence either db.seq if (!accession2seqid.containsKey(rxnacc)) { if (!unmapped_rxns.containsKey(rid)) unmapped_rxns.put(rid, new HashSet<AccID>()); unmapped_rxns.get(rid).add(rxnacc); continue; } Long seqid = new Long(accession2seqid.get(rxnacc)); // insert the mapping rxnid <-> seqid into the db addToDB(rxnid, seqid); } } if (_debug_level > 0) { Set<AccID> extractedAcc = new HashSet<AccID>(); for (Set<AccID> as : rxnid2accession.values()) extractedAcc.addAll(as); System.out.println("SwissProt: " + count_type(Seq.AccDB.swissprot , extractedAcc)); System.out.println("UniProt : " + count_type(Seq.AccDB.uniprot , extractedAcc)); System.out.println("TrEMBL : " + count_type(Seq.AccDB.trembl , extractedAcc)); System.out.println("EMBL : " + count_type(Seq.AccDB.embl , extractedAcc)); System.out.println("GenBank : " + count_type(Seq.AccDB.genbank , extractedAcc)); Set<String> no_map_for = new HashSet<String>(); for (Integer rid : unmapped_rxns.keySet()) no_map_for.add(rid + " -> " + unmapped_rxns.get(rid)); // not located in seq db, so no aa seq System.out.println(" Brenda Accessions that could not be resolved : " + no_map_for); System.out.println("|Breada Reactions that could not be resolved|: " + no_map_for.size()); System.out.println("|Accessions that were found using web lookup |: " + from_web_lookup.size()); Set<AccID> rxnSqs = new HashSet<AccID>(); for (Set<AccID> seqs : rxnid2accession.values()) rxnSqs.addAll(seqs); System.out.format("%d reactions have %d unique sequences\n", rxnid2accession.keySet().size(), rxnSqs.size()); System.out.format("%d swissprot entries\n", accession2seqid.keySet().size()); if (_debug_level > 1) { for (Integer rid: rxnid2accession.keySet()) System.out.format("rxnid(%s) -> %s\n", rid, rxnid2accession.get(rid)); System.out.println("Swissprot accessions: " + accession2seqid.keySet()); } } } private int count_type(Seq.AccDB db, Set<AccID> ids) { int c = 0; for (AccID a : ids) if (db == a.db) c++; return c; } private Set<SequenceEntry> web_lookup(AccID acc) { Set<SequenceEntry> entries = new HashSet<SequenceEntry>(); try { switch (acc.db) { case swissprot: // fallthrough case uniprot: // fallthrough case embl: // fallthrough case trembl: String api_xml = web_uniprot(acc.acc_num); entries = SwissProtEntry.parsePossiblyMany(api_xml); break; case genbank: String try_uniprot = web_uniprot(acc.acc_num); if (!try_uniprot.equals("")) { api_xml = try_uniprot; entries = SwissProtEntry.parsePossiblyMany(api_xml); } else { api_xml = web_genbank(acc.acc_num); entries = GenBankEntry.parsePossiblyMany(api_xml); } break; default: System.out.println("Unrecognized AccDB = " + acc.db); System.exit(-1); return null; } if (entries.size() > 1) { // System.out.println("Multiple entries: " + entries); System.out.println("XML from api call returned > 1 entry"); // System.console().readLine(); } } catch (IOException e) { // TODO: do better (propagate upwards probably). System.err.println("Caught IOException when attempting to look up accession number " + acc.acc_num + " in " + acc.db); e.printStackTrace(System.err); } return entries; } private String web_uniprot(String accession) { String url = "http://www.uniprot.org/uniprot/" + accession + ".xml"; String idtag = accession; String xml = api_get(url, new String[] { idtag }); System.out.println("API GET (UniProt): " + accession + " " + (!xml.equals("")?"success":"fail")); return xml; } private String web_genbank(String accession) { String url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id=" + accession + "&rettype=native&retmode=xml"; // retmode can also be json // documentation for eutils: http://www.ncbi.nlm.nih.gov/books/NBK25499/ String idtag = accession; String xml = api_get(url, new String[] { idtag }); System.out.println("API GET (GenBank): " + accession + " " + (!xml.equals("")?"success":"fail")); return xml; } private String web_ncbiprotein(long id) { String url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein&id=" + id + "&rettype=native&retmode=xml"; // documentation for eutils: http://www.ncbi.nlm.nih.gov/books/NBK25499/ String xml = api_get(url, new String[] { id + "" }); System.out.println("API GET (NCBI Protein Genbank): " + id + " " + (!xml.equals("")?"success":"fail")); return xml; } private String web_ncbi(String ec, String organism) { String query = ec + "[EC/RN Number] AND " + organism + "[Primary Organism]"; String url = null; try { url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=protein&term=" + URLEncoder.encode(query, "UTF-8") + "&rettype=native&retmode=xml"; } catch (Exception e) { System.out.println("[NCBI search] Could not encode query to url: " + query); } // documentation for eutils: http://www.ncbi.nlm.nih.gov/books/NBK25499/ String xml = api_get(url, new String[] { "<Id>" }); System.out.println("API GET (NCBI): " + ec + "/" + organism + " " + (!xml.equals("")?"success":"fail")); return xml; } private String api_get(String url, String[] should_contain) { String response = ""; try { System.out.println("\t GET: " + url); InputStream resp = new URL(url).openStream(); BufferedReader br = new BufferedReader(new InputStreamReader(resp)); String line; int lno = 0; while ((line = br.readLine())!=null) { response += line + "\n"; if (lno++ > 5000) { // receiving more than 5k lines => probably means // the accession is for the entire genome; abandon System.out.println("[MAP_SEQ] >5k lines read. Abondoning fetch. " + url + "Cause: We use rettype=native, instead of rettype=fasta. Use fasta for just the seq. Returned XML is formatted different, so GenBankEntry changes needed. See parsePossiblyMany there."); response = ""; break; } } resp.close(); } catch (Exception e) {} for (String test : should_contain) { if (!response.contains(test)) { // System.out.format("Failed to find [%s] in xml: %s\n", test, response.substring(0, Math.min(400, response.length()))); return ""; // failed test, unexpected response } } return response; } private String word_before(String buffer, int anchor_index) { int end = buffer.lastIndexOf(' ', anchor_index - 1); int start = buffer.lastIndexOf(' ', end - 1); String word = buffer.substring(start, end).trim(); return word; } private void add_words_before(Seq.AccDB suffix, String buffer, int start_at, Set<AccID> accumulator) { // make sure that the suffix is by itself, and not within a word // e.g., we were crashing earlier because we matched // " {Homo sapiens} adenovirus 100K assembly protein + H2O -?> ?" // and were extracting '100K ass' from it because EMBL is a // substring of assEMBLy String pattern = " " + suffix.name().toUpperCase() + " "; int added = 0; int idx = buffer.indexOf(pattern, start_at); if (idx == -1) return; // if no occurance found, return Set<AccID> accs_list = new HashSet<AccID>(); // match of suffix at idx, check the word that appears before it String word = word_before(buffer, idx); accs_list.add(new AccID(suffix, word)); // check if the prefix is a "and" list, e.g., "Kalanchoe pinnata Q33557 and Q43746 and P10797 UniProt" int list_idx = idx; while(true) { list_idx = list_idx - word.length() - 1; String preword = word_before(buffer, list_idx); if (preword.equals("AND")) { list_idx -= 4; // move backwards for the matched "AND " word = word_before(buffer, list_idx); accs_list.add(new AccID(suffix, word)); } else { break; } } // update the cummulative accession list accumulator.addAll(accs_list); if (_debug_level > 1) { System.out.format("Accession refs found: %s: %s\n", suffix, accs_list); // System.out.format("\tFrom sentence: %s\n\tParsed: %s\n", buffer, accs_list); } // recurse to after where the current suffix was found add_words_before(suffix, buffer, idx + pattern.length(), accumulator); return; } private Set<String> extract6LetterWords(String desc) { // six character; last character is 0-9 String regex = " ([A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][0-9]) "; Pattern r = Pattern.compile(regex); Matcher m = r.matcher(desc); Set<String> matches = new HashSet<String>(); while (m.find()) { String extracted = m.group(1); // desc.substring(m.start(), m.end()); matches.add(extracted); } return matches; } private Set<AccID> getAccessionNumbers(String desc) { Set<AccID> accs = new HashSet<AccID>(); // search for strings such as // " Q8TZI9 UniProt" // " P42527 SwissProt" // " Q18NX4 TrEMBL" -- unreviewed // " O70151 GenBank" // " Q9RLV9 EMBL" // add_words_before adds to the set of accessions "accs" and returns the delta count add_words_before(Seq.AccDB.swissprot, desc.toUpperCase(), 0, accs); add_words_before(Seq.AccDB.uniprot , desc.toUpperCase(), 0, accs); add_words_before(Seq.AccDB.trembl , desc.toUpperCase(), 0, accs); add_words_before(Seq.AccDB.embl , desc.toUpperCase(), 0, accs); add_words_before(Seq.AccDB.genbank , desc.toUpperCase(), 0, accs); if (_debug_level > 1) { Set<String> candidates = extract6LetterWords(desc); candidates.removeAll(accs); if (candidates.size() > 0) { System.out.println(); System.out.println("From reaction string : " + desc); System.out.println("Candidates not extracted: " + candidates); } } return accs; } private void connect_using_fingerprint() { // Map of rxn_id -> sequence fingerprint HashMap<Long, Set<SeqFingerPrint>> rxnIdent = new HashMap<Long, Set<SeqFingerPrint>>(); // Map of seq_id -> sequence fingerprint HashMap<Long, Set<SeqFingerPrint>> seqIdent = new HashMap<Long, Set<SeqFingerPrint>>(); double done, total; // take entries from db.actfamilies // map them to (ref_set, org_set, ec) // if (ref, org, ec) matches an entry in db.seq // map that sequence to the actfamilies entry System.out.println("[MAP_SEQ] mapping reactions -> (ec, org, pmid)"); // Populate rxnIdent List<Long> reactionids = db.getAllReactionUUIDs(); done = 0; total = reactionids.size(); for (Long uuid : reactionids) { Reaction r = db.getReactionFromUUID(uuid); Set<SeqFingerPrint> si = SeqFingerPrint.createFrom(r, db); rxnIdent.put(uuid, si); System.out.format("[MAP_SEQ] Done: %.0f%%\r", (100*done++/total)); } System.out.println(); System.out.println("[MAP_SEQ] mapping sequences -> (ec, org, pmid)"); // Populate seqIdent List<Long> seqids = db.getAllSeqUUIDs(); done = 0; total = seqids.size(); for (Long seqid : seqids) { Seq s = db.getSeqFromID(seqid); Set<SeqFingerPrint> si = SeqFingerPrint.createFrom(s); seqIdent.put(seqid, si); System.out.format("[MAP_SEQ] Done: %.0f%%\r", (100*done++/total)); } System.out.println(); // SeqIndent holds the (ref, org, ec) -> inferReln find connections Set<P<Long, Long>> rxn2seq = SeqFingerPrint.inferReln(rxnIdent, seqIdent); // for each pair (rxnid, seqid) in rxn2seq // insert the mapping rxnid <-> seqid into the db for (P<Long, Long> r2s : rxn2seq) addToDB(r2s.fst(), r2s.snd()); System.out.format("Found SwissProt sequences for %d rxns\n", rxn2seq.size()); System.out.format(" using exact matches: ref:%s, org:%s, ec:%s between db.actfamilies and db.seq\n", SeqFingerPrint.track_ref, SeqFingerPrint.track_org, SeqFingerPrint.track_ec); } class SequenceCache { String ec; String org; } private void connect_using_ncbi_protein_ec_org_lookup() { double done, total; System.out.println("[MAP_SEQ] NCBI EC+Org Lookup: installing seq <> rxn map"); // read cache that is map "ec + org" -> Set(SequenceEntry) Map<String, Set<SequenceEntry>> cache = readCachedSeqs(); List<Long> reactionids = db.getAllReactionUUIDs(); done = 0; total = reactionids.size(); for (Long uuid : reactionids) { Reaction r = db.getReactionFromUUID(uuid); Set<String> organisms = organismsForRxn(r); // now lookup the sequence mapping using ec# and these organisms try { // this can throw an exception if the data cannot be serialized // to the DB. in that case just ignore and continue to the next ncbi_protein_ec_org_lookup(uuid, r.getECNum(), organisms, cache); } catch (Exception e) {} System.out.format("[MAP_SEQ] Done: %.0f%% (%.0f/%.0f)\n", (100*done++/total), done, total); } System.out.println(); } Set<String> organismsForRxn(Reaction r) { // OLD way of extracting organisms from Reaction does not work anymore // we have changed the act.shared.Reaction System.err.println("act.installer.SeqIdentMapper: ABORT"); System.err.println("act.shared.Reaction has changed, and"); System.err.println("organisms are not as directly within the Reaction object"); System.exit(-1); return new HashSet<String>(); } Map<String, Set<SequenceEntry>> readCachedSeqs() { Map<String, Set<SequenceEntry>> cache = new HashMap<String, Set<SequenceEntry>>(); // TODO: WE NEED TO READ the cache from disk... // or from a local dump of NCBI protein... System.out.println("Resolving from NCBI protein is not fully implemented. TODO: (a) either read from Protein DB dump, (b) Read from web as code does, but write to disk so that we do not fetch in every run. Aborting."); System.exit(-1); return cache; } private String cacheId(String ec, String org) { return ec + " + " + org; } private void ncbi_protein_ec_org_lookup(Long rxnid, String ec, Set<String> organisms, Map<String, Set<SequenceEntry>> cache) { Seq.AccDB ncbidb = Seq.AccDB.ncbi_protein; Set<SequenceEntry> entries; Set<SequenceEntry> apiget_entries = new HashSet<SequenceEntry>(); for (String org : organisms) { // check if this "ec + org" is already in the cache String cacheid = cacheId(ec, org); if (cache.containsKey(cacheid)) { entries = cache.get(cacheid); } else { String api_xml = web_ncbi(ec, org); if (!api_xml.isEmpty()) { // process the xml and get Set(SequenceEntry) out entries = genbankEntriesFromSearchRslts(api_xml); } else { // no xml or invalid xml returned, send out an empty hashmap entries = new HashSet<SequenceEntry>(); } } apiget_entries.addAll(entries); System.out.println("[NCBI]\t" + ec + "\t" + org + "\t" + entries); } // TODO: WE NEED TO WRITE the fetched entries to disk cache... // See readCachedSeqs ABOVE! System.out.println("Resolving from NCBI protein is not fully implemented. TODO: (a) either read from Protein DB dump, (b) Read from web as code does, but write to disk so that we do not fetch in every run. Aborting."); System.exit(-1); for (SequenceEntry apiget : apiget_entries) { // insert the newly retrieved data from the web api into db.seq long seqid = apiget.writeToDB(this.db, ncbidb); // insert the mapping rxnid <-> seqid into the db addToDB(rxnid, seqid); System.out.println("Mapped rxn<>db.seq: " + rxnid + " <> " + seqid); } } private void addToDB(Long rxnid, Long seqid) { // The function below is deprecated now. // Instead the link between a rxn <> seq goes through // (organism, ec#) which identifies a sequence in db.seq // so a function like below addToDB(rxnid, seqid, orgid) // is more appropriate // db.addSeqRefToReactions(rxnid, seqid); } private void addToDB(Long rxnid, Long seqid, Long orgid) { // Need to call appropriate function in MongoDB } private Set<String> extractOrganisms(String desc) { // You can find all organisms referenced in brenda easy_desc fields using: // mongo localhost/actv01 --eval "rxns=db.actfamilies.find({},{easy_desc:1}); rxns.forEach(function (r) { print(r.easy_desc); });" > all_rxns.txt // cat all_rxns.txt | grep "{" | grep -v BiochemicalReaction | sed 's/^ *{\(.*\)} .*/\1/' | tr ',' '\n' | sort | uniq | cut -f1-2 -d ' ' Set<String> organisms = new HashSet<String>(); if (desc.contains("BiochemicalReaction")) return organisms; int start = desc.indexOf('{'); int end = desc.indexOf('}', start); if (start == -1 || end == -1 || !desc.substring(0, start).trim().isEmpty()) return organisms; String org_str = desc.substring(start + 1, end); String[] orgs = org_str.split(","); for (String org : orgs) { String org_name = genus_species(org.trim().split(" ")); if (org_name != null) organisms.add(org_name); } return organisms; } private String genus_species(String[] org_words) { if (org_words[0].equals("unidentified") || org_words[0].equals("uncultured") || org_words[0].equals("null")) return null; if ( org_words[0].equals("synthetic") && org_words[1].equals("construct") ) return null; if ( org_words[0].equals("soil") && org_words[1].equals("organism") ) return null; if ( org_words[0].equals("soil") && org_words[1].equals("bacterium") ) return null; if ( org_words[0].equals("acetic") && org_words[1].equals("acid") ) return null; if ( org_words.length == 1 && org_words[0].equals("artificial") ) return null; if (org_words[0].equals("yeast")) return "Saccharomyces"; if (org_words.length == 1) // only the genus specified, query just that return org_words[0]; else if (org_words.length == 2 && (org_words[1].equals("sp") || org_words[1].equals("sp."))) // when the second is the generic "species" short form just query the genus return org_words[0]; else // everything looks fine; return 1st word genus and 2nd species return org_words[0] + org_words[1]; } public Set<SequenceEntry> genbankEntriesFromSearchRslts(String ncbi_xml) { Set<SequenceEntry> all_entries = new HashSet<SequenceEntry>(); try { // example structure of this object "jo" is after this fn. JSONObject jo = XML.toJSONObject(ncbi_xml); // System.out.println("RECEIVED\n*******\n" + jo.toString(4) + "\n*******\n"); JSONObject main = jo.getJSONObject("eSearchResult"); int count = main.getInt("Count"); if (count > 0) { // found some hits. their Id are under parsed.IdList.Id Object ids = main.getJSONObject("IdList").get("Id"); JSONArray id_list; // parsed could be an array if more than one hit, or object // so wrap it into an array if required if (ids instanceof JSONArray) id_list = (JSONArray)ids; else id_list = new JSONArray(new Long[] { (Long)ids }); for (int i = 0; i < id_list.length(); i++) { long entry_id = id_list.getLong(i); try { String genbank_xml = web_ncbiprotein(entry_id); // returns in Genbank xml format... parsed through GenbankEntry Set<SequenceEntry> entries = GenBankEntry.parsePossiblyMany(genbank_xml); all_entries.addAll(entries); } catch (JSONException je) { } } } } catch (JSONException je) { System.out.println("Failed NCBI Protein Entry parse: " + je.toString() + " XML: " + ncbi_xml); } return all_entries; } } class SeqFingerPrint { public static boolean track_ref = true; public static boolean track_ec = true; public static boolean track_org = true; String ec, org, ref; SeqFingerPrint(String e, String o, String r) { this.ref = track_ref ? r : ""; this.ec = track_ec ? e : ""; this.org = track_org ? o : ""; } public static Set<SeqFingerPrint> expansion(String ec, List<String> orgs_e, List<String> refs_e) { Set<SeqFingerPrint> ident = new HashSet<SeqFingerPrint>(); // if we are not tracking something (e.g., ref, or org) then that field will be singleton // this way, we wont ignore the rest of the data. e.g., if ref.isEmpty and !track_ref Set<String> filler = new HashSet<String>(); filler.add(""); Set<String> refs = !track_ref ? filler : new HashSet<String>(refs_e); Set<String> orgs = !track_org ? filler : new HashSet<String>(orgs_e); for (String ref : refs) for (String org : orgs) ident.add(new SeqFingerPrint(ec, org, ref)); return ident; } public static Set<SeqFingerPrint> createFrom(Reaction r, MongoDB db) { String ec = r.getECNum(); Long[] orgids = new Long[0]; // r.getOrganismIDs(); System.err.println("act.installer.SeqIdentMapper: ABORT"); System.err.println("act.shared.Reaction has changed, and"); System.err.println("organismIDs are not as directly within the Reaction object"); System.exit(-1); List<String> orgs = new ArrayList<String>(); // translate orgids to org_names for (Long oid : orgids) orgs.add(db.getOrganismNameFromId(oid)); List<String> pmids = new ArrayList<String>(r.getReferences(Reaction.RefDataSource.PMID)); return expansion(ec, orgs, pmids); } public static Set<SeqFingerPrint> createFrom(Seq s) { String ec = s.getEc(); String org = s.getOrgName(); List<String> orgs = new ArrayList<String>(); orgs.add(org); /* The structure of get_references was changed from List<String> to List<JSONObject>. This loop converts PMID JSONObjects back into a List<String> of PMIDs. This allows for the expansion call to occur without error */ List<String> references = new ArrayList<>(); for (JSONObject obj : s.getReferences()) { if (obj.get("src").equals("PMID")) { references.add((String) obj.get("val")); } } return expansion(ec, orgs, references); } public static <I> Set<P<I,I>> inferReln(HashMap<I, Set<SeqFingerPrint>> A, HashMap<I, Set<SeqFingerPrint>> B) { HashSet<P<I,I>> reln = new HashSet<P<I, I>>(); System.out.println("[MAP_SEQ] Intersecting maps of reactions and sequences)"); // inverting the hashmaps gets to a O(n) intersection // algorithm, as opposed to O(n^2) otherwise HashMap<SeqFingerPrint, Set<I>> A_inv = invert_map(A); HashMap<SeqFingerPrint, Set<I>> B_inv = invert_map(B); double total = A_inv.size(), done = 0; for (SeqFingerPrint a : A_inv.keySet()) { System.out.format("Done: %.2f%%\r", 100*(done++/total)); if (!B_inv.containsKey(a)) continue; // shared fingerprint found. means for each of the I a, and I b // that shared this in their original mapped sets, we have a -> b for (I a_key : A_inv.get(a)) for (I b_key : B_inv.get(a)) reln.add(new P<I, I>(a_key, b_key)); } System.out.println(); // System.out.println("Performance bug: This is an O(n^2) older version of the above"); // total = A.size() * B.size(); done = 0; // for (I a_key : A.keySet()) { // for (I b_key : B.keySet()) { // System.out.format("Done: %.2f%%\r", 100*(done++/total)); // if (! intersect(A.get(a_key), B.get(b_key)).isEmpty()) { // reln.add(new P<I, I>(a_key, b_key)); // } // } // } // System.out.println(); return reln; } public static <I, X> HashMap<X, Set<I>> invert_map(HashMap<I, Set<X>> map) { HashMap<X, Set<I>> inverted = new HashMap<X, Set<I>>(); for (I i : map.keySet()) { for (X x : map.get(i)) { if (!inverted.containsKey(x)) inverted.put(x, new HashSet<I>()); inverted.get(x).add(i); } } return inverted; } public static <X> Set<X> intersect(Set<X> set1, Set<X> set2) { boolean set1IsLarger = set1.size() > set2.size(); Set<X> cloneSet = new HashSet<X>(set1IsLarger ? set2 : set1); cloneSet.retainAll(set1IsLarger ? set1 : set2); return cloneSet; } @Override public boolean equals(Object o) { if (!(o instanceof SeqFingerPrint)) return false; SeqFingerPrint that = (SeqFingerPrint)o; // we dont want to assign two fingerprints as equal if one of them is null if (this.ref == null || this.ec == null || this.org == null) return false; return this.ref.equals(that.ref) && this.ec.equals(that.ec) && this.org.equals(that.org); } @Override public int hashCode() { int hash = "magic".hashCode(); if (this.ref != null) hash ^= this.ref.hashCode(); if (this.ec != null) hash ^= this.ec.hashCode(); if (this.org != null) hash ^= this.org.hashCode(); return hash; } @Override public String toString() { List<String> data = new ArrayList<String>(); List<String> not_tracking = new ArrayList<String>(); if (track_ref) data.add(this.ref); else not_tracking.add("ref"); if (track_ec) data.add(this.ec); else not_tracking.add("ec"); if (track_org) data.add(this.org); else not_tracking.add("org"); String mode = ""; if (!track_ref || !track_ec || !track_org) mode = " not tracking" + not_tracking; return data + mode; } } /* --- output of: --- NCBI protein query 1.2.1.50[EC/RN Number] AND Photobacterium leiognathi[Primary Organism] converted to json { "eSearchResult": { "Count": "2", "RetMax": "2", "RetStart": "0", "IdList": { "Id": [ "547874", "126514" ] }, "TranslationStack": { "TermSet": [ { "Term": "1.2.1.50[EC/RN Number]", "Field": "EC/RN Number", "Count": "111", "Explode": "N" }, { "Term": "Photobacterium leiognathi[Primary Organism]", "Field": "Primary Organism", "Count": "16377", "Explode": "Y" } ], "OP": "AND" }, "QueryTranslation": "1.2.1.50[EC/RN Number] AND Photobacterium leiognathi[Primary Organism]" } } */ /* --- output of: --- curl -s "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein&id=547874&rettype=native&retmode=xml" <?xml version="1.0"?> <!DOCTYPE Bioseq-set PUBLIC "-//NCBI//NCBI Seqset/EN" "http://www.ncbi.nlm.nih.gov/dtd/NCBI_Seqset.dtd"> <Bioseq-set> <Bioseq-set_seq-set> <Seq-entry> <Seq-entry_seq> <Bioseq> <Bioseq_id> <Seq-id> <Seq-id_swissprot> <Textseq-id> <Textseq-id_name>LUXC1_PHOLE</Textseq-id_name> <Textseq-id_accession>Q03324</Textseq-id_accession> <Textseq-id_release>reviewed</Textseq-id_release> <Textseq-id_version>1</Textseq-id_version> </Textseq-id> </Seq-id_swissprot> </Seq-id> <Seq-id> <Seq-id_gi>547874</Seq-id_gi> </Seq-id> </Bioseq_id> <Bioseq_descr> <Seq-descr> <Seqdesc> <Seqdesc_title>RecName: Full=Acyl-CoA reductase</Seqdesc_title> </Seqdesc> <Seqdesc> <Seqdesc_source> <BioSource> <BioSource_org> <Org-ref> <Org-ref_taxname>Photobacterium leiognathi</Org-ref_taxname> <Org-ref_db> <Dbtag> <Dbtag_db>taxon</Dbtag_db> <Dbtag_tag> <Object-id> <Object-id_id>553611</Object-id_id> </Object-id> </Dbtag_tag> </Dbtag> </Org-ref_db> <Org-ref_orgname> <OrgName> <OrgName_name> <OrgName_name_binomial> <BinomialOrgName> <BinomialOrgName_genus>Photobacterium</BinomialOrgName_genus> <BinomialOrgName_species>leiognathi</BinomialOrgName_species> </BinomialOrgName> </OrgName_name_binomial> </OrgName_name> <OrgName_lineage>Bacteria; Proteobacteria; Gammaproteobacteria; Vibrionales; Vibrionaceae; Photobacterium</OrgName_lineage> <OrgName_gcode>11</OrgName_gcode> <OrgName_div>BCT</OrgName_div> </OrgName> </Org-ref_orgname> </Org-ref> </BioSource_org> </BioSource> </Seqdesc_source> </Seqdesc> <Seqdesc> <Seqdesc_molinfo> <MolInfo> <MolInfo_biomol value="peptide">8</MolInfo_biomol> <MolInfo_completeness value="complete">1</MolInfo_completeness> </MolInfo> </Seqdesc_molinfo> </Seqdesc> <Seqdesc> <Seqdesc_pub> <Pubdesc> <Pubdesc_pub> <Pub-equiv> <Pub> <Pub_gen> <Cit-gen> <Cit-gen_serial-number>1</Cit-gen_serial-number> </Cit-gen> </Pub_gen> </Pub> <Pub> <Pub_pmid> <PubMedId>8447834</PubMedId> </Pub_pmid> </Pub> <Pub> <Pub_article> <Cit-art> <Cit-art_title> <Title> <Title_E> <Title_E_name>Nucleotide sequence of the luxC gene encoding fatty acid reductase of the lux operon from Photobacterium leiognathi.</Title_E_name> </Title_E> </Title> </Cit-art_title> <Cit-art_authors> <Auth-list> <Auth-list_names> <Auth-list_names_std> <Author> <Author_name> <Person-id> <Person-id_name> <Name-std> <Name-std_last>Lin</Name-std_last> <Name-std_initials>J.W.</Name-std_initials> </Name-std> </Person-id_name> </Person-id> </Author_name> <Author_affil> <Affil> <Affil_str>Institute of Molecular Biology and Agricultural Biotechnology Laboratories, National Chung Hsing University, Taichung, Taiwan, R.O.C.</Affil_str> </Affil> </Author_affil> </Author> <Author> <Author_name> <Person-id> <Person-id_name> <Name-std> <Name-std_last>Chao</Name-std_last> <Name-std_initials>Y.F.</Name-std_initials> </Name-std> </Person-id_name> </Person-id> </Author_name> </Author> <Author> <Author_name> <Person-id> <Person-id_name> <Name-std> <Name-std_last>Weng</Name-std_last> <Name-std_initials>S.F.</Name-std_initials> </Name-std> </Person-id_name> </Person-id> </Author_name> </Author> </Auth-list_names_std> </Auth-list_names> </Auth-list> </Cit-art_authors> <Cit-art_from> <Cit-art_from_journal> <Cit-jour> <Cit-jour_title> <Title> <Title_E> <Title_E_iso-jta>Biochem. Biophys. Res. Commun.</Title_E_iso-jta> </Title_E> <Title_E> <Title_E_ml-jta>Biochem Biophys Res Commun</Title_E_ml-jta> </Title_E> <Title_E> <Title_E_issn>0006-291X</Title_E_issn> </Title_E> <Title_E> <Title_E_name>Biochemical and biophysical research communications</Title_E_name> </Title_E> </Title> </Cit-jour_title> <Cit-jour_imp> <Imprint> <Imprint_date> <Date> <Date_std> <Date-std> <Date-std_year>1993</Date-std_year> <Date-std_month>2</Date-std_month> <Date-std_day>26</Date-std_day> </Date-std> </Date_std> </Date> </Imprint_date> <Imprint_volume>191</Imprint_volume> <Imprint_issue>1</Imprint_issue> <Imprint_pages>314-318</Imprint_pages> <Imprint_language>eng</Imprint_language> <Imprint_pubstatus> <PubStatus value="ppublish">4</PubStatus> </Imprint_pubstatus> <Imprint_history> <PubStatusDateSet> <PubStatusDate> <PubStatusDate_pubstatus> <PubStatus value="pubmed">8</PubStatus> </PubStatusDate_pubstatus> <PubStatusDate_date> <Date> <Date_std> <Date-std> <Date-std_year>1993</Date-std_year> <Date-std_month>2</Date-std_month> <Date-std_day>26</Date-std_day> </Date-std> </Date_std> </Date> </PubStatusDate_date> </PubStatusDate> <PubStatusDate> <PubStatusDate_pubstatus> <PubStatus value="medline">12</PubStatus> </PubStatusDate_pubstatus> <PubStatusDate_date> <Date> <Date_std> <Date-std> <Date-std_year>1993</Date-std_year> <Date-std_month>2</Date-std_month> <Date-std_day>26</Date-std_day> <Date-std_hour>0</Date-std_hour> <Date-std_minute>1</Date-std_minute> </Date-std> </Date_std> </Date> </PubStatusDate_date> </PubStatusDate> <PubStatusDate> <PubStatusDate_pubstatus> <PubStatus value="other">255</PubStatus> </PubStatusDate_pubstatus> <PubStatusDate_date> <Date> <Date_std> <Date-std> <Date-std_year>1993</Date-std_year> <Date-std_month>2</Date-std_month> <Date-std_day>26</Date-std_day> <Date-std_hour>0</Date-std_hour> <Date-std_minute>0</Date-std_minute> </Date-std> </Date_std> </Date> </PubStatusDate_date> </PubStatusDate> </PubStatusDateSet> </Imprint_history> </Imprint> </Cit-jour_imp> </Cit-jour> </Cit-art_from_journal> </Cit-art_from> <Cit-art_ids> <ArticleIdSet> <ArticleId> <ArticleId_pubmed> <PubMedId>8447834</PubMedId> </ArticleId_pubmed> </ArticleId> <ArticleId> <ArticleId_pii> <PII>S0006-291X(83)71219-2</PII> </ArticleId_pii> </ArticleId> <ArticleId> <ArticleId_doi> <DOI>10.1006/bbrc.1993.1219</DOI> </ArticleId_doi> </ArticleId> </ArticleIdSet> </Cit-art_ids> </Cit-art> </Pub_article> </Pub> </Pub-equiv> </Pubdesc_pub> <Pubdesc_comment>NUCLEOTIDE SEQUENCE [GENOMIC DNA].;~STRAIN=741</Pubdesc_comment> </Pubdesc> </Seqdesc_pub> </Seqdesc> <Seqdesc> <Seqdesc_comment>[FUNCTION] LuxC is the fatty acid reductase enzyme responsible for synthesis of the aldehyde substrate for the luminescent reaction catalyzed by luciferase.</Seqdesc_comment> </Seqdesc> <Seqdesc> <Seqdesc_comment>[CATALYTIC ACTIVITY] A long-chain aldehyde + CoA + NADP(+) = a long-chain acyl-CoA + NADPH.</Seqdesc_comment> </Seqdesc> <Seqdesc> <Seqdesc_comment>[PATHWAY] Lipid metabolism; fatty acid reduction for biolumincescence.</Seqdesc_comment> </Seqdesc> <Seqdesc> <Seqdesc_comment>[SIMILARITY] Belongs to the LuxC family. {ECO:0000305}.</Seqdesc_comment> </Seqdesc> <Seqdesc> <Seqdesc_sp> <SP-block> <SP-block_class value="standard"/> <SP-block_seqref> <Seq-id> <Seq-id_gi>45566</Seq-id_gi> </Seq-id> <Seq-id> <Seq-id_gi>45567</Seq-id_gi> </Seq-id> <Seq-id> <Seq-id_gi>419592</Seq-id_gi> </Seq-id> </SP-block_seqref> <SP-block_dbref> <Dbtag> <Dbtag_db>ProteinModelPortal</Dbtag_db> <Dbtag_tag> <Object-id> <Object-id_str>Q03324</Object-id_str> </Object-id> </Dbtag_tag> </Dbtag> <Dbtag> <Dbtag_db>UniPathway</Dbtag_db> <Dbtag_tag> <Object-id> <Object-id_str>UPA00569</Object-id_str> </Object-id> </Dbtag_tag> </Dbtag> <Dbtag> <Dbtag_db>GO</Dbtag_db> <Dbtag_tag> <Object-id> <Object-id_str>GO:0003995</Object-id_str> </Object-id> </Dbtag_tag> </Dbtag> <Dbtag> <Dbtag_db>GO</Dbtag_db> <Dbtag_tag> <Object-id> <Object-id_str>GO:0050062</Object-id_str> </Object-id> </Dbtag_tag> </Dbtag> <Dbtag> <Dbtag_db>GO</Dbtag_db> <Dbtag_tag> <Object-id> <Object-id_str>GO:0008218</Object-id_str> </Object-id> </Dbtag_tag> </Dbtag> <Dbtag> <Dbtag_db>Gene3D</Dbtag_db> <Dbtag_tag> <Object-id> <Object-id_str>3.40.605.10</Object-id_str> </Object-id> </Dbtag_tag> </Dbtag> <Dbtag> <Dbtag_db>InterPro</Dbtag_db> <Dbtag_tag> <Object-id> <Object-id_str>IPR008670</Object-id_str> </Object-id> </Dbtag_tag> </Dbtag> <Dbtag> <Dbtag_db>InterPro</Dbtag_db> <Dbtag_tag> <Object-id> <Object-id_str>IPR016161</Object-id_str> </Object-id> </Dbtag_tag> </Dbtag> <Dbtag> <Dbtag_db>InterPro</Dbtag_db> <Dbtag_tag> <Object-id> <Object-id_str>IPR016162</Object-id_str> </Object-id> </Dbtag_tag> </Dbtag> <Dbtag> <Dbtag_db>Pfam</Dbtag_db> <Dbtag_tag> <Object-id> <Object-id_str>PF05893</Object-id_str> </Object-id> </Dbtag_tag> </Dbtag> <Dbtag> <Dbtag_db>PIRSF</Dbtag_db> <Dbtag_tag> <Object-id> <Object-id_str>PIRSF009414</Object-id_str> </Object-id> </Dbtag_tag> </Dbtag> <Dbtag> <Dbtag_db>SUPFAM</Dbtag_db> <Dbtag_tag> <Object-id> <Object-id_str>SSF53720</Object-id_str> </Object-id> </Dbtag_tag> </Dbtag> </SP-block_dbref> <SP-block_keywords> <SP-block_keywords_E>Luminescence</SP-block_keywords_E> <SP-block_keywords_E>NADP</SP-block_keywords_E> <SP-block_keywords_E>Oxidoreductase</SP-block_keywords_E> </SP-block_keywords> <SP-block_created> <Date> <Date_std> <Date-std> <Date-std_year>1994</Date-std_year> <Date-std_month>6</Date-std_month> <Date-std_day>1</Date-std_day> </Date-std> </Date_std> </Date> </SP-block_created> <SP-block_sequpd> <Date> <Date_std> <Date-std> <Date-std_year>1994</Date-std_year> <Date-std_month>6</Date-std_month> <Date-std_day>1</Date-std_day> </Date-std> </Date_std> </Date> </SP-block_sequpd> <SP-block_annotupd> <Date> <Date_std> <Date-std> <Date-std_year>2014</Date-std_year> <Date-std_month>10</Date-std_month> <Date-std_day>1</Date-std_day> </Date-std> </Date_std> </Date> </SP-block_annotupd> </SP-block> </Seqdesc_sp> </Seqdesc> <Seqdesc> <Seqdesc_create-date> <Date> <Date_std> <Date-std> <Date-std_year>1994</Date-std_year> <Date-std_month>6</Date-std_month> <Date-std_day>1</Date-std_day> </Date-std> </Date_std> </Date> </Seqdesc_create-date> </Seqdesc> <Seqdesc> <Seqdesc_update-date> <Date> <Date_std> <Date-std> <Date-std_year>2014</Date-std_year> <Date-std_month>10</Date-std_month> <Date-std_day>1</Date-std_day> </Date-std> </Date_std> </Date> </Seqdesc_update-date> </Seqdesc> </Seq-descr> </Bioseq_descr> <Bioseq_inst> <Seq-inst> <Seq-inst_repr value="raw"/> <Seq-inst_mol value="aa"/> <Seq-inst_length>478</Seq-inst_length> <Seq-inst_seq-data> <Seq-data> <Seq-data_iupacaa> <IUPACaa>MIKKIPLIIGGEVQDTSEHDVRELTLNNNTVNVPIITDKDAESITSLKIENKLNINQIVNFLYTVGQKWKSENYSRRLTYIRDLVKFMGYSPEMAKLEANWISMILCSKSALYDIVENDLSSRHIVDEWLPQGDCYVKALPKGKSIHLLAGNVPLSGVTSILRAILTKNECIIKTSSADPFTATALASSFIDTDANHPITRSMSVMYWSHNEDITIPQKIMNCADVVVAWGGNDAIKWATKHSPAHVDILKFGPKKSISIVDNPTDIKAAAIGVAHDICFYDQQACFSTQDIYYMGDKLDVFFDELTKQLNIYKVILPKGDQSFDEKGAFSLTERECLFAKYKVQKGEEQAWLLTQSPAGTFGNQPLSRSAYIHHVNDISEITPYIQNDITQTVSITPWEASFKYRDTLASHGAERIIESGMNNIFRVGGAHDGMRPLQRLVKYISHERPSTYTTKDVAVKIEQTRYLEEDKFLVFVP</IUPACaa> </Seq-data_iupacaa> </Seq-data> </Seq-inst_seq-data> <Seq-inst_hist> <Seq-hist> <Seq-hist_replaces> <Seq-hist-rec> <Seq-hist-rec_date> <Date> <Date_std> <Date-std> <Date-std_year>2005</Date-std_year> <Date-std_month>7</Date-std_month> <Date-std_day>26</Date-std_day> </Date-std> </Date_std> </Date> </Seq-hist-rec_date> <Seq-hist-rec_ids> <Seq-id> <Seq-id_gi>419592</Seq-id_gi> </Seq-id> </Seq-hist-rec_ids> </Seq-hist-rec> </Seq-hist_replaces> </Seq-hist> </Seq-inst_hist> </Seq-inst> </Bioseq_inst> <Bioseq_annot> <Seq-annot> <Seq-annot_data> <Seq-annot_data_ftable> <Seq-feat> <Seq-feat_data> <SeqFeatData> <SeqFeatData_region>Mature chain</SeqFeatData_region> </SeqFeatData> </Seq-feat_data> <Seq-feat_comment>Acyl-CoA reductase. /FTId=PRO_0000220196.</Seq-feat_comment> <Seq-feat_location> <Seq-loc> <Seq-loc_int> <Seq-interval> <Seq-interval_from>0</Seq-interval_from> <Seq-interval_to>477</Seq-interval_to> <Seq-interval_id> <Seq-id> <Seq-id_gi>547874</Seq-id_gi> </Seq-id> </Seq-interval_id> </Seq-interval> </Seq-loc_int> </Seq-loc> </Seq-feat_location> <Seq-feat_exp-ev value="experimental"/> </Seq-feat> <Seq-feat> <Seq-feat_data> <SeqFeatData> <SeqFeatData_gene> <Gene-ref> <Gene-ref_locus>luxC</Gene-ref_locus> </Gene-ref> </SeqFeatData_gene> </SeqFeatData> </Seq-feat_data> <Seq-feat_location> <Seq-loc> <Seq-loc_int> <Seq-interval> <Seq-interval_from>0</Seq-interval_from> <Seq-interval_to>477</Seq-interval_to> <Seq-interval_id> <Seq-id> <Seq-id_gi>547874</Seq-id_gi> </Seq-id> </Seq-interval_id> </Seq-interval> </Seq-loc_int> </Seq-loc> </Seq-feat_location> </Seq-feat> <Seq-feat> <Seq-feat_data> <SeqFeatData> <SeqFeatData_prot> <Prot-ref> <Prot-ref_name> <Prot-ref_name_E>Acyl-CoA reductase</Prot-ref_name_E> </Prot-ref_name> <Prot-ref_ec> <Prot-ref_ec_E>1.2.1.50</Prot-ref_ec_E> </Prot-ref_ec> </Prot-ref> </SeqFeatData_prot> </SeqFeatData> </Seq-feat_data> <Seq-feat_location> <Seq-loc> <Seq-loc_int> <Seq-interval> <Seq-interval_from>0</Seq-interval_from> <Seq-interval_to>477</Seq-interval_to> <Seq-interval_id> <Seq-id> <Seq-id_gi>547874</Seq-id_gi> </Seq-id> </Seq-interval_id> </Seq-interval> </Seq-loc_int> </Seq-loc> </Seq-feat_location> <Seq-feat_qual> <Gb-qual> <Gb-qual_qual>UniProtKB_evidence</Gb-qual_qual> <Gb-qual_val>Inferred from homology</Gb-qual_val> </Gb-qual> </Seq-feat_qual> </Seq-feat> </Seq-annot_data_ftable> </Seq-annot_data> </Seq-annot> <Seq-annot> <Seq-annot_db value="other">255</Seq-annot_db> <Seq-annot_name>Annot:CDD</Seq-annot_name> <Seq-annot_desc> <Annot-descr> <Annotdesc> <Annotdesc_name>CddSearch</Annotdesc_name> </Annotdesc> <Annotdesc> <Annotdesc_user> <User-object> <User-object_type> <Object-id> <Object-id_str>CddInfo</Object-id_str> </Object-id> </User-object_type> <User-object_data> <User-field> <User-field_label> <Object-id> <Object-id_str>version</Object-id_str> </Object-id> </User-field_label> <User-field_data> <User-field_data_str>3.13</User-field_data_str> </User-field_data> </User-field> </User-object_data> </User-object> </Annotdesc_user> </Annotdesc> <Annotdesc> <Annotdesc_create-date> <Date> <Date_std> <Date-std> <Date-std_year>2015</Date-std_year> <Date-std_month>1</Date-std_month> <Date-std_day>6</Date-std_day> <Date-std_hour>15</Date-std_hour> <Date-std_minute>43</Date-std_minute> <Date-std_second>14</Date-std_second> </Date-std> </Date_std> </Date> </Annotdesc_create-date> </Annotdesc> </Annot-descr> </Seq-annot_desc> <Seq-annot_data> <Seq-annot_data_ftable> <Seq-feat> <Seq-feat_data> <SeqFeatData> <SeqFeatData_region>ALDH_Acyl-CoA-Red_LuxC</SeqFeatData_region> </SeqFeatData> </Seq-feat_data> <Seq-feat_comment>Acyl-CoA reductase LuxC</Seq-feat_comment> <Seq-feat_location> <Seq-loc> <Seq-loc_int> <Seq-interval> <Seq-interval_from>29</Seq-interval_from> <Seq-interval_to>444</Seq-interval_to> <Seq-interval_id> <Seq-id> <Seq-id_gi>547874</Seq-id_gi> </Seq-id> </Seq-interval_id> </Seq-interval> </Seq-loc_int> </Seq-loc> </Seq-feat_location> <Seq-feat_ext> <User-object> <User-object_type> <Object-id> <Object-id_str>cddScoreData</Object-id_str> </Object-id> </User-object_type> <User-object_data> <User-field> <User-field_label> <Object-id> <Object-id_str>domain_from</Object-id_str> </Object-id> </User-field_label> <User-field_data> <User-field_data_int>0</User-field_data_int> </User-field_data> </User-field> <User-field> <User-field_label> <Object-id> <Object-id_str>domain_to</Object-id_str> </Object-id> </User-field_label> <User-field_data> <User-field_data_int>421</User-field_data_int> </User-field_data> </User-field> <User-field> <User-field_label> <Object-id> <Object-id_str>definition</Object-id_str> </Object-id> </User-field_label> <User-field_data> <User-field_data_str>cd07080</User-field_data_str> </User-field_data> </User-field> <User-field> <User-field_label> <Object-id> <Object-id_str>short_name</Object-id_str> </Object-id> </User-field_label> <User-field_data> <User-field_data_str>ALDH_Acyl-CoA-Red_LuxC</User-field_data_str> </User-field_data> </User-field> <User-field> <User-field_label> <Object-id> <Object-id_str>score</Object-id_str> </Object-id> </User-field_label> <User-field_data> <User-field_data_int>1148</User-field_data_int> </User-field_data> </User-field> <User-field> <User-field_label> <Object-id> <Object-id_str>evalue</Object-id_str> </Object-id> </User-field_label> <User-field_data> <User-field_data_real>2.97592e-153</User-field_data_real> </User-field_data> </User-field> <User-field> <User-field_label> <Object-id> <Object-id_str>bit_score</Object-id_str> </Object-id> </User-field_label> <User-field_data> <User-field_data_real>445.954</User-field_data_real> </User-field_data> </User-field> <User-field> <User-field_label> <Object-id> <Object-id_str>specific</Object-id_str> </Object-id> </User-field_label> <User-field_data> <User-field_data_bool value="true"/> </User-field_data> </User-field> <User-field> <User-field_label> <Object-id> <Object-id_str>superfamily</Object-id_str> </Object-id> </User-field_label> <User-field_data> <User-field_data_str>cl11961</User-field_data_str> </User-field_data> </User-field> </User-object_data> </User-object> </Seq-feat_ext> <Seq-feat_dbxref> <Dbtag> <Dbtag_db>CDD</Dbtag_db> <Dbtag_tag> <Object-id> <Object-id_id>143399</Object-id_id> </Object-id> </Dbtag_tag> </Dbtag> </Seq-feat_dbxref> </Seq-feat> <Seq-feat> <Seq-feat_data> <SeqFeatData> <SeqFeatData_site value="active"/> </SeqFeatData> </Seq-feat_data> <Seq-feat_comment>putative catalytic cysteine [active]</Seq-feat_comment> <Seq-feat_location> <Seq-loc> <Seq-loc_mix> <Seq-loc-mix> <Seq-loc> <Seq-loc_pnt> <Seq-point> <Seq-point_point>285</Seq-point_point> <Seq-point_id> <Seq-id> <Seq-id_gi>547874</Seq-id_gi> </Seq-id> </Seq-point_id> </Seq-point> </Seq-loc_pnt> </Seq-loc> </Seq-loc-mix> </Seq-loc_mix> </Seq-loc> </Seq-feat_location> <Seq-feat_ext> <User-object> <User-object_type> <Object-id> <Object-id_str>cddSiteScoreData</Object-id_str> </Object-id> </User-object_type> <User-object_data> <User-field> <User-field_label> <Object-id> <Object-id_str>completeness</Object-id_str> </Object-id> </User-field_label> <User-field_data> <User-field_data_real>1</User-field_data_real> </User-field_data> </User-field> <User-field> <User-field_label> <Object-id> <Object-id_str>feature-ID</Object-id_str> </Object-id> </User-field_label> <User-field_data> <User-field_data_int>0</User-field_data_int> </User-field_data> </User-field> <User-field> <User-field_label> <Object-id> <Object-id_str>specific</Object-id_str> </Object-id> </User-field_label> <User-field_data> <User-field_data_bool value="true"/> </User-field_data> </User-field> <User-field> <User-field_label> <Object-id> <Object-id_str>nonredundant</Object-id_str> </Object-id> </User-field_label> <User-field_data> <User-field_data_bool value="true"/> </User-field_data> </User-field> <User-field> <User-field_label> <Object-id> <Object-id_str>definition</Object-id_str> </Object-id> </User-field_label> <User-field_data> <User-field_data_str>cd07080</User-field_data_str> </User-field_data> </User-field> <User-field> <User-field_label> <Object-id> <Object-id_str>short_name</Object-id_str> </Object-id> </User-field_label> <User-field_data> <User-field_data_str>ALDH_Acyl-CoA-Red_LuxC</User-field_data_str> </User-field_data> </User-field> <User-field> <User-field_label> <Object-id> <Object-id_str>from</Object-id_str> </Object-id> </User-field_label> <User-field_data> <User-field_data_int>29</User-field_data_int> </User-field_data> </User-field> <User-field> <User-field_label> <Object-id> <Object-id_str>to</Object-id_str> </Object-id> </User-field_label> <User-field_data> <User-field_data_int>444</User-field_data_int> </User-field_data> </User-field> <User-field> <User-field_label> <Object-id> <Object-id_str>score</Object-id_str> </Object-id> </User-field_label> <User-field_data> <User-field_data_int>1148</User-field_data_int> </User-field_data> </User-field> <User-field> <User-field_label> <Object-id> <Object-id_str>evalue</Object-id_str> </Object-id> </User-field_label> <User-field_data> <User-field_data_real>2.97592e-153</User-field_data_real> </User-field_data> </User-field> <User-field> <User-field_label> <Object-id> <Object-id_str>bit_score</Object-id_str> </Object-id> </User-field_label> <User-field_data> <User-field_data_real>445.954</User-field_data_real> </User-field_data> </User-field> </User-object_data> </User-object> </Seq-feat_ext> <Seq-feat_dbxref> <Dbtag> <Dbtag_db>CDD</Dbtag_db> <Dbtag_tag> <Object-id> <Object-id_id>143399</Object-id_id> </Object-id> </Dbtag_tag> </Dbtag> </Seq-feat_dbxref> </Seq-feat> </Seq-annot_data_ftable> </Seq-annot_data> </Seq-annot> </Bioseq_annot> </Bioseq> </Seq-entry_seq> </Seq-entry> </Bioseq-set_seq-set> </Bioseq-set> */