package au.com.acpfg.misc.uniprot; import java.io.FileInputStream; import java.io.FileReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.PrintWriter; import java.io.Reader; import java.io.StringReader; import java.util.ArrayList; import java.util.List; import java.util.logging.Logger; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.XMLReader; import org.xml.sax.helpers.XMLReaderFactory; import com.fatdog.xmlEngine.DocItems; import com.fatdog.xmlEngine.ResultList; import com.fatdog.xmlEngine.XQEngine; import com.fatdog.xmlEngine.exceptions.CantParseDocumentException; import com.fatdog.xmlEngine.exceptions.InvalidQueryException; import com.fatdog.xmlEngine.exceptions.MissingOrInvalidSaxParserException; public class UniProtHit { private String m_recommended_name, m_organism, m_primary_gene, m_id, m_sequence, m_exist_evidence; private ArrayList<String> m_xrefs; // cross references to other databases private ArrayList<String> m_comments; // textual comments relating to entry (usually unspecified origin) private ArrayList<String> m_lineage; // NCBI taxonomic lineage private ArrayList<String> m_keywords; private ArrayList<String> m_features; private ArrayList<String> m_citations; public UniProtHit() { m_xrefs = new ArrayList<String>(); m_comments = new ArrayList<String>(); m_lineage = new ArrayList<String>(); m_keywords= new ArrayList<String>(); m_features= new ArrayList<String>(); m_citations= new ArrayList<String>(); } /* public static List<UniProtHit> make_entries(InputStream xml_stream) throws Exception { return make_entries(xml2string(xml_stream, true)); }*/ public static List<UniProtHit> make_entries(String xml) throws Exception { ArrayList<UniProtHit> ret = new ArrayList<UniProtHit>(); XQEngine results_eng = new XQEngine(); XMLReader xml_rdr = XMLReaderFactory.createXMLReader(); results_eng.setXMLReader(xml_rdr); //Logger.getAnonymousLogger().info(xml); if (xml.trim().length() < 1) { // deleted entries have an XML length of 0 eg. http://www.uniprot.org/uniprot/A7PGM7.xml // so we throw and let the caller sort out what it wants to do... throw new DeletedEntryException("UniProt entry appears to have been deleted, no data available."); } int doc_id = results_eng.setExplicitDocument(xml); ResultList results = results_eng.setQuery("for $e in /uniprot/entry return <uniprot>{$e}</uniprot>"); String hits_xml = results.emitXml(); if (hits_xml != null && hits_xml.length() > 0) { // HACK: split the xml based on the end of the root tag and then run the fragments thru XQEngine to extract desired results String[] hits = hits_xml.split("</uniprot>"); for (String hit : hits) { hit += "</uniprot>"; XQEngine hit_eng = new XQEngine(); hit_eng.setXMLReader(xml_rdr); hit_eng.setExplicitDocument(hit); UniProtHit uh = new UniProtHit(); ResultList rl = hit_eng.setQuery("/uniprot/entry/protein/recommendedName/fullName/text()"); uh.setRecommendedName(rl.emitXml()); rl = hit_eng.setQuery("/uniprot/entry/organism/name[@type='scientific']/text()"); uh.setOrganism(rl.asString()); rl = hit_eng.setQuery("/uniprot/entry/name"); uh.setName(rl.asString()); rl = hit_eng.setQuery("/uniprot/entry/sequence"); uh.setSequence(rl.asString()); rl = hit_eng.setQuery("/uniprot/entry/gene/name[@type='primary']/text()"); uh.setPrimaryGene(rl.asString()); rl = hit_eng.setQuery("//uniprot/entry/dbReference"); //Logger.getAnonymousLogger().info(rl.emitXml()); uh.setXrefs(extract_attribute_combined_key(rl.emitXml(), new String[] { "type", "id" })); rl = hit_eng.setQuery("//comment/text"); uh.setComments(extract_elements(rl.emitXml(), "text" )); rl = hit_eng.setQuery("/uniprot/entry/organism/lineage/taxon"); uh.setLineage(extract_elements(rl.emitXml(), "taxon")); rl = hit_eng.setQuery("/uniprot/entry/keyword"); uh.setKeywords(extract_elements(rl.emitXml(), "keyword")); rl = hit_eng.setQuery("/uniprot/entry/proteinExistence"); uh.setExistEvidence(extract_attribute(rl.emitXml(), "type")); rl = hit_eng.setQuery("/uniprot/entry/feature"); uh.extract_features(rl.emitXml()); uh.extract_citations(hit_eng); ret.add(uh); } } return ret; } private void setExistEvidence(String extract_attribute) { m_exist_evidence = extract_attribute; } private void setKeywords(ArrayList<String> extract_elements) { m_keywords = extract_elements; } private void setLineage(ArrayList<String> extract_elements) { m_lineage = extract_elements; } private void setComments(ArrayList<String> extract_elements) { m_comments = extract_elements; } private void setXrefs(ArrayList<String> elements) { m_xrefs = elements; } private void setPrimaryGene(String asString) { m_primary_gene = asString; } private void setName(String asString) { m_id = asString; } private void setSequence(String asString) { m_sequence = asString; } private void setOrganism(String asString) { m_organism = asString; } private void setRecommendedName(String asString) { m_recommended_name = asString; } public static String xml2string(InputStream response_stream, boolean remove_default_ns) throws SAXException, IOException { String xml = ""; Reader rdr = new InputStreamReader(response_stream); int n_chars; char[] buf = new char[10*1024]; boolean first = true; int got = 0; try { while ((n_chars = rdr.read(buf)) >= 0) { //Logger.getAnonymousLogger().info("read "+n_chars); got += n_chars; if (got > 100 * 1024 * 1024) { throw new IOException("Ridiculously large UniProt record (>100MB)... ignoring!"); } if (n_chars > 0) { char[] read = new char[n_chars]; System.arraycopy(buf, 0, read, 0, n_chars); if (first && remove_default_ns) { // remove default namespace (simplifies XQueries) since only one is in results Pattern p = Pattern.compile("\\sxmlns=\"[^\"]*?\""); String s = new String(read); Matcher m = p.matcher(s); xml += m.replaceFirst(" "); } else { xml += new String(read); } } } } finally { // BUG: seems to occassionally socket timeout (trying to exhaust input) so ignore it... try { rdr.close(); } catch (Exception e) { // NO-OP } } return xml; } protected String extract_text(String xml_fragment) { Pattern p = Pattern.compile("<(\\w+)>([^>]+?)(?:</\\1>)?"); Matcher m = p.matcher(xml_fragment); if (m.matches()) { return m.group(2); } else { return ""; } } protected void extract_citations(XQEngine eng) throws Exception { m_citations.clear(); ResultList rl = eng.setQuery("/uniprot/entry/reference/citation"); String[] citations = rl.emitXml().split("</citation>"); for (String citation : citations) { ArrayList<String> title = extract_elements(citation, "title"); ArrayList<String> authors= extract_elements(citation, "person"); String citation_tag = citation.substring(0, citation.indexOf(">")); String type = extract_attribute(citation_tag, "type"); String date = extract_attribute(citation_tag, "date"); String name = extract_attribute(citation_tag, "name"); String start= extract_attribute(citation_tag, "first"); String end = extract_attribute(citation_tag, "last"); String vol = extract_attribute(citation_tag, "volume"); if (type == null || title == null || title.size() < 1) { Logger.getAnonymousLogger().warning("Rejecting incomplete citation"); } String out = type+": "+extract_text(title.get(0))+", "; if (authors != null && authors.size() > 0) { for (String person_xml : authors) { String person = extract_attribute(person_xml, "name"); out += person+" "; } out = out.trim()+", "; } if (name != null) { out += name+", "; } if (date != null) { out += date+", "; } if (vol != null) { if (!vol.toLowerCase().startsWith("vol")) out += "Vol. "; out += vol+" "; if (start != null && end != null) { out += start + "-" + end; } } m_citations.add(out); } } protected void extract_features(String xml) { m_features.clear(); String[] features = xml.split("</feature>"); ArrayList<String> f = new ArrayList<String>(); for (String feature : features) { String type = extract_attribute(feature, "type"); String descr= extract_attribute(feature, "description"); String status=extract_attribute(feature, "status"); String tmp = type + ":"; if (descr != null) { tmp += " "+descr.trim(); if (status != null) { tmp += ":"; } } if (status != null) { tmp += status; } m_features.add(tmp); } } protected static String extract_attribute(String xml, String attrName) { int offset = xml.indexOf(" "+attrName+"=\""); if (offset < 0) return null; int end = offset + 3 + attrName.length(); while (end < xml.length() && xml.charAt(end) != '"') { end++; } return xml.substring(offset+3+attrName.length(), end); } protected static ArrayList<String> extract_attributes(String xml, String elName, String attrName) { ArrayList<String> ret = new ArrayList<String>(); for (String el : xml.split("</"+elName+">")) { if (el.trim().length() > 0) { ret.add(extract_attribute(el, attrName)); } } return ret; } protected static ArrayList<String> extract_elements(String xml_fragment, String elName) { ArrayList<String> ret = new ArrayList<String>(); String[] entries = xml_fragment.split("</"+elName+">"); for (String entry : entries) { // skip start tag int offset = elName.length()+1; char c = ' '; while (offset < entry.length() && (c = entry.charAt(offset)) != '>') { offset++; } if (c == '>') { ret.add(entry.substring(offset+1)); } } return ret; } protected static ArrayList<String> extract_attribute_combined_key(String xml_fragment, String[] attrNames) { ArrayList<String> ret = new ArrayList<String>(); String[] lines = xml_fragment.split("/><"); // only lines which contain ALL the attributes are added to ret for (String line : lines) { String xref = ""; int added = 0; for (String attrName : attrNames) { int offset = line.indexOf(attrName+"=\""); if ( offset >= 0) { int close_quote = offset + attrName.length()+3; boolean found = false; while (close_quote < line.length() && !found ) { found = (line.charAt(close_quote) == '"'); close_quote++; } if (found) { xref += line.substring(offset+attrName.length()+2, close_quote-1); xref += ":"; added++; } } } if (added == attrNames.length) { // remove trailing : ret.add(xref.substring(0, xref.length()-1)); } } return ret; } public String getRecommendedName() { return m_recommended_name; } public String getOrganism() { return m_organism; } public String getID() { return m_id; } public String getSequence() { return m_sequence; } public String getGenePrimary() { return m_primary_gene; } public List<String> getXrefs() { return m_xrefs; } public List<String> getComments() { return m_comments; } public List<String> getLineage() { return m_lineage; } public List<String> getKeywords() { return m_keywords; } public String getExistenceEvidence() { return m_exist_evidence; } public List<String> getFeatures() { return m_features; } public List<String> getCitations() { return m_citations; } }