package org.genedb.db.loading.auxiliary; import org.genedb.db.loading.GoEvidenceCode; import org.genedb.db.loading.GoInstance; import org.genedb.db.loading.ParsingException; import org.genedb.util.TwoKeyMap; import org.apache.log4j.Logger; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.text.DateFormat; import java.text.SimpleDateFormat; import java.util.Collections; import java.util.Date; import java.util.HashMap; import java.util.HashSet; import java.util.Map; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * The DomainFile class represents a an output file from a polypeptide domain prediction algorithm, such as pfam_scan or Prosite, * as a collection of @{link DomainRow}s. * The DomainRow class is an abstract class extended by specific classes such as PfamRow and PrositeRow which each represent a row of * the input file. * * @author art * @author rh11 * @author te3 * */ abstract class DomainRow { protected int lineNumber; protected DomainAcc acc = DomainAcc.NULL; protected Boolean comment; protected String key, nativeProg, db, nativeAcc, nativeDesc, score, evalue; protected int fmin, fmax; protected ISOFormatDate date; public String db() { return db; } public DomainAcc acc() { return acc; } public String key() { return key; } public String nativeAcc() { return nativeAcc; } public String nativeDesc() { return nativeDesc; } public String nativeProg() { return nativeProg; } public int lineNumber() { return lineNumber; } public int fmin() { return fmin; } public int fmax() { return fmax; } public String score() { return score; } public String evalue() { return evalue; } public Boolean comment() { return comment; } public String getDate() { return date.withDashes(); } public abstract Set<GoInstance> getGoTerms(); public abstract String getGoTermComment(); } /** * Represents a single row of an Interpro output file. * * @author art * @author rh11 * @author te3 */ class InterProRow extends DomainRow { private static final Logger logger = Logger.getLogger(InterProRow.class); Set<GoInstance> goTerms = new HashSet<GoInstance>(); // The columns we're interested in: private static final int COL_KEY = 0; private static final int COL_NATIVE_PROG = 3; private static final int COL_NATIVE_ACC = 4; private static final int COL_NATIVE_DESC = 5; private static final int COL_FMIN = 6; private static final int COL_FMAX = 7; private static final int COL_SCORE = 8; private static final int COL_DATE = 10; private static final int COL_ACC = 11; private static final int COL_DESC = 12; private static final int COL_GO = 13; private static final HashMap<String, String> dbByProg = new HashMap<String, String>() {{ put("HMMPfam", "Pfam"); put("FPrintScan", "PRINTS"); put("ProfileScan", "Prosite"); put("ScanRegExp", "Prosite"); put("HMMSmart", "SMART"); put("BlastProDom", "ProDom"); put("HMMTigr", "TIGR_TIGRFAMS"); put("HMMPIR", "PIRSF"); put("HMMPanther", "PANTHER"); // These three have not been seen in the P. falciparum output, at least. // Are they still possible? put("Superfamily", "Superfamily"); put("superfamily", "Superfamily"); put("ScanProsite", "Prosite"); }}; /** * Convert a row of an Interpro output file to an InterproRow object. * * @param lineNumber the line number of this line in the input file. * Used to produce more helpful diagnostics if there's a * problem decoding the line. * @param rowFields a line of the input file */ InterProRow(int lineNumber, String row) { this(lineNumber, row.split("\t")); } /** * Convert a row of an Interpro output file to an InterproRow object. * * @param lineNumber the line number of this line in the input file. * Used to produce more helpful diagnostics if there's a * problem decoding the line. * @param rowFields an array containing the fields in the file. * In the actual file, fields are separated by tab characters. */ InterProRow(int lineNumber, String[] rowFields) { this.comment = false; if (rowFields.length == 1 || rowFields[COL_KEY].substring(0, 1).equals("#")) { //blank line or comment this.comment = true; } this.lineNumber = lineNumber; this.key = rowFields[COL_KEY]; this.nativeProg = rowFields[COL_NATIVE_PROG]; this.db = dbByProg.get(nativeProg); this.nativeAcc = rowFields[COL_NATIVE_ACC]; this.nativeDesc = rowFields[COL_NATIVE_DESC]; this.fmin = Integer.parseInt(rowFields[COL_FMIN]) - 1; // -1 because we're converting to interbase this.fmax = Integer.parseInt(rowFields[COL_FMAX]); this.score = rowFields[COL_SCORE]; this.date = new ISOFormatDate(rowFields[COL_DATE]); if (rowFields.length > COL_ACC && !rowFields[COL_ACC].equals("NULL")) { this.acc = new DomainAcc(rowFields[COL_ACC], rowFields[COL_DESC]); } if (rowFields.length > COL_GO) parseGoString(lineNumber, rowFields[COL_GO]); } private static final Pattern goTermPattern = Pattern.compile("\\G(Cellular Component|Biological Process|Molecular Function): (.*?) \\(GO:(\\d{7})\\)(?:, |\\z)"); private void parseGoString(int lineNumber, String goString) { Matcher matcher = goTermPattern.matcher(goString); while (matcher.find()) { String type = matcher.group(1); String description = matcher.group(2); String goId = matcher.group(3); logger.debug(String.format("Parsed GO term: %s/%s/%s", type, description, goId)); GoInstance goTerm = new GoInstance(); try { goTerm.setId(goId); goTerm.setEvidence(GoEvidenceCode.IEA); goTerm.setWithFrom("InterPro:" + this.acc.getId()); goTerm.setRef("GOC:interpro2go"); goTerm.setDate(this.date.withoutDashes()); goTerm.setSubtype(type); } catch (ParsingException e) { throw new RuntimeException(e); } /* * We do not set the <code>geneName</code> field of the GoInstance, * because a) the key is not necessarily a gene name, and b) the * method FeatureUtils#createGoEntries does not use the geneName * field. */ this.goTerms.add(goTerm); } if (!matcher.hitEnd()) logger.error(String.format("Failed to completely parse GO terms '%s' on line %d", goString, lineNumber)); } public Set<GoInstance> getGoTerms() { return this.goTerms; } public String evalue() { return null; } public String getGoTermComment() { return("From iprscan"); } @Override public String toString() { StringBuilder sb = new StringBuilder(); sb.append(String.format("%s/%s: %s (%s), location %d-%d", key, acc.getId(), nativeDesc, nativeProg, fmin, fmax)); for (GoInstance goTerm: goTerms) { sb.append(String.format("\n\t%s (GO:%s)", goTerm.getSubtype(), goTerm.getId())); } return sb.toString(); } } class PfamRow extends DomainRow { Pfam2GoFile pfam2GoFile; private static String today; static { DateFormat dFormat = new SimpleDateFormat("yyyyMMdd"); today = dFormat.format(new Date()); } private static final Logger logger = Logger.getLogger(PfamRow.class); // The columns we're interested in: private static final int COL_KEY = 0; private static final int COL_NATIVE_ACC = 5; private static final int COL_NATIVE_DESC = 6; private static final int COL_FMIN = 1; private static final int COL_FMAX = 2; private static final int COL_SCORE = 11; private static final int COL_EVALUE = 12; private static final int COL_SIG = 13; /** * Convert a row of an Pfam output file to an PfamRow object. * * @param lineNumber the line number of this line in the input file. * Used to produce more helpful diagnostics if there's a * problem decoding the line. * @param pfam2GoFile * @param rowFields a line of the input file */ public PfamRow(int lineNumber, String row, Pfam2GoFile pfam2GoFile) { this(lineNumber, row.split("\\s+"), pfam2GoFile); } /** * Convert a row of an Pfam output file to an PfamRow object. * * @param lineNumber the line number of this line in the input file. * Used to produce more helpful diagnostics if there's a * problem decoding the line. * @param rowFields an array containing the fields in the file. * In the actual file, fields are separated by multiple space characters. * @param pfam2GoFile */ public PfamRow(int lineNumber, String[] rowFields, Pfam2GoFile pfam2GoFile) { this.comment = false; if (rowFields.length == 1 || rowFields[COL_KEY].substring(0, 1).equals("#")) { //blank line or comment this.comment = true; } else if (rowFields.length == 15 && rowFields[COL_NATIVE_ACC].substring(0, 2).equals("PF") && rowFields[COL_SIG].equals("1")) { this.lineNumber = lineNumber; this.key = rowFields[COL_KEY]; this.nativeAcc = rowFields[COL_NATIVE_ACC]; this.nativeDesc = rowFields[COL_NATIVE_DESC]; this.nativeProg = "pfam_scan"; this.db = "Pfam"; this.fmin = Integer.parseInt(rowFields[COL_FMIN]) - 1; // -1 because we're converting to interbase this.fmax = Integer.parseInt(rowFields[COL_FMAX]); this.score = rowFields[COL_SCORE]; this.evalue = rowFields[COL_EVALUE]; if (rowFields.length > COL_NATIVE_DESC && !rowFields[COL_NATIVE_ACC].equals("NULL")) { this.acc = new DomainAcc(rowFields[COL_NATIVE_ACC], rowFields[COL_NATIVE_DESC]); } this.pfam2GoFile = pfam2GoFile; } } public Set<GoInstance> getGoTerms() { Set<GoInstance> goTerms = new HashSet<GoInstance>(); String pfamAccession = this.nativeAcc; Pfam2GoFile pfam2GoFile = this.pfam2GoFile; if (pfam2GoFile.getGoByPfam(pfamAccession) == null) { logger.debug(String.format("The domain '%s' has no mapped GO terms", pfamAccession)); return Collections.emptySet(); } for (String goAccession: pfam2GoFile.getGoByPfam(pfamAccession)) { try { GoInstance goInstance = new GoInstance(); goInstance.setId(goAccession); goInstance.setDate(today); GoEvidenceCode evidenceCode = GoEvidenceCode.parse("IEA"); goInstance.setEvidence(evidenceCode); goTerms.add(goInstance); } catch (ParsingException e) { logger.error(e); } } return goTerms; } public String getGoTermComment() { return("From Pfam2GO mapping"); } @Override public String toString() { StringBuilder sb = new StringBuilder(); sb.append(String.format("%s/%s: %s (%s), location %d-%d", key, acc.getId(), nativeDesc, nativeProg, fmin, fmax)); return sb.toString(); } } class PrositeRow extends DomainRow { String nativeName; // The columns we're interested in: private static final int COL_KEY = 0; private static final int COL_NATIVE_ACC = 1; private static final int COL_NATIVE_NAME = 5; private static final int COL_NATIVE_DESC = 6; private static final int COL_FMIN = 2; private static final int COL_FMAX = 3; /** * Convert a row of a Prosite output file to a PrositeRow object. * * @param lineNumber the line number of this line in the input file. * Used to produce more helpful diagnostics if there's a * problem decoding the line. * @param rowFields a line of the input file */ public PrositeRow(int lineNumber, String row) { this(lineNumber, row.split("\t")); } /** * Convert a row of a Prosite output file to a PrositeRow object. * * @param lineNumber the line number of this line in the input file. * Used to produce more helpful diagnostics if there's a * problem decoding the line. * @param rowFields an array containing the fields in the file. * In the actual file, fields are separated by multiple space characters. */ public PrositeRow(int lineNumber, String[] rowFields) { this.comment = false; if (rowFields.length == 1 || rowFields[COL_KEY].substring(0, 1).equals("#")) { //blank line or comment this.comment = true; } else if (rowFields.length == 7 && rowFields[COL_NATIVE_ACC].substring(0, 2).equals("PS")) { this.lineNumber = lineNumber; this.key = rowFields[COL_KEY]; this.nativeAcc = rowFields[COL_NATIVE_ACC]; this.nativeName = rowFields[COL_NATIVE_NAME]; this.nativeDesc = rowFields[COL_NATIVE_DESC]; this.nativeProg = "prosite"; this.db = "PROSITE"; this.fmin = Integer.parseInt(rowFields[COL_FMIN]) - 1; // -1 because we're converting to interbase this.fmax = Integer.parseInt(rowFields[COL_FMAX]); if (rowFields.length > COL_NATIVE_DESC && !rowFields[COL_NATIVE_ACC].equals("NULL")) { this.acc = new DomainAcc(rowFields[COL_NATIVE_ACC], rowFields[COL_NATIVE_DESC]); } } } public String score() { return null; } public String evalue() { return null; } public Set<GoInstance> getGoTerms() { return Collections.emptySet(); //Prosite currently has no GO mapping } public String getGoTermComment() { return null;//Prosite currently has no GO mapping } @Override public String toString() { StringBuilder sb = new StringBuilder(); sb.append(String.format("%s/%s: %s (%s), location %d-%d", key, acc.getId(), nativeDesc, nativeProg, fmin, fmax)); return sb.toString(); } } /** * Represents an InterPro/Pfam/Prosite accession identifier with description, * as found in the last two columns of an PfamScan raw output file. * * @author rh11 */ class DomainAcc { private String id, description; public static final DomainAcc NULL = new DomainAcc(null, null); public DomainAcc(String id, String description) { this.id = id; this.description = description; } public String getId() { return id; } public String getDescription() { return description; } /* * hashCode() and equals() are auto-generated by Eclipse. * We need them because we want to use DomainAcc objects * as keys in a map. */ @Override public int hashCode() { final int prime = 31; int result = 1; result = prime * result + ((description == null) ? 0 : description.hashCode()); result = prime * result + ((id == null) ? 0 : id.hashCode()); return result; } @Override public boolean equals(Object obj) { if (this == obj) return true; if (obj == null) return false; if (getClass() != obj.getClass()) return false; final DomainAcc other = (DomainAcc) obj; if (description == null) { if (other.description != null) return false; } else if (!description.equals(other.description)) return false; if (id == null) { if (other.id != null) return false; } else if (!id.equals(other.id)) return false; return true; } } /** * Represents a polypeptide domain prediction output file as a collection of {@link DomainRow}s * keyed by gene name (or mangled polypeptide name) and Domain accession number. * * @author rh11 */ class DomainFile { private static final Logger logger = Logger.getLogger(DomainFile.class); private TwoKeyMap<String,DomainAcc,Set<DomainRow>> rowsByKeyAndAcc = new TwoKeyMap<String, DomainAcc, Set<DomainRow>>(); Pfam2GoFile pfam2GoFile; public DomainFile(String analysisProgram, InputStream inputStream) throws IOException { BufferedReader br = new BufferedReader(new InputStreamReader( inputStream ) ); String line; int lineNumber = 0; Set<String> unrecognisedProgs = new HashSet<String>(); while (null != (line = br.readLine())) { lineNumber++; DomainRow row; if (analysisProgram.equals("pfam_scan")) { //parse the pfam2go file if not already done if (pfam2GoFile == null) { logger.info(String.format("Creating pfam2go mapping")); pfam2GoFile = new Pfam2GoFile(); } row = new PfamRow(lineNumber, line, pfam2GoFile); } else if (analysisProgram.equals("prosite")) { row = new PrositeRow(lineNumber, line); } else if (analysisProgram.equals("iprscan")) { row = new InterProRow(lineNumber, line); } else { throw new IllegalArgumentException(String.format("Loader for program '%s' has not been implemented", analysisProgram)); } if (row.comment().equals(true)) { //skipping comment lines continue; } if (row.db() == null) { if (!unrecognisedProgs.contains(row.nativeProg())) { logger.warn(String.format("Unrecognised program '%s', first encountered on line %d", row.nativeProg(), lineNumber)); unrecognisedProgs.add(row.nativeProg()); } continue; } //If the peptide and domain don't already exist, add a new entry into the hash if (!rowsByKeyAndAcc.containsKey(row.key(), row.acc())) { rowsByKeyAndAcc.put(row.key(), row.acc(), new HashSet<DomainRow>()); rowsByKeyAndAcc.get(row.key(), row.acc()).add(row); }else{ /* If it already exists (in case of the same domain existing in two different locations), * then just add the dbrow */ rowsByKeyAndAcc.get(row.key(), row.acc()).add(row); } } } public Set<String> keys() { return rowsByKeyAndAcc.firstKeySet(); } public Set<DomainAcc> accsForKey(String key) { if (!rowsByKeyAndAcc.containsFirstKey(key)) throw new IllegalArgumentException(String.format("Key '%s' not found", key)); return rowsByKeyAndAcc.getMap(key).keySet(); } public Set<DomainRow> rows(String key, DomainAcc acc) { if (!rowsByKeyAndAcc.containsKey(key, acc)) throw new IllegalArgumentException( String.format("Accession number '%s' not found for key '%s'", acc, key)); return rowsByKeyAndAcc.get(key, acc); } } /** * Convert dates from the format "24-Sep-1976" into ISO format * (1976-09-24 or 19760924). For example: * <pre> * new ISOFormatDate("24-Sep-1976").withDashes(); // Returns "1976-09-24" * </pre> * @author rh11 */ class ISOFormatDate { private static final Map<String, String> months = new HashMap<String, String>(12) {{ put("Jan", "01"); put("May", "05"); put("Sep", "09"); put("Feb", "02"); put("Jun", "06"); put("Oct", "10"); put("Mar", "03"); put("Jul", "07"); put("Nov", "11"); put("Apr", "04"); put("Aug", "08"); put("Dec", "12"); }}; private static final Pattern datePattern = Pattern.compile("(\\d\\d)-([A-Z][a-z][a-z])-(\\d{4})"); private String year, month, day; /** * Create a converter for the specified date. * * @param date The date in format dd-Mon-yyyy, e.g. 24-Sep-1976 * @throws IllegalArgumentException if the date cannot be parsed */ public ISOFormatDate(String date) { Matcher matcher = datePattern.matcher(date); if (!matcher.matches()) throw new IllegalArgumentException(String.format( "Failed to parse date '%s'", date)); String day = matcher.group(1); String month = matcher.group(2); String year = matcher.group(3); if (!months.containsKey(month)) throw new IllegalArgumentException(String.format( "Unknown month '%s' while parsing date '%s'", month, date)); this.year = year; this.month = months.get(month); this.day = day; } /** * Get the date in the format <code>yyyy-mm-dd</code>. * * @return the date in format <code>yyyy-mm-dd</code> */ public String withDashes() { return String.format("%s-%s-%s", year, month, day); } /** * Get the date in the format <code>yyyymmdd</code>. * * @return the date in format <code>yyyymmdd</code> */ public String withoutDashes() { return String.format("%s%s%s", year, month, day); } } /* * Stores the pfam2go mappings in a Map<String, Set<String>> */ class Pfam2GoFile { Map<String, Set<String>> pfam2go; private static final Logger logger = Logger.getLogger(Pfam2GoFile.class); public Pfam2GoFile() throws IOException { InputStream inputStream = getClass().getResourceAsStream("/pfam2go"); BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream)); String line; pfam2go = new HashMap<String, Set<String>>(); while (null != (line = reader.readLine())) { //While not end of file if(0 < line.length()){ StringBuilder sb = new StringBuilder(line); sb.append('\n'); //logger.info(sb); Pfam2GoLine pfam2GoLine = new Pfam2GoLine(line); if (!pfam2go.containsKey(pfam2GoLine.pfamAccession)) { pfam2go.put(pfam2GoLine.pfamAccession, new HashSet<String>()); } pfam2go.get(pfam2GoLine.pfamAccession).add(pfam2GoLine.goAccession); logger.debug(String.format("adding pfam %s for go %s", pfam2GoLine.pfamAccession, pfam2GoLine.goAccession)); } } } public Set<String> getGoByPfam(String pfamAccession) { return(pfam2go.get(pfamAccession)); } } /* * Parses a single line of the pfam2go mapping file */ class Pfam2GoLine { String pfamAccession, goAccession; public Pfam2GoLine(String line) { //Sample line //Pfam:PF00001 7tm_1 > GO:G-protein coupled receptor protein signaling pathway ; GO:0007186 final Pattern LINE_PATTERN = Pattern.compile("Pfam:(\\S+)\\s+(.+>.+)\\s+;\\s+GO:(\\d+)"); Matcher matcher = LINE_PATTERN.matcher(line); if (matcher.matches()) { this.pfamAccession = matcher.group(1); this.goAccession = matcher.group(3); } } }