package org.gmod.schema.bulk; import org.gmod.schema.bulk.TranscriptInfo.Polypeptide; import org.gmod.schema.utils.CvTermUtils; import org.apache.log4j.Logger; import java.sql.Connection; import java.sql.PreparedStatement; import java.sql.ResultSet; import java.sql.SQLException; import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import java.util.Map; import java.util.Set; public class BulkProcessor { private static final Logger logger = Logger.getLogger(BulkProcessor.class); // SO accession numbers for important terms private static final String SO_POLYPEPTIDE = "0000104"; private static final String SO_PSEUDOGENIC_TRANSCRIPT = "0000516"; private static final String SO_TRANSCRIPT = "0000673"; private static final String SO_MRNA = "0000234"; private static final String SO_PSEUDOGENE = "0000336"; private static final String SO_GENE = "0000704"; private static final String SO_TRANSCRIPT_REGION = "0000833"; private static final String SO_PSEUDOGENIC_EXON = "0000507"; /* Initialisation */ public BulkProcessor(Connection conn) throws SQLException { this.conn = conn; init(); } private Connection conn; private int fetchSize = 100; private String organismCommonName; private Set<Integer> geneTypeIds; private Set<Integer> pseudogeneTypeIds; private Set<Integer> transcriptTypeIds; private Set<Integer> mRNATypeIds; private Set<Integer> polypeptideTypeIds; private Set<Integer> transcriptRegionTypeIds; private void init() throws SQLException { this.geneTypeIds = typeIds(SO_GENE, SO_PSEUDOGENE); this.pseudogeneTypeIds = typeIds(SO_PSEUDOGENE); this.transcriptTypeIds = typeIds(SO_TRANSCRIPT, SO_PSEUDOGENIC_TRANSCRIPT); this.mRNATypeIds = typeIds(SO_MRNA); this.polypeptideTypeIds = typeIds(SO_POLYPEPTIDE); this.transcriptRegionTypeIds = typeIds(SO_TRANSCRIPT_REGION, SO_PSEUDOGENIC_EXON); conn.createStatement().execute("set local enable_sort = false"); } /* Main processing loop */ public void processFeatures(TranscriptHandler handler) throws SQLException, ProcessingException { TranscriptIterator it = new TranscriptIterator(); while (it.hasNext()) { TranscriptInfo transcriptInfo = it.next(); processFeature(handler, transcriptInfo); } it.close(); } private void processFeature(TranscriptHandler handler, TranscriptInfo ti) throws ProcessingException { try { handler.processTranscript(ti); } catch (Exception e) { throw new ProcessingException(String.format("Error processing transcript '%s' (ID=%d)", ti.transcriptUniqueName, ti.transcriptFeatureId), e); } } /* Data fetching and result correlation */ private class TranscriptIterator extends AbstractIterator<TranscriptInfo> { private PreparedStatement stTranscripts; private PreparedStatement stExons; private ResultSet rsTranscripts; private ResultSet rsExons; private PolypeptideIterator polypeptideIterator; public TranscriptIterator() throws SQLException { stTranscripts = st(sqlTranscripts()); stExons = st(sqlExons()); logger.trace("Executing transcripts query..."); rsTranscripts = stTranscripts.executeQuery(); logger.trace("Executing exons query..."); rsExons = stExons.executeQuery(); logger.trace("Transcript queries executed."); polypeptideIterator = new PolypeptideIterator(); } @Override public void close() throws SQLException { stTranscripts.close(); stExons.close(); polypeptideIterator.close(); } private boolean exonsExhausted = false; private boolean exonsOutOfSync = false; /** * Get the next transcript. * * @return the next element if there is one, or <code>null</code> if not. * @throws DataIntegrityViolation */ @Override public TranscriptInfo getNext() throws SQLException, DataIntegrityViolation { if (!rsTranscripts.next()) { // No more transcripts to process! return null; } TranscriptInfo ti = fromTranscriptResultSet(rsTranscripts); if (exonsExhausted) { throw new DataIntegrityViolation("Transcript %s has no exons", ti); } // Try to incorporate the exons and UTRs for(;;) { if (!exonsOutOfSync) { // If the exons are out of sync, don't call next if (!rsExons.next()) { exonsExhausted = true; break; } } int transcriptIdOfExon = rsExons.getInt("transcript_feature_id"); if (transcriptIdOfExon > ti.transcriptFeatureId) { exonsOutOfSync = true; break; } exonsOutOfSync = false; if (transcriptIdOfExon < ti.transcriptFeatureId) { logger.warn(String.format( "Transcript ID=%d not found while processing exon '%s' (ID=%d)\n" + "(This probably means that the transcript has no gene)", rsExons.getInt("transcript_feature_id"), rsExons.getString("exon_uniquename"), rsExons.getInt("exon_feature_id"))); continue; } assert transcriptIdOfExon == ti.transcriptFeatureId; logger.trace(String.format("\t...adding exon '%s' (ID=%d)", rsExons.getString("exon_uniquename"), rsExons.getInt("exon_feature_id"))); ti.exons.add(fromExonResultSet(rsExons)); } if (ti.exons.isEmpty()) { throw new DataIntegrityViolation("Transcript %s has no exons", ti); } // Now try to incorporate the polypeptide return ti; } private TranscriptInfo fromTranscriptResultSet(ResultSet rs) throws SQLException { TranscriptInfo ti = new TranscriptInfo(); ti.transcriptFeatureId = rs.getInt("transcript_feature_id"); ti.transcriptUniqueName = rs.getString("transcript_uniquename"); ti.transcriptName = rs.getString("transcript_name"); ti.transcriptType = rs.getString("transcript_type"); ti.transcriptSynonyms = synonyms(ti.transcriptFeatureId); ti.geneFeatureId = rs.getInt("gene_feature_id"); ti.geneUniqueName = rs.getString("gene_uniquename"); ti.geneName = rs.getString("gene_name"); ti.geneType = rs.getString("gene_type"); ti.geneSynonyms = synonyms(ti.geneFeatureId); ti.srcFeatureId = rs.getInt("srcfeature_feature_id"); ti.srcFeatureUniqueName = rs.getString("srcfeature_uniquename"); ti.srcFeatureSeqLen = rs.getInt("srcfeature_seqlen"); return ti; } private TranscriptInfo.Exon fromExonResultSet(ResultSet rs) throws SQLException { TranscriptInfo.Exon exon = new TranscriptInfo.Exon(); exon.featureId = rs.getInt("exon_feature_id"); exon.type = rs.getString("exon_type"); exon.fmin = rs.getInt("fmin"); exon.fmax = rs.getInt("fmax"); return exon; } } private class PolypeptideIterator extends AbstractIterator<TranscriptInfo.Polypeptide> { private PreparedStatement stPolypeptides; private PreparedStatement stPolypeptideProps; private PreparedStatement stPolypeptideTerms; private ResultSet rsPolypeptides; private ResultSet rsPolypeptideProps; private ResultSet rsPolypeptideTerms; public PolypeptideIterator() throws SQLException { stPolypeptides = st(sqlPolypeptides()); stPolypeptideProps = st(sqlPolypeptideProps()); stPolypeptideTerms = st(sqlPolypeptideTerms()); logger.trace("Executing polypeptides query..."); rsPolypeptides = stPolypeptides.executeQuery(); logger.trace("Executing polypeptide properties query..."); rsPolypeptideProps = stPolypeptideProps.executeQuery(); logger.trace("Executing polypeptide terms query..."); rsPolypeptideTerms = stPolypeptideTerms.executeQuery(); logger.trace("All polypeptide queries executed."); } @Override public void close() throws SQLException { stPolypeptides.close(); stPolypeptideProps.close(); stPolypeptideTerms.close(); } @Override public Polypeptide getNext() throws SQLException { if (!rsPolypeptides.next()) { // No more polypeptides to process! return null; } Polypeptide p = new Polypeptide(); p.featureId = rsPolypeptides.getInt("feature_id"); p.uniqueName = rsPolypeptides.getString("uniquename"); p.seqLen = rsPolypeptides.getInt("seqlen"); return p; } } /* SQL, and related utility methods */ private int cursorNumber = 1; private PreparedStatement st(String sql) throws SQLException { String cursorName = String.format("curs%03d", cursorNumber++); // Fake bind variable //**sql = sql.replace("?", "'" + organismCommonName.replace("'", "''") + "'"); conn.createStatement().execute( "declare " + cursorName + " no scroll cursor for\n" + sql ); PreparedStatement st = conn.prepareStatement("fetch forward all in " + cursorName); st.setFetchSize(fetchSize); return st; } private String sqlTranscripts() { return "select gene.feature_id as gene_feature_id\n" + " , gene.uniquename as gene_uniquename\n" + " , gene.name as gene_name\n" + " , gene_type.name as gene_type\n" + " , (gene.type_id in (\n" + commaSeparatedList(this.pseudogeneTypeIds) + "\n" + " )) as is_pseudo\n" + " , transcript.feature_id as transcript_feature_id\n" + " , transcript.uniquename as transcript_uniquename\n" + " , transcript.name as transcript_name\n" + " , transcript_type.name as transcript_type\n" + " , (transcript.type_id in (\n" + commaSeparatedList(this.mRNATypeIds) + "\n" + " )) as is_coding\n" + " , transcriptloc.fmin\n" + " , transcriptloc.fmax\n" + " , transcriptloc.strand\n" + " , srcfeature.feature_id as srcfeature_feature_id\n" + " , srcfeature.uniquename as srcfeature_uniquename\n" + " , srcfeature.seqlen as srcfeature_seqlen\n" + " from feature gene\n" + " join cvterm gene_type on gene.type_id = gene_type.cvterm_id\n" + " join feature_relationship transcript_gene on transcript_gene.object_id = gene.feature_id\n" + " join feature transcript on transcript_gene.subject_id = transcript.feature_id\n" + " join cvterm transcript_type on transcript.type_id = transcript_type.cvterm_id\n" + " join featureloc transcriptloc on transcript.feature_id = transcriptloc.feature_id\n" + " join feature srcfeature on transcriptloc.srcfeature_id = srcfeature.feature_id\n" + " where transcript.type_id in (\n" + commaSeparatedList(this.transcriptTypeIds) + "\n)\n" + " and gene.type_id in (\n" + commaSeparatedList(this.geneTypeIds) + "\n)\n" /*+ " and transcript.organism_id = (\n" + " select organism_id from organism\n" + " where common_name = ?\n" + " )\n"*/ + " and transcriptloc.locgroup = 0 and transcriptloc.rank = 0\n" + " and not transcript.is_obsolete\n" + " order by transcript.feature_id"; } private String sqlExons() { return "select exon_transcript.object_id as transcript_feature_id\n" + " , exon.feature_id as exon_feature_id\n" + " , exon.uniquename as exon_uniquename\n" + " , exonloc.fmin\n" + " , exonloc.fmax\n" + " , exon_type.name as exon_type\n" + "from feature exon\n" + "join featureloc exonloc on exon.feature_id = exonloc.feature_id\n" + "join cvterm exon_type on exon.type_id = exon_type.cvterm_id\n" + "join feature_relationship exon_transcript on exon_transcript.subject_id = exon.feature_id\n" + "where exon.type_id in (\n" + commaSeparatedList(this.transcriptRegionTypeIds) + "\n)\n" /*+ "and exon.organism_id = (\n" + " select organism_id from organism\n" + " where common_name = ?\n" + ")\n" */ + "and exonloc.locgroup = 0\n" + "and exonloc.rank = 0\n" + "order by exon_transcript.object_id"; } private String sqlPolypeptides() { return "select polypeptide.feature_id\n" + " , polypeptide.uniquename\n" + " , polypeptide.seqlen\n" + "from feature polypeptide\n" + "where type_id in (\n" + commaSeparatedList(this.polypeptideTypeIds) + "\n" + ")\n" /*+ "and polypeptide.organism_id = (\n" + " select organism_id from organism\n" + " where common_name = ?\n" + ")\n"*/ + "order by polypeptide.feature_id"; } private String sqlPolypeptideProps() { return "select polypeptide.feature_id\n" + " , featureprop_type_cv.name as type_cv\n" + " , featureprop_type.name as type\n" + " , featureprop.value\n" + "from feature polypeptide\n" + "join featureprop on featureprop.feature_id = polypeptide.feature_id\n" + "join cvterm featureprop_type on featureprop.type_id = featureprop_type.cvterm_id\n" + "join cv featureprop_type_cv on featureprop_type.cv_id = featureprop_type.cv_id\n" + "where polypeptide.type_id in (\n" + commaSeparatedList(this.polypeptideTypeIds) + "\n" + ")\n" /*+ "and polypeptide.organism_id = (\n" + " select organism_id from organism\n" + " where common_name = ?\n" + ")\n"*/ + "order by polypeptide.feature_id"; } private String sqlPolypeptideTerms() { return "select polypeptide.feature_id\n" + " , cv.name as cv\n" + " , cvterm.name as term\n" + "from feature polypeptide\n" + "join feature_cvterm on feature_cvterm.feature_id = polypeptide.feature_id\n" + "join cvterm on feature_cvterm.cvterm_id = cvterm.cvterm_id\n" + "join cv on cvterm.cv_id = cv.cv_id\n" + "where polypeptide.type_id in (\n" + commaSeparatedList(this.polypeptideTypeIds) + "\n" + ")\n" /*+ "and polypeptide.organism_id = (\n" + " select organism_id from organism\n" + " where common_name = ?\n" + ")\n"*/ + "order by polypeptide.feature_id"; } /* Type IDs */ private boolean checkedTypeIds = false; private Set<Integer> typeIds(String... accessions) throws SQLException { Set<Integer> typeIds = new HashSet<Integer>(); StringBuilder questionMarks = new StringBuilder(); for (int i = 0; i < accessions.length; i++) { if (questionMarks.length() > 0) { questionMarks.append(", "); } questionMarks.append("?"); } if (!checkedTypeIds) { CvTermUtils.checkCvTermPath(conn); checkedTypeIds = true; } String sql = "select cvtermpath.subject_id" + " from cvtermpath" + " join cvterm type on cvtermpath.type_id = type.cvterm_id" + " join cvterm object on cvtermpath.object_id = object.cvterm_id" + " join dbxref object_dbxref on object.dbxref_id = object_dbxref.dbxref_id" + " join cv object_cv on object.cv_id = object_cv.cv_id" + " where lower(type.name) = 'is_a'" + " and object_cv.name = 'sequence'" + " and object_dbxref.accession in (" + questionMarks + ")"; PreparedStatement st = conn.prepareStatement(sql); try { for (int i = 0; i < accessions.length; i++) { st.setString(1 + i, accessions[i]); } ResultSet rs = st.executeQuery(); while (rs.next()) { typeIds.add(rs.getInt("subject_id")); } } finally { try { st.close(); } catch (SQLException e) { logger.error("Error during close()", e); } } return typeIds; } private static String commaSeparatedList(Iterable<?> list) { StringBuilder s = new StringBuilder(); for (Object item : list) { if (s.length() > 0) { s.append(", "); } s.append(item); } return s.toString(); } /* Synonyms */ private Map<Integer, Collection<TranscriptInfo.Synonym>> synonymsByFeatureId; private void fetchSynonyms() throws SQLException { String sql = "select feature_synonym.feature_id\n" + " , synonym.name as synonym\n" + " , synonym_type.name as type\n" + " from feature\n" + " join feature_synonym on feature.feature_id = feature_synonym.feature_id\n" + " join synonym on feature_synonym.synonym_id = synonym.synonym_id\n" + " join cvterm synonym_type on synonym.type_id = synonym_type.cvterm_id\n" + " where feature.organism_id = (\n" + " select organism_id from organism\n" + " where common_name = ?\n" + " )"; logger.debug("SQL: " + sql); PreparedStatement st = conn.prepareStatement(sql); try { synonymsByFeatureId = new HashMap<Integer, Collection<TranscriptInfo.Synonym>>(); st.setString(1, organismCommonName); ResultSet rs = st.executeQuery(); logger.debug("Fetching synonyms..."); int numberOfSynonyms = 0; while (rs.next()) { int featureId = rs.getInt("feature_id"); TranscriptInfo.Synonym synonym = new TranscriptInfo.Synonym(); synonym.name = rs.getString("synonym"); synonym.type = rs.getString("type"); if (!synonymsByFeatureId.containsKey(featureId)) { synonymsByFeatureId.put(featureId, new HashSet<TranscriptInfo.Synonym>()); } synonymsByFeatureId.get(featureId).add(synonym); numberOfSynonyms++; } logger.debug(String.format("Loaded %d synonyms", numberOfSynonyms)); } finally { try { st.close(); } catch (SQLException e) { logger.error("Error closing prepared statement", e); } } } private Collection<TranscriptInfo.Synonym> synonyms(int... featureIds) { Collection<TranscriptInfo.Synonym> synonyms = new HashSet<TranscriptInfo.Synonym>(); for (int featureId : featureIds) { if (synonymsByFeatureId.containsKey(featureId)) { synonyms.addAll(synonymsByFeatureId.get(featureId)); } } return synonyms; } }