package org.genedb.db.loading; import org.apache.log4j.Logger; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Configurable; import org.springframework.jdbc.core.simple.SimpleJdbcTemplate; import org.springframework.transaction.PlatformTransactionManager; import org.springframework.transaction.TransactionDefinition; import org.springframework.transaction.TransactionStatus; import org.springframework.transaction.support.DefaultTransactionDefinition; import java.io.IOException; import java.util.HashSet; import java.util.Map; import java.util.Set; /** * Load EST matches from an Exonerate Vulgar file. * * @author rh11 */ @Configurable public class VulgarLoader { /* * Implementation note: the first version of this code used Hibernate, but * was heavily CPU-bound and unacceptably slow. Thus it has been rewritten * to use JDBC, and is now thoroughly database-bound and reasonably fast. */ private static final Logger logger = Logger.getLogger(VulgarLoader.class); @Autowired private SimpleJdbcTemplate simpleJdbcTemplate; private PlatformTransactionManager transactionManager; // Configurable parameters private int organismId; /** * Set the organism into which to load data. * * @param organismCommonName the common name of the organism */ public void setOrganismCommonName(String organismCommonName) { this.organismId = getOrganismIdFromCommonName(organismCommonName); } private int getOrganismIdFromCommonName(String organismCommonName) { Map<String,Object> idMap = simpleJdbcTemplate.queryForMap( "select organism_id from organism where common_name = ?", organismCommonName); int organismId = (Integer) idMap.get("organism_id"); logger.trace(String.format("Organism '%s' has ID %d", organismCommonName, organismId)); return organismId; } private String matchType = "EST_match"; private static Set<String> permittedMatchTypes = new HashSet<String>(); static { permittedMatchTypes.add("nucleotide_match"); permittedMatchTypes.add("cDNA_match"); permittedMatchTypes.add("EST_match"); } public void setMatchType(String matchType) { this.matchType = matchType; } private TransactionStatus transactionStatus; private TransactionDefinition transactionDefinition = new DefaultTransactionDefinition(); public synchronized void load(VulgarFile file) throws ParsingException, IOException { /* * This class does not use the declarative @Transactional annotation, * because we wish to split the load into a number of consecutive * transactions. */ transactionStatus = transactionManager.getTransaction(transactionDefinition); init(); try { int i = 0; // The iteration implicit in the for(:) loop might cause a VulgarFileException for (VulgarMapping mapping: file) { loadMapping(mapping); if (++i % 100 == 0) { logger.info(String.format("Loaded %d mappings", i)); logger.debug("Committing transaction."); transactionManager.commit(transactionStatus); transactionStatus = transactionManager.getTransaction(transactionDefinition); } } } catch (VulgarFileException e) { logger.trace("Exception caught. Rolling back."); transactionManager.rollback(transactionStatus); Throwable t = e.getCause(); if (t == null) { throw new RuntimeException("VulgarFileException of null type", e); } if (t instanceof IOException) { throw (IOException) t; } if (t instanceof ParsingException) { throw (ParsingException) t; } throw new RuntimeException("VulgarFileException of unknown type", e); } catch (RuntimeException e) { logger.trace("Exception caught. Rolling back."); transactionManager.rollback(transactionStatus); throw e; } } private int matchTypeId; private int matchPartTypeId; private int partOfTypeId; private void init() { matchTypeId = getCvTermId("sequence", matchType); matchPartTypeId = getCvTermId("sequence", "match_part"); partOfTypeId = getCvTermId("relationship", "part_of"); } private int getCvTermId(String cvName, String cvTermName) { return (Integer) simpleJdbcTemplate.queryForMap( "select cvterm_id "+ "from cvterm "+ "join cv using (cv_id) "+ "where cv.name = ? "+ "and cvterm.name = ?", cvName, cvTermName ).get("cvterm_id"); } private void loadMapping(VulgarMapping mapping) throws VulgarFileException, DataError { String matchUniqueName = insertMatch(mapping); int matchId = currentFeatureId(); insertFeatureLoc(matchId, mapping.getQuery(), mapping.getQMin(), mapping.getQMax(), mapping.getQStrand(), 0); insertFeatureLoc(matchId, mapping.getTarget(), mapping.getTMin(), mapping.getTMax(), mapping.getTStrand(), 1); int sourcePos = 0; int targetPos = 0; int partIndex = 0; for (VulgarMapping.Match match: mapping.getMatches()) { switch (match.getType()) { case MATCH: createMatchPart(matchId, matchUniqueName, partIndex++, sourcePos, match.getQueryLength(), targetPos, match.getTargetLength()); /*FALL THROUGH*/ default: sourcePos += match.getQueryLength(); targetPos += match.getTargetLength(); } } } private int currentFeatureId() { long featureId = (Long) simpleJdbcTemplate.queryForMap( "select currval('feature_feature_id_seq'::regclass) as feature_id") .get("feature_id"); return (int) featureId; } private Set<String> matchUniqueNames = new HashSet<String>(); private String insertMatch(VulgarMapping mapping) { String matchUniqueName; if (mapping.getTStrand() >= 0) { matchUniqueName = String.format("match:%s@%s:%d-%d", mapping.getQuery(), mapping.getTarget(), mapping.getTMin(), mapping.getTMax()); } else { matchUniqueName = String.format("match:%s@%s:(%d-%d)", mapping.getQuery(), mapping.getTarget(), mapping.getTMin(), mapping.getTMax()); } // It's possible to have more than one match at the same location, // so make sure the name is unique; String originalMatchUniqueName = matchUniqueName; int i = 1; while (matchUniqueNames.contains(matchUniqueName)) { matchUniqueName = originalMatchUniqueName + ":" + i++; } matchUniqueNames.add(matchUniqueName); simpleJdbcTemplate.update( "insert into feature ("+ " organism_id, uniquename, type_id, is_analysis"+ ") values ("+ " ?, ?, ?, true"+ ")", organismId, matchUniqueName, matchTypeId); logger.trace(String.format("Inserted ESTMatch feature '%s'", matchUniqueName)); return matchUniqueName; } private void insertFeatureLoc(int featureId, String sourceFeature, int fmin, int fmax, int strand, int rank) throws DataError { int n = simpleJdbcTemplate.update( "insert into featureloc ("+ " feature_id, srcfeature_id, fmin, fmax, strand, locgroup, rank"+ ") ("+ " select ?"+ " , feature.feature_id"+ " , ?, ?, ?, 0, ?"+ " from feature"+ " where feature.uniqueName = ?"+ ")", featureId, fmin, fmax, strand, rank, sourceFeature); if (n != 1) { throw new DataError(String.format("Feature '%s' not found", sourceFeature)); } logger.trace(String.format("Inserted featureLoc (featureId=%d, fmin=%d, fmax=%d, strand=%d, rank=%d) to '%s'", featureId, fmin, fmax, strand, rank, sourceFeature)); n = simpleJdbcTemplate.update( "insert into featureloc ("+ " feature_id, srcfeature_id, fmin, fmax, strand, locgroup, rank"+ ") ("+ " select ? as feature_id"+ " , feature.feature_id as srcfeature_id"+ " , ? + featureloc.fmin as fmin" + " , ? + featureloc.fmin as fmax" + " , ? as strand" + " , featureloc.locgroup + 1 as locgroup" + " , ? as rank"+ " from feature"+ " join featureloc using (feature_id)"+ " where feature.uniqueName = ?"+ " and featureloc.rank = 0"+ ")", featureId, fmin, fmax, strand, rank, sourceFeature); logger.trace(String.format("Inserted %d dependent featureLocs for '%s'", n, sourceFeature)); } private void createMatchPart(int matchId, String matchUniqueName, int partIndex, int sourcePos, int sourceLength, int targetPos, int targetLength) { String partUniqueName = String.format("%s:part%d", matchUniqueName, partIndex); simpleJdbcTemplate.update( "insert into feature ("+ " organism_id, uniquename, type_id, is_analysis"+ ") values ("+ " ?, ?, ?, true"+ ")", organismId, partUniqueName, matchPartTypeId); logger.trace(String.format("Inserted MatchPart '%s'", partUniqueName)); int partId = currentFeatureId(); simpleJdbcTemplate.update( "insert into feature_relationship ("+ " subject_id, type_id, object_id"+ ") values ("+ " ?, ?, ?"+ ")", partId, partOfTypeId, matchId); logger.trace("Inserted FeatureRelationship"); createPartLoc(partId, matchId, sourcePos, sourceLength, 0); createPartLoc(partId, matchId, targetPos, targetLength, 1); } private void createPartLoc(int partId, int matchId, int pos, int length, int rank) { int n = simpleJdbcTemplate.update( "insert into featureloc ("+ " feature_id, srcfeature_id, fmin, fmax, strand, locgroup, rank"+ ") ("+ " select ? as feature_id"+ " , featureloc.srcfeature_id"+ " , featureloc.fmin + ?"+ " , featureloc.fmin + ?"+ " , featureloc.strand"+ " , featureloc.locgroup"+ " , featureloc.rank"+ " from featureloc"+ " where featureloc.feature_id = ?"+ " and featureloc.rank = ?"+ ")", partId, pos, pos + length, matchId, rank ); logger.trace(String.format("Created %d featurelocs for part ID=%d of match ID=%d at %d, length %d with rank %d", n, partId, matchId, pos, length, rank)); } public void setTransactionManager(PlatformTransactionManager transactionManager) { this.transactionManager = transactionManager; } }