package org.genedb.db.loading.auxiliary; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.log4j.Logger; import org.gmod.schema.feature.Polypeptide; import org.hibernate.Session; public class DGPILoader extends Loader { private static final Logger logger = Logger.getLogger(DGPILoader.class); @Override public void doLoad(InputStream inputStream, Session session) throws IOException { //Transaction transaction = session.getTransaction(); DGPIFile file = new DGPIFile(inputStream); int n=1; for (String key: file.keys()) { logger.info(String.format("[%d/%d] Loading DGPI results for key '%s'", n++, file.keys().size(), key)); Polypeptide polypeptide = getPolypeptideByMangledName(key); if (polypeptide == null) { logger.error(String.format("Could not find polypeptide '%s'", key)); continue; } //transaction.begin(); loadResult(polypeptide, file.resultForKey(key)); //transaction.commit(); /* * If the session isn't cleared out every so often, it * starts to get pretty slow after a while if we're loading * a large file. It's important that this come immediately * after a flush. (Commit will trigger a flush unless you've * set FlushMode.MANUAL, which we assume you haven't.) */ if (n % 50 == 1) { logger.info("Clearing session"); session.clear(); } } } private void loadResult(Polypeptide polypeptide, DGPIResult result) { logger.debug(String.format("Processing result for '%s'", polypeptide.getUniqueName())); if (result.isAnchored()) { addAnchoredProperty(polypeptide); } if (result.getBestCleavageSite() >= 0) addCleavageSite(polypeptide, result.getBestCleavageSite(), result.getCleavageSiteScore()); } private void addCleavageSite(Polypeptide polypeptide, int bestCleavageSite, String cleavageSiteScore) { logger.debug(String.format("Adding cleavage site at %d (score=%s)", bestCleavageSite, cleavageSiteScore)); sequenceDao.persist(sequenceDao.createGPIAnchorCleavageSite(polypeptide, bestCleavageSite, cleavageSiteScore)); } private void addAnchoredProperty(Polypeptide polypeptide) { logger.debug("Setting the 'DGPI anchored' property"); sequenceDao.persist(sequenceDao.createGPIAnchoredProperty(polypeptide)); } } class DGPIFile { private Map<String, DGPIResult> resultsByKey = new HashMap<String, DGPIResult>(); public DGPIFile(InputStream inputStream) throws IOException { BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream)); String line, key = null; List<String> resultLines = new ArrayList<String>(); while (null != (line = reader.readLine())) { if (line.startsWith(">")) { if (key != null) { DGPIResult result = DGPIResult.parseLines(resultLines); if (result != null) { resultsByKey.put((key.substring(1)).trim(), result); } } key = line; resultLines = new ArrayList<String>(); } else { resultLines.add(line); } } // Don't forget the last one! DGPIResult result = DGPIResult.parseLines(resultLines); if (result != null) { resultsByKey.put((key.substring(1)).trim(), result); } } public Collection<String> keys() { return resultsByKey.keySet(); } public DGPIResult resultForKey(String key) { return resultsByKey.get(key); } } class DGPIResult { private boolean anchored = false; private int bestCleavageSite = -1; private String cleavageSiteScore; private static final Pattern CLEAVAGE_SITE_PATTERN = Pattern.compile("\\s*There's a potential cleavage site at (\\d+) \\(score=(\\d+\\.\\d+)\\).*"); private static final Pattern BEST_CLEAVAGE_SITE_PATTERN = Pattern.compile("\\s*The best cleavage site is (\\d+)"); private static final Pattern IS_GPI_ANCHORED_PATTERN = Pattern.compile("\\s*This protein is GPI-anchored.*"); private enum State {NONE, CLEAVAGE_SITE, CONCLUSION}; public static DGPIResult parseLines (List<String> lines) { DGPIResult ret = new DGPIResult(); Map<String,String> cleavageSiteScoreByLocation = new HashMap<String,String>(); State state = State.NONE; for (String line: lines) { if (line.startsWith("Cleavage site")) { assert state == State.NONE; state = State.CLEAVAGE_SITE; } else if (line.startsWith("Conclusion")) { assert state == State.CLEAVAGE_SITE; state = State.CONCLUSION; } else { switch (state) { case CLEAVAGE_SITE: Matcher cleavageSiteMatcher = CLEAVAGE_SITE_PATTERN.matcher(line); if (cleavageSiteMatcher.matches()) cleavageSiteScoreByLocation.put(cleavageSiteMatcher.group(1), cleavageSiteMatcher.group(2)); Matcher bestCleavageSiteMatcher = BEST_CLEAVAGE_SITE_PATTERN.matcher(line); if (bestCleavageSiteMatcher.matches()) { String bestCleavageSiteString = bestCleavageSiteMatcher.group(1); ret.bestCleavageSite = Integer.parseInt(bestCleavageSiteString); if (!cleavageSiteScoreByLocation.containsKey(bestCleavageSiteString)) throw new RuntimeException("Failed to parse DGPI result"); ret.cleavageSiteScore = cleavageSiteScoreByLocation.get(bestCleavageSiteString); } break; case CONCLUSION: Matcher isAnchoredMatcher = IS_GPI_ANCHORED_PATTERN.matcher(line); if (isAnchoredMatcher.matches()) ret.anchored = true; break; case NONE: /* Nothing need be done */ break; } } } if (ret.anchored || ret.bestCleavageSite > -1) return ret; else return null; } public boolean isAnchored() { return anchored; } public int getBestCleavageSite() { return bestCleavageSite; } public String getCleavageSiteScore() { return cleavageSiteScore; } @Override public String toString(){ return "Cleavage site at " + getBestCleavageSite() + " with a score of " + getCleavageSiteScore() + " GPI anchored: " + isAnchored(); } }