package org.genedb.db.loading.auxiliary; import org.gmod.schema.feature.MembraneStructure; import org.gmod.schema.feature.Polypeptide; import org.gmod.schema.mapped.Analysis; import org.apache.log4j.Logger; import org.hibernate.Session; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.Map; import java.util.Set; public class TMHMMLoader extends Loader { private static Logger logger = Logger.getLogger(TMHMMLoader.class); String analysisProgramVersion = "unknown"; Boolean notFoundNotFatal = false; @Override protected Set<String> getOptionNames() { Set<String> options = new HashSet<String>(); Collections.addAll(options, "tmhmm-version", "not-found-not-fatal"); return options; } @Override protected boolean processOption(String optionName, String optionValue) { if (optionName.equals("tmhmm-version")) { analysisProgramVersion = optionValue; return true; } else if (optionName.equals("not-found-not-fatal")) { if (!optionValue.equals("true") && !optionValue.equals("false")) { return false; } notFoundNotFatal = Boolean.valueOf(optionValue); return true; } return false; } @Override public void doLoad(InputStream inputStream, Session session) throws IOException { // Add analysis Analysis analysis = new Analysis(); analysis.setProgram("tmhmm"); analysis.setProgramVersion(analysisProgramVersion); sequenceDao.persist(analysis); TMHMMFile file = new TMHMMFile(inputStream); int n=1; for (String key: file.keys()) { logger.info(String.format("[%d/%d] Loading helices for key '%s'", n++, file.keys().size(), key)); Polypeptide polypeptide = getPolypeptideByMangledName(key); if (polypeptide == null) { if (notFoundNotFatal) { String errorMessage = String.format("Failed to find polypeptide '%s'", key); logger.error(errorMessage); } else { throw new RuntimeException(String.format("Failed to find polypeptide '%s'", key)); } continue; } loadMembraneStructure(polypeptide, file.regionsForKey(key), analysis); /* * If the session isn't cleared out every so often, it * starts to get pretty slow after a while if we're loading * a large file. It's important that this come immediately * after a flush. (Commit will trigger a flush unless you've * set FlushMode.MANUAL, which we assume you haven't.) */ if (n % 50 == 1) { logger.info("Clearing session"); session.clear(); } } } private void loadMembraneStructure(Polypeptide polypeptide, Iterable<TMHMMRegion> regions, Analysis analysis) { logger.debug(String.format("Creating membrane structure region for '%s'", polypeptide.getUniqueName())); MembraneStructure membraneStructure = sequenceDao.createMembraneStructure(polypeptide); // Add analysisfeature if (analysis != null) { membraneStructure.createAnalysisFeature(analysis); } sequenceDao.persist(membraneStructure); for (TMHMMRegion region: regions) { loadRegion(membraneStructure, region); } } private void loadRegion(MembraneStructure membraneStructure, TMHMMRegion region) { logger.debug(String.format("Adding membrane structure subregion (%s) for '%s' at %d-%d", region.getType(), region.getKey(), region.getFmin(), region.getFmax())); switch(region.getType()) { case INSIDE: sequenceDao.persist(sequenceDao.createCytoplasmicRegion(membraneStructure, region.getFmin(), region.getFmax())); break; case TMHELIX: sequenceDao.persist(sequenceDao.createTransmembraneRegion(membraneStructure, region.getFmin(), region.getFmax())); break; case OUTSIDE: sequenceDao.persist(sequenceDao.createNonCytoplasmicRegion(membraneStructure, region.getFmin(), region.getFmax())); break; } } } class TMHMMRegion { public enum Type { INSIDE, TMHELIX, OUTSIDE; public static Type decode(String typeString) { if (typeString.equals("inside")) { return INSIDE; } else if (typeString.equals("TMhelix")) { return TMHELIX; } else if (typeString.equals("outside")) { return OUTSIDE; } else { throw new IllegalArgumentException(String.format("Unrecognised type '%s'", typeString)); } } }; private String key; private Type type; private int fmin, fmax; public TMHMMRegion(String key, int fmin, int fmax, Type type) { this.key = key; this.fmin = fmin; this.fmax = fmax; this.type = type; } public String getKey() { return key; } public int getFmin() { return fmin; } public int getFmax() { return fmax; } public Type getType() { return type; } public boolean isTMHelix() { return type == Type.TMHELIX; } } class TMHMMFile { private Set<String> keysWithHelices = new HashSet<String> (); private Map<String,Collection<TMHMMRegion>> regionsByKey = new HashMap<String,Collection<TMHMMRegion>>(); public TMHMMFile(InputStream inputStream) throws IOException { BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream)); String line; while (null != (line = reader.readLine())) { if (line.startsWith("#")) { continue; } String[] fields = line.split("\\s+"); String key = fields[0]; TMHMMRegion.Type type = TMHMMRegion.Type.decode(fields[2]); String startString = fields[3]; String stopString = fields[4]; TMHMMRegion region = new TMHMMRegion(key, Integer.parseInt(startString) - 1, Integer.parseInt(stopString), type); if (region.isTMHelix()) { keysWithHelices.add(key); } if (!regionsByKey.containsKey(key)) { regionsByKey.put(key, new ArrayList<TMHMMRegion>()); } regionsByKey.get(key).add(region); } } public Set<String> keys() { return keysWithHelices; } public Collection<TMHMMRegion> regionsForKey(String key) { return regionsByKey.get(key); } }