package org.genedb.db.loading; import org.apache.log4j.Logger; import org.springframework.context.ApplicationContext; import org.springframework.context.support.ClassPathXmlApplicationContext; import java.io.BufferedReader; import java.io.File; import java.io.IOException; import java.io.Reader; import java.sql.SQLException; import java.util.ArrayList; import java.util.Collections; import java.util.Iterator; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; public class LoadVulgar extends FileProcessor { private static final Logger logger = Logger.getLogger(LoadVulgar.class); public static void main(String[] args) throws MissingPropertyException, IOException, ParsingException, SQLException { if (args.length > 0) { logger.warn("Ignoring command-line arguments"); } String organismCommonName = getRequiredProperty("load.organismCommonName"); String inputDirectory = getRequiredProperty("load.inputDirectory"); String fileNamePattern = getPropertyWithDefault("load.fileNamePattern", ".*\\.vulgar(?:\\.gz)?"); String matchType = getPropertyWithDefault("load.matchType", null); logger.info(String.format("Options: organismCommonName=%s, inputDirectory=%s, fileNamePattern=%s", organismCommonName, inputDirectory, fileNamePattern)); LoadVulgar loadVulgar = new LoadVulgar(organismCommonName, matchType); loadVulgar.processFileOrDirectory(inputDirectory, fileNamePattern); } private VulgarLoader loader; private LoadVulgar(String organismCommonName, String matchType) { ApplicationContext applicationContext = new ClassPathXmlApplicationContext(new String[] {"Load.xml"}); this.loader = applicationContext.getBean("vulgarLoader", VulgarLoader.class); loader.setOrganismCommonName(organismCommonName); if (matchType != null) { loader.setMatchType(matchType); } } @Override protected void processFile(File inputFile, Reader reader) throws IOException, ParsingException { loader.load(new VulgarFile(inputFile, reader)); } } class VulgarFileException extends RuntimeException { public VulgarFileException() { super(); } public VulgarFileException(String message, Throwable cause) { super(message, cause); } public VulgarFileException(String message) { super(message); } public VulgarFileException(Throwable cause) { super(cause); } } class VulgarFile implements Iterable<VulgarMapping> { private boolean alreadyGotIterator = false; private File file; private BufferedReader br; VulgarFile(File file, Reader reader) { this.file = file; this.br = new BufferedReader(reader); } public Iterator<VulgarMapping> iterator() { if (alreadyGotIterator) { throw new RuntimeException("You can only get one iterator from a VulgarFile, sorry!"); } return new Iterator<VulgarMapping>() { private String nextLine = null; private int lineNumber = 0; public boolean hasNext() { if (br == null) { return false; } if (nextLine == null) { try { nextLine = br.readLine(); if (nextLine == null) { br.close(); br = null; return false; } } catch (IOException e) { throw new VulgarFileException(e); } } return true; } public VulgarMapping next() { if (br == null) { return null; } try { lineNumber ++; if (nextLine != null) { VulgarMapping vulgarMapping = new VulgarMapping(nextLine); nextLine = null; return vulgarMapping; } String line = br.readLine(); if (line == null) { br.close(); br = null; return null; } return new VulgarMapping(line); } catch (ParsingException e) { e.setInputFile(file); e.setLineNumber(lineNumber); throw new VulgarFileException(e); } catch (IOException e) { throw new VulgarFileException(e); } } public void remove() { throw new UnsupportedOperationException(); } }; } } enum VulgarMatchType { MATCH('M'), CODON('C'), GAP('G'), NON_EQUIVALENCED_REGION('N'), FIVE_PRIME_SPLICE_SITE('5'), THREE_PRIME_SPLICE_SITE('3'), INTRON('I'), SPLIT_CODON('S'), FRAMESHIFT('F'); private VulgarMatchType(@SuppressWarnings("unused") char c) { // empty } static VulgarMatchType fromChar(char c) throws ParsingException { switch(c) { case 'M': return MATCH; case 'C': return CODON; case 'G': return GAP; case 'N': return NON_EQUIVALENCED_REGION; case '5': return FIVE_PRIME_SPLICE_SITE; case '3': return THREE_PRIME_SPLICE_SITE; case 'I': return INTRON; case 'S': return SPLIT_CODON; case 'F': return FRAMESHIFT; default: throw new SyntaxError(String.format("Unknown match type '%c'", c)); } } } /** * Represents an Exonerate mapping, as described by a single line in the Vulgar format. * * @author rh11 * */ class VulgarMapping { private static final Logger logger = Logger.getLogger(VulgarMapping.class); private static final Pattern vulgarPattern = Pattern.compile( "vulgar: (\\S+) (\\d+) (\\d+) ([+-]) (\\S+) (\\d+) (\\d+) " + "([+-]) (\\d+)((?: [MCGN53ISF] \\d+ \\d+)+)?(?:\\tPROM=\\d+)?\\s*"); // Note: we deliberately support the (presumably technically invalid) // case where no match parts are specified, so that we can use the same // code to load matches where the subdivision into parts is unknown. VulgarMapping(String line) throws ParsingException { if (!line.startsWith("vulgar: ")) { throw new SyntaxError("Line does not start with 'vulgar: '"); } Matcher matcher = vulgarPattern.matcher(line); if (!matcher.matches()) { throw new SyntaxError("Could not parse line: " + line); } query = matcher.group(1); qStart = Integer.parseInt(matcher.group(2)); qEnd = Integer.parseInt(matcher.group(3)); qStrand = matcher.group(4).charAt(0); target = matcher.group(5); tStart = Integer.parseInt(matcher.group(6)); tEnd = Integer.parseInt(matcher.group(7)); tStrand = matcher.group(8).charAt(0); score = matcher.group(9); matches = parseMatches(matcher.group(10)); } private List<Match> parseMatches(String string) throws ParsingException { if (string == null) { return Collections.emptyList(); } if (!string.startsWith(" ")) { throw new RuntimeException("The string doesn't start with a space." + "That should be impossible."); } List<Match> matches = new ArrayList<Match>(); String[] fields = string.substring(1).split(" "); if (fields.length % 3 != 0) { throw new RuntimeException("The number of fields is not a multiple of three." + "That should be impossible at this point."); } if (logger.isTraceEnabled()) { StringBuilder fieldsStr = new StringBuilder(); boolean firstTime = true; for (String field: fields) { if (!firstTime) { fieldsStr.append(", "); } fieldsStr.append(field); firstTime = false; } logger.trace(String.format("Fields = [%s]", fieldsStr)); } for(int i=0; i < fields.length; i+=3) { matches.add( new Match(fields[i].charAt(0), Integer.parseInt(fields[i+1]), Integer.parseInt(fields[i+2]) )); } return matches; } /* * Note that Exonerate uses interbase coordinates, so we don't * need to translate them! */ private String query; private int qStart, qEnd; private char qStrand; private String target; private int tStart, tEnd; private char tStrand; private String score; private List<Match> matches; String getQuery() { return query; } int getQMin() { switch (qStrand) { case '+': return qStart; case '-': return qEnd; default: throw new IllegalStateException(String.format("Invalid qStrand '%c'", qStrand)); } } int getQMax() { switch (qStrand) { case '+': return qEnd; case '-': return qStart; default: throw new IllegalStateException(String.format("Invalid qStrand '%c'", qStrand)); } } int getQStrand() { switch(qStrand) { case '+': return +1; case '-': return -1; default: throw new IllegalStateException(String.format("Invalid qStrand '%c'", qStrand)); } } String getTarget() { return target; } int getTMin() { switch (tStrand) { case '+': return tStart; case '-': return tEnd; default: throw new IllegalStateException(String.format("Invalid tStrand '%c'", tStrand)); } } int getTMax() { switch (tStrand) { case '+': return tEnd; case '-': return tStart; default: throw new IllegalStateException(String.format("Invalid tStrand '%c'", tStrand)); } } int getTStrand() { switch(tStrand) { case '+': return +1; case '-': return -1; default: throw new IllegalStateException(String.format("Invalid tStrand '%c'", tStrand)); } } String getScore() { return score; } List<Match> getMatches() { return matches; } class Match { private VulgarMatchType type; private int queryLength; private int targetLength; public Match(char typeChar, int queryLength, int targetLength) throws ParsingException { this.type = VulgarMatchType.fromChar(typeChar); this.queryLength = queryLength; this.targetLength = targetLength; } VulgarMatchType getType() { return type; } int getQueryLength() { return queryLength; } int getTargetLength() { return targetLength; } } }