package org.genedb.db.loading; import org.genedb.db.loading.EmblLoader.OverwriteExisting; import org.apache.log4j.Logger; import java.io.BufferedReader; import java.io.File; import java.io.IOException; import java.io.Reader; import java.lang.reflect.InvocationTargetException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * Represents an EMBL file. * * The parser is reasonably forgiving on the macroscopic level - it doesn't * care what order the sections are in, for example - but generally quite strict * about the syntax of individual constructs. * * @author rh11 */ public class EmblFile { private static final Logger logger = Logger.getLogger(EmblFile.class); private boolean continueOnError = false; private String filePath; private OverwriteExisting overwriteExisting; private void dataError(DataError dataError) throws DataError { if (continueOnError) { logger.error("DataError", dataError); } else { throw dataError; } } private void syntaxError(SyntaxError syntaxError) throws SyntaxError { if (continueOnError) { logger.error("SyntaxError", syntaxError); } else { throw syntaxError; } } private void parsingException(ParsingException parsingException) throws ParsingException { if (continueOnError) { logger.error("ParsingException", parsingException); } else { throw parsingException; } } public EmblFile(File inputFile, Reader reader) throws IOException, ParsingException { this(inputFile, reader, false); } public EmblFile(File inputFile, Reader reader, boolean continueOnError, OverwriteExisting overwriteExisting) throws IOException, ParsingException { this (inputFile.toString(), new BufferedReader(reader), continueOnError, overwriteExisting); } public EmblFile(File inputFile, Reader reader, boolean continueOnError) throws IOException, ParsingException { this (inputFile.toString(), new BufferedReader(reader), continueOnError, OverwriteExisting.NO); } public EmblFile(String inputFile, BufferedReader reader, boolean continueOnError, OverwriteExisting overwriteExisting) throws IOException, ParsingException { this.filePath = inputFile; this.continueOnError = continueOnError; this.overwriteExisting = overwriteExisting; String line; while (null != (line = reader.readLine())) { processLine(inputFile, line); } if (idSection == null) { dataError(new DataError(inputFile, "Found no ID line")); // We only get here if we're running in quickAndDirty mode, i.e. continueOnError is set. idSection = new IDSection(); idSection.accession = inputFile; } if (sequenceSection == null && !overwriteExisting.toString().equals("MERGE")) { dataError(new DataError(inputFile, "Found no sequence data")); } if (sequenceSection != null && overwriteExisting.toString().equals("MERGE")) { dataError(new DataError(inputFile, "Found sequence data but running with overwriteExisting=MERGE")); } logger.info(String.format("Loaded '%s' from '%s'", getAccession(), inputFile)); } private static final Pattern linePattern = Pattern.compile("(ID|AC|PR|DT|DE|KW|OS|OC|OG|RN|RC|RP|RX|RG|RA|RT|RL|DR|CC|AH|AS|FH|FT|XX|SQ|CO| |//)(?: (.*))?|(##.*)"); private int lineNumber = 0; private String currentSectionIdentifier = null; private Section currentSection = null; private List<Section> sections = new ArrayList<Section>(); private void processLine(String inputFile, String line) throws ParsingException { ++ lineNumber; Matcher matcher = linePattern.matcher(line); if (!matcher.matches()) { syntaxError(new SyntaxError(inputFile, lineNumber, line)); return; } // The EMBL format doesn't really allow arbitrary comments to be interspersed with the data. // But for sample files (synthetic.embl in particular) it's useful to be able to include remarks // in the feature table. Thus we have invented a compatible extension: any line beginning with // two hash marks is ignored, wherever it appears. if (matcher.group(3) != null) { return; } String identifier = matcher.group(1); String data = matcher.group(2); try { if (!identifier.equals(currentSectionIdentifier)) { if (currentSection != null) { currentSection.finished(); } currentSectionIdentifier = identifier; currentSection = createSection(identifier); sections.add(currentSection); } currentSection.addData(lineNumber, data); } catch (ParsingException e) { e.setLocation(inputFile, lineNumber); parsingException(e); } } private static final Map<String,Class<? extends Section>> sectionTypeByIdentifier = new HashMap<String,Class<? extends Section>>() {{ put("ID", IDSection.class); put("FH", SilentlyIgnoredSection.class); put("FT", FeatureTable.class); put("CO", ContigSection.class); put("SQ", SequenceHeaderSection.class); put(" ", SequenceSection.class); put("XX", SilentlyIgnoredSection.class); put("DE", SilentlyIgnoredSection.class); put("KW", SilentlyIgnoredSection.class); put("//", SilentlyIgnoredSection.class); }}; /* * A factory method to create Sections, so we can have * different subclasses for different types of section. */ private Section createSection(String identifier) throws DataError { Class<? extends Section> sectionType = sectionTypeByIdentifier.get(identifier); if (sectionType == null) { return new UnknownSection(identifier); } Section section = null; try { section = sectionType.getDeclaredConstructor(EmblFile.class).newInstance(this); /* * For other section types, the section constructor leaves a reference to the * section in an instance variable - see IDSection, for example. Unlike the * others, FeatureTable is not an inner class: because it's complicated and * important enough to deserve its own file, and Java doesn't let you define * an inner class in a separate file. Since this is the only such case, it's * no great hardship to deal with it specially here. */ if (section instanceof FeatureTable) { if (featureTable != null) { dataError(new DataError("More than one feature table found")); } featureTable = (FeatureTable) section; } return section; } catch (InvocationTargetException e) { // The invoked constructor threw an exception Throwable targetException = e.getCause(); if (targetException instanceof DataError) { dataError((DataError) targetException); return section; } else { throw new RuntimeException(e); } } catch (NoSuchMethodException e) { throw new RuntimeException(e); } catch (InstantiationException e) { throw new RuntimeException(e); } catch (IllegalAccessException e) { throw new RuntimeException(e); } } private FeatureTable featureTable = null; /* * A minor subtlety here: the abstract superclass Section is a top-level nested * class, though most of the concrete subclasses are inner classes. This allows * us to have the non-inner implementing class {@link FeatureTable}. */ abstract static class Section { public abstract void addData(int lineNumber, String data) throws ParsingException; @SuppressWarnings("unused") public void finished() throws ParsingException {} } /* ID section */ private static final Pattern idPattern = Pattern.compile("([^;]+); SV (\\d+); (circular|linear); ([^;]+);" + " (CON|ANN|PAT|EST|GSS|HTC|HTG|MGA|WGS|TPA|STS|STD);" + " (PHG|ENV|FUN|HUM|INV|MAM|VRT|MUS|PLN|PRO|ROD|SYN|TGN|UNC|VRL); (\\d+) BP\\."); private IDSection idSection = null; private class IDSection extends Section { IDSection() throws DataError { if (idSection != null) { dataError(new DataError("Found more than one ID line")); } idSection = this; } String accession, topology, moleculeType, dataClass, taxonomicDivision; int version, sequenceLength; boolean alreadySeen = false; @Override public void addData(int lineNumber, String data) throws ParsingException { if (data == null) { // Ignore empty ID lines. They're technically illegal, but some of our files have them. return; } if (alreadySeen) { dataError(new DataError("Found more than one ID line")); } Matcher matcher = idPattern.matcher(data); if (!matcher.matches()) { logger.error("Failed to parse ID line: " + data); data = data.trim(); accession = data.substring(0, data.indexOf(' ')); if (accession.endsWith(";")) { accession = accession.substring(0, accession.length() - 1); } logger.warn(String.format("Taking the sequence identifier to be '%s'", accession)); } else { accession = matcher.group(1); version = Integer.parseInt(matcher.group(2)); topology = matcher.group(3); moleculeType = matcher.group(4); dataClass = matcher.group(5); taxonomicDivision = matcher.group(6); sequenceLength = Integer.parseInt(matcher.group(7)); } } } /* SQ line */ private boolean seenSequenceHeader = false;; private class SequenceHeaderSection extends Section { @SuppressWarnings("unused") // used by reflection SequenceHeaderSection() { // empty } @Override public void addData(int lineNumber, String data) throws ParsingException { if (seenSequenceHeader) { dataError(new DataError("Found more than one SQ line")); } seenSequenceHeader = true; } } /* CO section */ private ContigSection contigSection = null; private class ContigSection extends Section { @SuppressWarnings("unused") // used by reflection ContigSection() throws DataError { if (contigSection != null) { dataError(new DataError("More than one CO section found")); } contigSection = this; } private StringBuilder allData = new StringBuilder(); @Override public void addData(int lineNumber, String data) throws ParsingException { allData.append(data); } @Override public void finished() throws ParsingException { EmblLocation locations = EmblLocation.parse(allData.toString()); if (!(locations instanceof EmblLocation.Join)) { dataError(new DataError("The CO section is not a join(...) location")); } this.contigLocations = (EmblLocation.Join) locations; } private EmblLocation.Join contigLocations; } // The FT section is defined in a separate class, {@link FeatureTable} /* SQ section */ private static final Pattern sequencePattern = Pattern.compile("((?:\\w{10} ){0,5}\\w{1,10})\\s+(\\d+)"); private SequenceSection sequenceSection = null; private class SequenceSection extends Section { @SuppressWarnings("unused") // used by reflection SequenceSection() throws DataError { if (sequenceSection != null) { dataError(new DataError("Found more than one sequence data section")); } sequenceSection = this; } StringBuilder sequence = new StringBuilder(); @Override public void addData(int lineNumber, String data) throws ParsingException { Matcher matcher = sequencePattern.matcher(data); if (!matcher.matches()) { syntaxError(new SyntaxError("Failed to parse sequence data: " + data)); } sequence.append(matcher.group(1).replaceAll("\\s", "")); } public String getSequence() { return sequence.toString(); } } /* Other sections */ private class UnknownSection extends Section { String identifier; UnknownSection(String identifier) { this.identifier = identifier; } @Override public void addData(int lineNumber, String data) { logger.warn(String.format("Ignoring: %s %s on line %d", identifier, data, lineNumber)); } } private class SilentlyIgnoredSection extends Section { @SuppressWarnings("unused") // used by reflection SilentlyIgnoredSection() {} @Override public void addData(int lineNumber, String data) {} } /* Accessors */ public String getAccession() { return idSection.accession; } public int getSequenceVersion() { return idSection.version; } public String getTopology() { return idSection.topology; } public String getMoleculeType() { return idSection.moleculeType; } public String getDataClass() { return idSection.dataClass; } public String getTaxonomicDivision() { if (idSection.taxonomicDivision != null) { return idSection.taxonomicDivision; } else { logger.warn("Taxonomic division unspecified (bad ID line). Assuming UNK."); return "UNK"; } } public int getSequenceLength() { return idSection.sequenceLength; } public String getSequence() { if (sequenceSection == null) { // Only if the sequence section is missing and we're in quickAndDirty mode. return ""; } return sequenceSection.getSequence(); } public FeatureTable getFeatureTable() { return featureTable; } public EmblLocation.Join getContigLocations() { if (contigSection == null) { return null; } return contigSection.contigLocations; } public String getFilePath() { return this.filePath; } }