/** * Copyright Copyright 2010-15 Simon Andrews * * This file is part of BamQC. * * BamQC is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 3 of the License, or * (at your option) any later version. * * BamQC is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with BamQC; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ /* * Changelog: * - Piero Dalle Pezze: Code from SeqMonk and removed un-necessary parts (only left extraction of location). * Added progress listeners. */ package uk.ac.babraham.BamQC.AnnotationParsers; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.IOException; import java.util.Enumeration; import uk.ac.babraham.BamQC.BamQCConfig; import uk.ac.babraham.BamQC.BamQCException; import uk.ac.babraham.BamQC.DataTypes.ProgressListener; import uk.ac.babraham.BamQC.DataTypes.Genome.AnnotationSet; import uk.ac.babraham.BamQC.DataTypes.Genome.Chromosome; import uk.ac.babraham.BamQC.DataTypes.Genome.Feature; import uk.ac.babraham.BamQC.DataTypes.Genome.Genome; import uk.ac.babraham.BamQC.DataTypes.Genome.SplitLocation; import uk.ac.babraham.BamQC.Preferences.BamQCPreferences; import uk.ac.babraham.BamQC.Utilities.FileFilters.DatSimpleFileFilter; import uk.ac.babraham.BamQC.Utilities.FileFilters.GFFSimpleFileFilter; /** * The Class can either do a full parse of the original EMBL format files, or parse * included gff / gtf files if present. * @author Simon Andrews * @author Piero Dalle Pezze */ public class GenomeParser extends AnnotationParser { /** The genome. */ private Genome genome = null; /** The base location. */ private File baseLocation; /** The current offset. */ private int currentOffset = 0; /** The prefs. */ private BamQCPreferences prefs = BamQCPreferences.getInstance(); public GenomeParser () { super(); } /** * The parsed genome or null if no genome has been parsed. * @return the parsed genome or null */ public Genome genome() { return genome; } /* (non-Javadoc) * @see uk.ac.babraham.BamQC.AnnotationParsers.AnnotationParser#requiresFile() */ @Override public boolean requiresFile() { return false; } /* (non-Javadoc) * @see uk.ac.babraham.BamQC.AnnotationParsers.AnnotationParser#name() */ @Override public String name() { return "Genome Parser"; } /* * (non-Javadoc) * @see uk.ac.babraham.BamQC.AnnotationParsers.AnnotationParser#parseAnnotation(uk.ac.babraham.BamQC.DataTypes.Genome.AnnotationSet, java.io.File) */ @Override public void parseAnnotation(AnnotationSet annotationSet, File file) throws Exception {} /* * (non-Javadoc) * @see uk.ac.babraham.BamQC.AnnotationParsers.AnnotationParser#parseGenome(java.io.File) */ @Override public void parseGenome (File baseLocation) throws Exception { this.baseLocation = baseLocation; try { genome = new Genome(baseLocation); } catch (BamQCException ex) { Enumeration<ProgressListener> en = listeners.elements(); while (en.hasMoreElements()) { en.nextElement().progressExceptionReceived(ex); } throw ex; } // Update the listeners Enumeration<ProgressListener> e = listeners.elements(); while (e.hasMoreElements()) { e.nextElement().progressUpdated("Loading files for genome "+baseLocation,0,0); } parseGenomeFiles(genome); } private void parseGenomeFiles (Genome genome) throws Exception { // We need a list of all of the .dat files inside the baseLocation File [] files = baseLocation.listFiles(new DatSimpleFileFilter()); int importedFeatures = 0; int totalFiles = files.length; int filesRead = 0; int previousPercent = 0; Enumeration<ProgressListener> e = null; for (int i=0;i<totalFiles;i++) { // Update the listeners // Enumeration<ProgressListener> e = listeners.elements(); // while (e.hasMoreElements()) { // e.nextElement().progressUpdated("Loading genome file "+files[i].getName(),i,files.length); // } try { importedFeatures += processEMBLFile(files[i]); filesRead = i+1; int percent = Math.round(filesRead * 100.0f / totalFiles); if (previousPercent < percent) { // Update the listeners e = listeners.elements(); while (e.hasMoreElements()) { e.nextElement().progressUpdated("Parsing genome " + BamQCConfig.getInstance().genome.getParentFile().getName() + " [ " + BamQCConfig.getInstance().genome.getName() + " ] (" + percent + "%)", percent, 100); } previousPercent = percent; } } catch (Exception ex) { throw ex; } } // Update the listeners e = listeners.elements(); if(files.length > 0) { while (e.hasMoreElements()) { // Update the listeners e.nextElement().progressComplete("Processed features: "+importedFeatures + "\n" + "Parsed annotation .dat files for genome " + genome.toString(), null); } } // Now do the same thing for gff files. // We need a list of all of the .gff/gtf files inside the baseLocation files = baseLocation.listFiles(new GFFSimpleFileFilter()); totalFiles = files.length; filesRead = 0; previousPercent = 0; GFF3AnnotationParser gffParser = new GFF3AnnotationParser(); for (int i=0;i<totalFiles;i++) { // Update the listeners // e = listeners.elements(); // while (e.hasMoreElements()) { // e.nextElement().progressUpdated("Loading genome file "+files[i].getName(),i,files.length); // } try { AnnotationSet newSet = new AnnotationSet(); gffParser.parseAnnotation(newSet, files[i]); Feature [] features = newSet.getAllFeatures(); for (int f=0;f<features.length;f++) { genome.annotationSet().addFeature(features[f]); } filesRead = i+1; int percent = filesRead * 100 / totalFiles; if (previousPercent < percent && percent%5 == 0){ // Update the listeners e = listeners.elements(); while (e.hasMoreElements()) { e.nextElement().progressUpdated("Parsing annotation file " + files[i].getName() + " (" + percent + "%)", percent, 100); } previousPercent = percent; } } catch (Exception ex) { e = listeners.elements(); while (e.hasMoreElements()) { e.nextElement().progressExceptionReceived(ex); } throw ex; } } if(files.length > 0) { // Update the listeners e = listeners.elements(); while (e.hasMoreElements()) { e.nextElement().progressComplete("Parsed annotation .gff/.gtf files for genome "+ genome.toString(), null); } } } /** * Process EMBL file. * * @param f the f * @param annotation the annotation * @throws Exception the exception * @return the number of imported features */ private int processEMBLFile (File f) throws Exception { // int processedLines = 0; int processedFeatures = 0; BufferedReader br = null; try { br = new BufferedReader(new FileReader(f)); Chromosome c = null; // We need to find and read the accession line to find out // which chromosome and location we're dealing with. // Each physical file can contain more than one EMBL file. We // need to account for this in our processing. while ((c = parseChromosome(br)) != null) { // processedLines++; String line; // We can now skip through to the start of the feature table while ((line=br.readLine())!=null) { // processedLines++; if (line.startsWith("FH") || line.startsWith("SQ")) { break; } } // We can now start reading the features one at a time by // concatenating them and then passing them on for processing StringBuilder currentAttribute = new StringBuilder(); boolean skipping = true; Feature feature = null; while ((line=br.readLine())!=null) { // if (processedLines % 100000 == 0) { // System.err.println ("Processed "+processedLines+" lines currently holding "+processedFeatures+" features"); // } // processedLines++; // System.err.println("Read line '"+line+"'"); if (line.startsWith("XX") || line.startsWith("SQ") || line.startsWith("//")) { skipToEntryEnd(br); break; } if (line.length() < 18) continue; // Just a blank line. String type = line.substring(5,18).trim(); // System.out.println("Type is "+type); if (type.length()>0) { //We're at the start of a new feature. // Check whether we need to process the old feature if (skipping) { // We're either on the first feature, or we've // moving past this one skipping = false; } else { // We need to process the last attribute from the // old feature processAttributeReturnSkip(currentAttribute.toString(), feature); genome.annotationSet().addFeature(feature); processedFeatures++; } // We can check to see if we're bothering to load this type of feature if (prefs.loadAnnotation(type)) { // System.err.println("Creating new feature of type "+type); feature = new Feature(type,c); currentAttribute=new StringBuilder("location="); currentAttribute.append(line.substring(21).trim()); // System.out.println(currentAttribute.toString()); continue; } skipping = true; } if (skipping) continue; String data = line.substring(21).trim(); if (data.startsWith("/")) { // We're at the start of a new attribute //Process the last attribute (extract the location) skipping = processAttributeReturnSkip(currentAttribute.toString(), feature); currentAttribute = new StringBuilder(); } // Our default action is just to append onto the existing information // Descriptions which run on to multiple lines need a space adding // before the next lot of text. if (currentAttribute.indexOf("description=") >= 0) currentAttribute.append(" "); currentAttribute.append(data); } // We've finished, but we need to process the last feature // if there was one if (!skipping) { // We need to process the last attribute from the // old feature processAttributeReturnSkip(currentAttribute.toString(), feature); genome.annotationSet().addFeature(feature); processedFeatures++; } } } catch(Exception ex) { throw ex; } finally { if(br != null) { br.close(); } } return processedFeatures; } /** * Process attribute return skip. * * @param attribute the attribute * @param feature the feature * @return true, if successful * @throws BamQCException the bamqc exception */ private boolean processAttributeReturnSkip (String attribute, Feature feature) throws BamQCException { // System.out.println("Adding feature - current attribute is "+attribute); String [] nameValue = attribute.split("=",2); // We used to insist on key value pairs, but the EMBL spec // allows a key without a value, so one value is OK. // extract the location if (nameValue[0].equals("location")) { // A location has to have a value if (nameValue.length < 2) { throw new BamQCException("Location didn't have an '=' delimiter"); } // TODO just a checkpoint for a print showing that we are collecting the location correctly. // Print the location for this feature type // System.out.println("Location is "+nameValue[1]); //Check to see if this is a location we can support if (nameValue[1].indexOf(":")>=0) { // Some locations are given relative to other sequences // (where a feature splits across more than one sequence). // We can't handle this so we don't try. return true; } feature.setLocation(new SplitLocation(nameValue[1],currentOffset)); } return false; } /** * Parses the chromosome. * * @param br the br * @return the chromosome * @throws BamQCException the seq monk exception * @throws IOException Signals that an I/O exception has occurred. */ private Chromosome parseChromosome (BufferedReader br) throws BamQCException, IOException { String line; while ((line=br.readLine())!=null) { if (line.startsWith("AC")) { String [] sections = line.split(":"); if (sections.length != 6) { // It's not a chromosome file. We probably just want to // skip it and move onto the next entry progressWarningReceived(new BamQCException("AC line didn't have 6 sections '"+line+"'")); skipToEntryEnd(br); continue; } if (line.indexOf("supercontig")>=0) { // It's not a chromosome file. We probably just want to // skip it and move onto the next entry skipToEntryEnd(br); continue; } // Add a new chromosome to the factory if this does not exist. Chromosome c = genome.annotationSet().chromosomeFactory().getChromosome(sections[2]); c.setLength(Integer.parseInt(sections[4])); // Since the positions of all features are given relative // to the current sequence we need to add the current // start position to all locations as an offset. currentOffset = Integer.parseInt(sections[3])-1; return c; } if (line.startsWith("//")) { throw new BamQCException("Couldn't find AC line"); } } return null; } /** * Skip to entry end. * * @param br the br * @throws IOException Signals that an I/O exception has occurred. */ private void skipToEntryEnd (BufferedReader br) throws IOException { String line; try { Thread.sleep(5); } catch (InterruptedException e) { } while ((line=br.readLine())!=null) { if (line.startsWith("//")) return; } } }