GenomeParser.java example

Explorer
BamQC-master
/**
 * Copyright Copyright 2010-15 Simon Andrews
 *
 *    This file is part of BamQC.
 *
 *    BamQC is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    BamQC is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with BamQC; if not, write to the Free Software
 *    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 */
/*
 * Changelog: 
 * - Piero Dalle Pezze: Code from SeqMonk and removed un-necessary parts (only left extraction of location).
 * Added progress listeners.
 */
package uk.ac.babraham.BamQC.AnnotationParsers;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.Enumeration;

import uk.ac.babraham.BamQC.BamQCConfig;
import uk.ac.babraham.BamQC.BamQCException;
import uk.ac.babraham.BamQC.DataTypes.ProgressListener;
import uk.ac.babraham.BamQC.DataTypes.Genome.AnnotationSet;
import uk.ac.babraham.BamQC.DataTypes.Genome.Chromosome;
import uk.ac.babraham.BamQC.DataTypes.Genome.Feature;
import uk.ac.babraham.BamQC.DataTypes.Genome.Genome;
import uk.ac.babraham.BamQC.DataTypes.Genome.SplitLocation;
import uk.ac.babraham.BamQC.Preferences.BamQCPreferences;
import uk.ac.babraham.BamQC.Utilities.FileFilters.DatSimpleFileFilter;
import uk.ac.babraham.BamQC.Utilities.FileFilters.GFFSimpleFileFilter;

/**
 * The Class can either do a full parse of the original EMBL format files, or parse 
 * included gff / gtf files if present.
 * @author Simon Andrews
 * @author Piero Dalle Pezze
 */
public class GenomeParser extends AnnotationParser {

	
	/** The genome. */
	private Genome genome = null;
	
	/** The base location. */
	private File baseLocation;
	
	/** The current offset. */
	private int currentOffset = 0;
	
	/** The prefs. */
	private BamQCPreferences prefs = BamQCPreferences.getInstance();
	
	
	public GenomeParser () { 
		super();
	}
	
	/** 
	 * The parsed genome or null if no genome has been parsed.
	 * @return the parsed genome or null
	 */
	public Genome genome() {
		return genome;
	}
	
	/* (non-Javadoc)
	 * @see uk.ac.babraham.BamQC.AnnotationParsers.AnnotationParser#requiresFile()
	 */
	@Override
	public boolean requiresFile() {
		return false;
	}


	/* (non-Javadoc)
	 * @see uk.ac.babraham.BamQC.AnnotationParsers.AnnotationParser#name()
	 */
	@Override
	public String name() {
		return "Genome Parser";
	}
	
	/*
	 * (non-Javadoc)
	 * @see uk.ac.babraham.BamQC.AnnotationParsers.AnnotationParser#parseAnnotation(uk.ac.babraham.BamQC.DataTypes.Genome.AnnotationSet, java.io.File)
	 */
	@Override
	public void parseAnnotation(AnnotationSet annotationSet, File file) throws Exception {}	
	
	/*
	 * (non-Javadoc)
	 * @see uk.ac.babraham.BamQC.AnnotationParsers.AnnotationParser#parseGenome(java.io.File)
	 */
	@Override
	public void parseGenome (File baseLocation) throws Exception {
		this.baseLocation = baseLocation;

		try {
			genome = new Genome(baseLocation);
			
		} catch (BamQCException ex) {
			Enumeration<ProgressListener> en = listeners.elements();
			
			while (en.hasMoreElements()) {
				en.nextElement().progressExceptionReceived(ex);
			}
			throw ex;
		}
		// Update the listeners
		Enumeration<ProgressListener> e = listeners.elements();
		while (e.hasMoreElements()) {
			e.nextElement().progressUpdated("Loading files for genome "+baseLocation,0,0);
		}
		parseGenomeFiles(genome);
	}
	
		
	
	
	private void parseGenomeFiles (Genome genome) throws Exception {
		
		// We need a list of all of the .dat files inside the baseLocation
		File [] files = baseLocation.listFiles(new DatSimpleFileFilter());
		
		int importedFeatures = 0;
		
	    int totalFiles = files.length;                    
	    int filesRead = 0;
	    int previousPercent = 0;
	    
	    Enumeration<ProgressListener> e = null;
		
		for (int i=0;i<totalFiles;i++) {
			// Update the listeners
//			Enumeration<ProgressListener> e = listeners.elements();
//			while (e.hasMoreElements()) {
//				e.nextElement().progressUpdated("Loading genome file "+files[i].getName(),i,files.length);
//			}
			try {
				importedFeatures += processEMBLFile(files[i]);
				
	            filesRead = i+1;
	            int percent = Math.round(filesRead * 100.0f / totalFiles);          
	            if (previousPercent < percent) {
	        		// Update the listeners
	        		e = listeners.elements();
	        		while (e.hasMoreElements()) {
	        			e.nextElement().progressUpdated("Parsing genome "  
	        		     + BamQCConfig.getInstance().genome.getParentFile().getName() + " [ " 
	        			 + BamQCConfig.getInstance().genome.getName() + " ] (" + percent + "%)", percent, 100);
	        		}
	                previousPercent = percent;
	            }
			} 
			catch (Exception ex) {
				throw ex;
			}			
		}
		
		// Update the listeners
		e = listeners.elements();
		if(files.length > 0) {
			while (e.hasMoreElements()) {
				// Update the listeners
				e.nextElement().progressComplete("Processed features: "+importedFeatures + "\n" + 
												 "Parsed annotation .dat files for genome " + genome.toString(), null);
			}
		}
		
		
		
		
		// Now do the same thing for gff files.
		
		// We need a list of all of the .gff/gtf files inside the baseLocation
		files = baseLocation.listFiles(new GFFSimpleFileFilter());
		
		
	    totalFiles = files.length;                    
	    filesRead = 0;
	    previousPercent = 0;
		
		GFF3AnnotationParser gffParser = new GFF3AnnotationParser();
		
		for (int i=0;i<totalFiles;i++) {
			// Update the listeners
//			e = listeners.elements();
//			while (e.hasMoreElements()) {
//				e.nextElement().progressUpdated("Loading genome file "+files[i].getName(),i,files.length);
//			}
			try {
				AnnotationSet newSet = new AnnotationSet(); 
				gffParser.parseAnnotation(newSet, files[i]);
				Feature [] features = newSet.getAllFeatures();
				for (int f=0;f<features.length;f++) {
					genome.annotationSet().addFeature(features[f]);
				}
				
	            filesRead = i+1;
	            int percent = filesRead * 100 / totalFiles;          
	            if (previousPercent < percent && percent%5 == 0){
	        		// Update the listeners
	        		e = listeners.elements();
	        		while (e.hasMoreElements()) {
	        			e.nextElement().progressUpdated("Parsing annotation file " + files[i].getName() + " (" + percent + "%)", percent, 100);
	        		}
	                previousPercent = percent;
	            }
			} 
			catch (Exception ex) {
				e = listeners.elements();
				
				while (e.hasMoreElements()) {
					e.nextElement().progressExceptionReceived(ex);
				}
				throw ex;
			}			
		}


		if(files.length > 0) {
			// Update the listeners
			e = listeners.elements();
			while (e.hasMoreElements()) {
				e.nextElement().progressComplete("Parsed annotation .gff/.gtf files for genome "+ genome.toString(), null);
			}
		}

	}

	
	/**
	 * Process EMBL file.
	 * 
	 * @param f the f
	 * @param annotation the annotation
	 * @throws Exception the exception
	 * @return the number of imported features
	 */
	private int processEMBLFile (File f) throws Exception {
		
//		int processedLines = 0;
		int processedFeatures = 0;
		
		BufferedReader br = null; 
		try {
			br = new BufferedReader(new FileReader(f));
			Chromosome c = null;
			// We need to find and read the accession line to find out
			// which chromosome and location we're dealing with.
			
			// Each physical file can contain more than one EMBL file.  We 
			// need to account for this in our processing.
			
			while ((c = parseChromosome(br)) != null) {
//				processedLines++;
				String line;			
				// We can now skip through to the start of the feature table
				while ((line=br.readLine())!=null) {
//					processedLines++;
					if (line.startsWith("FH") || line.startsWith("SQ")) {
						break;
					}
				}
				
				// We can now start reading the features one at a time by
				// concatenating them and then passing them on for processing
				StringBuilder currentAttribute = new StringBuilder();
				boolean skipping = true;
				Feature feature = null;
				while ((line=br.readLine())!=null) {
					
	//				if (processedLines % 100000 == 0) {
	//					System.err.println ("Processed "+processedLines+" lines currently holding "+processedFeatures+" features");
	//				}
//					processedLines++;
	//				System.err.println("Read line '"+line+"'");
					
					if (line.startsWith("XX") || line.startsWith("SQ") || line.startsWith("//")) {
						skipToEntryEnd(br);
						break;
					}
					
					if (line.length() < 18) continue; // Just a blank line.
					
					String type = line.substring(5,18).trim();
	//				System.out.println("Type is "+type);
					if (type.length()>0) {
						//We're at the start of a new feature.
						
						// Check whether we need to process the old feature
						if (skipping) {
							// We're either on the first feature, or we've
							// moving past this one
							skipping = false;
						}
						else {						
							// We need to process the last attribute from the
							// old feature
							processAttributeReturnSkip(currentAttribute.toString(), feature);
							genome.annotationSet().addFeature(feature);
							processedFeatures++;
						}
						
						// We can check to see if we're bothering to load this type of feature
						if (prefs.loadAnnotation(type)) {
	//						System.err.println("Creating new feature of type "+type);
							feature = new Feature(type,c);
							currentAttribute=new StringBuilder("location=");
							currentAttribute.append(line.substring(21).trim());
	//						System.out.println(currentAttribute.toString());
							continue;
						}
						skipping = true;
						
					}
					
					if (skipping) continue;
					
					String data = line.substring(21).trim();
	
					if (data.startsWith("/")) {
						// We're at the start of a new attribute
											
						//Process the last attribute (extract the location)
						skipping = processAttributeReturnSkip(currentAttribute.toString(), feature);
						currentAttribute = new StringBuilder();
					}
					
					// Our default action is just to append onto the existing information
	
					// Descriptions which run on to multiple lines need a space adding
					// before the next lot of text.
					if (currentAttribute.indexOf("description=") >= 0) currentAttribute.append(" ");
	
					currentAttribute.append(data);
					
				}
				
				// We've finished, but we need to process the last feature
				// if there was one
				if (!skipping) {
					// We need to process the last attribute from the
					// old feature
					processAttributeReturnSkip(currentAttribute.toString(), feature);
					genome.annotationSet().addFeature(feature);
					processedFeatures++;
				}
			}
		} catch(Exception ex) {
			throw ex;
		} finally {
			if(br != null) {
				br.close();			
			}
		}
		return processedFeatures;
	}	
		
	
	/**
	 * Process attribute return skip.
	 * 
	 * @param attribute the attribute
	 * @param feature the feature
	 * @return true, if successful
	 * @throws BamQCException the bamqc exception
	 */
	private boolean processAttributeReturnSkip (String attribute, Feature feature) throws BamQCException {
//		System.out.println("Adding feature - current attribute is "+attribute);
		String [] nameValue = attribute.split("=",2);

		// We used to insist on key value pairs, but the EMBL spec
		// allows a key without a value, so one value is OK.
		
		// extract the location
		if (nameValue[0].equals("location")) {
			
			// A location has to have a value
			if (nameValue.length < 2) {
				throw new BamQCException("Location didn't have an '=' delimiter");
			}
			
			// TODO just a checkpoint for a print showing that we are collecting the location correctly.
			// Print the location for this feature type
// 			System.out.println("Location is "+nameValue[1]);
			//Check to see if this is a location we can support
			
			if (nameValue[1].indexOf(":")>=0) {
				// Some locations are given relative to other sequences
				// (where a feature splits across more than one sequence).
				// We can't handle this so we don't try.
				return true;
			}

			feature.setLocation(new SplitLocation(nameValue[1],currentOffset));
		}

		return false;
	}
	
	/**
	 * Parses the chromosome.
	 * 
	 * @param br the br
	 * @return the chromosome
	 * @throws BamQCException the seq monk exception
	 * @throws IOException Signals that an I/O exception has occurred.
	 */
	private Chromosome parseChromosome (BufferedReader br) throws BamQCException, IOException {
		String line;
		while ((line=br.readLine())!=null) {
			
			if (line.startsWith("AC")) {
				String [] sections = line.split(":");
				if (sections.length != 6) {
					// It's not a chromosome file.  We probably just want to
					// skip it and move onto the next entry
					progressWarningReceived(new BamQCException("AC line didn't have 6 sections '"+line+"'"));
					skipToEntryEnd(br);
					continue;
				}
				if (line.indexOf("supercontig")>=0) {
					// It's not a chromosome file.  We probably just want to
					// skip it and move onto the next entry
					skipToEntryEnd(br);
					continue;
				}
			
				// Add a new chromosome to the factory if this does not exist.
				Chromosome c = genome.annotationSet().chromosomeFactory().getChromosome(sections[2]);

								
				c.setLength(Integer.parseInt(sections[4]));
				
				// Since the positions of all features are given relative
				// to the current sequence we need to add the current
				// start position to all locations as an offset.
				currentOffset = Integer.parseInt(sections[3])-1;
				return c;
			}
			
			if (line.startsWith("//")) {
				throw new BamQCException("Couldn't find AC line");
			}
		}
		return null;
	}
	

	/**
	 * Skip to entry end.
	 * 
	 * @param br the br
	 * @throws IOException Signals that an I/O exception has occurred.
	 */
	private void skipToEntryEnd (BufferedReader br) throws IOException {
		String line;
		try {
			Thread.sleep(5);
		} catch (InterruptedException e) {
		}

		while ((line=br.readLine())!=null) {
			if (line.startsWith("//"))
				return;
		}
	}
	
		
}