package com.illumina.basespace.igv.gff; import java.io.BufferedInputStream; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.PrintWriter; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.logging.Logger; import javax.swing.SwingWorker; import org.broad.igv.Globals; import org.broad.igv.exceptions.ParserException; import org.broad.igv.feature.BasicFeature; import org.broad.igv.feature.FeatureDB; import org.broad.igv.feature.FeatureParser; import org.broad.igv.feature.GFFParser; import org.broad.igv.feature.genome.Genome; import org.broad.igv.feature.tribble.GFFCodec; import org.broad.igv.renderer.GeneTrackRenderer; import org.broad.igv.renderer.IGVFeatureRenderer; import org.broad.igv.track.FeatureCollectionSource; import org.broad.igv.track.FeatureTrack; import org.broad.igv.track.GFFFeatureSource; import org.broad.igv.track.TrackLoader; import org.broad.igv.track.TrackProperties; import org.broad.igv.ui.IGV; import org.broad.igv.util.ResourceLocator; import org.broad.tribble.Feature; import com.illumina.basespace.ApiClient; import com.illumina.basespace.igv.BaseSpaceMain; import com.illumina.basespace.igv.BaseSpaceUtil; import com.illumina.basespace.igv.gff.GFFLocatorFactory.GFFTrackLoader; import com.illumina.basespace.igv.ui.ProgressReport; import com.illumina.basespace.igv.ui.tree.BrowserDialog; public class BaseSpaceGFFParser implements FeatureParser { static Logger log = Logger.getLogger(GFFParser.class.getName()); private TrackProperties trackProperties = null; public BaseSpaceGFFParser(String path) { } public List<FeatureTrack> loadTracks(ResourceLocator locator, Genome genome) { BufferedReader reader = null; try { GFFTrackLoader gffLocator = (GFFTrackLoader) locator; ApiClient client = BaseSpaceMain.instance().getApiClient(gffLocator.getClientId()); // this.lineCount = count(gffLocator.getFile().getSize(),client.getFileInputStream(gffLocator.getFile())); reader = new BufferedReader(new InputStreamReader(client.getFileInputStream(gffLocator.getFile()))); GFFCodec.Version version = locator.getPath().endsWith(".gff3") ? GFFCodec.Version.GFF3 : GFFCodec.Version.GFF2; List<org.broad.tribble.Feature> features = loadFeatures(reader, genome); FeatureTrack track = new FeatureTrack(locator, new FeatureCollectionSource(features, genome)); track.setName(locator.getTrackName()); track.setRendererClass(IGVFeatureRenderer.class); track.setMinimumHeight(35); track.setHeight(45); track.setRendererClass(GeneTrackRenderer.class); if (trackProperties != null) { track.setProperties(trackProperties); track.setName(trackProperties.getName()); } List<FeatureTrack> tracks = new ArrayList<FeatureTrack>(); tracks.add(track); return tracks; } catch (Throwable ex) { log.severe(ex.toString()); throw new RuntimeException(ex); } finally { BaseSpaceUtil.dispose(reader); } } public List<org.broad.tribble.Feature> loadFeatures(final BufferedReader reader, final Genome genome) { BrowserDialog.instance().workInit(1000); SwingWorker<List<Feature>, ProgressReport> worker = new SwingWorker<List<Feature>, ProgressReport>() { @Override protected List<Feature> doInBackground() throws Exception { String line = null; int lineNumber = 0; int progressLineCount = 0; GFFCodec codec = new GFFCodec(genome); GFFFeatureSource.GFFCombiner combiner = new GFFFeatureSource.GFFCombiner(); try { while ((line = reader.readLine()) != null) { lineNumber++; progressLineCount++; if (line.startsWith("#")) { codec.readHeaderLine(line); } else { try { Feature f = codec.decode(line); if (f != null) { combiner.addFeature((BasicFeature) f); } List<ProgressReport> progress = new ArrayList<ProgressReport>(); progress.add(new ProgressReport("Decoding GFF Feature at line " + lineNumber,progressLineCount)); publish(progress.toArray(new ProgressReport[progress.size()])); if (progressLineCount % 1000 == 0) { progressLineCount = 0; } } catch (Exception e) { log.severe("Error parsing: " + line); } } } } catch (IOException ex) { log.severe("Error reading GFF file"); if (line != null && lineNumber != 0) { throw new ParserException(ex.getMessage(), ex, lineNumber, line); } else { throw new RuntimeException(ex); } } trackProperties = TrackLoader.getTrackProperties(codec.getHeader()); // Combine the features List<Feature> iFeatures = combiner.combineFeatures(); if (IGV.hasInstance()) { FeatureDB.addFeatures(iFeatures); } return iFeatures; } @Override protected void process(List<ProgressReport> chunks) { BrowserDialog.instance().workProgress(chunks); } @Override protected void done() { BrowserDialog.instance().workDone(); } }; BrowserDialog.instance().workStart(); worker.execute(); try { List<Feature> rtn = worker.get(); return rtn; } catch (Throwable t) { throw new RuntimeException(t); } } public int count(final double total,final InputStream in) throws IOException { BrowserDialog.instance().workInit(total); SwingWorker<Integer, ProgressReport> worker = new SwingWorker<Integer, ProgressReport>() { @Override protected Integer doInBackground() throws Exception { InputStream is = new BufferedInputStream(in); try { byte[] c = new byte[1024]; int count = 0; int readChars = 0; int bytesRead = 0; boolean empty = true; while ((readChars = is.read(c)) != -1) { bytesRead += readChars; List<ProgressReport> progress = new ArrayList<ProgressReport>(); progress.add(new ProgressReport("Read bytes " + bytesRead + " of " + total,bytesRead)); publish(progress.toArray(new ProgressReport[progress.size()])); empty = false; for (int i = 0; i < readChars; ++i) { if (c[i] == '\n') { ++count; } } } return (count == 0 && !empty) ? 1 : count; } finally { BaseSpaceUtil.dispose(is); BaseSpaceUtil.dispose(in); } } @Override protected void process(List<ProgressReport> chunks) { BrowserDialog.instance().workProgress(chunks); } @Override protected void done() { BrowserDialog.instance().workDone(); } }; BrowserDialog.instance().workStart(); worker.execute(); try { return worker.get(); } catch (Throwable t) { throw new RuntimeException(t); } } public List<org.broad.tribble.Feature> loadFeaturesBackup(BufferedReader reader, Genome genome) { String line = null; int lineNumber = 0; GFFCodec codec = new GFFCodec(genome); GFFFeatureSource.GFFCombiner combiner = new GFFFeatureSource.GFFCombiner(); try { while ((line = reader.readLine()) != null) { lineNumber++; if (line.startsWith("#")) { codec.readHeaderLine(line); } else { try { Feature f = codec.decode(line); if (f != null) { combiner.addFeature((BasicFeature) f); } } catch (Exception e) { log.severe("Error parsing: " + line); } } } } catch (IOException ex) { log.severe("Error reading GFF file"); if (line != null && lineNumber != 0) { throw new ParserException(ex.getMessage(), ex, lineNumber, line); } else { throw new RuntimeException(ex); } } trackProperties = TrackLoader.getTrackProperties(codec.getHeader()); // Combine the features List<Feature> iFeatures = combiner.combineFeatures(); if (IGV.hasInstance()) { FeatureDB.addFeatures(iFeatures); } return iFeatures; } /** * Given a GFF File, creates a new GFF file for each type. Any feature type * which is part of a "gene" ( {@link GFFCodec#geneParts} ) are put in the * same file, others are put in different files. So features of type "gene", * "exon", and "mrna" would go in gene.gff, but features of type "myFeature" * would go in myFeature.gff. * * @param gffFile * @param outputDirectory * @throws IOException */ public static void splitFileByType(String gffFile, String outputDirectory) throws IOException { BufferedReader br = new BufferedReader(new FileReader(gffFile)); String nextLine; String ext = "." + gffFile.substring(gffFile.length() - 4); Map<String, PrintWriter> writers = new HashMap<String, PrintWriter>(); while ((nextLine = br.readLine()) != null) { nextLine = nextLine.trim(); if (!nextLine.startsWith("#")) { String[] tokens = Globals.tabPattern.split(nextLine.trim().replaceAll("\"", ""), -1); String type = tokens[2]; if (GFFCodec.geneParts.contains(type)) { type = "gene"; } if (!writers.containsKey(type)) { writers.put(type, new PrintWriter(new FileWriter(new File(outputDirectory, type + ext)))); } } } br.close(); br = new BufferedReader(new FileReader(gffFile)); PrintWriter currentWriter = null; while ((nextLine = br.readLine()) != null) { nextLine = nextLine.trim(); if (nextLine.startsWith("#")) { for (PrintWriter pw : writers.values()) { pw.println(nextLine); } } else { String[] tokens = Globals.tabPattern.split(nextLine.trim().replaceAll("\"", ""), -1); String type = tokens[2]; if (GFFCodec.geneParts.contains(type)) { type = "gene"; } currentWriter = writers.get(type); if (currentWriter != null) { currentWriter.println(nextLine); } else { System.out.println("No writer for: " + type); } } } br.close(); for (PrintWriter pw : writers.values()) { pw.close(); } } public TrackProperties getTrackProperties() { return trackProperties; } public static void main(String[] args) throws IOException { if (args.length < 2) { System.out.println("SpitFilesByType <gffFile> <outputDirectory>"); return; } splitFileByType(args[0], args[1]); } }