package com.github.lindenb.jvarkit.util.bio.gtf; import java.io.IOException; import java.net.URLDecoder; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.regex.Pattern; import com.github.lindenb.jvarkit.lang.JvarkitException; import htsjdk.samtools.util.Interval; import htsjdk.samtools.util.IntervalTreeMap; import htsjdk.tribble.readers.LineIterator; public class GTFCodec { private final Pattern tab=Pattern.compile("[\t]"); private static final String GFF_VERSION="##gff-version"; private GTFHeader header=null; public static class GTFHeader { private final List<String> lines = new ArrayList<>(); private boolean is_gff3=false; public boolean isGff3() { return is_gff3; } } public GTFCodec() { } public GTFLine decode(LineIterator r)throws IOException { for(;;) { if(!r.hasNext()) return null; String line=r.next(); if(line.startsWith("#")) continue; return decode(line); } } public GTFHeader readHeader(LineIterator r) throws IOException { if(this.header!=null) throw new IOException("Reader already read"); this.header = new GTFHeader(); while(r.hasNext() && r.peek().startsWith("#")) { final String line=r.next(); if(line.startsWith(GFF_VERSION+" ")) { final String version =line.substring(GFF_VERSION.length()).trim(); if(version.equals("3")) { this.header.is_gff3=true; } } this.header.lines.add(line); } return this.header; } private String unescape(String s) throws IOException { return URLDecoder.decode(s, "UTF-8"); } public IntervalTreeMap<List<GTFGene>> readAllAsIntervalTreeMap(final LineIterator iter) throws IOException { IntervalTreeMap<List<GTFGene>> h=new IntervalTreeMap<>(); for(;;) { final List<GTFGene> L2 = nextGenesInContig(iter); if(L2.isEmpty()) break; for(final GTFGene gene:L2) { final Interval interval =new Interval(gene.getContig(),gene.getStart(),gene.getEnd()); List<GTFGene> x= h.get(interval); if(x==null) { x=new ArrayList<>(); h.put(interval, x); } x.add(gene); } } return h; } public List<GTFGene> readAll(final LineIterator iter) throws IOException { final List<GTFGene> L = new ArrayList<>(); for(;;) { final List<GTFGene> L2 = nextGenesInContig(iter); if(L2.isEmpty()) break; L.addAll(L2); } return L; } public List<GTFGene> nextGenesInContig(final LineIterator iter) throws IOException { if(!iter.hasNext()) return Collections.emptyList(); final Map<String,List<GTFLine>> transcript2map = new HashMap<>(); String prevContig = null; while(iter.hasNext()) { final String line = iter.peek(); final GTFLine record = this.decode(line); if(prevContig!=null && prevContig.equals(record.getContig())) { break; } iter.next();//consumme final String transcript_id = record.getAtts().get("transcript_id"); if( transcript_id == null ) continue; List<GTFLine> lines = transcript2map.get(transcript_id); if( lines ==null ) { lines = new ArrayList<>(); transcript2map.put(transcript_id,lines); } lines.add(record); prevContig = record.getContig(); } final List<GTFGene> list = new ArrayList<>(transcript2map.size()); Collections.sort(list, (A,B)->{ int i=A.getContig().compareTo(B.getContig()); if( i != 0 ) return i; i = A.getStart() - B.getStart(); if( i !=0 ) return i; i = A.getEnd() - B.getEnd(); return i; }); return list; } public GTFLine decode(final String line) throws IOException { if(this.header==null) { throw new IOException("header was not parsed"); } final String tokens[]=this.tab.split(line); if(tokens.length<8) { throw new JvarkitException.TokenErrors("Expected 8 columns",tokens); } final GTFLine L=new GTFLine(); L.contig=tokens[0]; L.source= tokens[1]; L.type = tokens[2]; L.start = Integer.parseInt(tokens[3]); L.end = Integer.parseInt(tokens[4]); if(!tokens[5].equals(".")) L.score = (Double.parseDouble(tokens[5])); L.strand = (tokens[6].charAt(0)); if(!tokens[7].equals(".")) L.phase=Integer.parseInt(tokens[7]); final Map<String, String> attMap=new HashMap<>(); final String mapStr = tokens[8]; int k=0; while( k < mapStr.length()) { if(Character.isWhitespace(mapStr.charAt(k))) { ++k; continue; } char c= mapStr.charAt(k); if(c==';') { ++k; continue;} /* read KEY */ final StringBuilder sbk=new StringBuilder(); while( k < mapStr.length()) { c= mapStr.charAt(k); ++k; if(c=='=' || Character.isWhitespace(c)) { break; } sbk.append(c); } /* SKIP WS */ while( k < mapStr.length() && Character.isWhitespace(mapStr.charAt(k))) { ++k; continue; } /* EQUAL SIGN */ if( k < mapStr.length() && mapStr.charAt(k)=='=') { ++k; } /* SKIP WS */ while( k < mapStr.length() && Character.isWhitespace(mapStr.charAt(k))) { ++k; continue; } /* read VALUE */ final StringBuilder sbv=new StringBuilder(); c=(k < mapStr.length()?mapStr.charAt(k):'\0'); // quoted string if( c == '\"') { ++k; while( k < mapStr.length()) { c= mapStr.charAt(k); ++k; if(c=='\\') { c=(k < mapStr.length()?mapStr.charAt(k):'\0'); ++k; switch(c) { case '"': sbv.append("\"");break; case '\'': sbv.append("\'");break; case 't': sbv.append("\t");break; case 'n': sbv.append("\n");break; default:break; } } else if(c=='\"') { break; } else { sbv.append(c); } } } else { while( k < mapStr.length()) { c= mapStr.charAt(k); ++k; if(c==';' || Character.isWhitespace(c)) { break; } sbv.append(c); } } attMap.put(sbk.toString(),sbv.toString()); } L.atts=attMap; return L; } }