package com.github.lindenb.jvarkit.tools.bam4deseq;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.StreamTokenizer;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.regex.Pattern;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SAMRecordIterator;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.SamReader;
import htsjdk.samtools.util.CloserUtil;
import htsjdk.samtools.util.SequenceUtil;
import com.beust.jcommander.Parameter;
import com.github.lindenb.jvarkit.io.IOUtils;
import com.github.lindenb.jvarkit.util.bio.bin.SamSequenceRecordBinMap;
import com.github.lindenb.jvarkit.util.jcommander.Launcher;
import com.github.lindenb.jvarkit.util.jcommander.Program;
import com.github.lindenb.jvarkit.util.log.Logger;
@Program(name="deseqcount",description="java version of htseqcount",
deprecatedMsg="never tested, don't use this")
public class HtSeqCount extends Launcher
{
private static final Logger LOG = Logger.build(HtSeqCount.class).make();
/** features we want in the GTF */
@Parameter(names="-F",description=" add this feature. default are : CDS and exon")
private Set<String> features=new HashSet<>();
/** first Dict found, to print chrom names and compare with others */
private SAMSequenceDictionary firstDict=null;
/** mapping position to transcript */
private SamSequenceRecordBinMap<Transcript> pos2transcript=null;
/** all filenames */
private List<String> filenames=new ArrayList<String>();
/** user GTF file */
@Parameter(names="-g",description="path to GTF file",required=true)
private File gtfFile=null;
/* current file scanner in filenames */
private int file_index=0;
/* user feature in the GTF */
@Parameter(names="-T",description="key for clustering info")
private String transcript_id="transcript_id";
/* all transcripts to print at the end */
private HashMap<String, Transcript> name2transcript=new HashMap<String, Transcript>();
/* first ouput line is header */
@Parameter(names="-H",description="first ouput line is header")
private boolean print_header=false;
/* remove lines with 0 coverage */
private boolean removeZero=false;
/* print chrom/start-end on the left*/
@Parameter(names="-c",description="print chrom/start/end")
private boolean print_chromstartend=false;
private static class Transcript
{
String name;
int count[];
boolean bad_flag=false;//multiple chroms
int tid;
int start=Integer.MAX_VALUE;
int end=0;
}
private void parseGTF(BufferedReader in,SAMSequenceDictionary dict) throws IOException
{
this.pos2transcript=new SamSequenceRecordBinMap<Transcript>(dict);
final Pattern tab=Pattern.compile("[\t]");
String line;
while((line=in.readLine())!=null)
{
if(line.isEmpty() || line.startsWith("#")) continue;
String tokens[]=tab.split(line);
if(tokens.length<=8)
{
throw new IOException("Bad GTF line " +line);
}
if(!this.features.contains(tokens[2])) continue;
String chrom=tokens[0];
int tid=dict.getSequenceIndex(chrom);
if(tid<0)
{
LOG.warning("Unknown chromosome "+chrom+" Ignoring "+line);
continue;
}
String transcript_name=null;
String info=tokens[8];
StreamTokenizer scanner=new StreamTokenizer(new StringReader(info));
scanner.wordChars('_', '_');
scanner.wordChars('0', '9');
while(scanner.nextToken()!=StreamTokenizer.TT_EOF)
{
String key=scanner.sval;
//info("K "+key);
if(scanner.nextToken()==StreamTokenizer.TT_EOF)
{
throw new IOException("Bad gtf in "+line+" after "+key);
}
String value=scanner.sval;
//info("v "+value);
if(key.equals(this.transcript_id))
{
transcript_name=value;
break;
}
if(scanner.nextToken()==StreamTokenizer.TT_EOF) break;
if(scanner.ttype!=';')
{
throw new IOException("Bad gtf in "+line+" expect ; ");
}
}
if(transcript_name==null)
{
LOG.info("No "+transcript_id+" in "+line);
continue;
}
Transcript transcript=name2transcript.get(transcript_name);
if(transcript==null)
{
transcript=new Transcript();
transcript.tid=tid;
transcript.name=transcript_name;
transcript.count=new int[filenames.size()];
Arrays.fill(transcript.count,0);
name2transcript.put(transcript_name,transcript);
if(name2transcript.size()%1000==0)
{
LOG.info("Transcripts: "+name2transcript.size()+" "+transcript_name);
}
}
else
{
if(transcript.tid!=tid)
{
LOG.warning("Multiple chromosomes for "+transcript);
transcript.bad_flag=true;
continue;
}
}
int start1=Integer.parseInt(tokens[3]);
int end1=Integer.parseInt(tokens[4]);
transcript.start=Math.min(transcript.start,start1);
transcript.end=Math.max(transcript.end,end1);
this.pos2transcript.put(tid, start1-1, end1,transcript);
/**
boolean ok=false;
Iterator<Transcript>x=this.pos2transcript.overlapping(tid, start1-1, end1);
while(x.hasNext()) if(transcript==x.next()) {ok=true;}
if(!ok) throw new IllegalStateException("boum");*/
}
LOG.info("Done Reading transcripts:"+name2transcript.size());
}
private void touch(int tid,int start1,int end1)
{
Set<String> seen=null;
Iterator<Transcript> iter=this.pos2transcript.overlapping(tid, start1-1, end1);
while(iter.hasNext())
{
Transcript transcript=iter.next();
if(seen==null) seen=new HashSet<String>();
if(!seen.add(transcript.name)) continue;
transcript.count[this.file_index]++;
}
}
private void run(final SamReader sfr) throws IOException
{
SAMFileHeader header=sfr.getFileHeader();
if(this.file_index==0)
{
firstDict=header.getSequenceDictionary();
LOG.info("Reading "+this.gtfFile);
BufferedReader gtfIn=IOUtils.openFileForBufferedReading(this.gtfFile);
parseGTF(gtfIn,header.getSequenceDictionary());
gtfIn.close();
}
else
{
if(!SequenceUtil.areSequenceDictionariesEqual(firstDict, header.getSequenceDictionary()))
{
throw new IOException("not same sequence dictionaires between "+
filenames.get(0)+" && "+
filenames.get(this.file_index)
);
}
}
long nReads=0L;
SAMRecordIterator iter=sfr.iterator();
while(iter.hasNext())
{
SAMRecord rec=iter.next();
if(rec.getReadUnmappedFlag()) continue;
if(rec.getDuplicateReadFlag()) continue;
if(rec.getNotPrimaryAlignmentFlag()) continue;
if(rec.getReadFailsVendorQualityCheckFlag()) continue;
if(nReads++%1E6==0)
{
LOG.info("Read "+nReads+" in "+this.filenames.get(this.file_index));
}
touch(rec.getReferenceIndex(),
rec.getAlignmentStart(),
rec.getAlignmentEnd()
);
}
iter.close();
}
@Override
public int doWork(List<String> args) {
if(features.isEmpty())
{
for(String F:new String[]{"CDS","exon"})
{
LOG.info("Adding "+F+" as default feature.");
features.add(F);
}
}
if( this.gtfFile==null)
{
LOG.error("undefined GTF file");
return -1;
}
SamReader sfr=null;
try
{
this.file_index=0;
if(args.isEmpty())
{
LOG.info("Opening stdin");
filenames.add("stdin");
sfr=super.openSamReader(null);
run(sfr);
sfr.close();
}
else
{
for(String filename:args)
{
filenames.add(filename);
}
for(String filename:filenames)
{
LOG.info("Opening "+filename);
sfr=super.openSamReader(filename);
run(sfr);
sfr.close();
this.file_index++;
}
}
if(this.print_header)
{
stdout().print("#");
if(this.print_chromstartend)
{
stdout().print("chrom\tstart\tend\t");
}
stdout().print("Transcript");
for(String filename:filenames)
{
stdout().print("\t"+filename);
}
stdout().println();
}
List<Transcript> ordered=new ArrayList<Transcript>(this.name2transcript.values());
Collections.sort(ordered,new java.util.Comparator<Transcript>()
{
@Override
public int compare(Transcript a,Transcript b)
{
int i= a.tid-b.tid;
if(i!=0) return i;
i= a.start-b.start;
if(i!=0) return i;
i= a.end-b.end;
if(i!=0) return i;
return a.name.compareTo(b.name);
}
});
for(Transcript tr:ordered)
{
if(tr.bad_flag) continue;
if(this.removeZero)
{
//remove unmapped transcripts
int x=0;
for(x=0;x< tr.count.length;++x)
if(tr.count[x]!=0) break;
if(x==tr.count.length) continue;
}
if(this.print_chromstartend)
{
stdout().print(firstDict.getSequence(tr.tid).getSequenceName());
stdout().print("\t");
stdout().print(tr.start);
stdout().print("\t");
stdout().print(tr.end);
stdout().print("\t");
}
stdout().print(tr.name);
for(int C:tr.count)
{
stdout().print("\t");
stdout().print(C);
}
stdout().println();
}
return 0;
}
catch(Exception err)
{
LOG.error(err);
return -1;
}
finally
{
CloserUtil.close(sfr);
}
}
public static void main(String[] args) {
new HtSeqCount().instanceMainWithExit(args);
}
}