package com.github.lindenb.jvarkit.tools.misc;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.math.BigInteger;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
import htsjdk.samtools.fastq.FastqRecord;
import htsjdk.samtools.ValidationStringency;
import htsjdk.samtools.SAMUtils;
import com.beust.jcommander.Parameter;
import com.github.lindenb.jvarkit.io.ArchiveFactory;
import com.github.lindenb.jvarkit.util.Counter;
import com.github.lindenb.jvarkit.util.illumina.FastQName;
import com.github.lindenb.jvarkit.util.jcommander.Launcher;
import com.github.lindenb.jvarkit.util.jcommander.Program;
import com.github.lindenb.jvarkit.util.log.Logger;
import com.github.lindenb.jvarkit.util.picard.FastqReader;
import com.github.lindenb.jvarkit.util.picard.FourLinesFastqReader;
@Program(name="IlluminaStatsFastq",description="Reads filenames from stdin: Count FASTQs in Illumina Result.")
public class IlluminaStatsFastq
extends Launcher
{
private static final Logger LOG = Logger.build(IlluminaStatsFastq.class).make();
@Parameter(names={"-o","--output"},description="Output zip file.")
private File outputFile = null;
@Parameter(names={"-X"},description="maximum number of DNA indexes to print. memory consuming if not 0. ")
private int COUNT_INDEX=0;
private static class Bases
{
long A=0L;
long T=0L;
long G=0L;
long C=0L;
long N=0L;
}
/* archive factory, where to put the results */
private ArchiveFactory archiveFactory;
private PrintWriter wnames=null;
private PrintWriter wcount=null;
private PrintWriter wquals=null;
private PrintWriter wbadfastq=null;
private PrintWriter whistquals=null;
private PrintWriter wqualperpos=null;
private PrintWriter wbases=null;
private PrintWriter wlength=null;
private PrintWriter wDNAIndexes=null;
private PrintWriter wsqlite=null;
private static void tsv(PrintWriter out,Object...array)
{
for(int i=0;i< array.length;++i)
{
if(i>0) out.print('\t');
out.print(array[i]);
}
out.println();
}
private class Analyzer extends Thread
{
private File fastqFile;
private final Pattern DNARegex=Pattern.compile("[ATGCatcNn]{4,8}");
private String hash;
Analyzer(File fastqFile)
{
this.fastqFile=fastqFile;
try {
MessageDigest md5=MessageDigest.getInstance("MD5");
md5.reset();
md5.update(String.valueOf(fastqFile.getPath()).getBytes());
hash = new BigInteger(1, md5.digest()).toString(16);
if (hash.length() != 32) {
final String zeros = "00000000000000000000000000000000";
hash= zeros.substring(0, 32 - hash.length()) + hash;
}
} catch (NoSuchAlgorithmException e) {
throw new RuntimeException("MD5 algorithm not found",e);
}
}
@Override
public void run()
{
try {
analyze(this.fastqFile);
}
catch (Exception e) {
LOG.error(e);
}
}
IlluminaStatsFastq owner()
{
return IlluminaStatsFastq.this;
}
private void analyze(File f) throws IOException
{
if(f==null) return;
if(!(f.getName().endsWith(".fastq.gz") && f.isFile())) return;
if(!f.canRead())
{
synchronized (IlluminaStatsFastq.class)
{
tsv(owner().wbadfastq,f.getPath(),"Cannot read");
}
return;
}
final int QUALITY_STEP=5;
LOG.info(f.toString());
FastQName fq=FastQName.parse(f);
Counter<Integer> qualityHistogram=new Counter<Integer>();
Counter<Integer> pos2quality=new Counter<Integer>();
List<Bases> pos2bases=new ArrayList<Bases>(300);
Counter<Integer> lengths=new Counter<Integer>();
Counter<Integer> pos2count=new Counter<Integer>();
Counter<String> dnaIndexes=new Counter<String>();
long nReads=0L;
double sum_qualities=0L;
long count_bases=0L;
long count_read_fails_filter=0L;
long count_read_doesnt_fail_filter=0L;
FastqReader r=null;
try
{
r=new FourLinesFastqReader(f);
r.setValidationStringency(ValidationStringency.LENIENT);
while(r.hasNext())
{
FastqRecord record=r.next();
++nReads;
if(record.getReadHeader().contains(":Y:"))
{
count_read_fails_filter++;
continue;
}
else if(record.getReadHeader().contains(":N:"))
{
count_read_doesnt_fail_filter++;
}
if(owner().COUNT_INDEX>0)
{
//index
int last_colon=record.getReadHeader().lastIndexOf(':');
if(last_colon!=-1 && last_colon+1< record.getReadHeader().length())
{
String dnaIndex=record.getReadHeader().substring(last_colon+1).trim().toUpperCase();
if(this.DNARegex.matcher(dnaIndex).matches())
{
dnaIndexes.incr(dnaIndex);
}
}
}
byte phred[]=SAMUtils.fastqToPhred(record.getBaseQualityString());
for(int i=0;i< phred.length ;++i)
{
sum_qualities+=phred[i];
count_bases++;
qualityHistogram.incr(phred[i]/QUALITY_STEP);
pos2quality.incr(i,phred[i]);
pos2count.incr(i);
}
/* get base usage */
while(pos2bases.size() <record.getReadString().length())
{
pos2bases.add(new Bases());
}
for(int i=0;i< record.getReadString().length() ;++i)
{
Bases bases=pos2bases.get(i);
switch(record.getReadString().charAt(i))
{
case 'A': case 'a':bases.A++;break;
case 'T': case 't':bases.T++;break;
case 'G': case 'g':bases.G++;break;
case 'C': case 'c':bases.C++;break;
default: bases.N++;break;
}
}
lengths.incr(record.getBaseQualityString().length());
}
}
catch(Exception err2)
{
LOG.error(err2);
err2.printStackTrace();
synchronized (IlluminaStatsFastq.class)
{
tsv(owner().wbadfastq,f.getPath(),this.hash,err2.getMessage());
}
return;
}
finally
{
if(r!=null) r.close();
r=null;
}
synchronized (IlluminaStatsFastq.class)
{
if(fq.isValid())
{
tsv(owner().wnames,
f.getPath(),
f.getParentFile(),
f.getName(),
this.hash,
(fq.isUndetermined()?"Undetermined":fq.getSample()),
fq.getSeqIndex(),
fq.getLane(),
fq.getSide(),
fq.getSplit(),
fq.getFile().length()
);
}
else
{
tsv(owner().wbadfastq,f.getPath(),this.hash);
}
tsv(owner().wcount,
this.hash,
nReads,
count_read_fails_filter,
count_read_doesnt_fail_filter
);
tsv(owner().wquals,
this.hash,
sum_qualities/count_bases
);
for(Integer step:qualityHistogram.keySet())
{
tsv(owner().whistquals,
this.hash,
step*QUALITY_STEP,
qualityHistogram.count(step)
);
}
for(Integer position:pos2quality.keySet())
{
tsv(owner().wqualperpos,
this.hash,
position+1,
pos2quality.count(position)/(double)pos2count.count(position),
pos2count.count(position)
);
}
for(int i=0;i< pos2bases.size();++i)
{
Bases b=pos2bases.get(i);
tsv(owner().wbases,
this.hash,
i+1,b.A,b.T,b.G,b.C,b.N
);
}
for(Integer L:lengths.keySet())
{
tsv(owner().wlength,
this.hash,
L,
lengths.count(L)
);
}
int count_out=0;
for(String dna:dnaIndexes.keySetDecreasing())
{
if(++count_out>owner().COUNT_INDEX) break;
tsv(owner().wDNAIndexes,this.hash,dna,dnaIndexes.count(dna));
}
}
}
}
@Override
public int doWork(List<String> args) {
if(!args.isEmpty())
{
System.err.println("Expected reads from stdin. Illegal Number of arguments.");
return -1;
}
if(this.outputFile==null)
{
System.err.println("undefined output file.");
return -1;
}
try {
archiveFactory=ArchiveFactory.open(this.outputFile);
this.wnames = archiveFactory.openWriter("names.tsv");
this.wcount = archiveFactory.openWriter("counts.tsv");
this.wquals = archiveFactory.openWriter("quals.tsv");
this.wbadfastq = archiveFactory.openWriter("notfastq.tsv");
this.whistquals = archiveFactory.openWriter("histquals.tsv");
this.wqualperpos = archiveFactory.openWriter("histpos2qual.tsv");
this.wbases = archiveFactory.openWriter("bases.tsv");
this.wlength = archiveFactory.openWriter("lengths.tsv");
this.wDNAIndexes = archiveFactory.openWriter("indexes.tsv");
this.wsqlite = archiveFactory.openWriter("sqlite3.sql");
LOG.info("reading from stdin");
BufferedReader in=new BufferedReader(new InputStreamReader(System.in));
for(;;)
{
String line=in.readLine();
if(line==null) break;
final Analyzer a=new Analyzer(new File(line));
a.run();
}
in.close();
this.wsqlite.println("create table if not exists wnames ( path TEXT, directory PATH, filename TEXT, md5 TEST,sample TEXT, dnaIndex TEXT, lane INT, side TEXT, split INT, fileSize INT );");
this.wsqlite.println("create table if not exists wcount ( md5 TEXT, nReads INT, count_read_fails_filter INT, count_read_doesnt_fail_filter INT );");
this.wsqlite.println("create table if not exists wquals ( md5 TEXT, qual FLOAT );");
this.wsqlite.println("create table if not exists whistquals ( md5 TEXT, qual FLOAT, nqual INT );");
this.wsqlite.println("create table if not exists wqualperpos ( md5 TEXT, position INT, qual FLOAT, nbases INT );");
this.wsqlite.println("create table if not exists wbases ( md5 TEXT, position INT, A INT, T INT, G INT, C INT, N INT );");
this.wsqlite.println("create table if not exists wlength ( md5 TEXT, readLen INT, nReads INT );");
this.wsqlite.println("create table if not exists wDNAIndexes ( md5 TEXT, dnaIndex TEXT, nReads INT );");
this.wsqlite.println(".separator '\t'");
this.wsqlite.println(".import names.tsv wnames\n");
this.wsqlite.println(".import counts.tsv wcount\n");
this.wsqlite.println(".import quals.tsv wquals\n");
this.wsqlite.println(".import histquals.tsv whistquals\n");
this.wsqlite.println(".import histpos2qual.tsv wqualperpos\n");
this.wsqlite.println(".import bases.tsv wbases\n");
this.wsqlite.println(".import lengths.tsv wlength\n");
this.wsqlite.println(".import indexes.tsv wDNAIndexes\n");
for(PrintWriter pw: new PrintWriter[]{
this.wnames,
this.wcount,
this.wquals,
this.wbadfastq,
this.whistquals,
this.wqualperpos,
this.wbases,
this.wlength,
this.wDNAIndexes,
this.wsqlite
})
{
pw.flush();
pw.close();
}
if(archiveFactory!=null)
{
this.archiveFactory.close();
}
LOG.info("Done.");
}
catch (Exception e) {
LOG.error(e);
return -1;
}
finally
{
}
return 0;
}
/**
* @param args
*/
public static void main(String[] args)
{
new IlluminaStatsFastq().instanceMainWithExit(args);
}
}