/*
The MIT License (MIT)
Copyright (c) 2014 Pierre Lindenbaum
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
History:
* 2014 creation
*/
package com.github.lindenb.jvarkit.tools.misc;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.SortedMap;
import java.util.TreeMap;
import java.util.function.Function;
import javax.xml.stream.XMLOutputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamWriter;
import com.beust.jcommander.Parameter;
import com.github.lindenb.jvarkit.util.illumina.FastQName;
import com.github.lindenb.jvarkit.util.jcommander.Launcher;
import com.github.lindenb.jvarkit.util.jcommander.Program;
import com.github.lindenb.jvarkit.util.log.Logger;
import com.google.gson.stream.JsonWriter;
import htsjdk.samtools.util.CloserUtil;
/**
BEGIN_DOC
### Motivation
Illuminadir scans folders , search for FASTQs and generate a structured summary of the files (xml or json).
Currently only tested with HiSeq data.
### Examples
```
$ find dir1 dir2 -type f -name "*.fastq.gz" |\
java -jar dist/illuminadir.jar | \
xsltproc xml2script.xslt > script.bash
(...)
```
#### XML output
The XML ouput looks like this:
```
<?xml version="1.0" encoding="UTF-8"?>
<illumina>
<!--com.github.lindenb.jvarkit.tools.misc.IlluminaDirectory IN=[RUN62_XFC2DM8ACXX/data] JSON=false VERBOSITY=INFO QUIET=false VALIDATION_STRINGENCY=STRICT COMPRESSION_LEVEL=5 MAX_RECORDS_IN_RAM=500000 CREATE_INDEX=false CREATE_MD5_FILE=false-->
<directory path="RUN62_XFC2DM8ACXX/data">
<samples>
<sample name="SAMPLE1">
<pair md5="cd4b436ce7aff4cf669d282c6d9a7899" lane="8" index="ATCACG" split="2">
<fastq md5filename="3369c3457d6603f06379b654cb78e696" side="1" path="RUN62_XFC2DM8ACXX/data/OUT/Sample_SAMPLE1/SAMPLE1_ATCACG_L008_R1_002.fastq.gz" file-size="359046311"/>
<fastq md5filename="832039fa00b5f40108848e48eb437e0b" side="2" path="RUN62_XFC2DM8ACXX/data/OUT/Sample_SAMPLE1/SAMPLE1_ATCACG_L008_R2_002.fastq.gz" file-size="359659451"/>
</pair>
<pair md5="b3050fa3307e63ab9790b0e263c5d240" lane="8" index="ATCACG" split="3">
<fastq md5filename="091727bb6b300e463c3d708e157436ab" side="1" path="RUN62_XFC2DM8ACXX/data/OUT/Sample_SAMPLE1/SAMPLE1_ATCACG_L008_R1_003.fastq.gz" file-size="206660736"/>
<fastq md5filename="20235ef4ec8845515beb4e13da34b5d3" side="2" path="RUN62_XFC2DM8ACXX/data/OUT/Sample_SAMPLE1/SAMPLE1_ATCACG_L008_R2_003.fastq.gz" file-size="206715143"/>
</pair>
<pair md5="9f7ee49e87d01610372c43ab928939f6" lane="8" index="ATCACG" split="1">
<fastq md5filename="54cb2fd33edd5c2e787287ccf1595952" side="1" path="RUN62_XFC2DM8ACXX/data/OUT/Sample_SAMPLE1/SAMPLE1_ATCACG_L008_R1_001.fastq.gz" file-size="354530831"/>
<fastq md5filename="e937cbdf32020074e50d3332c67cf6b3" side="2" path="RUN62_XFC2DM8ACXX/data/OUT/Sample_SAMPLE1/SAMPLE1_ATCACG_L008_R2_001.fastq.gz" file-size="356908963"/>
</pair>
<pair md5="0697846a504158eef523c0f4ede85288" lane="7" index="ATCACG" split="2">
<fastq md5filename="6fb35d130efae4dcfa79260281504aa3" side="1" path="RUN62_XFC2DM8ACXX/data/OUT/Sample_SAMPLE1/SAMPLE1_ATCACG_L007_R1_002.fastq.gz" file-size="357120615"/>
(...)
<pair md5="634cbb29ca64604174963a4fff09f37a" lane="7" split="1">
<fastq md5filename="bc0b283a58946fd75a95b330e0aefdc8" side="1" path="RUN62_XFC2DM8ACXX/data/Undetermined_indices/Sample_lane7/lane7_Undetermined_L007_R1_001.fastq.gz" file-size="371063045"/>
<fastq md5filename="9eab26c5b593d50d642399d172a11835" side="2" path="RUN62_XFC2DM8ACXX/data/Undetermined_indices/Sample_lane7/lane7_Undetermined_L007_R2_001.fastq.gz" file-size="372221753"/>
</pair>
<pair md5="bf31099075d6c3c7ea052b8038cb4a03" lane="8" split="2">
<fastq md5filename="f229389da36a3efc20888bffdec09b80" side="1" path="RUN62_XFC2DM8ACXX/data/Undetermined_indices/Sample_lane8/lane8_Undetermined_L008_R1_002.fastq.gz" file-size="374331268"/>
<fastq md5filename="417fd9f28d24f63ce0d0808d97543315" side="2" path="RUN62_XFC2DM8ACXX/data/Undetermined_indices/Sample_lane8/lane8_Undetermined_L008_R2_002.fastq.gz" file-size="372181102"/>
</pair>
<pair md5="95cab850b0608c53e8c83b25cfdb3b2b" lane="8" split="3">
<fastq md5filename="23f5be8a962697f50e2a271394242e2f" side="1" path="RUN62_XFC2DM8ACXX/data/Undetermined_indices/Sample_lane8/lane8_Undetermined_L008_R1_003.fastq.gz" file-size="60303589"/>
<fastq md5filename="3f39f212c36d0aa884b81649ad56630c" side="2" path="RUN62_XFC2DM8ACXX/data/Undetermined_indices/Sample_lane8/lane8_Undetermined_L008_R2_003.fastq.gz" file-size="59123627"/>
</pair>
<pair md5="ab108b1dda7df86f33f375367b86bfe4" lane="8" split="1">
<fastq md5filename="14f8281cf7d1a53d29cd03cb53a45b4a" side="1" path="RUN62_XFC2DM8ACXX/data/Undetermined_indices/Sample_lane8/lane8_Undetermined_L008_R1_001.fastq.gz" file-size="371255111"/>
<fastq md5filename="977fd388e1b3451dfcdbf9bdcbb89ed4" side="2" path="RUN62_XFC2DM8ACXX/data/Undetermined_indices/Sample_lane8/lane8_Undetermined_L008_R2_001.fastq.gz" file-size="370744530"/>
</pair>
</undetermined>
</directory>
</illumina>
```
How to use that file ? here is a example of **XSLT** stylesheet that can generate a **Makefile** to generate a **LaTex** about the number of reads per Lane/Sample/Index:
```
<?xml version='1.0' encoding="ISO-8859-1"?>
<xsl:stylesheet
xmlns:xsl='http://www.w3.org/1999/XSL/Transform'
version='1.0'
>
<xsl:output method="text"/>
<xsl:template match="/">
.PHONY:all clean
all: report.pdf
report.pdf: report.tex
pdflatex $<
report.tex : all.count
echo 'T<-read.table("$<",head=TRUE,sep="\t");$(foreach FTYPE,Index Sample Lane, T2<-tapply(T$$count,T$$${FTYPE},sum);png("${FTYPE}.png");barplot(T2,las=3);dev.off();)' | R --no-save
echo "\documentclass{report}" > $@
echo "\ usepackage{graphicx}" >> $@
echo "\date{\today}" >> $@
echo "\title{FastQ Report}" >> $@
echo "\begin{document}" >> $@
echo "\maketitle" >> $@
$(foreach FTYPE,Index Sample Lane, echo "\section{By ${FTYPE}}#\begin{center}#\includegraphics{${FTYPE}.png}#\end{center}" | tr "#" "\n" >> $@ ; )
echo "\end{document}" >> $@
all.count : $(addsuffix .count, <xsl:for-each select="//fastq" ><xsl:value-of select="@md5filename"/><xsl:text> </xsl:text></xsl:for-each>)
echo -e "Lane\tsplit\tside\tsize\tcount\tIndex\tSample" > $@ && \
cat $^ >> $@
<xsl:apply-templates select="//fastq" mode="count"/>
clean:
rm -f all.count report.pdf report.tex $(addsuffix .count, <xsl:for-each select="//fastq" ><xsl:value-of select="@md5filename"/><xsl:text> </xsl:text></xsl:for-each>)
</xsl:template>
<xsl:template match="fastq" mode="count">
$(addsuffix .count, <xsl:value-of select="@md5filename"/>): <xsl:value-of select="@path"/>
gunzip -c $< | awk '(NR%4==1)' | wc -l | xargs printf "<xsl:value-of select="../@lane"/>\t<xsl:value-of select="../@split"/>\t<xsl:value-of select="@side"/>\t<xsl:value-of select="@file-size"/>\t%s\t<xsl:choose><xsl:when test="../@index"><xsl:value-of select="../@index"/></xsl:when><xsl:otherwise>Undetermined</xsl:otherwise></xsl:choose>\t<xsl:choose><xsl:when test="../../@name"><xsl:value-of select="../../@name"/></xsl:when><xsl:otherwise>Undetermined</xsl:otherwise></xsl:choose>\n" > $@
</xsl:template>
</xsl:stylesheet>
```
```
$ xsltproc illumina.xml illumina2makefile.xsl > Makefile
```
output:
```
.PHONY:all clean
all: report.pdf
report.pdf: report.tex
pdflatex $<
report.tex : all.count
echo 'T<-read.table("$<",head=TRUE,sep="\t");$(foreach FTYPE,Index Sample Lane, T2<-tapply(T$$count,T$$${FTYPE},sum);png("${FTYPE}.png");barplot(T2,las=3);dev.off();)' | R --no-save
echo "\documentclass{report}" > $@
echo "\ usepackage{graphicx}" >> $@
echo "\date{\today}" >> $@
echo "\title{FastQ Report}" >> $@
echo "\begin{document}" >> $@
echo "\maketitle" >> $@
$(foreach FTYPE,Index Sample Lane, echo "\section{By ${FTYPE}}#\begin{center}#\includegraphics{${FTYPE}.png}#\end{center}" | tr "#" "\n" >> $@ ; )
echo "\end{document}" >> $@
all.count : $(addsuffix .count, 3369c3457d6603f06379b654cb78e696 832039fa00b5f40108848e48eb437e0b 091727bb6b300e463c3d708e157436ab 20235ef4ec88....)
echo -e "Lane\tsplit\tside\tsize\tcount\tIndex\tSample" > $@ && \
cat $^ >> $@
$(addsuffix .count, 3369c3457d6603f06379b654cb78e696): RUN62_XFC2DM8ACXX/data/OUT/Sample_SAMPLE1/SAMPLE1_ATCACG_L008_R1_002.fastq.gz
gunzip -c $< | awk '(NR%4==1)' | wc -l | xargs printf "8\t2\t1\t359046311\t%s\tATCACG\tSAMPLE1\n" > $@
$(addsuffix .count, 832039fa00b5f40108848e48eb437e0b): RUN62_XFC2DM8ACXX/data/OUT/Sample_SAMPLE1/SAMPLE1_ATCACG_L008_R2_002.fastq.gz
gunzip -c $< | awk '(NR%4==1)' | wc -l | xargs printf "8\t2\t2\t359659451\t%s\tATCACG\tSAMPLE1\n" > $@
(....)
```
#### JSON output
The JSON output looks loke this
```
{"directory":"RUN62_XFC2DM8ACXX/data","samples":[{"sample":"SAMPLE1","files":[{
"md5pair":"cd4b436ce7aff4cf669d282c6d9a7899","lane":8,"index":"ATCACG","split":2
,"forward":{"md5filename":"3369c3457d6603f06379b654cb78e696","path":"20131001_SN
L149_0062_XFC2DM8ACXX/data/OUT/Sample_SAMPLE1/SAMPLE1_ATCACG_L008_R1_002.fastq.g
z","side":1,"file-size":359046311},"reverse":{"md5filename":"832039fa00b5f401088
48e48eb437e0b","path":"20131001_SNL149_0062_XFC2DM8ACXX/data/OUT/Sample_SAMPLE1/
SAMPLE1_ATCACG_L008_R2_002.fastq.gz","side":2,"file-size":359659451}},{"md5pair"
:"b3050fa3307e63ab9790b0e263c5d240","lane":8,"index":"ATCACG","split":3,"forward
":{"md5filename":"091727bb6b300e463c3d708e157436ab","path":"20131001_SNL149_0062
_XFC2DM8ACXX/data/OUT/Sample_SAMPLE1/SAMPLE1_ATCACG_L008_R1_003.fastq.gz","side"
:1,"file-size":206660736},"reverse":{"md5filename":"20235ef4ec8845515beb4e13da34
b5d3","path":"20131001_SNL149_0062_XFC2DM8ACXX/data/OUT/Sample_SAMPLE1/SAMPLE1_A
TCACG_L008_R2_003.fastq.gz","side":2,"file-size":206715143}},{"md5pair":"9f7ee49
e87d01610372c43ab928939f6","lane":8,"index":"ATCACG","split":1,"forward":{"md5fi
lename":"54cb2fd33edd5c2e787287ccf1595952","path":"20131001_SNL149_0062_XFC2DM8A
CXX/data/OUT/Sample_SAMPLE1/SAMPLE1_ATCACG_L008_R1_001.fastq.gz","side":1,"file-
size":354530831},"reverse":{"md5filename":"e937cbdf32020074e50d3332c67cf6b3","pa
th":"20131001_SNL149_0062_XFC2DM8ACXX/data/OUT/Sample_SAMPLE1/SAMPLE1_ATCACG_L00
8_R2_001.fastq.gz","side":2,"file-size":356908963}},{"md5pair":"0697846a504158ee
f523c0f4ede85288","lane":7,"index":"ATCACG","split":2,"forward":{"md5filename":"
```
It can be processed using a tool like [jsvelocity](https://github.com/lindenb/jsvelocity) to generate the same kind of Makefile:
The velocity template for jsvelocity (https://github.com/lindenb/jsvelocity)
```
#macro(maketarget $fastq)
$(addsuffix .count, ${fastq.md5filename}): ${fastq.path}
gunzip -c $< | awk '(NR%4==1)' | wc -l | xargs printf "${fastq.parentNode.lane}\t${fastq.parentNode.split}\t${fastq.side}\t${fastq['file-size']}\t%s\t#if(${fastq.parentNode.containsKey("index")})${fastq.parentNode.index}#{else}Undetermined#{end}\t#if(${fastq.parentNode.parentNode.containsKey("name")})${fastq.parentNode.parentNode.name}#{else}Undetermined#{end}\n" > $@
#end
.PHONY:all clean
all: report.pdf
report.pdf: report.tex
pdflatex $<
report.tex : all.count
echo 'T<-read.table("$<",head=TRUE,sep="\t");$(foreach FTYPE,Index Sample Lane, T2<-tapply(T$$count,T$$${FTYPE},sum);png("${FTYPE}.png");barplot(T2,las=3);dev.off();)' | R --no-save
echo "\documentclass{report}" > $@
echo "\ usepackage{graphicx}" >> $@
echo "\date{\today}" >> $@
echo "\title{FastQ Report}" >> $@
echo "\begin{document}" >> $@
echo "\maketitle" >> $@
$(foreach FTYPE,Index Sample Lane, echo "\section{By ${FTYPE}}#\begin{center}#\includegraphics{${FTYPE}.png}#\end{center}" | tr "#" "\n" >> $@ ; )
echo "\end{document}" >> $@
all.count : $(addsuffix .count, #foreach($dir in $all) #foreach($sample in ${dir.samples})#foreach($pair in ${sample.files}) ${pair.forward.md5filename} ${pair.reverse.md5filename} #end #end #foreach($pair in ${dir.undetermined}) ${pair.forward.md5filename} ${pair.reverse.md5filename} #end #end )
#foreach($dir in $all)
#foreach($sample in ${dir.samples})
#foreach($pair in ${sample.files})
#maketarget($pair.forward)
#maketarget($pair.reverse)
#end
#end
#foreach($pair in ${dir.undetermined})
#maketarget($pair.forward)
#maketarget($pair.reverse)
#end
#end
clean:
rm -f all.count $(addsuffix .count, #foreach($dir in $all)
#foreach($sample in ${dir.samples})
#foreach($pair in ${sample.files}) ${pair.forward.md5filename} ${pair.reverse.md5filename} #end #end
#foreach($pair in ${dir.undetermined}) ${pair.forward.md5filename} ${pair.reverse.md5filename} #end #end )
```
transform using jsvelocity:
```
java -jar dist/jsvelocity.jar \
-d all illumina.json \
illumina.vm > Makefile
```
output: same as above
END_DOC
*/
@Program(name="illuminadir",description="Create a structured (**JSON** or **XML**) representation of a directory containing some Illumina FASTQs.")
public class IlluminaDirectory
extends Launcher
{
private static final Logger LOG = Logger.build(IlluminaDirectory.class).make();
@Parameter(names={"-o","--output"},description="Output file. Optional . Default: stdout")
private File outputFile = null;
@Parameter(names={"-J","--json"},description="Generate JSON output.")
private boolean JSON = false;
private final Function<String, String> str2md5 = new StringToMd5();
private int ID_GENERATOR=0;
private class Folder
{
String projectName="Project1";
final SortedMap<String, Sample> sampleMap=new TreeMap<String, IlluminaDirectory.Sample>();
final List<Pair> undetermined=new ArrayList<Pair>();
void scan(final File f)
{
if(f==null) return;
if(!f.canRead()) return;
LOG.info("Scanning "+f);
FastQName fq=FastQName.parse(f);
if(!fq.isValid())
{
LOG.warn("invalid name:"+fq);
return;
}
if(fq.isUndetermined())
{
for(int i=0;i< undetermined.size();++i)
{
Pair p=undetermined.get(i);
if(p.complement(fq)) return;
}
undetermined.add(new Pair(fq));
}
else
{
Sample sample=this.sampleMap.get(fq.getSample());
if(sample==null)
{
sample=new Sample();
sample.name=fq.getSample();
this.sampleMap.put(sample.name,sample);
}
sample.add(fq);
final File sampleDir = f.getParentFile();
if(sampleDir!=null && sampleDir.isDirectory() && sampleDir.getName().startsWith("Sample_"))
{
final File projDir = sampleDir.getParentFile();
if(projDir!=null && projDir.isDirectory() && projDir.getName().startsWith("Project_"))
{
this.projectName = projDir.getName().substring(8).replace(' ', '_');
}
}
}
}
void json(final JsonWriter out) throws IOException
{
out.beginObject();
out.name("samples");
out.beginArray();
for(final Sample S:this.sampleMap.values())
{
S.json(out);
}
out.endArray();
out.name("undetermined");
out.beginArray();
for(final Pair p:undetermined)
{
p.json(out);
}
out.endArray();
out.endObject();
}
void write(final XMLStreamWriter w) throws XMLStreamException
{
w.writeStartElement("project");
w.writeAttribute("name",this.projectName);
w.writeAttribute("center", "CENTER");
w.writeAttribute("haloplex", "false");
w.writeAttribute("wgs", "false");
for(final Sample S:this.sampleMap.values())
{
S.write(w);
}
w.writeStartElement("undetermined");
for(final Pair p:this.undetermined)
{
p.write(w);
}
w.writeEndElement();
w.writeEndElement();
}
}
/**
* A pair of fastq , Forward, reverse
*/
private class Pair
{
int id;
FastQName forward;
FastQName reverse;
Pair(FastQName fq)
{
id=++ID_GENERATOR;
switch(fq.getSide())
{
case Forward:forward=fq; break;
case Reverse:reverse=fq; break;
default:throw new RuntimeException("bad side "+fq);
}
}
boolean complement(final FastQName other)
{
if(forward!=null && reverse!=null) return false;
if(forward!=null && forward.isComplementOf(other))
{
reverse=other;
return true;
}
else if(reverse!=null && reverse.isComplementOf(other))
{
forward=other;
return true;
}
return false;
}
void json(final JsonWriter out) throws IOException
{
if(forward!=null && reverse!=null)
{
out.beginObject();
out.name("id");out.value("p"+this.id);
out.name("md5pair");out.value(str2md5.apply(forward.getFile().getPath()+reverse.getFile().getPath()));
out.name("lane");out.value(""+forward.getLane());
out.name("index");
if(forward.getSeqIndex()!=null)
{
out.value(forward.getSeqIndex());
}
else
{
out.nullValue();
}
out.name("split");out.value(""+forward.getSplit());
out.name("forward");
out.beginObject();
out.name("md5filename");out.value(str2md5.apply(forward.getFile().getPath()));
out.name("path");out.value(forward.getFile().getPath());
out.name("side");out.value(forward.getSide().ordinal());
out.name("file-size");out.value(forward.getFile().length());
out.endObject();
out.name("reverse");
out.beginObject();
out.name("md5filename");out.value(str2md5.apply(reverse.getFile().getPath()));
out.name("path");out.value(reverse.getFile().getPath());
out.name("side");out.value(reverse.getSide().ordinal());
out.name("file-size");out.value(reverse.getFile().length());
out.endObject();
out.endObject();
}
else
{
final FastQName F=(forward==null?reverse:forward);
out.beginObject();
out.name("id");out.value("p"+this.id);
out.name("md5filename");out.value(str2md5.apply(F.getFile().getPath()));
out.name("lane");out.value(""+F.getLane());
out.name("index");
if(F.getSeqIndex()!=null)
{
out.value(F.getSeqIndex());
}
else
{
out.nullValue();
}
out.name("split");out.value(""+F.getSplit());
out.name("path");out.value(F.getFile().getPath());
out.name("side");out.value(F.getSide().ordinal());
out.endObject();
}
}
void write(XMLStreamWriter w,String tagName,FastQName fastqFile) throws XMLStreamException
{
w.writeStartElement(tagName);
w.writeAttribute("md5filename",str2md5.apply(fastqFile.getFile().getPath()));
w.writeAttribute("file-size",String.valueOf( fastqFile.getFile().length()));
w.writeCharacters(fastqFile.getFile().getPath());
w.writeEndElement();
}
void write(XMLStreamWriter w) throws XMLStreamException
{
w.writeStartElement("fastq");
w.writeAttribute("id","p"+this.id);
w.writeAttribute("md5",str2md5.apply(forward.getFile().getPath()+reverse.getFile().getPath()));
w.writeAttribute("lane", String.valueOf(forward.getLane()));
if(forward.getSeqIndex()!=null) w.writeAttribute("index", String.valueOf(forward.getSeqIndex()));
w.writeAttribute("split", String.valueOf(forward.getSplit()));
w.writeAttribute("group-id", getGroupId());
if(forward!=null && reverse!=null)
{
write(w,"for",forward);
write(w,"rev",reverse);
}
else
{
write(w,"single",forward==null?reverse:forward);
}
w.writeEndElement();
}
private String getGroupId()
{
return IlluminaDirectory.this.getGroupId(this.forward);
}
}
private Map<String,String> groupIdMap=new HashMap<>();
private String getGroupId(final FastQName fastq)
{
final String s= str2md5.apply( fastq.getSample()+" "+fastq.getLane()+" "+fastq.getSeqIndex());
String gid = this.groupIdMap.get(s);
if(gid==null)
{
gid = fastq.getSample()+"."+(this.groupIdMap.size()+1);
this.groupIdMap.put(s, gid);
}
return gid;
}
private class Sample
{
String name;
final List<Pair> pairs=new ArrayList<Pair>();
private void add(FastQName fq)
{
for(int i=0;i< pairs.size();++i)
{
Pair p=pairs.get(i);
if(p.complement(fq)) return;
}
pairs.add(new Pair(fq));
}
void write(XMLStreamWriter w) throws XMLStreamException
{
w.writeStartElement("sample");
w.writeAttribute("name",this.name);
w.writeAttribute("father","undefined");
w.writeAttribute("mother","undefined");
w.writeAttribute("sex","undefined");
for(Pair p:this.pairs)
{
p.write(w);
}
w.writeEndElement();
}
void json(final JsonWriter out) throws IOException
{
out.beginObject();
out.name("sample");
out.value(this.name);
out.name("files");
out.beginArray();
for(final Pair p: this.pairs)
{
p.json(out);
}
out.endArray();
out.endObject();
}
}
@Override
public int doWork(List<String> args) {
BufferedReader in=null;
try
{
final String inputName= oneFileOrNull(args);
in= super.openBufferedReader(inputName);
final Folder folder=new Folder();
String line;
while((line=in.readLine())!=null)
{
if(line.isEmpty() || line.startsWith("#")) continue;
if(!line.endsWith(".fastq.gz"))
{
LOG.warn("ignoring "+line+" because it doesn't end with *.fastq.gz");
continue;
}
final File f=new File(line);
if(!f.exists())
{
return wrapException("Doesn't exist:"+f);
}
if(!f.isFile())
{
return wrapException("Not a file:"+f);
}
folder.scan(f);
}
in.close();
final PrintWriter pw = this.openFileOrStdoutAsPrintWriter(outputFile);
if(this.JSON)
{
final JsonWriter js=new JsonWriter(pw);
folder.json(js);
CloserUtil.close(js);
}
else
{
final XMLOutputFactory xmlfactory= XMLOutputFactory.newInstance();
final XMLStreamWriter w= xmlfactory.createXMLStreamWriter(pw);
w.writeStartDocument("UTF-8","1.0");
folder.write(w);
w.writeEndDocument();
w.flush();
CloserUtil.close(w);
}
pw.flush();
CloserUtil.close(pw);
return RETURN_OK;
}
catch(final Exception err)
{
return wrapException(err);
}
finally
{
CloserUtil.close(in);
}
}
/**
* @param args
*/
public static void main(String[] args)
{
new IlluminaDirectory().instanceMainWithExit(args);
}
}