/*
The MIT License (MIT)
Copyright (c) 2014 Pierre Lindenbaum
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
History:
* 2014 creation
*/
package com.github.lindenb.jvarkit.tools.misc;
import htsjdk.samtools.Cigar;
import htsjdk.samtools.CigarElement;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMReadGroupRecord;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SAMRecordIterator;
import htsjdk.samtools.SamReader;
import htsjdk.samtools.util.CloserUtil;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.GenotypeBuilder;
import htsjdk.variant.variantcontext.GenotypesContext;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.variantcontext.VariantContextBuilder;
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
import htsjdk.variant.vcf.VCFFormatHeaderLine;
import htsjdk.variant.vcf.VCFHeader;
import htsjdk.variant.vcf.VCFHeaderLineType;
import java.io.File;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import com.github.lindenb.jvarkit.io.IOUtils;
import com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress;
import com.github.lindenb.jvarkit.util.vcf.VCFUtils;
import com.github.lindenb.jvarkit.util.vcf.VcfIterator;
/**
BEGIN_DOC
### Example
```
$ yourtool-mergingvcf 1.vcf 2.vcf 3.vcf > merged.vcf
$ find ./ -name "*.bam" > bams.list
$ java -jar dist/fixvcfmissinggenotypes.jar -f bams.list < merged.vcf > out.vcf
```
```
$ find DIR1 -name "PREFIX_*_variations.gatk.annotations.vcf.gz" |\
grep -E '(S1|S2|S3|S4)' |\
xargs perl vcftools_0.1.12b/perl vcftools_0.1.12b/bin/vcf-merge |\
java -jar dist/fixvcfmissinggenotypes.jar -d 10 -f <( find DIR1 -name "PREFIX_*final.bam" | grep -E '(S1|S2|S3|S4)' ) |\
gzip --best > out.vcf.gz
```
### See also
* https://www.biostars.org/p/119007/
### History
* 2014: Creation
END_DOC
*/
import com.beust.jcommander.Parameter;
import com.github.lindenb.jvarkit.util.jcommander.Launcher;
import com.github.lindenb.jvarkit.util.jcommander.Program;
import com.github.lindenb.jvarkit.util.log.Logger;
/**
BEGIN_DOC
### Example
```
$ yourtool-mergingvcf 1.vcf 2.vcf 3.vcf > merged.vcf
$ find ./ -name "*.bam" > bams.list
$ java -jar dist/fixvcfmissinggenotypes.jar -f bams.list < merged.vcf > out.vcf
```
```
$ find DIR1 -name "PREFIX_*_variations.gatk.annotations.vcf.gz" |\
grep -E '(S1|S2|S3|S4)' |\
xargs perl vcftools_0.1.12b/perl vcftools_0.1.12b/bin/vcf-merge |\
java -jar dist/fixvcfmissinggenotypes.jar -d 10 -f <( find DIR1 -name "PREFIX_*final.bam" | grep -E '(S1|S2|S3|S4)' ) |\
gzip --best > out.vcf.gz
```
### See also
* https://www.biostars.org/p/119007/
### History
* 2014: Creation
END_DOC
*/
@Program(name="fixvcfmissinggenotypes",description="After a VCF-merge, read a VCF, look back at some BAMS to tells if the missing genotypes were homozygotes-ref or not-called. If the number of reads is greater than min.depth, then the missing genotypes is said hom-ref.")
public class FixVcfMissingGenotypes extends Launcher
{
private static final Logger LOG = Logger.build(FixVcfMissingGenotypes.class).make();
@Parameter(names={"-o","--output"},description="Output file. Optional . Default: stdout")
private File outputFile = null;
@Parameter(names={"-d","--depth"},description="min depth")
private int minDepth = 10 ;
@Parameter(names={"-B","--bams"},description=">path of indexed BAM path with read Groups. You can put those paths in a text file having a *.list sufffix")
private List<String> bamList=new ArrayList<>();
@Override
public int doWork(List<String> args) {
VcfIterator in=null;
try {
final Set<String> bamFiles= IOUtils.unrollFiles(bamList);
final Map<String,Set<File>> sample2bam=new HashMap<>(bamFiles.size());
for(final String bamFile: bamFiles)
{
LOG.info("Reading header for "+bamFile);
final SamReader reader=super.openSamReader(bamFile);
final SAMFileHeader header=reader.getFileHeader();
for(final SAMReadGroupRecord g:header.getReadGroups())
{
if(g.getSample()==null) continue;
String sample=g.getSample();
if(sample.isEmpty()) continue;
Set<File> set= sample2bam.get(sample);
if(set==null)
{
set=new HashSet<>();
sample2bam.put(sample,set);
}
set.add(new File(bamFile));
}
reader.close();
}
in = openVcfIterator(oneFileOrNull(args));
final File tmpFile1 = File.createTempFile("fixvcf", ".vcf");
final File tmpFile2 = File.createTempFile("fixvcf", ".vcf");
VCFHeader header=in.getHeader();
VCFHeader h2=new VCFHeader(header);
final String FIXED_TAG="FXG";
h2.addMetaDataLine(new VCFFormatHeaderLine(FIXED_TAG,1,VCFHeaderLineType.Integer,"Genotype was set as homozygous (min depth ="+this.minDepth+")"));
super.addMetaData(h2);
for(int i=0;i< header.getNGenotypeSamples();++i)
{
int countFixed=0;
int countNonFixed=0;
int countTotal=0;
final String sample= header.getSampleNamesInOrder().get(i);
LOG.info("Sample: "+sample);
Set<File> bams = sample2bam.get(sample);
if(bams==null) bams = new HashSet<File>();
if(bams.isEmpty())
{
LOG.warn("No bam to fix sample "+sample);
//don't 'continue' for simplicity
}
final List<SamReader> samReaders= new ArrayList<>(bams.size());
for(File bam:bams)
{
LOG.info("Opening "+bam);
samReaders.add(super.openSamReader(bam.getPath()));
}
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header);
final VariantContextWriter w = VCFUtils.createVariantContextWriter(i%2==0?tmpFile1:tmpFile2);
w.writeHeader(h2);
while(in.hasNext())
{
final VariantContext ctx= progress.watch(in.next());
countTotal++;
if(samReaders.isEmpty())
{
w.add(ctx);
continue;
}
final Genotype genotype = ctx.getGenotype(sample);
if(genotype!=null && genotype.isCalled())
{
w.add(ctx);
continue;
}
int depth=0;
//get depth for this position
for(SamReader sr: samReaders)
{
SAMRecordIterator iter=sr.query(ctx.getContig(), ctx.getStart(), ctx.getEnd(), false);
while(iter.hasNext())
{
final SAMRecord rec=iter.next();
if(rec.getReadUnmappedFlag()) continue;
if(rec.getDuplicateReadFlag()) continue;
if(rec.isSecondaryOrSupplementary()) continue;
if(rec.getMappingQuality()==0 ) continue;
if(rec.getReadFailsVendorQualityCheckFlag()) continue;
final SAMReadGroupRecord rg=rec.getReadGroup();
if(!sample.equals(rg.getSample())) continue;
final Cigar cigar=rec.getCigar();
if(cigar==null) continue;
int refPos=rec.getAlignmentStart();
for(final CigarElement ce:cigar.getCigarElements())
{
if( refPos > ctx.getEnd() ) break;
if(!ce.getOperator().consumesReferenceBases()) continue;
if( ce.getOperator().consumesReadBases())
{
for(int n=0;n< ce.getLength();++n )
{
if( refPos+n < ctx.getStart() ) continue;
if( refPos+n > ctx.getEnd()) break;
depth++;
}
}
refPos+= ce.getLength();
}
}
iter.close();
}
depth /= ( 1 + ctx.getEnd() - ctx.getStart() );
if(depth< this.minDepth)
{
countNonFixed++;
w.add(ctx);
continue;
}
final List<Allele> homozygous=new ArrayList<>(2);
homozygous.add(ctx.getReference());
homozygous.add(ctx.getReference());
final GenotypeBuilder gb=new GenotypeBuilder(genotype);
gb.alleles(homozygous);
gb.attribute(FIXED_TAG, 1);
if(header.getFormatHeaderLine("DP")!=null)
{
gb.DP(depth);
}
final GenotypesContext gtx=GenotypesContext.copy(ctx.getGenotypes());
gtx.replace(gb.make());
final VariantContextBuilder vcb=new VariantContextBuilder(ctx);
vcb.genotypes(gtx);
w.add(vcb.make());
countFixed++;
}
progress.finish();
w.close();
in.close();
//closing BAMS
for(final SamReader r:samReaders) CloserUtil.close(r);
LOG.info("done sample "+sample+
" fixed="+countFixed+
" not-fixed="+countNonFixed+
" total="+countTotal+
" genotypes");
//reopen in
in = VCFUtils.createVcfIteratorFromFile(i%2==0?tmpFile1:tmpFile2);
h2= in.getHeader();
}
final VariantContextWriter w = super.openVariantContextWriter(this.outputFile);
w.writeHeader(h2);
while(in.hasNext())
{
w.add(in.next());
}
in.close();
w.close();
tmpFile1.delete();
tmpFile2.delete();
return RETURN_OK;
}
catch(Exception err)
{
LOG.error(err);
return -1;
}
finally
{
CloserUtil.close(in);
}
}
public static void main(String[] args) {
new FixVcfMissingGenotypes().instanceMainWithExit(args);
}
}