/*
The MIT License (MIT)
Copyright (c) 2014 Pierre Lindenbaum
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
History:
* 2015 creation
*/
package com.github.lindenb.jvarkit.tools.xcontamination;
import htsjdk.samtools.Cigar;
import htsjdk.samtools.CigarElement;
import htsjdk.samtools.CigarOperator;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMReadGroupRecord;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SAMRecordIterator;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.SamReader;
import htsjdk.samtools.SamReaderFactory;
import htsjdk.samtools.ValidationStringency;
import htsjdk.samtools.util.CloserUtil;
import htsjdk.samtools.util.SequenceUtil;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFHeader;
import java.io.File;
import java.io.PrintWriter;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import com.beust.jcommander.Parameter;
import com.github.lindenb.jvarkit.io.IOUtils;
import com.github.lindenb.jvarkit.util.Counter;
import com.github.lindenb.jvarkit.util.illumina.ShortReadName;
import com.github.lindenb.jvarkit.util.jcommander.Launcher;
import com.github.lindenb.jvarkit.util.jcommander.Program;
import com.github.lindenb.jvarkit.util.log.Logger;
import com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress;
import com.github.lindenb.jvarkit.util.vcf.VcfIterator;
/**
BEGIN_DOC
## Example
```bash
$ find . -type f -name "*.bam" > bam.list
$ head -n 10000 variant.vcf | java -jar dist/xcontaminations.jar - bam.list > out.tsv
$ verticalize out.tsv
>>> 2
$1 #Machine:FlowCell:Run:Lane-1 : HISEQ10:C3FBPACXX:0:4
$2 sample1 : B00G5V9
$3 Machine:FlowCell:Run:Lane-2 : HISEQ10:C486PACXX:0:3
$4 sample2 : B00G7LK
$5 same.lane : 0
$6 reads_sample1_supporting_sample1 : 26392
$7 reads_sample1_supporting_sample2 : 70
$8 reads_sample1_supporting_others : 40
$9 reads_sample2_supporting_sample2 : 21473
$10 reads_sample2_supporting_sample1 : 39
$11 reads_sample2_supporting_other : 31
<<< 2
(...)
>>> 9
$1 #Machine:FlowCell:Run:Lane-1 : HISEQ5:C3FV0ACXX:0:7
$2 sample1 : B00G738
$3 Machine:FlowCell:Run:Lane-2 : HISEQ5:C3FV0ACXX:0:7
$4 sample2 : B00G754
$5 same.lane : 1
$6 reads_sample1_supporting_sample1 : 10209
$7 reads_sample1_supporting_sample2 : 23
$8 reads_sample1_supporting_others : 15
$9 reads_sample2_supporting_sample2 : 9054
$10 reads_sample2_supporting_sample1 : 32
$11 reads_sample2_supporting_other : 9
<<< 9
```
generating in parallel:
```make
CHROMS=1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
.PHONY:all
define xcont
$$(addprefix tmp.xcont.,$$(addsuffix .tsv.gz,$(1))) :
bcftools view -r "$(1)" unifiedgenotyper.vcf.gz -Tcapture.bed | java -Xmx1g -jar xcontaminations.jar - bal.list | gzip --best > $$(addsuffix .tmp.gz,$$@) && mv
$$(addsuffix .tmp.gz,$$@) $$@
endef
all: $(foreach C,${CHROMS},$(addprefix tmp.xcont.,$(addsuffix .tsv.gz,${C})))
$(foreach I,0 1, gunzip -c $^ | awk -F ' ' '($$5==$I)' |awk -F ' ' 'BEGIN {T=0;N=0;} {for(i=6;i<=NF;++i) T+=int($$i); N+=int($$7); N+=int($$10);} E
ND { printf("%f\n",N/T);}'; )
$(foreach C,${CHROMS},$(eval $(call xcont,$C)))
```
END_DOC
*/
@Program(name="xcontaminations",description="For @AdrienLeger2 : cross contamination between samples in same lane")
public class XContaminations extends Launcher
{
private static final Logger LOG=Logger.build(XContaminations.class).make();
@Parameter(names={"-o","--out"},description="Output file or stdout")
private File outputFile = null;
private Set<File> bamFiles=new HashSet<File>();
private static class SampleAlleles
{
long reads_sample1_supporting_sample1=0;
long reads_sample1_supporting_sample2=0;
long reads_sample1_supporting_other=0;
long reads_sample2_supporting_sample1=0;
long reads_sample2_supporting_sample2=0;
long reads_sample2_supporting_other=0;
}
private static class SamplePair
{
SequencerFlowCellRunLaneSample sample1;
SequencerFlowCellRunLaneSample sample2;
SamplePair(SequencerFlowCellRunLaneSample s1,SequencerFlowCellRunLaneSample s2)
{
if(s1.sampleName.compareTo(s2.sampleName)<0)
{
sample1=s1;
sample2=s2;
}
else
{
sample1=s2;
sample2=s1;
}
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + sample1.hashCode();
result = prime * result + sample2.hashCode();
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj) return true;
if (obj == null) return false;
if (getClass() != obj.getClass()) return false;
SamplePair other = (SamplePair) obj;
if (!sample1.equals(other.sample1)) return false;
if (!sample2.equals(other.sample2)) return false;
return true;
}
@Override
public String toString() {
return "("+sample1+"/"+sample2+")";
}
}
private static class SequencerFlowCellRunLaneSample
{
String machine;
String flowCell;
int run;
int lane;
String sampleName;
SequencerFlowCellRunLaneSample(ShortReadName name,String sampleName)
{
this.machine=name.getInstrumentName();
this.flowCell=name.getFlowCellId();
this.run=Math.max(name.getRunId(),0);
this.lane=name.getFlowCellLane();
this.sampleName=sampleName;
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + flowCell.hashCode();
result = prime * result + lane;
result = prime * result + machine.hashCode();
result = prime * result + sampleName.hashCode();
result = prime * result + run;
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj) return true;
if (obj == null) return false;
if (getClass() != obj.getClass()) return false;
SequencerFlowCellRunLaneSample other = (SequencerFlowCellRunLaneSample) obj;
if (lane != other.lane) return false;
if (run != other.run) return false;
if (!flowCell.equals(other.flowCell)) return false;
if (!machine.equals(other.machine)) return false;
if (!sampleName.equals(other.sampleName)) return false;
return true;
}
public String getSequencingLabel()
{
return machine+":"+flowCell+":"+run+":"+lane;
}
@Override
public String toString() {
return getSequencingLabel()+":"+sampleName;
}
}
public void addBamFile(File bamFile)
{
this.bamFiles.add(bamFile);
}
@Override
public int doWork(List<String> args) {
if(args.size()<2)
{
LOG.error("Illegal Number of args");
return -1;
}
for(String f:IOUtils.unrollFiles(args.subList(1, args.size())))
{
addBamFile(new File(f));
}
if(this.bamFiles.isEmpty())
{
LOG.error("Undefined BAM file(s)");
return -1;
}
SAMRecordIterator iter=null;
VcfIterator in=null;
Map<String,SamReader> sample2samReader=new HashMap<>();
try {
SamReaderFactory srf=SamReaderFactory.makeDefault().validationStringency(ValidationStringency.LENIENT);
if(args.get(0).equals("-"))
{
in = super.openVcfIterator(null);
}
else
{
in = super.openVcfIterator(args.get(0));
}
VCFHeader vcfHeader=in.getHeader();
SAMSequenceDictionary dict1=vcfHeader.getSequenceDictionary();
if(dict1==null)
{
LOG.error("VCF is missing a SAM sequence dictionary");
return -1;
}
Set<String> sampleNames= new HashSet<>(vcfHeader.getSampleNamesInOrder());
if( sampleNames.isEmpty())
{
LOG.error("VCF contains no sample");
return -1;
}
for(File bamFile:this.bamFiles)
{
LOG.info("Opening "+bamFile);
SamReader samReader=srf.open(bamFile);
SAMFileHeader samHeader= samReader.getFileHeader();
SAMSequenceDictionary dict2=samHeader.getSequenceDictionary();
if(dict2==null)
{
samReader.close();
LOG.error("BAM is missing a SAM sequence dictionary");
return -1;
}
if(!SequenceUtil.areSequenceDictionariesEqual(dict1, dict2))
{
samReader.close();
LOG.error("VCF/BAM Not the same dictionaries");
return -1;
}
if(!samReader.hasIndex())
{
samReader.close();
LOG.error("sam is not index");
return -1;
}
String sampleName=null;
for(SAMReadGroupRecord rgr:samHeader.getReadGroups())
{
String s=rgr.getSample();
if(s==null ) continue;
if(sampleName==null)
{
sampleName=s;
}
else if(!sampleName.equals(s))
{
samReader.close();
LOG.error("Cannot handle more than one sample/bam");
return -1;
}
}
if(sampleName==null)
{
samReader.close();
LOG.error("No sample in "+bamFile);
continue;//skip this bam
}
if(!sampleNames.contains(sampleName))
{
samReader.close();
LOG.error("Not in VCF header: sample "+sampleName+" "+bamFile);
continue;//skip this bam
}
if(sample2samReader.containsKey(sampleName))
{
samReader.close();
LOG.error("Cannot handle more than one bam/sample");
return -1;
}
sample2samReader.put(sampleName, samReader);
}
if(sample2samReader.size()<2)
{
LOG.error("Not engough BAM/samples. Expected at least two valid BAMs");
return -1;
}
sampleNames.retainAll(sample2samReader.keySet());
Map<SamplePair,SampleAlleles> contaminationTable=new HashMap<>();
SAMSequenceDictionaryProgress progress=new SAMSequenceDictionaryProgress(dict1);
while(in.hasNext())
{
VariantContext ctx= progress.watch(in.next());
if(!ctx.isSNP() || !ctx.isBiallelic() || ctx.isSymbolic()) continue;
boolean isWorthScanning=false;
/* scan Reads for this Sample */
for(String s1: sampleNames)
{
Genotype g1=ctx.getGenotype(s1);
if(g1==null || !g1.isHom()) continue;
/* scan Reads for this Sample */
for(String s2: sampleNames)
{
if(s1.compareTo(s2)>=0) continue;
Genotype g2=ctx.getGenotype(s2);
if(g2==null || !g2.isHom()) continue;
if(g1.sameGenotype(g2)) continue;
isWorthScanning=true;
break;
}
if(isWorthScanning) break;
}
if(!isWorthScanning) continue;
//Set<SequencerFlowCellRunLane> sequencerFlowCellRunLaneInThisContext=new HashSet<>();
Map<SequencerFlowCellRunLaneSample,Counter<Allele>> sample2allelesCount=new HashMap<>();
/* scan Reads for this Sample */
for(String sampleName: sampleNames)
{
//sample name is not in vcf header
if(!sample2samReader.containsKey(sampleName))
continue;
SamReader samReader = sample2samReader.get(sampleName);
if(samReader==null) continue;
Genotype genotype = ctx.getGenotype(sampleName);
if(genotype==null || !genotype.isHom()) continue;
iter = samReader.query(
ctx.getContig(),
ctx.getStart(),
ctx.getEnd(),
false
);
while(iter.hasNext())
{
SAMRecord record= iter.next();
if(record.getReadUnmappedFlag()) continue;
if(record.isSecondaryOrSupplementary()) continue;
if(record.getDuplicateReadFlag()) continue;
if(record.getMappingQuality()==0 || record.getMappingQuality()==255) continue;
if(record.getReadPairedFlag())
{
if(!record.getProperPairFlag()) continue;
}
SAMReadGroupRecord srgr = record.getReadGroup();
//not current sample
if(srgr==null) continue;
if(!sampleName.equals(srgr.getSample())) continue;
ShortReadName readName = ShortReadName.parse(record);
if(!readName.isValid())
{
LOG.info("No a valid read name "+record.getReadName());
continue;
}
SequencerFlowCellRunLaneSample sequencerFlowCellRunLaneSample=
new SequencerFlowCellRunLaneSample(readName, sampleName);
Cigar cigar=record.getCigar();
if(cigar==null) continue;
byte readSeq[]=record.getReadBases();
if(readSeq==null) continue;
int refPos1 = record.getAlignmentStart();
int readPos = 0;
for(CigarElement ce: cigar.getCigarElements())
{
//beyond variant position ?
if(refPos1>ctx.getStart()) break;
CigarOperator op=ce.getOperator();
switch(op)
{
case I: readPos+=ce.getLength(); break;
case N://threw
case D: refPos1+=ce.getLength(); break;
case S: readPos+=ce.getLength(); break;
case H: break;
case P: break;
case M: case EQ: case X:
{
for(int i=0;i< ce.getLength();++i)
{
if( refPos1 == ctx.getStart())
{
byte base = readSeq[readPos];
if(!(base=='N' || base=='n'))
{
Counter<Allele> sampleAlleles= sample2allelesCount.get(sequencerFlowCellRunLaneSample);
if(sampleAlleles==null)
{
sampleAlleles=new Counter<Allele>();
sample2allelesCount.put(sequencerFlowCellRunLaneSample, sampleAlleles);
}
sampleAlleles.incr(
Allele.create(base,false)
);
}
}
refPos1++;
readPos++;
}
break;
}
default: throw new IllegalStateException();
}
}
}
iter.close();
iter=null;
}/* end scan reads for this sample */
/* sum-up data for this SNP */
for(String sample1: sampleNames)
{
Genotype g1= ctx.getGenotype(sample1);
if(g1==null || !g1.isHom()) continue;
Allele a1 = Allele.create(g1.getAllele(0).getBases(),false);
for(String sample2: sampleNames)
{
if(sample1.compareTo(sample2)>=0) continue;
Genotype g2= ctx.getGenotype(sample2);
if(g2==null || !g2.isHom() || g2.sameGenotype(g1)) continue;
Allele a2 = Allele.create(g2.getAllele(0).getBases(),false);
for(SequencerFlowCellRunLaneSample sfcr1:sample2allelesCount.keySet())
{
if(!sfcr1.sampleName.equals(sample1)) continue;
for(SequencerFlowCellRunLaneSample sfcr2:sample2allelesCount.keySet())
{
if(!sfcr2.sampleName.equals(sample2)) continue;
SamplePair samplePair=new SamplePair(sfcr1, sfcr2);
Counter<Allele> counter1 = sample2allelesCount.get(sfcr1);
if(counter1==null) continue;
Counter<Allele> counter2 = sample2allelesCount.get(sfcr2);
if(counter2==null) continue;
SampleAlleles sampleAlleles= contaminationTable.get(samplePair);
if(sampleAlleles==null)
{
sampleAlleles=new SampleAlleles();
contaminationTable.put(samplePair,sampleAlleles);
if( contaminationTable.size()%10000==0) LOG.info("n(pairs)=" + contaminationTable.size() );
}
for(Allele allele: counter1.keySet())
{
long n = counter1.count(allele);
if(allele.equals(a1))
{
sampleAlleles.reads_sample1_supporting_sample1 += n;
}
else if(allele.equals(a2))
{
sampleAlleles.reads_sample1_supporting_sample2 += n;
}
else
{
sampleAlleles.reads_sample1_supporting_other += n;
}
}
for(Allele allele: counter2.keySet())
{
long n = counter2.count(allele);
if(allele.equals(a2))
{
sampleAlleles.reads_sample2_supporting_sample2 += n;
}
else if(allele.equals(a1))
{
sampleAlleles.reads_sample2_supporting_sample1 += n;
}
else
{
sampleAlleles.reads_sample2_supporting_other += n;
}
}
}
}
}
}
}
progress.finish();
boolean somethingPrinted=false;
PrintWriter pw= super.openFileOrStdoutAsPrintWriter(this.outputFile);
/* we're done, print the result */
pw.print("#");
pw.print("Machine:FlowCell:Run:Lane-1\tsample1");
pw.print("\tMachine:FlowCell:Run:Lane-2\tsample2");
pw.print("\tsame.lane");
pw.print("\treads_sample1_supporting_sample1");
pw.print('\t');
pw.print("reads_sample1_supporting_sample2");
pw.print('\t');
pw.print("reads_sample1_supporting_others");
pw.print('\t');
pw.print("reads_sample2_supporting_sample2");
pw.print('\t');
pw.print("reads_sample2_supporting_sample1");
pw.print('\t');
pw.print("reads_sample2_supporting_other");
pw.println();
for(SamplePair pair : contaminationTable.keySet())
{
SampleAlleles sampleAlleles = contaminationTable.get(pair);
if(sampleAlleles==null) continue;
pw.print(pair.sample1.getSequencingLabel());
pw.print('\t');
pw.print(pair.sample1.sampleName);
pw.print('\t');
pw.print(pair.sample2.getSequencingLabel());
pw.print('\t');
pw.print(pair.sample2.sampleName);
pw.print('\t');
pw.print(pair.sample1.getSequencingLabel().equals(pair.sample2.getSequencingLabel())?1:0);
pw.print('\t');
pw.print(sampleAlleles.reads_sample1_supporting_sample1);
pw.print('\t');
pw.print(sampleAlleles.reads_sample1_supporting_sample2);
pw.print('\t');
pw.print(sampleAlleles.reads_sample1_supporting_other);
pw.print('\t');
pw.print(sampleAlleles.reads_sample2_supporting_sample2);
pw.print('\t');
pw.print(sampleAlleles.reads_sample2_supporting_sample1);
pw.print('\t');
pw.print(sampleAlleles.reads_sample2_supporting_other);
pw.println();
somethingPrinted=true;
}
pw.flush();
pw.close();
if(!somethingPrinted)
{
System.err.println("Warning: NO output");
}
return 0;
}
catch (Exception e) {
LOG.error(e);
return -1;
}
finally
{
CloserUtil.close(in);
CloserUtil.close(iter);
for(SamReader samReader:sample2samReader.values())
CloserUtil.close(samReader);
sample2samReader.clear();
}
}
public static void main(String[] args)
{
new XContaminations().instanceMainWithExit(args);
}
}