/**
* Author:
* Pierre Lindenbaum PhD
* Date:
* Fev-2014
* Contact:
* plindenbaum@yahoo.fr
* Motivation:
* Idea from Solena: successive synonymous mutations are a stop codong
*/
package com.github.lindenb.jvarkit.tools.vcfannot;
import java.io.BufferedReader;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
import com.beust.jcommander.Parameter;
import com.beust.jcommander.ParametersDelegate;
import com.github.lindenb.jvarkit.io.IOUtils;
import com.github.lindenb.jvarkit.lang.DelegateCharSequence;
import com.github.lindenb.jvarkit.util.bio.AcidNucleics;
import com.github.lindenb.jvarkit.util.bio.GeneticCode;
import com.github.lindenb.jvarkit.util.bio.GranthamScore;
import com.github.lindenb.jvarkit.util.jcommander.Launcher;
import com.github.lindenb.jvarkit.util.jcommander.Program;
import com.github.lindenb.jvarkit.util.log.Logger;
import com.github.lindenb.jvarkit.util.picard.AbstractDataCodec;
import com.github.lindenb.jvarkit.util.picard.GenomicSequence;
import com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress;
import com.github.lindenb.jvarkit.util.ucsc.KnownGene;
import com.github.lindenb.jvarkit.util.vcf.VCFUtils;
import com.github.lindenb.jvarkit.util.vcf.VcfIterator;
import htsjdk.samtools.Cigar;
import htsjdk.samtools.CigarElement;
import htsjdk.samtools.CigarOperator;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMReadGroupRecord;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SAMRecordIterator;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.SamReader;
import htsjdk.samtools.SamReaderFactory;
import htsjdk.samtools.ValidationStringency;
import htsjdk.samtools.reference.IndexedFastaSequenceFile;
import htsjdk.samtools.util.CloseableIterator;
import htsjdk.samtools.util.CloserUtil;
import htsjdk.samtools.util.Interval;
import htsjdk.samtools.util.IntervalTreeMap;
import htsjdk.samtools.util.SequenceUtil;
import htsjdk.samtools.util.SortingCollection;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.variantcontext.VariantContextBuilder;
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
import htsjdk.variant.vcf.VCFConstants;
import htsjdk.variant.vcf.VCFFilterHeaderLine;
import htsjdk.variant.vcf.VCFHeader;
import htsjdk.variant.vcf.VCFHeaderLine;
import htsjdk.variant.vcf.VCFHeaderLineCount;
import htsjdk.variant.vcf.VCFHeaderLineType;
import htsjdk.variant.vcf.VCFInfoHeaderLine;
/**
* VCFStopCodon
* @SolenaLS 's idea: variant in the same codon give a new Amino acid undetected by annotaion tools.
*
*/
/**
BEGIN_DOC
### Output
#### Example
```
##fileformat=VCFv4.2
##FILTER=<ID=TwoStrands,Description="(number of reads carrying both mutation) < (reads carrying variant 1 + reads carrying variant 2)">
##INFO=<ID=CodonVariant,Number=.,Type=String,Description="Variant affected by two distinct mutation. Format is defined in the INFO column. INFO_AC:Allele count in genotypes, for each ALT allele, in the same order as listed.INFO_AF:Allele Frequency, for each ALT allele, in the same order as listed.INFO_MLEAC:Maximum likelihood expectation (MLE) for the allele counts (not necessarily the same as the AC), for each ALT allele, in the same order as listed.INFO_MLEAF:Maximum likelihood expectation (MLE) for the allele frequency (not necessarily the same as the AF), for each ALT allele, in the same order as listed.">
##VCFCombineTwoSnvsCmdLine=-k jeter.knownGene.txt -tmpdir tmp/ -R /commun/data/pubdb/broadinstitute.org/bundle/1.5/b37/human_g1k_v37.fasta -B /commun/data/projects/plateforme/NTS-017_HAL_Schott_mitral/20141106/align20141106/Samples/CD13314/BAM/Haloplex20141106_CD13314_final.bam
##VCFCombineTwoSnvsHtsJdkHome=/commun/data/packages/htsjdk/htsjdk-2.0.1
##VCFCombineTwoSnvsHtsJdkVersion=2.0.1
##VCFCombineTwoSnvsVersion=c5af7d1bd367562b3578d427d24ec62856835d38
#CHROM POS ID REF ALT QUAL FILTER INFO
1 120612013 rs200646249 G A . . CodonVariant=CHROM|1|REF|G|TRANSCRIPT|uc001eil.3|cDdnaPos|8|CodonPos|7|CodonWild|GCC|AAPos|3|AAWild|A|POS1|120612013|ID1|rs200646249|PosInCodon1|2|Alt1|A|Codon1|GTC|AA1|V|INFO_MLEAC_1|1|INFO_AC_1|1|INFO_MLEAF_1|0.500|INFO_AF_1|0.500|POS2|120612014|ID2|.|PosInCodon2|1|Alt2|A|Codon2|TCC|AA2|S|INFO_MLEAC_2|1|INFO_AC_2|1|INFO_MLEAF_2|0.500|INFO_AF_2|0.500|CombinedCodon|TTC|CombinedAA|F|CombinedSO|nonsynonymous_variant|CombinedType|combined_is_new|N_READS_BOTH_VARIANTS|168|N_READS_NO_VARIANTS|1045|N_READS_TOTAL|1213|N_READS_ONLY_1|0|N_READS_ONLY_2|0,CHROM|1|REF|G|TRANSCRIPT|uc001eik.3|cDdnaPos|8|CodonPos|7|CodonWild|GCC|AAPos|3|AAWild|A|POS1|120612013|ID1|rs200646249|PosInCodon1|2|Alt1|A|Codon1|GTC|AA1|V|INFO_MLEAC_1|1|INFO_AC_1|1|INFO_MLEAF_1|0.500|INFO_AF_1|0.500|POS2|120612014|ID2|.|PosInCodon2|1|Alt2|A|Codon2|TCC|AA2|S|INFO_MLEAC_2|1|INFO_AC_2|1|INFO_MLEAF_2|0.500|INFO_AF_2|0.500|CombinedCodon|TTC|CombinedAA|F|CombinedSO|nonsynonymous_variant|CombinedType|combined_is_new|N_READS_BOTH_VARIANTS|168|N_READS_NO_VARIANTS|1045|N_READS_TOTAL|1213|N_READS_ONLY_1|0|N_READS_ONLY_2|0;EXAC03_AC_NFE=641;EXAC03_AN_NFE=48948
1 120612014 . C A . . CodonVariant=CHROM|1|REF|C|TRANSCRIPT|uc001eik.3|cDdnaPos|7|CodonPos|7|CodonWild|GCC|AAPos|3|AAWild|A|POS1|120612014|ID1|.|PosInCodon1|1|Alt1|A|Codon1|TCC|AA1|S|INFO_MLEAC_1|1|INFO_AC_1|1|INFO_MLEAF_1|0.500|INFO_AF_1|0.500|POS2|120612013|ID2|rs200646249|PosInCodon2|2|Alt2|A|Codon2|GTC|AA2|V|INFO_MLEAC_2|1|INFO_AC_2|1|INFO_MLEAF_2|0.500|INFO_AF_2|0.500|CombinedCodon|TTC|CombinedAA|F|CombinedSO|nonsynonymous_variant|CombinedType|combined_is_new|N_READS_BOTH_VARIANTS|168|N_READS_NO_VARIANTS|1045|N_READS_TOTAL|1213|N_READS_ONLY_1|0|N_READS_ONLY_2|0,CHROM|1|REF|C|TRANSCRIPT|uc001eil.3|cDdnaPos|7|CodonPos|7|CodonWild|GCC|AAPos|3|AAWild|A|POS1|120612014|ID1|.|PosInCodon1|1|Alt1|A|Codon1|TCC|AA1|S|INFO_MLEAC_1|1|INFO_AC_1|1|INFO_MLEAF_1|0.500|INFO_AF_1|0.500|POS2|120612013|ID2|rs200646249|PosInCodon2|2|Alt2|A|Codon2|GTC|AA2|V|INFO_MLEAC_2|1|INFO_AC_2|1|INFO_MLEAF_2|0.500|INFO_AF_2|0.500|CombinedCodon|TTC|CombinedAA|F|CombinedSO|nonsynonymous_variant|CombinedType|combined_is_new|N_READS_BOTH_VARIANTS|168|N_READS_NO_VARIANTS|1045|N_READS_TOTAL|1213|N_READS_ONLY_1|0|N_READS_ONLY_2|0;EXAC03_AC_NFE=640;EXAC03_AN_NFE=48228
```
#### Fields
KEYEXAMPLEDescription
CHROM1Chromosome for current variant.
REFCReference Allele for current variant
TRANSCRIPTuc001eik.3UCSC knownGene Transcript
cDdnaPos7+1 based position in cDNA
CodonPos7+1 based position of the codon in cNA
CodonWildGCCWild codon
AAPos3+1 based position of amino acid
AAWildAWild amino acid
POS1120612014+1 based position of variant 1
ID1.RS ID of variant 1
PosInCodon11Position in codon (1,2,3) of variant 1
Alt1AAlternate allele of variant 1
Codon1TCC Codon with variant 1 alone
AA1SAmino acid prediction for variant 1
INFO_*_11Data about alternate allele 1 taken out of original VCF
POS2120612013+1 based position of variant 1
ID2rs200646249RS ID of variant 1
PosInCodon22Position in codon (1,2,3) of variant 2
Alt2AAlternate allele of variant 2
Codon2GTC Codon with variant 2alone
AA2VAmino acid prediction for variant 2
INFO_*_21Data about alternate allele 2 taken out of original VCF
CombinedCodonTTCCombined codon with ALT1 and ALT2
CombinedAAFCombined amino acid with ALT1 and ALT2
CombinedSOnonsynonymous_variantSequence Ontology term
CombinedTypecombined_is_newtype of new mutation
N_READS_BOTH_VARIANTS168Number of reads carrying both variants
N_READS_NO_VARIANTS1045Number of reads carrying no variants
N_READS_TOTAL1213Total Number of reads
N_READS_ONLY_10Number of reads carrying onlt variant 1
N_READS_ONLY_20Number of reads carrying onlt variant 2
### See also
http://bmcresnotes.biomedcentral.com/articles/10.1186/1756-0500-5-615
END_DOC
*/
@Program(name="vcfcombinetwosnvs",description="Idea from @SolenaLS and then @AntoineRimbert")
public class VCFCombineTwoSnvs extends Launcher
{
private static final Logger LOG = Logger.build(VCFCombineTwoSnvs.class).make();
@Parameter(names={"-o","--output"},description="Output file. Optional . Default: stdout")
private File outputFile = null;
@Parameter(names={"-k","--knownGene"},description="KnownGene data URI/File. Beware chromosome names are formatted the same as your REFERENCE.",required=true)
private String kgURI = "http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/knownGene.txt.gz";
@Parameter(names={"-B","--bam"},description="Optional indexed BAM file used to get phasing information. This can be a list of bam if the filename ends with '.list'")
private File bamIn = null;
@Parameter(names={"-R","--reference"},description="Indexed fasta Reference",required=true)
private File referenceFile = null;
@ParametersDelegate
private WritingSortingCollection writingSortingCollection=new WritingSortingCollection();
/** known Gene collection */
private final IntervalTreeMap<List<KnownGene>> knownGenes=new IntervalTreeMap<>();
/** reference genome */
private IndexedFastaSequenceFile indexedFastaSequenceFile=null;
/** current genomic sequence */
private GenomicSequence genomicSequence=null;
/** all variants */
private SortingCollection<Variant> variants= null;
/** genetic Code used */
private static final GeneticCode GENETIC_CODE = GeneticCode.getStandard();
/** mutated cdna */
private static class MutedSequence extends DelegateCharSequence
{
private int begin=-1;
private int end=-1;
private String newseq=null;
MutedSequence(CharSequence wild)
{
super(wild);
}
void setMutation(int begin,int end,final String newseq)
{
if(this.newseq!=null) throw new IllegalStateException();
this.newseq = newseq;
this.begin=begin;
this.end=end;
if(this.begin>this.end) throw new IllegalArgumentException();
if(this.end> getDelegate().length()) throw new IndexOutOfBoundsException();
}
@Override
public int length() {
int L = getDelegate().length();
if(this.newseq!=null) {
L-=(this.end-this.begin);
L+=this.newseq.length();
}
return L;
}
@Override
public char charAt(int index)
{
if(this.newseq==null || index < this.begin ) return getDelegate().charAt(index);
int idx2= index-this.begin;
if(idx2 < this.newseq.length())
{
return this.newseq.charAt(idx2);
}
idx2-=this.newseq.length();
return getDelegate().charAt(this.end+idx2);
}
}
private static class ProteinCharSequence extends DelegateCharSequence
{
ProteinCharSequence(final CharSequence cDNA)
{
super(cDNA);
}
@Override
public char charAt(int i)
{
return GENETIC_CODE.translate(
getDelegate().charAt(i*3+0),
getDelegate().charAt(i*3+1),
getDelegate().charAt(i*3+2));
}
@Override
public int length()
{
return getDelegate().length()/3;
}
}
/** load KnownGenes */
private void loadKnownGenesFromUri() throws IOException
{
BufferedReader in = null;
try {
final SAMSequenceDictionary dict=this.indexedFastaSequenceFile.getSequenceDictionary();
if(dict==null) throw new IOException("dictionary missing");
LOG.info("loading genes from "+this.kgURI);
in =IOUtils.openURIForBufferedReading(this.kgURI);
final Pattern tab=Pattern.compile("[\t]");
String line = null;
while((line=in.readLine())!=null)
{
line= line.trim();
if(line.isEmpty()) continue;
String tokens[]=tab.split(line);
final KnownGene g=new KnownGene(tokens);
if(g.isNonCoding()) continue;
if(dict.getSequence(g.getContig())==null)
{
LOG.warn("Unknown chromosome "+g.getContig()+" in dictionary");
continue;
}
//use 1 based interval
final Interval interval=new Interval(
g.getContig(),
g.getTxStart()+1,
g.getTxEnd()
);
List<KnownGene> lkg= this.knownGenes.get(interval);
if(lkg==null) {
lkg=new ArrayList<>(2);
this.knownGenes.put(interval, lkg);
}
lkg.add(g);
}
CloserUtil.close(in);in=null;
LOG.info("genes:"+knownGenes.size());
}
finally
{
CloserUtil.close(in);
}
}
private static int ID_GENERATOR=0;
private static class CoverageInfo
{
int depth1=0;
int depth2=0;
int count_reads_having_both_variants = 0;
int count_reads_having_no_variants = 0;
int count_reads_having_variant1 = 0;
int count_reads_having_variant2 = 0;
}
static private class AbstractContext {
String contig;
int genomicPosition1=0;
String id=VCFConstants.EMPTY_ID_FIELD;
Allele refAllele;
Allele altAllele;
int sorting_id;
/* original vcf line */
String vcfLine=null;
}
static private class Variant extends AbstractContext
{
String transcriptName;
byte strand;
int position_in_cdna=-1;
String wildCodon=null;
String mutCodon=null;
Variant()
{
}
Variant(final VariantContext ctx,final Allele allele,final KnownGene gene) {
this.contig = ctx.getContig();
this.genomicPosition1=ctx.getStart();
this.id = (ctx.hasID()?ctx.getID():VCFConstants.EMPTY_ID_FIELD);
this.transcriptName = gene.getName();
this.strand = (byte)(gene.isNegativeStrand()?'-':'+');
this.refAllele = ctx.getReference();
this.altAllele = allele;
//this.genotypes.addAll(ctx.getGenotypes());
}
int positionInCodon() { return position_in_cdna%3;}
int codonStart() { return this.position_in_cdna - this.positionInCodon();}
/** get specific info for this variant */
private void _getInfo(final Map<String,Object> info,int suffix) {
//data about this
info.put("POS"+suffix,String.valueOf(this.genomicPosition1));
info.put("ID"+suffix,this.id);
info.put("PosInCodon"+suffix,String.valueOf(this.positionInCodon()+1));
info.put("Alt"+suffix,this.altAllele.getBaseString());
info.put("Codon"+suffix,this.mutCodon);
info.put("AA"+suffix,new ProteinCharSequence(this.mutCodon).getString());
}
public Map<String,Object> getInfo(final Variant other) {
final Map<String,Object> info = new LinkedHashMap<>();
info.put("CHROM",this.contig);
info.put("REF",this.refAllele.getBaseString());
info.put("TRANSCRIPT",this.transcriptName);
info.put("STRAND",this.strand==(byte)'+'?"plus":"minus");
info.put("cDdnaPos",String.valueOf(this.position_in_cdna+1));
info.put("CodonPos",String.valueOf(this.codonStart()+1));
info.put("CodonWild",this.wildCodon);
info.put("AAPos",String.valueOf(1+(this.codonStart()/3)));
info.put("AAWild",new ProteinCharSequence(this.wildCodon).getString() );
_getInfo(info, 1);
other._getInfo(info, 2);
return info;
}
@Override
public String toString() {
return contig+"\t"+genomicPosition1+"\t"+refAllele.getBaseString()+"\t"+
altAllele.getBaseString()+"\t"+
transcriptName+"\t"+
position_in_cdna+"\t"+
codonStart()+"\t"+
positionInCodon()+"\t"+
wildCodon+"\t"+
mutCodon
;
}
}
static private class VariantCodec extends AbstractDataCodec<Variant>
{
VariantCodec() {
}
@Override
public Variant decode(final DataInputStream dis) throws IOException {
String contig;
try {
contig = dis.readUTF();
} catch (Exception e) {
return null;
}
final Variant variant = new Variant();
variant.contig = contig;
variant.genomicPosition1 = dis.readInt();
variant.id = dis.readUTF();
variant.transcriptName = dis.readUTF();
variant.strand = dis.readByte();
variant.refAllele = Allele.create(dis.readUTF(), true);
variant.altAllele = Allele.create(dis.readUTF(), false);
variant.position_in_cdna = dis.readInt();
variant.wildCodon = dis.readUTF();
variant.mutCodon = dis.readUTF();
variant.sorting_id = dis.readInt();
variant.vcfLine = readString(dis);
return variant;
}
@Override
public void encode(final DataOutputStream dos, final Variant v) throws IOException {
dos.writeUTF(v.contig);
dos.writeInt(v.genomicPosition1);
dos.writeUTF(v.id);
dos.writeUTF(v.transcriptName);
dos.writeByte(v.strand);
dos.writeUTF(v.refAllele.getBaseString());
dos.writeUTF(v.altAllele.getBaseString());
dos.writeInt(v.position_in_cdna);
dos.writeUTF(v.wildCodon);
dos.writeUTF(v.mutCodon);
dos.writeInt(v.sorting_id);
writeString(dos, v.vcfLine);
}
@Override
public VariantCodec clone() {
return new VariantCodec();
}
}
static private class VariantComparator implements Comparator<Variant>
{
final SAMSequenceDictionary dict;
VariantComparator(final SAMSequenceDictionary dict) {
this.dict = dict;
}
int contig(final Variant v) { return dict.getSequenceIndex(v.contig);}
@Override
public int compare(final Variant o1,final Variant o2) {
int i= contig(o1) - contig(o2);
if(i!=0) return i;
i= o1.transcriptName.compareTo(o2.transcriptName);
if(i!=0) return i;
i= o1.position_in_cdna-o2.position_in_cdna;
if(i!=0) return i;
return o1.sorting_id - o2.sorting_id;
}
}
private static class CombinedMutation extends AbstractContext
{
String info=null;
String filter = VCFConstants.UNFILTERED;
int grantham_score = -1;
}
static private class MutationCodec extends AbstractDataCodec<CombinedMutation>
{
@Override
public CombinedMutation decode(final DataInputStream dis) throws IOException {
String contig;
try {
contig = dis.readUTF();
} catch (Exception e) {
return null;
}
final CombinedMutation mutation = new CombinedMutation();
mutation.contig = contig;
mutation.genomicPosition1 = dis.readInt();
mutation.id = dis.readUTF();
mutation.refAllele = Allele.create(dis.readUTF(), true);
mutation.altAllele = Allele.create(dis.readUTF(), false);
mutation.info = readString(dis);
mutation.sorting_id = dis.readInt();
mutation.filter = dis.readUTF();
mutation.grantham_score = dis.readInt();
mutation.vcfLine = readString(dis);
return mutation;
}
@Override
public void encode(final DataOutputStream dos, final CombinedMutation v) throws IOException {
dos.writeUTF(v.contig);
dos.writeInt(v.genomicPosition1);
dos.writeUTF(v.id);
dos.writeUTF(v.refAllele.getBaseString());
dos.writeUTF(v.altAllele.getBaseString());
writeString(dos,v.info);
dos.writeInt(v.sorting_id);
dos.writeUTF(v.filter);
dos.writeInt(v.grantham_score);
writeString(dos, v.vcfLine);
}
@Override
public MutationCodec clone() {
return new MutationCodec();
}
}
static private class MutationComparator implements Comparator<CombinedMutation>
{
final SAMSequenceDictionary dict;
MutationComparator(final SAMSequenceDictionary dict) {
this.dict = dict;
}
int contig(final CombinedMutation v) { return dict.getSequenceIndex(v.contig);}
@Override
public int compare(final CombinedMutation o1, final CombinedMutation o2) {
int i= contig(o1) - contig(o2);
if(i!=0) return i;
i= o1.genomicPosition1-o2.genomicPosition1;
if(i!=0) return i;
i = o1.refAllele.compareTo(o2.refAllele);
if(i!=0) return i;
return o1.sorting_id - o2.sorting_id;
}
}
/* used to compare a base in a BAM and an VCF allele */
private static boolean same(byte baseInRead,final Allele a)
{
return Character.toUpperCase(a.getBases()[0])==Character.toUpperCase(baseInRead);
}
/* used to serialize a map to string in INFO column */
private static String mapToString(final Map<String,Object> map)
{
final StringBuilder sb = new StringBuilder();
for(final String key: map.keySet()) {
if(sb.length()>0) sb.append("|");
sb.append(key);
sb.append("|");
sb.append(String.valueOf(map.get(key)));
}
return sb.toString();
}
@Override
protected int doVcfToVcf(String inputName, VcfIterator iterin, VariantContextWriter out) {
throw new IllegalStateException("should be never called");
}
@Override
protected int doVcfToVcf(final String inputName,File saveAs)
{
BufferedReader bufferedReader = null;
htsjdk.variant.variantcontext.writer.VariantContextWriter w=null;
SortingCollection<CombinedMutation> mutations = null;
CloseableIterator<Variant> varIter = null;
CloseableIterator<CombinedMutation> mutIter = null;
Map<String,SamReader> sample2samReader = new HashMap<>();
try {
bufferedReader = inputName==null?
IOUtils.openStreamForBufferedReader(stdin()):
IOUtils.openURIForBufferedReading(inputName);
final VCFUtils.CodecAndHeader cah = VCFUtils.parseHeader(bufferedReader);
/* get VCF header */
final VCFHeader header= cah.header;
final Set<String> sampleNamesInOrder = new HashSet<>(header.getSampleNamesInOrder());
LOG.info("opening REF:"+referenceFile);
this.indexedFastaSequenceFile=new IndexedFastaSequenceFile(this.referenceFile);
final SAMSequenceDictionary dict=this.indexedFastaSequenceFile.getSequenceDictionary();
if(dict==null) throw new IOException("dictionary missing");
if(this.bamIn!=null)
{
/** unroll and open bam file */
for(final File bamFile : IOUtils.unrollFileCollection(Collections.singletonList(this.bamIn)))
{
LOG.info("opening BAM :"+this.bamIn);
final SamReader samReader = SamReaderFactory.makeDefault().
referenceSequence(this.referenceFile).
validationStringency(ValidationStringency.LENIENT).
open(this.bamIn)
;
if(!samReader.hasIndex())
{
throw new IOException("Sam file is NOT indexed: "+bamFile);
}
final SAMFileHeader samHeader = samReader.getFileHeader();
if(samHeader.getSequenceDictionary()==null ||
!SequenceUtil.areSequenceDictionariesEqual(dict, samReader.getFileHeader().getSequenceDictionary())) {
throw new IOException(bamFile+" and REF don't have the same Sequence Dictionary.");
}
/* get sample name */
String sampleName=null;
for(final SAMReadGroupRecord rg:samHeader.getReadGroups()) {
if(rg.getSample()==null) continue;
if(sampleName!=null && !sampleName.equals(rg.getSample())) {
samReader.close();
throw new IOException(bamFile+" Contains two samples "+sampleName+" "+rg.getSample());
}
sampleName= rg.getSample();
}
if(sampleName==null) {
samReader.close();
LOG.warn("no sample in "+ bamFile);
continue;
}
if(!sampleNamesInOrder.contains(sampleName)) {
samReader.close();
LOG.warn("no sample "+sampleName+" in vcf");
continue;
}
sample2samReader.put(sampleName, samReader);
}
}
loadKnownGenesFromUri();
this.variants = SortingCollection.newInstance(Variant.class,
new VariantCodec(),
new VariantComparator(dict),
this.writingSortingCollection.getMaxRecordsInRam(),
this.writingSortingCollection.getTmpDirectories()
);
this.variants.setDestructiveIteration(true);
SAMSequenceDictionaryProgress progress=new SAMSequenceDictionaryProgress(header);
String vcfLine=null;
while((vcfLine=bufferedReader.readLine())!=null)
{
final VariantContext ctx= progress.watch(cah.codec.decode(vcfLine));
/* discard non SNV variant */
if(!ctx.isVariant() || ctx.isIndel())
{
continue;
}
/* find the overlapping genes : extend the interval of the variant to include the stop codon */
final Collection<KnownGene> genes= new ArrayList<>();
for(List<KnownGene> lkg:this.knownGenes.getOverlapping(
new Interval(ctx.getContig(),
Math.max(1,ctx.getStart()-3),
ctx.getEnd()+3
)))
{
genes.addAll(lkg);
}
final List<Allele> alternateAlleles = ctx.getAlternateAlleles();
/* loop over overlapping genes */
for(final KnownGene kg:genes) {
/* loop over available alleles */
for(int allele_idx=0;allele_idx< alternateAlleles.size();++allele_idx) {
final Allele alt = alternateAlleles.get(allele_idx);
challenge(ctx,alt,kg,vcfLine);
}
}
}
progress.finish();
this.variants.doneAdding();
mutations = SortingCollection.newInstance(CombinedMutation.class,
new MutationCodec(),
new MutationComparator(dict),
this.writingSortingCollection.getMaxRecordsInRam(),
this.writingSortingCollection.getTmpDirectories()
);
mutations.setDestructiveIteration(true);
final VCFFilterHeaderLine vcfFilterHeaderLine = new VCFFilterHeaderLine(
"TwoHaplotypes",
"(number of reads carrying both mutation) < (reads carrying variant 1 + reads carrying variant 2) "
);
varIter = this.variants.iterator();
progress=new SAMSequenceDictionaryProgress(header);
final ArrayList<Variant> buffer= new ArrayList<>();
for(;;)
{
Variant variant = null;
if(varIter.hasNext())
{
variant = varIter.next();
progress.watch(variant.contig, variant.genomicPosition1);
}
if(variant==null || !(!buffer.isEmpty() && buffer.get(0).contig.equals(variant.contig) && buffer.get(0).transcriptName.equals(variant.transcriptName)))
{
if(!buffer.isEmpty()) {
for(int i=0;i< buffer.size();++i)
{
final Variant v1 = buffer.get(i);
for(int j=i+1;j< buffer.size();++j)
{
final Variant v2 = buffer.get(j);
if(v1.codonStart() != v2.codonStart()) continue;
if(v1.positionInCodon() == v2.positionInCodon()) continue;
if(!v1.wildCodon.equals(v2.wildCodon))
{
throw new IllegalStateException();
}
final StringBuilder combinedCodon = new StringBuilder(v1.wildCodon);
combinedCodon.setCharAt(v1.positionInCodon(), v1.mutCodon.charAt(v1.positionInCodon()));
combinedCodon.setCharAt(v2.positionInCodon(), v2.mutCodon.charAt(v2.positionInCodon()));
final String pwild = new ProteinCharSequence(v1.wildCodon).getString();
final String p1 = new ProteinCharSequence(v1.mutCodon).getString();
final String p2 = new ProteinCharSequence(v2.mutCodon).getString();
final String pCombined = new ProteinCharSequence(combinedCodon).getString();
final String combinedSO;
final String combinedType;
/* both AA are synonymous, while combined is not */
if(!pCombined.equals(pwild) && p1.equals(pwild) && p2.equals(pwild)) {
combinedType = "combined_is_nonsynonymous";
if(pCombined.equals("*"))
{
/* http://www.sequenceontology.org/browser/current_svn/term/SO:0001587 */
combinedSO="stop_gained";
}
else if(pwild.equals("*"))
{
/* http://www.sequenceontology.org/browser/current_svn/term/SO:0002012 */
combinedSO="stop_lost";
}
else
{
/* http://www.sequenceontology.org/miso/current_svn/term/SO:0001992 */
combinedSO="nonsynonymous_variant";
}
}
else if(!pCombined.equals(p1) && !pCombined.equals(p2) && !pCombined.equals(pwild))
{
combinedType = "combined_is_new";
if(pCombined.equals("*"))
{
/* http://www.sequenceontology.org/browser/current_svn/term/SO:0001587 */
combinedSO="stop_gained";
}
else
{
/* http://www.sequenceontology.org/miso/current_svn/term/SO:0001992 */
combinedSO="nonsynonymous_variant";
}
}
else
{
combinedType = null;
combinedSO = null;
}
/** ok, there is something interesting here ,
* create two new Mutations carrying the
* two variants
*/
if( combinedSO != null)
{
/** grantham score is max found combined vs (p1/p2/wild)*/
int grantham_score= GranthamScore.score(pCombined.charAt(0), pwild.charAt(0));
grantham_score = Math.max(grantham_score,GranthamScore.score(pCombined.charAt(0), p1.charAt(0)));
grantham_score = Math.max(grantham_score,GranthamScore.score(pCombined.charAt(0), p2.charAt(0)));
/** info that will be displayed in the vcf */
final Map<String,Object> info1 = v1.getInfo(v2);
final Map<String,Object> info2 = v2.getInfo(v1);
//filter for this combined: default it fails the filter
String filter = vcfFilterHeaderLine.getID();
final Map<String,Object> combinedMap = new LinkedHashMap<>();
combinedMap.put("CombinedCodon",combinedCodon);
combinedMap.put("CombinedAA",pCombined);
combinedMap.put("CombinedSO",combinedSO);
combinedMap.put("CombinedType",combinedType);
combinedMap.put("GranthamScore", grantham_score);
info1.putAll(combinedMap);
info2.putAll(combinedMap);
final Map<String,CoverageInfo> sample2coverageInfo = new HashMap<>(sample2samReader.size());
final int chromStart = Math.min(v1.genomicPosition1,v2.genomicPosition1);
final int chromEnd = Math.max(v1.genomicPosition1,v2.genomicPosition1);
/* get phasing info for each sample*/
for(final String sampleName : sample2samReader.keySet() ) {
final SamReader samReader = sample2samReader.get(sampleName);
final CoverageInfo covInfo = new CoverageInfo();
sample2coverageInfo.put(sampleName, covInfo);
SAMRecordIterator iter = null;
try {
iter = samReader.query(v1.contig,
chromStart,
chromEnd,
false
);
while(iter.hasNext()) {
final SAMRecord rec = iter.next();
if(rec.getReadUnmappedFlag()) continue;
if(rec.isSecondaryOrSupplementary()) continue;
if(rec.getDuplicateReadFlag()) continue;
if(rec.getReadFailsVendorQualityCheckFlag()) continue;
// get DEPTh for variant 1
if(rec.getAlignmentStart()<= v1.genomicPosition1 && v1.genomicPosition1<=rec.getAlignmentEnd()) {
covInfo.depth1++;
}
// get DEPTh for variant 2
if(rec.getAlignmentStart()<= v2.genomicPosition1 && v2.genomicPosition1<=rec.getAlignmentEnd()) {
covInfo.depth2++;
}
if(rec.getAlignmentEnd()<chromEnd) continue;
if(rec.getAlignmentStart()>chromStart) continue;
final Cigar cigar = rec.getCigar();
if(cigar==null) continue;
final byte bases[] = rec.getReadBases();
if(bases==null) continue;
int refpos1=rec.getAlignmentStart();
int readpos = 0;
boolean found_variant1_on_this_read=false;
boolean found_variant2_on_this_read=false;
/** loop over cigar */
for(final CigarElement ce:cigar.getCigarElements()) {
final CigarOperator op = ce.getOperator();
switch(op)
{
case P: continue;
case S: case I: readpos+=ce.getLength();break;
case D: case N: refpos1+=ce.getLength(); break;
case H: continue;
case EQ:case M:case X:
for(int x=0;x< ce.getLength();++x)
{
if(refpos1 == v1.genomicPosition1 && same(bases[readpos],v1.altAllele))
{
found_variant1_on_this_read = true;
}
else if(refpos1 == v2.genomicPosition1 && same(bases[readpos],v2.altAllele))
{
found_variant2_on_this_read = true;
}
refpos1++;
readpos++;
}
break;
default: throw new IllegalStateException(op.name());
}
/* skip remaining bases after last variant */
if(refpos1>chromEnd) break;
}/* end of loop over cigar */
/* sum-up what we found */
if( found_variant1_on_this_read && found_variant2_on_this_read) {
covInfo.count_reads_having_both_variants++; }
else if( !found_variant1_on_this_read && !found_variant2_on_this_read) {
covInfo.count_reads_having_no_variants++; }
else if( found_variant1_on_this_read) {
covInfo.count_reads_having_variant1++;
}else if( found_variant2_on_this_read) {
covInfo.count_reads_having_variant2++;
}
}/* end of loop over reads */
} finally {
iter.close();
iter=null;
}
info1.put("N_READS_BOTH_VARIANTS_"+sampleName, covInfo.count_reads_having_both_variants);
info2.put("N_READS_BOTH_VARIANTS_"+sampleName, covInfo.count_reads_having_both_variants);
info1.put("N_READS_NO_VARIANTS_"+sampleName, covInfo.count_reads_having_no_variants);
info2.put("N_READS_NO_VARIANTS_"+sampleName, covInfo.count_reads_having_no_variants);
info1.put("N_READS_TOTAL_"+sampleName,
covInfo.count_reads_having_both_variants +
covInfo.count_reads_having_no_variants +
covInfo.count_reads_having_variant1+
covInfo.count_reads_having_variant2
);
info2.put("N_READS_TOTAL_"+sampleName,
covInfo.count_reads_having_both_variants +
covInfo.count_reads_having_no_variants +
covInfo.count_reads_having_variant1+
covInfo.count_reads_having_variant2
);
//count for variant 1
info1.put("N_READS_ONLY_1_"+sampleName, covInfo.count_reads_having_variant1);
info1.put("N_READS_ONLY_2_"+sampleName, covInfo.count_reads_having_variant2);
info1.put("DEPTH_1_"+sampleName, covInfo.depth1);
//inverse previous count
info2.put("N_READS_ONLY_1_"+sampleName, covInfo.count_reads_having_variant2);
info2.put("N_READS_ONLY_2_"+sampleName, covInfo.count_reads_having_variant1);
info2.put("DEPTH_2_"+sampleName, covInfo.depth2);
/* number of reads with both variant is greater than
* reads carrying only one variant: reset the filter
*/
if(2*covInfo.count_reads_having_both_variants>(covInfo.count_reads_having_variant1+covInfo.count_reads_having_variant2)) {
/* reset filter */
filter = VCFConstants.UNFILTERED;
info1.put("FILTER_1_"+sampleName,".");
info2.put("FILTER_2_"+sampleName,".");
}
else
{
info1.put("FILTER_1_"+sampleName,vcfFilterHeaderLine.getID());
info2.put("FILTER_2_"+sampleName,vcfFilterHeaderLine.getID());
}
}/* end of loop over bams */
final CombinedMutation m1 = new CombinedMutation();
m1.contig = v1.contig;
m1.genomicPosition1 = v1.genomicPosition1;
m1.id = v1.id ;
m1.refAllele = v1.refAllele;
m1.altAllele = v1.altAllele;
m1.vcfLine = v1.vcfLine;
m1.info = mapToString(info1);
m1.filter = filter;
m1.grantham_score = grantham_score;
m1.sorting_id = ID_GENERATOR++;
mutations.add(m1);
final CombinedMutation m2 = new CombinedMutation();
m2.contig = v2.contig;
m2.genomicPosition1 = v2.genomicPosition1;
m2.id = v2.id ;
m2.refAllele = v2.refAllele;
m2.altAllele = v2.altAllele;
m2.vcfLine = v2.vcfLine;
m2.info = mapToString(info2);
m2.filter = filter;
m2.grantham_score = grantham_score;
m2.sorting_id = ID_GENERATOR++;
mutations.add(m2);
}
}
}
}
buffer.clear();
if(variant==null) break;
}
buffer.add(variant);
}
progress.finish();
mutations.doneAdding();
varIter.close();varIter=null;
variants.cleanup();variants=null;
final ArrayList<CombinedMutation> mBuffer= new ArrayList<>();
final VCFHeader header2 = new VCFHeader(header);
header2.addMetaDataLine(new VCFHeaderLine(getProgramName()+"AboutQUAL", "QUAL is filled with Grantham Score http://www.ncbi.nlm.nih.gov/pubmed/4843792"));
final StringBuilder infoDesc =new StringBuilder("Variant affected by two distinct mutation. Format is defined in the INFO column. ");
final VCFInfoHeaderLine infoHeaderLine = new VCFInfoHeaderLine(
"CodonVariant",VCFHeaderLineCount.UNBOUNDED,VCFHeaderLineType.String,
infoDesc.toString()
);
super.addMetaData(header2);
header2.addMetaDataLine(infoHeaderLine);
if(!sample2samReader.isEmpty())
{
header2.addMetaDataLine(vcfFilterHeaderLine);
}
w = super.openVariantContextWriter(saveAs);
w.writeHeader(header2);
progress=new SAMSequenceDictionaryProgress(header);
mutIter = mutations.iterator();
for(;;)
{
CombinedMutation mutation = null;
if(mutIter.hasNext())
{
mutation = mutIter.next();
progress.watch(mutation.contig, mutation.genomicPosition1);
}
if(mutation==null || !(!mBuffer.isEmpty() &&
mBuffer.get(0).contig.equals(mutation.contig) &&
mBuffer.get(0).genomicPosition1 == mutation.genomicPosition1 &&
mBuffer.get(0).refAllele.equals(mutation.refAllele)))
{
if(!mBuffer.isEmpty())
{
//default grantham score used in QUAL
int grantham_score = -1;
//default filter fails
String filter=vcfFilterHeaderLine.getID();
final CombinedMutation first = mBuffer.get(0);
final Set<String> info = new HashSet<>();
final VariantContext ctx = cah.codec.decode(first.vcfLine);
final VariantContextBuilder vcb=new VariantContextBuilder(ctx);
vcb.chr(first.contig);
vcb.start(first.genomicPosition1);
vcb.stop(first.genomicPosition1 + first.refAllele.length()-1);
if( !first.id.equals(VCFConstants.EMPTY_ID_FIELD)) vcb.id(first.id);
for(final CombinedMutation m:mBuffer){
info.add(m.info);
grantham_score=Math.max(grantham_score, m.grantham_score);
if(VCFConstants.UNFILTERED.equals(m.filter)) {
filter = null; //at least one SNP is ok one this line
}
}
vcb.unfiltered();
if(filter!=null && !sample2samReader.isEmpty())
{
vcb.filter(filter);
}
else
{
vcb.passFilters();
}
vcb.attribute(infoHeaderLine.getID(), new ArrayList<String>(info));
if(grantham_score>0) {
vcb.log10PError(grantham_score/-10.0);
} else
{
vcb.log10PError(VariantContext.NO_LOG10_PERROR);
}
w.add(vcb.make());
}
mBuffer.clear();
if(mutation==null) break;
}
mBuffer.add(mutation);
}
progress.finish();
mutIter.close();
mutations.cleanup();mutations=null;
return RETURN_OK;
}
catch(Exception err)
{
LOG.error(err);
return -1;
}
finally
{
CloserUtil.close(this.indexedFastaSequenceFile);
CloserUtil.close(mutIter);
CloserUtil.close(varIter);
if(this.variants!=null) this.variants.cleanup();
if(mutations!=null) mutations.cleanup();
this.variants=null;
for(SamReader r: sample2samReader.values()) CloserUtil.close(r);
CloserUtil.close(w);
CloserUtil.close(bufferedReader);
}
}
private void challenge(
final VariantContext ctx,
final Allele allele,
final KnownGene gene,
final String vcfLine
) throws IOException
{
if(allele.isSymbolic()) return;
if(allele.isNoCall()) return;
if(!allele.isCalled())return;
if(allele.length()!=1) return;
if(ctx.getReference().length()!=1) return;
if(allele.equals(Allele.SPAN_DEL)) return;
if(genomicSequence==null || !genomicSequence.getChrom().equals(ctx.getContig()))
{
LOG.info("getting genomic Sequence for "+gene.getContig());
genomicSequence=new GenomicSequence(this.indexedFastaSequenceFile, gene.getContig());
}
final int positionContext0 = ctx.getStart() -1;
Variant variant=null;
final StringBuilder wildRNA=new StringBuilder(1000);
final MutedSequence mutRNA=new MutedSequence(wildRNA);
if(gene.isPositiveStrand())
{
for(int exon_index=0;exon_index< gene.getExonCount();++exon_index)
{
final KnownGene.Exon exon= gene.getExon(exon_index);
int genomicPosition = Math.max(gene.getCdsStart() , exon.getStart());
for(;;)
{
// we need to consider the last stop codon
if( genomicPosition>=exon.getEnd() ||
genomicPosition>= gene.getCdsEnd())
{
break;
}
wildRNA.append(genomicSequence.charAt(genomicPosition));
if(variant==null && positionContext0 ==genomicPosition)
{
variant = new Variant(ctx,allele,gene);
variant.sorting_id = ID_GENERATOR++;
variant.position_in_cdna=wildRNA.length()-1;
char mutBase= allele.getBaseString().charAt(0);
mutRNA.setMutation( wildRNA.length()-1, wildRNA.length(),""+mutBase );
}
++genomicPosition;
}
}
}
else
{
int exon_index = gene.getExonCount()-1;
while(exon_index >=0)
{
final KnownGene.Exon exon= gene.getExon(exon_index);
int genomicPosition = Math.min(gene.getCdsEnd()-1, exon.getEnd()-1);
for(;;) {
if( genomicPosition< exon.getStart() ||
genomicPosition<gene.getCdsStart())
{
break;
}
wildRNA.append(AcidNucleics.complement(genomicSequence.charAt(genomicPosition)));
if(variant==null && positionContext0 == genomicPosition )
{
variant = new Variant(ctx,allele,gene);
variant.sorting_id = ID_GENERATOR++;
variant.position_in_cdna=wildRNA.length()-1;
char mutBase=AcidNucleics.complement(allele.getBaseString().charAt(0));
mutRNA.setMutation( wildRNA.length()-1, wildRNA.length(),""+mutBase );
}
--genomicPosition;
}
--exon_index;
}
}
if(variant!=null)
{
variant.wildCodon="";
variant.mutCodon="";
for(int i=0;i< 3;++i)
{
int pos = variant.codonStart()+i;
variant.wildCodon += (pos< wildRNA.length()?wildRNA.charAt(pos):'*');
variant.mutCodon += (pos< mutRNA.length()?mutRNA.charAt(pos):'*');
}
variant.wildCodon = variant.wildCodon.toUpperCase();
variant.mutCodon = variant.mutCodon.toUpperCase();
variant.vcfLine = vcfLine;
if(variant.wildCodon.equals(variant.mutCodon)) {
LOG.info("Uh??????? "+allele+" "+ctx);
return;
}
this.variants.add(variant);
}
}
@Override
public int doWork(List<String> args) {
if(this.referenceFile==null)
{
LOG.error("Undefined REFERENCE");
return -1;
}
if(this.kgURI==null || this.kgURI.trim().isEmpty())
{
LOG.error("Undefined kgURI.");
return -1;
}
return doVcfToVcf(args,outputFile);
}
public static void main(String[] args)
{
new VCFCombineTwoSnvs().instanceMainWithExit(args);
}
}