/* The MIT License (MIT) Copyright (c) 2017 Pierre Lindenbaum Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. History: * 2014 creation * 2015 moving to knime */ package com.github.lindenb.jvarkit.tools.burden; import java.io.DataInputStream; import java.io.DataOutputStream; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.io.PrintWriter; import java.util.ArrayList; import java.util.Comparator; import java.util.HashSet; import java.util.List; import java.util.Set; import java.util.stream.Collectors; import htsjdk.samtools.util.CloseableIterator; import htsjdk.samtools.util.CloserUtil; import htsjdk.samtools.util.Interval; import htsjdk.samtools.util.RuntimeIOException; import htsjdk.samtools.util.SortingCollection; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.variantcontext.VariantContextBuilder; import htsjdk.variant.variantcontext.writer.VariantContextWriter; import htsjdk.variant.vcf.VCFHeader; import htsjdk.variant.vcf.VCFHeaderLineCount; import htsjdk.variant.vcf.VCFHeaderLineType; import htsjdk.variant.vcf.VCFInfoHeaderLine; import com.beust.jcommander.Parameter; import com.beust.jcommander.ParametersDelegate; import com.github.lindenb.jvarkit.io.NullOuputStream; import com.github.lindenb.jvarkit.util.EqualRangeIterator; import com.github.lindenb.jvarkit.util.jcommander.Program; import com.github.lindenb.jvarkit.util.log.Logger; import com.github.lindenb.jvarkit.util.picard.AbstractDataCodec; import com.github.lindenb.jvarkit.util.vcf.DelegateVariantContextWriter; import com.github.lindenb.jvarkit.util.vcf.VCFUtils; import com.github.lindenb.jvarkit.util.vcf.VcfIterator; import com.github.lindenb.jvarkit.util.vcf.predictions.AnnPredictionParser; import com.github.lindenb.jvarkit.util.vcf.predictions.AnnPredictionParserFactory; import com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParser; import com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParser.VepPrediction; import com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParserFactory; import com.github.lindenb.jvarkit.util.vcf.predictions.AnnPredictionParser.AnnPrediction; /** * @author lindenb * BEGIN_DOC END_DOC */ @Program(name="vcfburdensplitter2",description="new version",keywords={"vcf","burden","gene","vep","snpeff","prediction"}) public class VcfBurdenSplitter2 { //public for knime public static final String DEFAULT_VCF_HEADER_SPLITKEY="VCFBurdenSplitName"; private static final Logger LOG = Logger.build(). prefix("splitter2"). make(); @Parameter(names={"-if","--ignorefilter"},description="accept variants having a FILTER column. Default is ignore variants with a FILTER column") private boolean acceptFiltered = false; @Parameter(names="--maxRecordsInRam",description="Max records in RAM") private int maxRecordsInRam=50000; @Parameter(names="--tmpDir",description="Temporary directory") private File tmpDir = new File(System.getProperty("java.io.tmpdir",".")); @Parameter(names={"-m","--manifestFile"},description="Manifest File") private File manifestFile = null; @Parameter(names={"-t","--tag"},description="Split Key") private String splitInfoKey = DEFAULT_VCF_HEADER_SPLITKEY; public VariantContextWriter open(final VariantContextWriter delegate) { final MyWriter w = new MyWriter(delegate); w.manifestFile=this.manifestFile; w.acceptFiltered=this.acceptFiltered; w.tmpDir=this.tmpDir; w.maxRecordsInRam=this.maxRecordsInRam; w.splitInfoKey=this.splitInfoKey; return w; } private class MyWriter extends DelegateVariantContextWriter { private String prev_contig=null; private SortingCollection<Interval> sortingcollection=null; private PrintWriter manifestWriter=null; private File manifestFile = null; private boolean acceptFiltered = false; private File tmpDir=null; private int maxRecordsInRam; private String splitInfoKey; private AnnPredictionParser annPredictionParser = null; private VepPredictionParser vepPredictionParser = null; MyWriter(final VariantContextWriter w) { super(w); } @Override public void writeHeader(final VCFHeader header) { final VCFHeader header2= new VCFHeader(header); this.annPredictionParser = new AnnPredictionParserFactory(header).get(); this.vepPredictionParser = new VepPredictionParserFactory(header).get(); this.prev_contig=null; if(this.manifestFile==null) { LOG.warning("Manifest file is undefined"); this.manifestWriter=new PrintWriter(new NullOuputStream()); } else { try { this.manifestWriter=new PrintWriter(this.manifestFile); } catch (final FileNotFoundException e) { throw new RuntimeIOException(e); } } header2.addMetaDataLine( new VCFInfoHeaderLine( this.splitInfoKey, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Split Names" )); super.writeHeader(header2); } @Override public void add(final VariantContext ctx) { if(prev_contig==null || !ctx.getContig().equals(prev_contig)) { dump(); prev_contig=ctx.getContig(); } if(ctx.isFiltered() && !this.acceptFiltered) { //add to delegate without registering the SPLIT name super.add(ctx); return; } final Set<String> splitNames = getSplitNamesFor(ctx); if(splitNames.isEmpty()) { super.add(ctx); return; } if(this.sortingcollection==null) { /* create sorting collection for new contig */ this.sortingcollection = SortingCollection.newInstance( Interval.class, new IntervalCodec(), new IntervalComparator(), this.maxRecordsInRam, this.tmpDir ); this.sortingcollection.setDestructiveIteration(true); } for(final String spltiName:splitNames) { this.sortingcollection.add( new Interval(ctx.getContig(), ctx.getStart(), ctx.getEnd(),false,spltiName)); } super.add(new VariantContextBuilder(ctx).attribute( this.splitInfoKey, new ArrayList<>(splitNames)). make()); } @Override public void close() { dump(); if(manifestWriter!=null) { this.manifestWriter.flush(); if(this.manifestWriter.checkError()) { throw new RuntimeIOException("There was a I/O error when writing the manifest files"); } this.manifestWriter.close(); this.manifestWriter=null; } super.close(); } private Set<String> getSplitNamesFor(final VariantContext ctx){ final Set<String> keys = new HashSet<>(); for(final VepPrediction pred: this.vepPredictionParser.getPredictions(ctx)) { keys.addAll(pred.getGeneKeys().stream().map(S->ctx.getContig()+"_"+S).collect(Collectors.toSet())); } for(final AnnPrediction pred: this.annPredictionParser.getPredictions(ctx)) { keys.addAll(pred.getGeneKeys().stream().map(S->ctx.getContig()+"_"+S).collect(Collectors.toSet())); } /* replace . by _ so we don't have problems with regex later */ return keys.stream(). map(S->S.replace('.', '_').replace('-', '_')). collect(Collectors.toSet()); } private void dump() { if(this.sortingcollection==null || this.manifestWriter==null) return; CloseableIterator<Interval> iter; this.sortingcollection.doneAdding(); iter = this.sortingcollection.iterator(); LOG.info("dumping data for CONTIG: \""+prev_contig+"\""); final EqualRangeIterator<Interval> eqiter = new EqualRangeIterator<>(iter, new Comparator<Interval>() { @Override public int compare(final Interval o1, final Interval o2) { return o1.getName().compareTo(o2.getName()); } }); while(eqiter.hasNext()) { final List<Interval> buffer = eqiter.next(); final Interval first = buffer.get(0); this.manifestWriter.print(first.getContig()); this.manifestWriter.print('\t'); this.manifestWriter.print(buffer.stream().map(I->I.getStart()).min((A,B)->A.compareTo(B)).get()); this.manifestWriter.print('\t'); this.manifestWriter.print(buffer.stream().map(I->I.getEnd()).max((A,B)->A.compareTo(B)).get()); this.manifestWriter.print('\t'); this.manifestWriter.print(first.getName()); this.manifestWriter.print('\t'); this.manifestWriter.print(buffer.size()); this.manifestWriter.println(); this.manifestWriter.flush(); } if(this.manifestWriter.checkError()) { LOG.warn("I/O error when writing manifest"); } eqiter.close(); iter.close();iter=null; //dispose sorting collection sortingcollection.cleanup(); sortingcollection=null; } } private static class IntervalComparator implements Comparator<Interval> { @Override public int compare(final Interval o1, final Interval o2) { int i = o1.getName().compareTo(o2.getName()); if(i!=0) return i; if(!o1.getContig().equals(o2.getContig())) { throw new IllegalStateException("not same contig???"); } i =o1.getStart() - o2.getStart(); if(i!=0) return i; return o1.getEnd() - o2.getEnd(); } } private static class IntervalCodec extends AbstractDataCodec<Interval> { @Override public Interval decode(final DataInputStream dis) throws IOException { String k; try { k=dis.readUTF(); } catch(IOException err) { return null;} final String contig = dis.readUTF(); final int beg= dis.readInt(); final int end= dis.readInt(); return new Interval(contig,beg,end,false,k); } @Override public void encode(final DataOutputStream dos, final Interval object) throws IOException { dos.writeUTF(object.getName()); dos.writeUTF(object.getContig()); dos.writeInt(object.getStart()); dos.writeInt(object.getStart()); } @Override public IntervalCodec clone() { return new IntervalCodec(); } } protected boolean isDebuggingVariant(VariantContext ctx) { return false; } protected String shortName(VariantContext ctx) { return ctx.getContig()+":"+ctx.getStart()+":"+ctx.getAlleles(); } public VcfBurdenSplitter2() { } private static class Launcher extends com.github.lindenb.jvarkit.util.jcommander.Launcher { @Parameter(names={"-ls","--listsplitters"},description="List available splitters and exit") private boolean listSplitters = false; @ParametersDelegate private VcfBurdenSplitter2 instance=new VcfBurdenSplitter2(); @Parameter(names={"-o","--out"},description="Vcf output.") private VariantContextWriter output=new com.github.lindenb.jvarkit.util.jcommander.Launcher.VcfWriterOnDemand(); Launcher() { } @Override public int doWork(final List<String> args) { VariantContextWriter w=null; VcfIterator in=null; try { in = VCFUtils.createVcfIterator(super.oneFileOrNull(args)); w= this.instance.open(output); VCFUtils.copyHeaderAndVariantsTo(in, w); w.close(); return 0; } catch(Exception err) { LOG.fatal(err); return -1; } finally { CloserUtil.close(w); CloserUtil.close(in); } } } public static void main(String[] args) { new Launcher().instanceMainWithExit(args); } }