package picard.vcf;
import htsjdk.samtools.Defaults;
import htsjdk.samtools.SAMSequenceRecord;
import htsjdk.samtools.ValidationStringency;
import htsjdk.samtools.liftover.LiftOver;
import htsjdk.samtools.reference.ReferenceSequenceFileWalker;
import htsjdk.samtools.util.CloserUtil;
import htsjdk.samtools.util.CollectionUtil;
import htsjdk.samtools.util.IOUtil;
import htsjdk.samtools.util.Interval;
import htsjdk.samtools.util.Log;
import htsjdk.samtools.util.ProgressLogger;
import htsjdk.samtools.util.SequenceUtil;
import htsjdk.samtools.util.SortingCollection;
import htsjdk.samtools.util.StringUtil;
import htsjdk.variant.variantcontext.*;
import htsjdk.variant.variantcontext.writer.Options;
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
import htsjdk.variant.variantcontext.writer.VariantContextWriterBuilder;
import htsjdk.variant.vcf.VCFFileReader;
import htsjdk.variant.vcf.VCFFilterHeaderLine;
import htsjdk.variant.vcf.VCFHeader;
import htsjdk.variant.vcf.VCFHeaderLineType;
import htsjdk.variant.vcf.VCFInfoHeaderLine;
import htsjdk.variant.vcf.VCFRecordCodec;
import picard.cmdline.CommandLineProgram;
import picard.cmdline.CommandLineProgramProperties;
import picard.cmdline.Option;
import picard.cmdline.StandardOptionDefinitions;
import picard.cmdline.programgroups.VcfOrBcf;
import java.io.File;
import java.text.DecimalFormat;
import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* Tool for lifting over a VCF to another genome build and producing a properly header'd,
* sorted and indexed VCF in one go.
*
* @author Tim Fennell
*/
@CommandLineProgramProperties(
usage = LiftoverVcf.USAGE_SUMMARY + LiftoverVcf.USAGE_DETAILS,
usageShort = LiftoverVcf.USAGE_SUMMARY,
programGroup = VcfOrBcf.class
)
public class LiftoverVcf extends CommandLineProgram {
static final String USAGE_SUMMARY = "Lifts over a VCF file from one reference build to another. ";
static final String USAGE_DETAILS = "This tool adjusts the coordinates of variants within a VCF file to match a new reference. The " +
"output file will be sorted and indexed using the target reference build. To be clear, REFERENCE_SEQUENCE should be the " +
"<em>target</em> reference build. The tool is based on the UCSC liftOver tool (see: http://genome.ucsc.edu/cgi-bin/hgLiftOver) " +
"and uses a UCSC chain file to guide its operation. <br /><br />" +
"Note that records may be rejected because they cannot be lifted over or because of sequence incompatibilities between the " +
"source and target reference genomes. Rejected records will be emitted with filters to the REJECT file, using the source " +
"genome coordinates.<br />" +
"<h4>Usage example:</h4>" +
"<pre>" +
"java -jar picard.jar LiftoverVcf \\<br />" +
" I=input.vcf \\<br />" +
" O=lifted_over.vcf \\<br />" +
" CHAIN=b37tohg19.chain \\<br />" +
" REJECT=rejected_variants.vcf \\<br />" +
" R=reference_sequence.fasta" +
"</pre>" +
"For additional information, please see: http://genome.ucsc.edu/cgi-bin/hgLiftOver" +
"<hr />";
@Option(shortName=StandardOptionDefinitions.INPUT_SHORT_NAME, doc="The input VCF/BCF file to be lifted over.")
public File INPUT;
@Option(shortName=StandardOptionDefinitions.OUTPUT_SHORT_NAME, doc="The output location to write the lifted over VCF/BCF to.")
public File OUTPUT;
@Option(shortName="C", doc="The liftover chain file. See https://genome.ucsc.edu/goldenPath/help/chain.html for a description" +
" of chain files. See http://hgdownload.soe.ucsc.edu/downloads.html#terms for where to download chain files.")
public File CHAIN;
@Option(doc="File to which to write rejected records.")
public File REJECT;
@Option(shortName = StandardOptionDefinitions.REFERENCE_SHORT_NAME, common=false,
doc = "The reference sequence (fasta) for the TARGET genome build. The fasta file must have an " +
"accompanying sequence dictionary (.dict file).")
public File REFERENCE_SEQUENCE = Defaults.REFERENCE_FASTA;
// Option on whether or not to provide a warning, or error message and exit if a missing contig is encountered
@Option(shortName = "WMC", doc = "Warn on missing contig.", optional = true)
public boolean WARN_ON_MISSING_CONTIG = false;
// Option on whether or not to write the original contig/position of the variant to the INFO field
@Option(doc = "Write the original contig/position for lifted variants to the INFO field.", optional = true)
public boolean WRITE_ORIGINAL_POSITION = false;
@Option(doc = "The minimum percent match required for a variant to be lifted.", optional = true)
public double LIFTOVER_MIN_MATCH = 1.0;
@Option(doc = "Allow INFO and FORMAT in the records that are not found in the header", optional = true)
public boolean ALLOW_MISSING_FIELDS_IN_HEADER = false;
// When a contig used in the chain is not in the reference, exit with this value instead of 0.
protected static int EXIT_CODE_WHEN_CONTIG_NOT_IN_REFERENCE = 1;
/** Filter name to use when a target cannot be lifted over. */
public static final String FILTER_CANNOT_LIFTOVER_INDEL = "ReverseComplementedIndel";
/** Filter name to use when a target cannot be lifted over. */
public static final String FILTER_NO_TARGET = "NoTarget";
/** Filter name to use when a target is lifted over, but the reference allele doens't match the new reference. */
public static final String FILTER_MISMATCHING_REF_ALLELE = "MismatchedRefAllele";
/** Filters to be added to the REJECT file. */
private static final List<VCFFilterHeaderLine> FILTERS = CollectionUtil.makeList(
new VCFFilterHeaderLine(FILTER_CANNOT_LIFTOVER_INDEL, "Indel falls into a reverse complemented region in the target genome."),
new VCFFilterHeaderLine(FILTER_NO_TARGET, "Variant could not be lifted between genome builds."),
new VCFFilterHeaderLine(FILTER_MISMATCHING_REF_ALLELE, "Reference allele does not match reference genome sequence after liftover.")
);
/** Attribute used to store the name of the source contig/chromosome prior to liftover. */
public static final String ORIGINAL_CONTIG = "OriginalContig";
/** Attribute used to store the position of the variant on the source contig prior to liftover. */
public static final String ORIGINAL_START = "OriginalStart";
/** Metadata to be added to the Passing file. */
private static final List<VCFInfoHeaderLine> ATTRS = CollectionUtil.makeList(
new VCFInfoHeaderLine(ORIGINAL_CONTIG, 1, VCFHeaderLineType.String, "The name of the source contig/chromosome prior to liftover."),
new VCFInfoHeaderLine(ORIGINAL_START, 1, VCFHeaderLineType.String, "The position of the variant on the source contig prior to liftover.")
);
private final Log log = Log.getInstance(LiftoverVcf.class);
// Stock main method
public static void main(final String[] args) {
new LiftoverVcf().instanceMainWithExit(args);
}
@Override protected int doWork() {
IOUtil.assertFileIsReadable(INPUT);
IOUtil.assertFileIsReadable(REFERENCE_SEQUENCE);
IOUtil.assertFileIsReadable(CHAIN);
IOUtil.assertFileIsWritable(OUTPUT);
IOUtil.assertFileIsWritable(REJECT);
////////////////////////////////////////////////////////////////////////
// Setup the inputs
////////////////////////////////////////////////////////////////////////
final LiftOver liftOver = new LiftOver(CHAIN);
final VCFFileReader in = new VCFFileReader(INPUT, false);
log.info("Loading up the target reference genome.");
final ReferenceSequenceFileWalker walker = new ReferenceSequenceFileWalker(REFERENCE_SEQUENCE);
final Map<String,byte[]> refSeqs = new HashMap<String,byte[]>();
for (final SAMSequenceRecord rec: walker.getSequenceDictionary().getSequences()) {
refSeqs.put(rec.getSequenceName(), walker.get(rec.getSequenceIndex()).getBases());
}
CloserUtil.close(walker);
////////////////////////////////////////////////////////////////////////
// Setup the outputs
////////////////////////////////////////////////////////////////////////
final VCFHeader inHeader = in.getFileHeader();
final VCFHeader outHeader = new VCFHeader(inHeader);
outHeader.setSequenceDictionary(walker.getSequenceDictionary());
if (WRITE_ORIGINAL_POSITION) {
for (final VCFInfoHeaderLine line : ATTRS) outHeader.addMetaDataLine(line);
}
final VariantContextWriter out = new VariantContextWriterBuilder().setOption(Options.INDEX_ON_THE_FLY)
.modifyOption(Options.ALLOW_MISSING_FIELDS_IN_HEADER, ALLOW_MISSING_FIELDS_IN_HEADER)
.setOutputFile(OUTPUT).setReferenceDictionary(walker.getSequenceDictionary()).build();
out.writeHeader(outHeader);
final VariantContextWriter rejects = new VariantContextWriterBuilder().setOutputFile(REJECT).unsetOption(Options.INDEX_ON_THE_FLY)
.modifyOption(Options.ALLOW_MISSING_FIELDS_IN_HEADER, ALLOW_MISSING_FIELDS_IN_HEADER)
.build();
final VCFHeader rejectHeader = new VCFHeader(in.getFileHeader());
for (final VCFFilterHeaderLine line : FILTERS) rejectHeader.addMetaDataLine(line);
if (WRITE_ORIGINAL_POSITION) {
for (final VCFInfoHeaderLine line : ATTRS) rejectHeader.addMetaDataLine(line);
}
rejects.writeHeader(rejectHeader);
////////////////////////////////////////////////////////////////////////
// Read the input VCF, lift the records over and write to the sorting
// collection.
////////////////////////////////////////////////////////////////////////
long failedLiftover = 0, failedAlleleCheck = 0, total = 0;
log.info("Lifting variants over and sorting.");
final SortingCollection<VariantContext> sorter = SortingCollection.newInstance(VariantContext.class,
new VCFRecordCodec(outHeader, ALLOW_MISSING_FIELDS_IN_HEADER || VALIDATION_STRINGENCY != ValidationStringency.STRICT),
outHeader.getVCFRecordComparator(),
MAX_RECORDS_IN_RAM,
TMP_DIR);
ProgressLogger progress = new ProgressLogger(log, 1000000, "read");
// a mapping from original allele to reverse complemented allele
final Map<Allele, Allele> reverseComplementAlleleMap = new HashMap<Allele, Allele>(10);
for (final VariantContext ctx : in) {
++total;
final Interval source = new Interval(ctx.getContig(), ctx.getStart(), ctx.getEnd(), false, ctx.getContig() + ":" + ctx.getStart() + "-" + ctx.getEnd());
final Interval target = liftOver.liftOver(source, LIFTOVER_MIN_MATCH);
// if the target is null OR (the target is reverse complemented AND the variant is an indel or mixed), then we cannot lift it over
if (target == null || (target.isNegativeStrand() && (ctx.isMixed() || ctx.isIndel()))) {
final String reason = (target == null) ? FILTER_NO_TARGET : FILTER_CANNOT_LIFTOVER_INDEL;
rejects.add(new VariantContextBuilder(ctx).filter(reason).make());
failedLiftover++;
} else if (!refSeqs.containsKey(target.getContig())) {
rejects.add(new VariantContextBuilder(ctx).filter(FILTER_NO_TARGET).make());
failedLiftover++;
String missingContigMessage = "Encountered a contig, " + target.getContig() + " that is not part of the target reference.";
if(WARN_ON_MISSING_CONTIG) {
log.warn(missingContigMessage);
} else {
log.error(missingContigMessage);
return EXIT_CODE_WHEN_CONTIG_NOT_IN_REFERENCE;
}
} else {
// Fix the alleles if we went from positive to negative strand
reverseComplementAlleleMap.clear();
final List<Allele> alleles = new ArrayList<Allele>();
for (final Allele oldAllele : ctx.getAlleles()) {
if (target.isPositiveStrand() || oldAllele.isSymbolic()) {
alleles.add(oldAllele);
}
else {
final Allele fixedAllele = Allele.create(SequenceUtil.reverseComplement(oldAllele.getBaseString()), oldAllele.isReference());
alleles.add(fixedAllele);
reverseComplementAlleleMap.put(oldAllele, fixedAllele);
}
}
// Build the new variant context
final VariantContextBuilder builder = new VariantContextBuilder(
ctx.getSource(),
target.getContig(),
target.getStart(),
target.getEnd(),
alleles);
builder.id(ctx.getID());
builder.attributes(ctx.getAttributes());
if (WRITE_ORIGINAL_POSITION) {
builder.attribute(ORIGINAL_CONTIG, source.getContig());
builder.attribute(ORIGINAL_START, source.getStart());
}
builder.genotypes(fixGenotypes(ctx.getGenotypes(), reverseComplementAlleleMap));
builder.filters(ctx.getFilters());
builder.log10PError(ctx.getLog10PError());
// Check that the reference allele still agrees with the reference sequence
boolean mismatchesReference = false;
for (final Allele allele : builder.getAlleles()) {
if (allele.isReference()) {
final byte[] ref = refSeqs.get(target.getContig());
final String refString = StringUtil.bytesToString(ref, target.getStart()-1, target.length());
if (!refString.equalsIgnoreCase(allele.getBaseString())) {
mismatchesReference = true;
}
break;
}
}
if (mismatchesReference) {
rejects.add(new VariantContextBuilder(ctx).filter(FILTER_MISMATCHING_REF_ALLELE).make());
failedAlleleCheck++;
}
else {
sorter.add(builder.make());
}
}
progress.record(ctx.getContig(), ctx.getStart());
}
final NumberFormat pfmt = new DecimalFormat("0.0000%");
final String pct = pfmt.format((failedLiftover + failedAlleleCheck) / (double) total);
log.info("Processed ", total, " variants.");
log.info(failedLiftover, " variants failed to liftover.");
log.info(failedAlleleCheck, " variants lifted over but had mismatching reference alleles after lift over.");
log.info(pct, " of variants were not successfully lifted over and written to the output.");
rejects.close();
in.close();
////////////////////////////////////////////////////////////////////////
// Write the sorted outputs to the final output file
////////////////////////////////////////////////////////////////////////
sorter.doneAdding();
progress = new ProgressLogger(log, 1000000, "written");
log.info("Writing out sorted records to final VCF.");
for (final VariantContext ctx : sorter) {
out.add(ctx);
progress.record(ctx.getContig(), ctx.getStart());
}
out.close();
sorter.cleanup();
return 0;
}
protected static GenotypesContext fixGenotypes(final GenotypesContext originals, final Map<Allele, Allele> reverseComplementAlleleMap) {
// optimization: if nothing needs to be fixed then don't bother
if ( reverseComplementAlleleMap.isEmpty() ) {
return originals;
}
final GenotypesContext fixedGenotypes = GenotypesContext.create(originals.size());
for ( final Genotype genotype : originals ) {
final List<Allele> fixedAlleles = new ArrayList<Allele>();
for ( final Allele allele : genotype.getAlleles() ) {
final Allele fixedAllele = reverseComplementAlleleMap.containsKey(allele) ? reverseComplementAlleleMap.get(allele) : allele;
fixedAlleles.add(fixedAllele);
}
fixedGenotypes.add(new GenotypeBuilder(genotype).alleles(fixedAlleles).make());
}
return fixedGenotypes;
}
}