/* * Copyright 2015 OpenCB * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * */ package org.opencb.hpg.bigdata.core.converters.variation; import htsjdk.tribble.AbstractFeatureReader; import htsjdk.tribble.FeatureReader; import htsjdk.tribble.TribbleException; import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.*; import htsjdk.variant.vcf.VCFConstants; import htsjdk.variant.vcf.VCFHeader; import htsjdk.variant.vcf.VCFHeaderLine; import org.apache.avro.file.CodecFactory; import org.apache.commons.lang3.NotImplementedException; import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.time.StopWatch; import org.ga4gh.models.*; import org.opencb.biodata.formats.variant.vcf4.FullVcfCodec; import org.opencb.hpg.bigdata.core.converters.Converter; import org.opencb.hpg.bigdata.core.io.avro.AvroWriter; import org.opencb.hpg.bigdata.core.utils.AvroUtils; import java.io.*; import java.util.*; import java.util.Map.Entry; import java.util.concurrent.atomic.AtomicReference; /** * Chr20 of 1000 genomes -> 855,196 rows = 30 header, 855,166 rows. * * avro.snappy: 3GB -> 12.0 GB (58 min) * avro.deflate: 3GB -> 7.2 GB (82 min) * * * @author mh719 * */ public class VariantContext2VariantConverter implements Converter<VariantContext, Variant> { private final AtomicReference<VariantConverterContext> ctx = new AtomicReference<>(); public static final String VCF_FILTER_COLUMN = "FILTER"; public static final String VCF_QUAL_COLUMN = "QUAL"; private String variantSetId = ""; public static void main(String[] args) { convert(args[0], args[1]); } public static void convert(String input, String output) { StopWatch sw = new StopWatch(); sw.start(); File file = new File(input); long cnt = 0; File outFile = new File(output); try (OutputStream os = new FileOutputStream(outFile); OutputStream callOs = new FileOutputStream(new File(output + ".Call")); OutputStream vsOs = new FileOutputStream(new File(output + ".variantSet"))) { CodecFactory codec = AvroUtils.getCodec("deflate"); // CompressionUtils.getCodec("snappy"); AvroWriter<Variant> writer = new AvroWriter<>(Variant.getClassSchema(), codec, os); AvroWriter<CallSet> callWriter = new AvroWriter<>(CallSet.getClassSchema(), codec, callOs); AvroWriter<VariantSet> vsWriter = new AvroWriter<>(VariantSet.getClassSchema(), codec, vsOs); Converter<String, CallSet> gtConverter = new Genotype2CallSet(); Converter<VCFHeaderLine, VariantSetMetadata> infoConverter = new VcfHeaderLine2VariantSetMetadataConverter(); VariantContext2VariantConverter varConverter = new VariantContext2VariantConverter(); try (FeatureReader<VariantContext> freader = AbstractFeatureReader.getFeatureReader(file.getAbsolutePath(), new FullVcfCodec(), false)) { VCFHeader header = (VCFHeader) freader.getHeader(); int gtSize = header.getGenotypeSamples().size(); VariantConverterContext ctx = new VariantConverterContext(); VariantSet vs = new VariantSet(); vs.setId(file.getName()); vs.setDatasetId(file.getName()); vs.setReferenceSetId("test"); List<String> genotypeSamples = header.getGenotypeSamples(); for (int gtPos = 0; gtPos < gtSize; ++gtPos) { CallSet cs = gtConverter.forward(genotypeSamples.get(gtPos)); cs.getVariantSetIds().add(vs.getId()); ctx.getCallSetMap().put(cs.getName(), cs); callWriter.write(cs); } vs.setMetadata(new LinkedList<>()); convertHeaders(infoConverter, header, vs); vsWriter.write(vs); varConverter.setContext(ctx); for (VariantContext c : freader.iterator()) { if (cnt % 1000 == 0) { System.err.println(String.format("Processing variant %s ...", cnt++)); } cnt++; Variant v = varConverter.forward(c); writer.write(v); } } catch (TribbleException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } writer.close(); vsWriter.close(); callWriter.close(); } catch (FileNotFoundException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } catch (IOException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } sw.stop(); System.err.println(sw.toString()); } /** * Converts INFO, FORMAT, FILTER. * @param headerConverter * @param header * @param vs */ private static void convertHeaders(Converter<VCFHeaderLine, VariantSetMetadata> headerConverter, VCFHeader header, VariantSet vs) { Collection[] coll = new Collection[] { header.getInfoHeaderLines(), header.getFilterLines(), header.getFormatHeaderLines(), // header.getContigLines(), // TODO other formats }; for (Collection<? extends VCFHeaderLine> c : coll) { if (null != c) { for (VCFHeaderLine hl : c) { vs.getMetadata().add(headerConverter.forward(hl)); } } } } public static void print(Variant v) { System.out.println(String.format("%s:%s-%s %s %s %s", v.getReferenceName(), v.getStart(), v.getEnd(), v.getId(), v.getReferenceBases(), v.getAlternateBases())); } @Deprecated public VariantConverterContext getContext() { return ctx.get(); } @Deprecated public void setContext(VariantConverterContext ctx) { this.ctx.set(ctx); } public String getVariantSetId() { return variantSetId; } public VariantContext2VariantConverter setVariantSetId(String variantSetId) { this.variantSetId = variantSetId; return this; } /* * * VariantSetMetadata represents VCF header information. * * `Variant` and `CallSet` both belong to a `VariantSet`. * `VariantSet` belongs to a `Dataset`. * The variant set is equivalent to a VCF file. * * A `CallSet` is a collection of variant calls for a particular sample. It belongs to a `VariantSet`. * This is equivalent to one column in VCF. * A `Call` represents the determination of genotype with respect to a particular `Variant`. * An `AlleleCall` represents the determination of the copy number of a particular `Allele`, * possibly within a certain `Variant`. * A `Variant` represents a change in DNA sequence relative to some reference. For example, * a variant could represent a SNP or an insertion. * Variants belong to a `VariantSet`. This is equivalent to a row in VCF. * * SUMMARY: * VCF File -> VariantSet * Header -> VariantSetMetadata (separate converter) * Row -> Variant * * Whole Column (sample) -> CallSet * One Column entry(sample) -> Call * */ @Override public Variant forward(VariantContext variantContext) { Variant gaVariant = new Variant(); gaVariant.setVariantSetId(variantSetId); gaVariant.setId(variantContext.getContig() + ":" + translateStartPosition(variantContext) + ":" + variantContext.getReference().getBaseString() + ":" + StringUtils.join(variantContext.getAlternateAlleles(), ",")); // Assumed to be related to the avro record long currTime = System.currentTimeMillis(); gaVariant.setCreated(currTime); gaVariant.setUpdated(currTime); /* VCF spec (1-Based): * -> POS - position: The reference position, with the 1st base having position 1. * -> TODO: Telomeres are indicated by using positions 0 or N+1 * GA4GH spec * -> (0-Based) */ gaVariant.setReferenceName(variantContext.getContig()); gaVariant.setStart(translateStartPosition(variantContext)); /* 0-based, exclusive end position [start,end) */ /* 1-based, inclusive end position [start,end] */ gaVariant.setEnd((long) variantContext.getEnd()); List<String> nameList = Collections.emptyList(); if (StringUtils.isNotBlank(variantContext.getID())) { nameList = Arrays.asList(variantContext.getID().split(VCFConstants.ID_FIELD_SEPARATOR)); } gaVariant.setNames(nameList); /* Classic mode */ // Reference Allele refAllele = variantContext.getReference(); gaVariant.setReferenceBases(refAllele.getBaseString()); assert Long.compare(gaVariant.getEnd() - gaVariant.getStart(), gaVariant.getReferenceBases().length()) == 0; // Alt List<Allele> altAllele = variantContext.getAlternateAlleles(); List<String> altList = new ArrayList<>(altAllele.size()); for (Allele allele : altAllele) { altList.add(allele.getBaseString()); } gaVariant.setAlternateBases(altList); /* Graph mode */ // NOT supported // gaVariant.setAlleleIds(value); // INFO Map<String, List<String>> info = convertInfo(variantContext); gaVariant.setInfo(info); // QUAL info.put(VCF_QUAL_COLUMN, Collections.singletonList(Double.toString(variantContext.getPhredScaledQual()))); // FILTER Set<String> filter = variantContext.getFilters(); if (filter.isEmpty()) { info.put(VCF_FILTER_COLUMN, Collections.singletonList(VCFConstants.PASSES_FILTERS_v4)); } else { info.put(VCF_FILTER_COLUMN, new ArrayList<>(filter)); } // Genotypes (Call's) gaVariant.setCalls(convertCalls(variantContext)); return gaVariant; } /** * Convert HTS Genotypes to GA4GH {@link Call}. * * @param context {@link VariantContext} * @return {@link List} of {@link Call} */ @SuppressWarnings("deprecation") private List<Call> convertCalls(VariantContext context) { GenotypesContext gtc = context.getGenotypes(); // Create Allele mapping Map<String, Integer> alleleMap = new HashMap<>(); int i = 0; alleleMap.put(context.getReference().getBaseString(), i++); for (Allele a : context.getAlternateAlleles()) { alleleMap.put(a.getBaseString(), i++); } int size = gtc.size(); List<Call> callList = new ArrayList<>(size); for (Genotype gt : gtc) { String name = gt.getSampleName(); // CallSet cs = getContext().getCallSetMap().get(name); Call c = new Call(); c.setCallSetId(name); // c.setCallSetId(cs.getId()); // c.setCallSetName(cs.getName()); // TODO is this really needed 2x c.setCallSetName(StringUtils.EMPTY); List<Integer> cgt = new ArrayList<>(gt.getPloidy()); boolean unknownGT = false; // TODO issue with ./. -> propose ignore call for (Allele a : gt.getAlleles()) { String baseString = a.getBaseString(); Integer e = alleleMap.get(baseString); if (null == e) { unknownGT = true; break; } cgt.add(e); } if (unknownGT) { continue; // ignore Call -> Genotype not known (e.g. "./.") } c.setGenotype(cgt); c.setPhaseset(Boolean.valueOf(gt.isPhased()).toString()); // TODO decide what's the correct string Map<String, List<String>> infoMap = new HashMap<>(); if (gt.hasAD()) { int[] ad = gt.getAD(); List<String> adl = new ArrayList<>(ad.length); for (int val : ad) { adl.add(Integer.toString(val)); } infoMap.put(VCFConstants.GENOTYPE_ALLELE_DEPTHS, adl); } if (gt.hasDP()) { infoMap.put(VCFConstants.DEPTH_KEY, Collections.singletonList(Integer.toString(gt.getDP()))); } if (gt.hasGQ()) { infoMap.put(VCFConstants.GENOTYPE_QUALITY_KEY, Collections.singletonList(Integer.toString(gt.getGQ()))); } // Add Filter information to the genotype data // infoMap.put(VFC_FILTER_COLUMN, filters); Map<String, Object> extAttr = gt.getExtendedAttributes(); List<Double> lhList = Collections.emptyList(); if (extAttr != null) { for (Entry<String, Object> extEntry : extAttr.entrySet()) { Object value = extEntry.getValue(); // requires FullVCFCodec implementation to add GL as extra field instead of converting if (StringUtils.equals(extEntry.getKey(), VCFConstants.GENOTYPE_LIKELIHOODS_KEY)) { if (null != value) { GenotypeLikelihoods glField = GenotypeLikelihoods.fromGLField((String) value); double[] glv = glField.getAsVector(); lhList = new ArrayList<>(glv.length); for (double d : glv) { lhList.add(d); } } } else { if (value instanceof List) { // TODO Check if this works List vlist = (List) value; List<String> lst = new ArrayList<>(vlist.size()); for (Object o : vlist) { lst.add(o.toString()); } infoMap.put(extEntry.getKey(), lst); } else { infoMap.put(extEntry.getKey(), Collections.singletonList(value.toString())); } } } } c.setGenotypeLikelihood(lhList); c.setInfo(infoMap); callList.add(c); } return callList; } /** * Convert into from HTS to GA4GH. * @param c {@link VariantContext} * @return Map of info information */ private Map<String, List<String>> convertInfo(VariantContext c) { Map<String, List<String>> infoMap = new HashMap<>(); Map<String, Object> attributes = c.getAttributes(); for (Entry<String, Object> a : attributes.entrySet()) { String key = a.getKey(); Object value = a.getValue(); final List<String> valList; if (value == null) { // don't do anything valList = Collections.emptyList(); } else if (value instanceof List) { List l = (List) value; valList = new ArrayList<>(l.size()); for (Object o : l) { valList.add(o.toString()); } } else { String o = value.toString(); valList = Arrays.asList(o); } infoMap.put(key, valList); } return infoMap; } /** * Translate from 1-based to 0-based. * @param c VariantContext object * @return 0-based start position */ private Long translateStartPosition(VariantContext c) { int val = c.getStart(); if (val == 0) { return 0L; // TODO: test if this is correct for Telomeres } return Long.valueOf(val - 1); } @Override public VariantContext backward(Variant obj) { throw new NotImplementedException(""); } }