/* * Copyright 2015-2016 OpenCB * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.opencb.opencga.storage.core.variant.transform; import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectWriter; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.vcf.VCFCodec; import htsjdk.variant.vcf.VCFHeader; import htsjdk.variant.vcf.VCFHeaderVersion; import org.apache.avro.generic.GenericRecord; import org.opencb.biodata.formats.variant.vcf4.FullVcfCodec; import org.opencb.biodata.models.variant.*; import org.opencb.biodata.models.variant.avro.FileEntry; import org.opencb.biodata.models.variant.exceptions.NotAVariantException; import org.opencb.biodata.tools.variant.converters.avro.VariantContextToVariantConverter; import org.opencb.biodata.tools.variant.stats.VariantGlobalStatsCalculator; import org.opencb.commons.run.ParallelTaskRunner; import org.opencb.opencga.storage.core.io.plain.StringDataWriter; import org.opencb.opencga.storage.core.variant.io.json.mixin.GenericRecordAvroJsonMixin; import org.opencb.opencga.storage.core.variant.io.json.mixin.VariantSourceJsonMixin; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.nio.file.Path; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.concurrent.atomic.AtomicLong; import java.util.function.BiConsumer; /** * Created on 25/02/16. * * @author Jacobo Coll <jacobo167@gmail.com> */ public abstract class VariantTransformTask<T> implements ParallelTaskRunner.Task<String, T> { protected final VariantFactory factory; protected final VariantSource source; protected boolean includeSrc = false; protected final Logger logger = LoggerFactory.getLogger(VariantAvroTransformTask.class); protected final VCFCodec vcfCodec; protected final VariantContextToVariantConverter converter; protected final VariantNormalizer normalizer; protected final Path outputFileJsonFile; protected final VariantGlobalStatsCalculator variantStatsTask; protected final AtomicLong htsConvertTime = new AtomicLong(0); protected final AtomicLong biodataConvertTime = new AtomicLong(0); protected final AtomicLong normTime = new AtomicLong(0); protected final List<BiConsumer<String, RuntimeException>> errorHandlers = new ArrayList<>(); protected boolean failOnError = false; public VariantTransformTask(VariantFactory factory, VariantSource source, Path outputFileJsonFile, VariantGlobalStatsCalculator variantStatsTask, boolean includesrc) { this.factory = factory; this.source = source; this.outputFileJsonFile = outputFileJsonFile; this.variantStatsTask = variantStatsTask; this.includeSrc = includesrc; this.vcfCodec = null; this.converter = null; this.normalizer = null; } public VariantTransformTask(VCFHeader header, VCFHeaderVersion version, VariantSource source, Path outputFileJsonFile, VariantGlobalStatsCalculator variantStatsTask, boolean includeSrc, boolean generateReferenceBlocks) { this.variantStatsTask = variantStatsTask; this.factory = null; this.source = source; this.outputFileJsonFile = outputFileJsonFile; this.includeSrc = includeSrc; this.vcfCodec = new FullVcfCodec(); this.vcfCodec.setVCFHeader(header, version); this.converter = new VariantContextToVariantConverter(source.getStudyId(), source.getFileId(), source.getSamples()); this.normalizer = new VariantNormalizer(); normalizer.setGenerateReferenceBlocks(generateReferenceBlocks); } @Override public void pre() { synchronized (variantStatsTask) { variantStatsTask.pre(); } } @Override public List<T> apply(List<String> batch) { List<Variant> transformedVariants = new ArrayList<>(batch.size()); logger.debug("Transforming {} lines", batch.size()); long curr; if (factory != null) { for (String line : batch) { if (line.startsWith("#") || line.trim().isEmpty()) { continue; } List<Variant> variants; try { curr = System.currentTimeMillis(); variants = factory.create(source, line); this.biodataConvertTime.addAndGet(System.currentTimeMillis() - curr); for (Variant variant : variants) { if (!includeSrc) { for (StudyEntry studyEntry : variant.getStudies()) { for (FileEntry fileEntry : studyEntry.getFiles()) { if (fileEntry.getAttributes().containsKey(VariantVcfFactory.SRC)) { fileEntry.getAttributes().remove(VariantVcfFactory.SRC); } } } } transformedVariants.add(variant); } variantStatsTask.apply(variants); } catch (NotAVariantException ignore) { variants = Collections.emptyList(); } catch (RuntimeException e) { onError(e, line); } } } else { List<VariantContext> variantContexts = new ArrayList<>(batch.size()); curr = System.currentTimeMillis(); for (String line : batch) { if (line.startsWith("#") || line.trim().isEmpty()) { continue; } try { variantContexts.add(vcfCodec.decode(line)); } catch (RuntimeException e) { onError(e, line); } } this.htsConvertTime.addAndGet(System.currentTimeMillis() - curr); curr = System.currentTimeMillis(); List<Variant> variants = converter.apply(variantContexts); this.biodataConvertTime.addAndGet(System.currentTimeMillis() - curr); curr = System.currentTimeMillis(); List<Variant> normalizedVariants = new ArrayList<>((int) (variants.size() * 1.1)); for (Variant variant : variants) { try { normalizedVariants.addAll(normalizer.normalize(Collections.singletonList(variant), true)); } catch (Exception e) { logger.error("Error parsing variant " + variant); throw new IllegalStateException(e); } } this.normTime.addAndGet(System.currentTimeMillis() - curr); variantStatsTask.apply(normalizedVariants); transformedVariants.addAll(normalizedVariants); } return encodeVariants(transformedVariants); } private void onError(RuntimeException e, String line) { logger.error("Error parsing line: {}", line); for (BiConsumer<String, RuntimeException> handler : errorHandlers) { handler.accept(line, e); } if (failOnError) { throw e; } } @Override public void post() { synchronized (variantStatsTask) { variantStatsTask.post(); } ObjectMapper jsonObjectMapper = new ObjectMapper(); jsonObjectMapper.addMixIn(VariantSource.class, VariantSourceJsonMixin.class); jsonObjectMapper.addMixIn(GenericRecord.class, GenericRecordAvroJsonMixin.class); ObjectWriter variantSourceObjectWriter = jsonObjectMapper.writerFor(VariantSource.class); try { String sourceJsonString = variantSourceObjectWriter.writeValueAsString(source); StringDataWriter.write(outputFileJsonFile, Collections.singletonList(sourceJsonString)); } catch (JsonProcessingException e) { e.printStackTrace(); } logger.debug("Time txt2hts: " + this.htsConvertTime.get()); logger.debug("Time hts2biodata: " + this.biodataConvertTime.get()); logger.debug("Time normalization: " + this.normTime.get()); } public boolean isIncludeSrc() { return includeSrc; } public VariantTransformTask setIncludeSrc(boolean includeSrc) { this.includeSrc = includeSrc; return this; } public VariantTransformTask addMalformedErrorHandler(BiConsumer<String, RuntimeException> handler) { errorHandlers.add(handler); return this; } public VariantTransformTask setFailOnError(boolean failOnError) { this.failOnError = failOnError; return this; } protected abstract List<T> encodeVariants(List<Variant> variants); }