/*
* Copyright 2015 OpenCB
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.opencb.hpg.bigdata.core.converters.variation;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFHeader;
import htsjdk.variant.vcf.VCFHeaderVersion;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
import org.opencb.biodata.formats.variant.vcf4.FullVcfCodec;
import org.opencb.biodata.tools.variant.converters.Converter;
import org.opencb.commons.run.ParallelTaskRunner;
import org.opencb.hpg.bigdata.core.io.VariantContextBlockIterator;
import org.opencb.hpg.bigdata.core.io.avro.AvroEncoder;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicLong;
/**
* Created by hpccoll1 on 10/04/15.
*/
public class VariantAvroEncoderTask<T extends GenericRecord> implements ParallelTaskRunner.Task<CharBuffer, ByteBuffer> {
public static final int BATCH_SIZE = 1000;
protected Logger logger = LoggerFactory.getLogger(this.getClass().toString());
// private final VariantConverterContext variantConverterContext;
private final VCFHeader header;
private final AvroEncoder<T> encoder;
private final Converter<VariantContext, T> converter;
private final VariantContextBlockIterator variantContextBlockIterator;
private final FullVcfCodec codec;
private static AtomicLong numConverts = new AtomicLong(0);
private static AtomicLong parseTime = new AtomicLong(0);
private static AtomicLong convertTime = new AtomicLong(0);
private static AtomicLong encodeTime = new AtomicLong(0);
private static AtomicBoolean postDone = new AtomicBoolean(false);
private int failConvert = 0;
public VariantAvroEncoderTask(VCFHeader header, VCFHeaderVersion version, Converter<VariantContext, T> converter, Schema schema) {
this.header = header;
codec = new FullVcfCodec();
codec.setVCFHeader(this.header, version);
this.converter = converter;
encoder = new AvroEncoder<>(schema, true);
variantContextBlockIterator = new VariantContextBlockIterator(codec);
variantContextBlockIterator.setDecodeGenotypes(false);
}
@Override
public void pre() {
numConverts.set(0);
parseTime.set(0);
convertTime.set(0);
encodeTime.set(0);
postDone.set(false);
int gtSize = header.getGenotypeSamples().size();
List<String> genotypeSamples = header.getGenotypeSamples();
}
@Override
public List<ByteBuffer> apply(List<CharBuffer> charBufferList) {
List<T> convertedList = new ArrayList<>(charBufferList.size());
List<ByteBuffer> encoded;
logProgress(charBufferList.size());
//Parse from CharBuffer to VariantContext
long start = System.nanoTime();
List<VariantContext> variantContexts = variantContextBlockIterator.convert(charBufferList);
parseTime.addAndGet(System.nanoTime() - start);
// Convert to Variants
start = System.nanoTime();
for (VariantContext variantContext : variantContexts) {
try {
convertedList.add(converter.convert(variantContext));
} catch (Exception e) {
e.printStackTrace();
failConvert++;
}
}
convertTime.addAndGet(System.nanoTime() - start);
logger.debug("[" + Thread.currentThread().getName() + "] Processed " + variantContexts.size()
+ " variants into " + convertedList.size() + " avro variants");
//Encode with Avro
try {
start = System.nanoTime();
encoded = encoder.encode(convertedList);
encodeTime.addAndGet(System.nanoTime() - start);
logger.debug("[" + Thread.currentThread().getName() + "] Processed " + convertedList.size()
+ " avro variants into " + encoded.size() + " encoded variants");
return encoded;
} catch (IOException e) {
e.printStackTrace();
return Collections.emptyList();
}
}
private void logProgress(int size) {
long num = numConverts.getAndAdd(size);
long batch = num / BATCH_SIZE;
long newBatch = (num + size) / BATCH_SIZE;
if (batch != newBatch) {
logger.info("Num processed variants: " + newBatch * BATCH_SIZE);
}
}
@Override
public void post() {
if (!postDone.getAndSet(true)) {
logger.debug("parseTime = " + parseTime.get() / 1000000000.0 + "s");
logger.debug("convertTime = " + convertTime.get() / 1000000000.0 + "s");
logger.debug("encodeTime = " + encodeTime.get() / 1000000000.0 + "s");
}
}
}