package org.opencb.opencga.storage.hadoop.variant.exporters; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.GzipCodec; import org.apache.hadoop.mapreduce.RecordWriter; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.util.ReflectionUtils; import org.opencb.biodata.models.variant.Variant; import org.opencb.commons.datastore.core.Query; import org.opencb.commons.datastore.core.QueryOptions; import org.opencb.opencga.storage.core.metadata.StudyConfiguration; import org.opencb.opencga.storage.core.variant.adaptors.VariantSourceDBAdaptor; import org.opencb.opencga.storage.core.variant.io.VariantVcfDataWriter; import org.opencb.opencga.storage.hadoop.variant.adaptors.HadoopVariantSourceDBAdaptor; import org.opencb.opencga.storage.hadoop.variant.index.VariantTableHelper; import java.io.DataOutputStream; import java.io.IOException; import java.io.OutputStream; /** * Created by mh719 on 21/12/2016. * @author Matthias Haimel */ public class HadoopVcfOutputFormat extends FileOutputFormat<Variant, NullWritable> { public HadoopVcfOutputFormat() { // do nothing } @Override public RecordWriter<Variant, NullWritable> getRecordWriter(TaskAttemptContext job) throws IOException, InterruptedException { Configuration conf = job.getConfiguration(); boolean isCompressed = getCompressOutput(job); CompressionCodec codec = null; String extension = ""; if (isCompressed) { Class file = getOutputCompressorClass(job, GzipCodec.class); codec = (CompressionCodec) ReflectionUtils.newInstance(file, conf); extension = codec.getDefaultExtension(); } Path file1 = this.getDefaultWorkFile(job, extension); FileSystem fs = file1.getFileSystem(conf); FSDataOutputStream fileOut = fs.create(file1, false); if (!isCompressed) { return new HadoopVcfOutputFormat.VcfRecordWriter(configureWriter(job, fileOut)); } else { DataOutputStream out = new DataOutputStream(codec.createOutputStream(fileOut)); return new HadoopVcfOutputFormat.VcfRecordWriter(configureWriter(job, out)); } } private VariantVcfDataWriter configureWriter(final TaskAttemptContext job, OutputStream fileOut) { job.getCounter(VariantVcfDataWriter.class.getName(), "failed").increment(0); // init final Configuration conf = job.getConfiguration(); boolean withGenotype = conf.getBoolean(VariantTableExportDriver.CONFIG_VARIANT_TABLE_EXPORT_GENOTYPE, false); try (VariantTableHelper helper = new VariantTableHelper(conf)) { StudyConfiguration sc = helper.loadMeta(); VariantSourceDBAdaptor source = new HadoopVariantSourceDBAdaptor(helper); QueryOptions options = new QueryOptions(); VariantVcfDataWriter exporter = new VariantVcfDataWriter(sc, source, fileOut, new Query(), options); exporter.setExportGenotype(withGenotype); exporter.setConverterErrorListener((v, e) -> job.getCounter(VariantVcfDataWriter.class.getName(), "failed").increment(1)); exporter.open(); exporter.pre(); return exporter; } catch (IOException e) { throw new IllegalStateException("Problem init Helper", e); } } protected static class VcfRecordWriter extends RecordWriter<Variant, NullWritable> { private final VariantVcfDataWriter writer; public VcfRecordWriter(VariantVcfDataWriter writer) { this.writer = writer; } @Override public void write(Variant variant, NullWritable nullWritable) throws IOException, InterruptedException { writer.write(variant); } @Override public void close(TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException { writer.post(); writer.close(); } } }