package org.opencb.hpg.bigdata.core.avro; import com.fasterxml.jackson.annotation.JsonInclude; import com.fasterxml.jackson.databind.MapperFeature; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectWriter; import org.apache.commons.lang3.StringUtils; import org.opencb.biodata.models.core.Region; import org.opencb.biodata.models.variant.Variant; import org.opencb.biodata.models.variant.VariantSource; import org.opencb.biodata.models.variant.avro.VariantAvro; import org.opencb.biodata.tools.variant.VariantVcfHtsjdkReader; import org.opencb.biodata.tools.variant.stats.VariantGlobalStatsCalculator; import org.opencb.hpg.bigdata.core.io.avro.AvroFileWriter; import java.io.*; import java.util.List; /** * Created by jtarraga on 03/08/16. */ public class VariantAvroSerializer extends AvroSerializer<VariantAvro> { public VariantAvroSerializer() { super("deflate"); } public VariantAvroSerializer(String compression) { super(compression); } public void toAvro(InputStream inputStream, String outputFilename) throws IOException { // reader String metaFilename = outputFilename + ".meta"; VariantSource variantSource = new VariantSource(metaFilename, "0", "0", "s"); VariantVcfHtsjdkReader vcfReader = new VariantVcfHtsjdkReader(inputStream, variantSource, null); vcfReader.open(); vcfReader.pre(); // writer OutputStream outputStream; if (StringUtils.isEmpty(outputFilename) || outputFilename.equals("STDOUT")) { outputStream = System.out; } else { outputStream = new FileOutputStream(outputFilename); } AvroFileWriter<VariantAvro> avroFileWriter = new AvroFileWriter<>(VariantAvro.SCHEMA$, compression, outputStream); avroFileWriter.open(); VariantGlobalStatsCalculator statsCalculator = new VariantGlobalStatsCalculator(vcfReader.getSource()); statsCalculator.pre(); // main loop List<Variant> variants; while (true) { variants = vcfReader.read(1000); if (variants.size() == 0) { break; } // write variants and update stats for (Variant variant: variants) { if (filter(variant.getImpl())) { avroFileWriter.writeDatum(variant.getImpl()); statsCalculator.updateGlobalStats(variant); } } } ObjectMapper mapper = new ObjectMapper(); mapper.setSerializationInclusion(JsonInclude.Include.NON_NULL); mapper.configure(MapperFeature.REQUIRE_SETTERS_FOR_GETTERS, true); ObjectWriter writer = mapper.writer(); PrintWriter pwriter = new PrintWriter(new FileWriter(metaFilename + ".json")); pwriter.write(writer.withDefaultPrettyPrinter().writeValueAsString(variantSource.getImpl())); pwriter.close(); // close vcfReader.post(); vcfReader.close(); avroFileWriter.close(); outputStream.close(); } public VariantAvroSerializer addRegionFilter(Region region) { getFilters().add(v -> v.getChromosome().equals(region.getChromosome()) && v.getEnd() >= region.getStart() && v.getStart() <= region.getEnd()); return this; } }