package org.opencb.opencga.storage.core.variant.io; import com.fasterxml.jackson.databind.ObjectMapper; import org.opencb.biodata.models.variant.Variant; import org.opencb.commons.datastore.core.Query; import org.opencb.commons.datastore.core.QueryOptions; import org.opencb.commons.io.DataWriter; import org.opencb.commons.run.ParallelTaskRunner; import org.opencb.opencga.core.common.ProgressLogger; import org.opencb.opencga.storage.core.exceptions.StorageEngineException; import org.opencb.opencga.storage.core.metadata.StudyConfiguration; import org.opencb.opencga.storage.core.metadata.StudyConfigurationManager; import org.opencb.opencga.storage.core.metadata.ExportMetadata; import org.opencb.opencga.storage.core.variant.adaptors.VariantDBAdaptor; import org.opencb.opencga.storage.core.variant.io.VariantWriterFactory.VariantOutputFormat; import org.opencb.opencga.storage.core.variant.io.db.VariantDBReader; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import javax.annotation.Nullable; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStream; import java.net.URI; import java.nio.file.Paths; import java.util.*; import java.util.concurrent.ExecutionException; import java.util.concurrent.TimeUnit; import java.util.zip.GZIPOutputStream; /** * Prints the result of a given query in the selected output format, and the associated metadata. * * This class is intended to be extended by other exporters. * * Created on 06/12/16. * @author Jacobo Coll <jacobo167@gmail.com> */ public class VariantExporter { public static final String METADATA_FILE_EXTENSION = ".meta.json.gz"; private final VariantDBAdaptor dbAdaptor; private final VariantWriterFactory variantWriterFactory; private final Logger logger = LoggerFactory.getLogger(VariantExporter.class); public VariantExporter(VariantDBAdaptor dbAdaptor) { this.dbAdaptor = dbAdaptor; variantWriterFactory = new VariantWriterFactory(dbAdaptor); } /** * Exports the result of the given query and the associated metadata. * @param outputFileUri Optional output file. If null or empty, will print into the Standard output. Won't export any metadata. * @param outputFormat Variant Output format. * @param query Query with the variants to export * @param queryOptions Query options * @throws IOException If there is any IO error * @throws StorageEngineException If there is any error exporting variants */ public void export(@Nullable URI outputFileUri, VariantOutputFormat outputFormat, Query query, QueryOptions queryOptions) throws IOException, StorageEngineException { String outputFile = null; if (outputFileUri != null) { outputFile = outputFileUri.getPath(); } outputFile = VariantWriterFactory.checkOutput(outputFile, outputFormat); List<Integer> studyIds = dbAdaptor.getReturnedStudies(query, QueryOptions.empty()); try (OutputStream os = VariantWriterFactory.getOutputStream(outputFile, outputFormat)) { boolean logProgress = !VariantWriterFactory.isStandardOutput(outputFile); exportData(os, outputFormat, query, queryOptions, logProgress); } if (!VariantWriterFactory.isStandardOutput(outputFile)) { exportMetaData(query, queryOptions, studyIds, outputFile + METADATA_FILE_EXTENSION); } } protected void exportData(OutputStream outputStream, VariantOutputFormat outputFormat, Query query, QueryOptions queryOptions, boolean logProgress) throws StorageEngineException, IOException { if (query == null) { query = new Query(); } if (queryOptions == null) { queryOptions = new QueryOptions(); } // DataReader VariantDBReader variantDBReader = new VariantDBReader(dbAdaptor, query, queryOptions); // Task<Variant, Variant> ParallelTaskRunner.TaskWithException<Variant, Variant, Exception> progressTask; if (logProgress) { progressTask = batch -> batch; } else { final Query finalQuery = query; final QueryOptions finalQueryOptions = queryOptions; ProgressLogger progressLogger = new ProgressLogger("Export variants", () -> { Long count = dbAdaptor.count(finalQuery).first(); long limit = finalQueryOptions.getLong(QueryOptions.LIMIT, Long.MAX_VALUE); long skip = finalQueryOptions.getLong(QueryOptions.SKIP, 0); count = Math.min(limit, count - skip); return count; }, 200); progressTask = batch -> { progressLogger.increment(batch.size(), () -> "up to position " + batch.get(batch.size() - 1).toString()); return batch; }; } // DataWriter DataWriter<Variant> variantDataWriter = variantWriterFactory.newDataWriter(outputFormat, outputStream, query, queryOptions); ParallelTaskRunner.Config config = ParallelTaskRunner.Config.builder().setNumTasks(1).setBatchSize(10).build(); ParallelTaskRunner<Variant, Variant> ptr = new ParallelTaskRunner<>(variantDBReader, progressTask, variantDataWriter, config); try { ptr.run(); } catch (ExecutionException e) { throw new StorageEngineException("Error exporting variants", e); } logger.info("Time fetching data: " + variantDBReader.getTimeFetching(TimeUnit.MILLISECONDS) / 1000.0 + 's'); logger.info("Time converting data: " + variantDBReader.getTimeConverting(TimeUnit.MILLISECONDS) / 1000.0 + 's'); } protected void exportMetaData(Query query, QueryOptions queryOptions, List studies, String output) throws IOException { StudyConfigurationManager scm = dbAdaptor.getStudyConfigurationManager(); Map<Integer, List<Integer>> returnedSamples = dbAdaptor.getDBAdaptorUtils().getReturnedSamples(query, queryOptions); List<StudyConfiguration> studyConfigurations = new ArrayList<>(returnedSamples.size()); returnedSamples.forEach((studyId, samplesList) -> { StudyConfiguration sc = scm.getStudyConfiguration(studyId, QueryOptions.empty()).first(); List<Integer> samplesToRemove = new ArrayList<>(); for (Integer sampleId : sc.getSampleIds().values()) { if (!samplesList.contains(sampleId)) { samplesToRemove.add(sampleId); } } for (Integer sampleToRemove : samplesToRemove) { sc.getSampleIds().inverse().remove(sampleToRemove); for (LinkedHashSet<Integer> samplesInFile : sc.getSamplesInFiles().values()) { samplesInFile.remove(sampleToRemove); } sc.getCohorts().values().forEach(samplesInCohort -> samplesInCohort.remove(sampleToRemove)); } sc.setBatches(Collections.emptyList()); studyConfigurations.add(sc); }); ExportMetadata exportMetadata = new ExportMetadata(studyConfigurations, query, queryOptions); writeMetadata(exportMetadata, output); } protected void writeMetadata(ExportMetadata exportMetadata, String output) throws IOException { ObjectMapper objectMapper = new ObjectMapper(); File file = Paths.get(output).toFile(); try (OutputStream os = new GZIPOutputStream(new FileOutputStream(file))) { objectMapper.writeValue(os, exportMetadata); } } }