/* * Copyright 2015-2016 OpenCB * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.opencb.opencga.storage.core.variant.annotation; import org.apache.commons.lang3.StringUtils; import org.opencb.biodata.formats.feature.bed.Bed; import org.opencb.biodata.formats.feature.bed.io.BedReader; import org.opencb.biodata.formats.feature.gff.Gff; import org.opencb.biodata.formats.feature.gff.io.GffReader; import org.opencb.biodata.formats.io.FormatReaderWrapper; import org.opencb.biodata.models.core.Region; import org.opencb.biodata.models.variant.Variant; import org.opencb.biodata.models.variant.VariantSource; import org.opencb.biodata.models.variant.avro.AdditionalAttribute; import org.opencb.biodata.models.variant.avro.VariantAnnotation; import org.opencb.biodata.tools.variant.VariantVcfHtsjdkReader; import org.opencb.commons.datastore.core.ObjectMap; import org.opencb.commons.datastore.core.Query; import org.opencb.commons.datastore.core.QueryOptions; import org.opencb.commons.io.DataReader; import org.opencb.commons.io.DataWriter; import org.opencb.commons.run.ParallelTaskRunner; import org.opencb.opencga.core.common.ProgressLogger; import org.opencb.opencga.core.common.TimeUtils; import org.opencb.opencga.core.common.UriUtils; import org.opencb.opencga.storage.core.exceptions.StorageEngineException; import org.opencb.opencga.storage.core.io.avro.AvroDataReader; import org.opencb.opencga.storage.core.io.avro.AvroDataWriter; import org.opencb.opencga.storage.core.variant.adaptors.VariantDBAdaptor; import org.opencb.opencga.storage.core.variant.annotation.annotators.VariantAnnotator; import org.opencb.opencga.storage.core.variant.io.VariantReaderUtils; import org.opencb.opencga.storage.core.variant.io.db.VariantAnnotationDBWriter; import org.opencb.opencga.storage.core.variant.io.db.VariantDBReader; import org.opencb.opencga.storage.core.variant.io.json.VariantAnnotationJsonDataReader; import org.opencb.opencga.storage.core.variant.io.json.VariantAnnotationJsonDataWriter; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.net.URI; import java.net.URISyntaxException; import java.nio.file.Path; import java.nio.file.Paths; import java.util.*; import java.util.concurrent.ExecutionException; import java.util.zip.GZIPInputStream; /** * Two steps annotation pipeline. * Defines the steps create and load. * * Created by jacobo on 9/01/15. * * @author Javier Lopez <fjlopez@ebi.ac.uk> */ public class DefaultVariantAnnotationManager implements VariantAnnotationManager { public static final String FILE_NAME = "fileName"; public static final String OUT_DIR = "outDir"; public static final String BATCH_SIZE = "batchSize"; public static final String NUM_WRITERS = "numWriters"; public static final String NUM_THREADS = "numThreads"; private VariantDBAdaptor dbAdaptor; private VariantAnnotator variantAnnotator; protected static Logger logger = LoggerFactory.getLogger(DefaultVariantAnnotationManager.class); public DefaultVariantAnnotationManager(VariantAnnotator variantAnnotator, VariantDBAdaptor dbAdaptor) { if (dbAdaptor == null || variantAnnotator == null) { throw new NullPointerException(); } this.dbAdaptor = dbAdaptor; this.variantAnnotator = variantAnnotator; } @Override public void annotate(Query query, ObjectMap params) throws VariantAnnotatorException, IOException, StorageEngineException { String annotationFileStr = params.getString(LOAD_FILE); boolean doCreate = params.getBoolean(CREATE); boolean doLoad = StringUtils.isNotEmpty(annotationFileStr); if (!doCreate && !doLoad) { doCreate = true; doLoad = true; } URI annotationFile; if (doCreate) { long start = System.currentTimeMillis(); logger.info("Starting annotation creation"); logger.info("Query : {} ", query.toJson()); annotationFile = createAnnotation( Paths.get(params.getString(OUT_DIR, "/tmp")), params.getString(FILE_NAME, "annotation_" + TimeUtils.getTime()), query, params); logger.info("Finished annotation creation {}ms, generated file {}", System.currentTimeMillis() - start, annotationFile); } else { try { annotationFile = UriUtils.createUri(annotationFileStr); } catch (URISyntaxException e) { throw new IllegalArgumentException(e); } } if (doLoad) { long start = System.currentTimeMillis(); logger.info("Starting annotation load"); loadAnnotation(annotationFile, params); logger.info("Finished annotation load {}ms", System.currentTimeMillis() - start); } } /** * Creates a variant annotation file from an specific source based on the content of a Variant DataBase. * * @param outDir File outdir. * @param fileName Generated file name. * @param query Query for those variants to annotate. * @param params Specific params. * @return URI of the generated file. * @throws VariantAnnotatorException IOException thrown */ public URI createAnnotation(Path outDir, String fileName, Query query, ObjectMap params) throws VariantAnnotatorException { boolean gzip = params == null || params.getBoolean("gzip", true); boolean avro = params == null || params.getBoolean("annotation.file.avro", false); Path path = Paths.get(outDir != null ? outDir.toString() : "/tmp", fileName + ".annot" + (avro ? ".avro" : ".json") + (gzip ? ".gz" : "")); URI fileUri = path.toUri(); /** Getting iterator from OpenCGA Variant database. **/ QueryOptions iteratorQueryOptions; if (params == null) { iteratorQueryOptions = new QueryOptions(); } else { iteratorQueryOptions = new QueryOptions(params); } List<String> include = Arrays.asList("chromosome", "start", "end", "alternate", "reference"); iteratorQueryOptions.add("include", include); int batchSize = 200; int numThreads = 8; if (params != null) { //Parse query options batchSize = params.getInt(BATCH_SIZE, batchSize); numThreads = params.getInt(NUM_THREADS, numThreads); } try { DataReader<Variant> variantDataReader = new VariantDBReader(dbAdaptor, query, iteratorQueryOptions); ProgressLogger progressLogger = new ProgressLogger("Annotated variants:", () -> dbAdaptor.count(query).first(), 200); ParallelTaskRunner.TaskWithException<Variant, VariantAnnotation, VariantAnnotatorException> annotationTask = variantList -> { List<VariantAnnotation> variantAnnotationList; long start = System.currentTimeMillis(); logger.debug("Annotating batch of {} genomic variants.", variantList.size()); variantAnnotationList = variantAnnotator.annotate(variantList); progressLogger.increment(variantList.size(), () -> ", up to position " + variantList.get(variantList.size() - 1).toString()); logger.debug("Annotated batch of {} genomic variants. Time: {}s", variantList.size(), (System.currentTimeMillis() - start) / 1000.0); return variantAnnotationList; }; final DataWriter<VariantAnnotation> variantAnnotationDataWriter; if (avro) { variantAnnotationDataWriter = new AvroDataWriter<>(path, gzip, VariantAnnotation.getClassSchema()); } else { variantAnnotationDataWriter = new VariantAnnotationJsonDataWriter(path, gzip); } ParallelTaskRunner.Config config = ParallelTaskRunner.Config.builder() .setNumTasks(numThreads) .setBatchSize(batchSize) .setAbortOnFail(true) .setSorted(false).build(); ParallelTaskRunner<Variant, VariantAnnotation> parallelTaskRunner = new ParallelTaskRunner<>(variantDataReader, annotationTask, variantAnnotationDataWriter, config); parallelTaskRunner.run(); } catch (ExecutionException e) { throw new VariantAnnotatorException("Error creating annotations", e); } return fileUri; } public void loadAnnotation(URI uri, ObjectMap params) throws IOException, StorageEngineException { Path path = Paths.get(uri); String fileName = path.getFileName().toString().toLowerCase(); if (VariantReaderUtils.isAvro(fileName) || VariantReaderUtils.isJson(fileName)) { loadVariantAnnotation(uri, params); } else { loadCustomAnnotation(uri, params); } } /** * Loads variant annotations from an specified file into the selected Variant DataBase. * * @param uri URI of the annotation file * @param params Specific params. * @throws IOException IOException thrown * @throws StorageEngineException if there is a problem creating or running the {@link ParallelTaskRunner} */ public void loadVariantAnnotation(URI uri, ObjectMap params) throws IOException, StorageEngineException { final int batchSize = params.getInt(DefaultVariantAnnotationManager.BATCH_SIZE, 100); final int numConsumers = params.getInt(DefaultVariantAnnotationManager.NUM_WRITERS, 6); ParallelTaskRunner.Config config = ParallelTaskRunner.Config.builder() .setNumTasks(numConsumers) .setBatchSize(batchSize) .setAbortOnFail(true) .setSorted(false).build(); DataReader<VariantAnnotation> reader; reader = newVariantAnnotationDataReader(uri); try { ProgressLogger progressLogger = new ProgressLogger("Loaded annotations: "); ParallelTaskRunner<VariantAnnotation, Object> ptr = new ParallelTaskRunner<>(reader, () -> newVariantAnnotationDBWriter(dbAdaptor, new QueryOptions(params)) .setProgressLogger(progressLogger), null, config); ptr.run(); } catch (ExecutionException e) { throw new StorageEngineException("Error loading variant annotation"); } } protected DataReader<VariantAnnotation> newVariantAnnotationDataReader(URI uri) { DataReader<VariantAnnotation> reader; if (VariantReaderUtils.isAvro(uri.toString())) { reader = new AvroDataReader<>(Paths.get(uri).toFile(), VariantAnnotation.class); } else if (VariantReaderUtils.isJson(uri.toString())) { reader = new VariantAnnotationJsonDataReader(Paths.get(uri).toFile()); // } else if (VariantReaderUtils.isVcf(uri.toString())) { // //TODO: Read from VEP file // reader = new VepFormatReader(Paths.get(uri).toString()); } else { throw new IllegalArgumentException("Unable to load annotations from file " + uri); } return reader; } protected VariantAnnotationDBWriter newVariantAnnotationDBWriter(VariantDBAdaptor dbAdaptor, QueryOptions options) { return new VariantAnnotationDBWriter(dbAdaptor, options, null); } /** * Loads custom variant annotations from an specified file into the selected Variant DataBase. * * @param uri URI of the annotation file * @param params Specific params. * @throws IOException IOException thrown * @throws StorageEngineException if there is a problem creating or running the {@link ParallelTaskRunner} */ public void loadCustomAnnotation(URI uri, ObjectMap params) throws IOException, StorageEngineException { final int batchSize = params.getInt(BATCH_SIZE, 100); final int numConsumers = params.getInt(NUM_WRITERS, 6); final String key = params.getString(CUSTOM_ANNOTATION_KEY, "default"); ParallelTaskRunner.Config config = ParallelTaskRunner.Config.builder() .setNumTasks(numConsumers) .setBatchSize(batchSize) .setAbortOnFail(true) .setSorted(false) .build(); Path path = Paths.get(uri); String fileName = path.getFileName().toString().toLowerCase(); if (fileName.endsWith(".gff") || fileName.endsWith(".gff.gz")) { try { GffReader gffReader = new GffReader(path); ParallelTaskRunner<Gff, Void> ptr = new ParallelTaskRunner<>( new FormatReaderWrapper<>(gffReader), gffList -> { for (Gff gff : gffList) { Region region = new Region(normalizeChromosome(gff.getSequenceName()), gff.getStart(), gff.getEnd()); Query query = new Query(VariantDBAdaptor.VariantQueryParams.REGION.key(), region); dbAdaptor.updateCustomAnnotations( query, key, new AdditionalAttribute(Collections.singletonMap("feature", gff.getFeature())), QueryOptions.empty()); } return Collections.emptyList(); }, null, config); try { ptr.run(); } catch (ExecutionException e) { throw new StorageEngineException("Error executing ParallelTaskRunner", e); } } catch (NoSuchMethodException e) { throw new RuntimeException(e); // This should never happen! } } else if (fileName.endsWith(".bed") || fileName.endsWith(".bed.gz")) { try { BedReader bedReader = new BedReader(path); ParallelTaskRunner<Bed, Void> ptr = new ParallelTaskRunner<>( new FormatReaderWrapper<>(bedReader), bedList -> { for (Bed bed: bedList) { Region region = new Region(normalizeChromosome(bed.getChromosome()), bed.getStart(), bed.getEnd()); Query query = new Query(VariantDBAdaptor.VariantQueryParams.REGION.key(), region); Map<String, String> annotation = new HashMap<>(3); annotation.put("name", bed.getName()); annotation.put(("score"), String.valueOf(bed.getScore())); annotation.put(("strand"), bed.getStrand()); dbAdaptor.updateCustomAnnotations(query, key, new AdditionalAttribute(annotation), QueryOptions.empty()); } return Collections.emptyList(); }, null, config); try { ptr.run(); } catch (ExecutionException e) { throw new StorageEngineException("Error executing ParallelTaskRunner", e); } } catch (NoSuchMethodException e) { throw new RuntimeException(e); // This should never happen! } } else if (fileName.endsWith(".vcf") || fileName.endsWith(".vcf.gz")) { InputStream is = new FileInputStream(path.toFile()); if (fileName.endsWith(".gz")) { is = new GZIPInputStream(is); } VariantSource source = new VariantSource(fileName, "f", "s", "s"); ParallelTaskRunner<Variant, Void> ptr = new ParallelTaskRunner<>( new VariantVcfHtsjdkReader(is, source), variantList -> { for (Variant variant : variantList) { Region region = new Region(normalizeChromosome(variant.getChromosome()), variant.getStart(), variant.getEnd()); Query query = new Query(VariantDBAdaptor.VariantQueryParams.REGION.key(), region); Map<String, String> info = variant.getStudies().get(0).getFiles().get(0).getAttributes(); AdditionalAttribute attribute = new AdditionalAttribute(info); dbAdaptor.updateCustomAnnotations(query, key, attribute, new QueryOptions()); } return Collections.emptyList(); }, null, config); try { ptr.run(); } catch (ExecutionException e) { throw new StorageEngineException("Error executing ParallelTaskRunner", e); } } else { throw new StorageEngineException("Unknown format file : " + path); } } private String normalizeChromosome(String chromosome) { return chromosome.replace("chrom", "").replace("chr", ""); } }