/* * Copyright 2015-2016 OpenCB * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.opencb.opencga.storage.core.variant.io; import com.fasterxml.jackson.databind.ObjectMapper; import org.opencb.biodata.formats.variant.io.VariantReader; import org.opencb.biodata.formats.variant.vcf4.io.VariantVcfReader; import org.opencb.biodata.models.variant.VariantSource; import org.opencb.biodata.tools.variant.VariantFileUtils; import org.opencb.commons.utils.FileUtils; import org.opencb.opencga.storage.core.exceptions.StorageEngineException; import org.opencb.opencga.storage.core.variant.io.avro.VariantAvroReader; import org.opencb.opencga.storage.core.variant.io.json.VariantJsonReader; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.net.URI; import java.nio.file.Path; import java.nio.file.Paths; import java.util.regex.Pattern; /** * Created on 31/03/16. * * @author Jacobo Coll <jacobo167@gmail.com> */ public class VariantReaderUtils { public static final String MALFORMED_FILE = "malformed"; public static final String VARIANTS_FILE = "variants"; public static final String METADATA_FILE = "file"; public static final String METADATA_FORMAT = "json"; public static final String METADATA_FILE_FORMAT_GZ = METADATA_FILE + "." + METADATA_FORMAT + ".gz"; private static final Pattern VALID_META = Pattern.compile("^.+\\." + METADATA_FILE + "\\." + METADATA_FORMAT + "\\.gz$"); private static final Pattern VALID_VARIANTS = Pattern.compile("^.+\\." + VARIANTS_FILE + "\\.(avro|json|proto)(\\.(gz|snappy))?$"); /** * Get a variant data reader depending on the type of the input file. * * @param input Stream Input variant file (avro, json, vcf) * @param source Optional VariantSource * @return VariantReader * @throws StorageEngineException if the format is not valid or there is an error reading */ public static VariantReader getVariantReader(Path input, VariantSource source) throws StorageEngineException { String fileName = input.getFileName().toString(); if (isJson(fileName)) { return getVariantJsonReader(input, source); } else if (isAvro(fileName)) { return getVariantAvroReader(input, source); } else if (isVcf(fileName)) { return new VariantVcfReader(source, input.toAbsolutePath().toString()); } else { throw variantInputNotSupported(input); } } public static StorageEngineException variantInputNotSupported(Path input) { return new StorageEngineException("Variants input file format not supported for file: " + input); } protected static VariantJsonReader getVariantJsonReader(Path input, VariantSource source) throws StorageEngineException { VariantJsonReader variantJsonReader; if (isJson(input.toString())) { String sourceFile = getMetaFromTransformedFile(input.toAbsolutePath().toString()); variantJsonReader = new VariantJsonReader(source, input.toAbsolutePath().toString(), sourceFile); } else { throw variantInputNotSupported(input); } return variantJsonReader; } protected static VariantAvroReader getVariantAvroReader(Path input, VariantSource source) throws StorageEngineException { VariantAvroReader variantAvroReader; if (isAvro(input.toString())) { String sourceFile = getMetaFromTransformedFile(input.toAbsolutePath().toString()); variantAvroReader = new VariantAvroReader(input.toAbsolutePath().toFile(), new File(sourceFile), source); } else { throw variantInputNotSupported(input); } return variantAvroReader; } public static Path getMetaFromTransformedFile(Path variantsFile) { return Paths.get(getMetaFromTransformedFile(variantsFile.toString())); } public static String getMetaFromTransformedFile(String variantsFile) { checkTransformedVariants(variantsFile); int idx = variantsFile.indexOf(VARIANTS_FILE); return new StringBuilder().append(variantsFile, 0, idx).append(METADATA_FILE_FORMAT_GZ).toString(); } public static String getFileName(URI input) { return Paths.get(input.getPath()).getFileName().toString(); } public static String getOriginalFromTransformedFile(URI input) { return getOriginalFromTransformedFile(getFileName(input)); } public static String getOriginalFromTransformedFile(String variantsFile) { if (isTransformedVariants(variantsFile)) { int idx = variantsFile.lastIndexOf(VARIANTS_FILE); return variantsFile.substring(0, idx - 1); } else if (isMetaFile(variantsFile)) { int idx = variantsFile.lastIndexOf(METADATA_FILE); return variantsFile.substring(0, idx - 1); } else { return variantsFile; } } /** * Read the VariantSource from an InputStream. * * InputStream must point to a json object. * * @param inputStream Input variant source file * @return Read VariantSource * @throws IOException if there is an error reading */ public static VariantSource readVariantSource(InputStream inputStream) throws IOException { VariantSource source; source = new ObjectMapper().readValue(inputStream, VariantSource.class); return source; } public VariantSource readVariantSource(URI input) throws StorageEngineException { if (input.getScheme() == null || input.getScheme().startsWith("file")) { return readVariantSource(Paths.get(input.getPath()), null); } else { throw new StorageEngineException("Can not read files from " + input.getScheme()); } } /** * Read the VariantSource from a variant file. * * Accepted formats: Avro, Json and VCF * * @param input Input variant file (avro, json, vcf) * @param source VariantSource to fill. Can be null * @return Read VariantSource * @throws StorageEngineException if the format is not valid or there is an error reading */ public static VariantSource readVariantSource(Path input, VariantSource source) throws StorageEngineException { if (source == null) { source = new VariantSource(input.getFileName().toString(), "", "", ""); } // If it's a sourceFile if (input.toString().endsWith(METADATA_FILE_FORMAT_GZ)) { try (InputStream inputStream = FileUtils.newInputStream(input)) { return VariantReaderUtils.readVariantSource(inputStream); } catch (IOException | RuntimeException e) { throw new StorageEngineException("Unable to read VariantSource", e); } } VariantReader reader = getVariantReader(input, source); try { source = VariantFileUtils.readVariantSource(reader, source); } catch (IOException e) { throw new StorageEngineException("Unable to read VariantSource", e); } return source; } public static boolean isAvro(String fileName) { return hasFormat(fileName, "avro"); } public static boolean isProto(String fileName) { return hasFormat(fileName, "proto"); } public static boolean isJson(String fileName) { return hasFormat(fileName, "json"); } public static boolean isVcf(String fileName) { return hasFormat(fileName, "vcf"); } public static boolean hasFormat(String fileName, String format) { if (fileName.endsWith("." + format)) { return true; } else if (fileName.contains(".")) { return fileName.substring(0, fileName.lastIndexOf('.')).endsWith("." + format); } return false; } public static void checkTransformedVariants(String file) { if (!isTransformedVariants(file)) { throw new IllegalArgumentException("Not a valid transformed variants file : " + file); } } public static boolean isTransformedVariants(String file) { return VALID_VARIANTS.matcher(file).find(); } public static void checkMetaFile(String file) { if (!isMetaFile(file)) { throw new IllegalArgumentException("Not a valid transformed variants metadata file : " + file); } } public static boolean isMetaFile(String file) { return VALID_META.matcher(file).find(); } }