package org.opencb.opencga.storage.core.variant.io;
import org.apache.commons.lang3.StringUtils;
import org.opencb.biodata.models.variant.Variant;
import org.opencb.biodata.models.variant.avro.VariantAvro;
import org.opencb.biodata.tools.variant.stats.writer.VariantStatsPopulationFrequencyExporter;
import org.opencb.biodata.tools.variant.stats.writer.VariantStatsTsvExporter;
import org.opencb.commons.datastore.core.Query;
import org.opencb.commons.datastore.core.QueryOptions;
import org.opencb.commons.io.DataWriter;
import org.opencb.opencga.storage.core.metadata.StudyConfiguration;
import org.opencb.opencga.storage.core.metadata.StudyConfigurationManager;
import org.opencb.opencga.storage.core.variant.adaptors.VariantDBAdaptor;
import org.opencb.opencga.storage.core.variant.adaptors.VariantSourceDBAdaptor;
import org.opencb.opencga.storage.core.variant.io.avro.VariantAvroWriter;
import org.opencb.opencga.storage.core.variant.io.json.VariantJsonWriter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.annotation.Nullable;
import java.io.*;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;
import java.util.zip.GZIPOutputStream;
import static org.opencb.opencga.storage.core.variant.adaptors.VariantDBAdaptor.VariantQueryParams.RETURNED_SAMPLES;
import static org.opencb.opencga.storage.core.variant.adaptors.VariantDBAdaptor.VariantQueryParams.RETURNED_STUDIES;
import static org.opencb.opencga.storage.core.variant.io.VariantWriterFactory.VariantOutputFormat.*;
/**
* Created on 06/12/16.
*
* @author Jacobo Coll <jacobo167@gmail.com>
*/
public class VariantWriterFactory {
private static Logger logger = LoggerFactory.getLogger(VariantWriterFactory.class);
private final VariantDBAdaptor dbAdaptor;
public VariantWriterFactory(VariantDBAdaptor dbAdaptor) {
this.dbAdaptor = dbAdaptor;
}
public enum VariantOutputFormat {
VCF("vcf", false),
VCF_GZ("vcf.gz", false),
JSON("json"),
JSON_GZ("json.gz"),
AVRO("avro"),
AVRO_GZ("avro.gz"),
AVRO_SNAPPY("avro.snappy"),
STATS("stats.tsv", false),
STATS_GZ("stats.tsv.gz", false),
CELLBASE("frequencies.json"),
CELLBASE_GZ("frequencies.json.gz");
private final boolean multiStudy;
private final String extension;
VariantOutputFormat(String extension) {
this.extension = extension;
this.multiStudy = true;
}
VariantOutputFormat(String extension, boolean multiStudy) {
this.multiStudy = multiStudy;
this.extension = extension;
}
public String getExtension() {
return extension;
}
public boolean isMultiStudyOutput() {
return multiStudy;
}
boolean isGzip() {
return extension.endsWith(".gz");
}
boolean isSnappy() {
return extension.endsWith(".snappy");
}
}
/**
* Transform the string to a valid output format.
* If none, VCF by default.
*
* @param outputFormatStr Output format as String
* @param output Output file
* @return Valid VariantOutputFormat
* @throws IllegalArgumentException if the outputFormatStr is not valid
*/
public static VariantOutputFormat toOutputFormat(String outputFormatStr, String output) {
if (!StringUtils.isEmpty(outputFormatStr)) {
outputFormatStr = outputFormatStr.replace('.', '_');
return VariantOutputFormat.valueOf(outputFormatStr.toUpperCase());
} else if (isStandardOutput(output)) {
return VCF;
} else {
return VCF_GZ;
}
}
public static String checkOutput(@Nullable String output, VariantOutputFormat outputFormat) throws IOException {
if (isStandardOutput(output)) {
// Standard output
return null;
}
if (output.endsWith("/")) {
throw new IllegalArgumentException("Invalid directory as output file name");
}
if (output.endsWith(".")) {
output = output.substring(0, output.length() - 1);
}
if (!output.endsWith(outputFormat.getExtension())) {
String[] split = outputFormat.getExtension().split("\\.");
int idx = 0;
for (int i = 0; i < split.length; i++) {
String s = split[i];
if (output.endsWith('.' + s)) {
idx = i + 1;
}
}
for (int i = idx; i < split.length; i++) {
String s = split[i];
if (!output.endsWith(s)) {
output = output + '.' + s;
}
}
}
Path path = Paths.get(output);
File file = path.toFile();
if (file.isDirectory()) {
throw new IOException("{}: Is a directory");
} else {
if (file.canWrite()) {
throw new IOException("{}: Permission denied");
}
}
return output;
}
public static OutputStream getOutputStream(String output, String outputFormatStr) throws IOException {
VariantOutputFormat outputFormat = toOutputFormat(outputFormatStr, output);
return getOutputStream(output, outputFormat);
}
public static OutputStream getOutputStream(String output, VariantOutputFormat outputFormat) throws IOException {
boolean gzip = outputFormat.isGzip();
// output format has priority over output name
OutputStream outputStream;
if (isStandardOutput(output)) {
// Unclosable OutputStream
outputStream = new VariantVcfDataWriter.UnclosableOutputStream(System.out);
} else {
outputStream = new FileOutputStream(output);
logger.debug("writing to %s", output);
}
// If compressed a GZip output stream is used
if (gzip && outputFormat != VariantOutputFormat.AVRO_GZ) {
outputStream = new GZIPOutputStream(outputStream);
} else {
outputStream = new BufferedOutputStream(outputStream);
}
logger.debug("using {} output stream", gzip ? "gzipped" : "plain");
return outputStream;
}
protected DataWriter<Variant> newDataWriter(VariantOutputFormat outputFormat, final OutputStream outputStream,
Query query, QueryOptions queryOptions) throws IOException {
final DataWriter<Variant> exporter;
switch (outputFormat) {
case VCF_GZ:
case VCF:
StudyConfiguration studyConfiguration = getStudyConfiguration(query, dbAdaptor, true);
if (studyConfiguration != null) {
// Samples to be returned
if (query.containsKey(RETURNED_SAMPLES.key())) {
queryOptions.put(RETURNED_SAMPLES.key(), query.get(RETURNED_SAMPLES.key()));
}
VariantSourceDBAdaptor sourceDBAdaptor = dbAdaptor.getVariantSourceDBAdaptor();
exporter = new VariantVcfDataWriter(studyConfiguration, sourceDBAdaptor, outputStream, query, queryOptions);
} else {
throw new IllegalArgumentException("No study found named " + query.getAsStringList(RETURNED_STUDIES.key()).get(0));
}
break;
case JSON_GZ:
case JSON:
exporter = new VariantJsonWriter(outputStream);
break;
case AVRO:
case AVRO_GZ:
case AVRO_SNAPPY:
String codecName = "";
if (outputFormat.isGzip()) {
codecName = "gzip";
} else if (outputFormat.isSnappy()) {
codecName = "snappy";
}
exporter = new VariantAvroWriter(VariantAvro.getClassSchema(), codecName, outputStream);
break;
case STATS_GZ:
case STATS:
StudyConfiguration sc = getStudyConfiguration(query, dbAdaptor, true);
List<String> cohorts = new ArrayList<>(sc.getCohortIds().keySet());
exporter = new VariantStatsTsvExporter(outputStream, sc.getStudyName(), cohorts);
break;
case CELLBASE_GZ:
case CELLBASE:
exporter = new VariantStatsPopulationFrequencyExporter(outputStream);
break;
default:
throw variantFormatNotSupported(outputFormat.toString());
}
return exporter;
}
protected static IllegalArgumentException variantFormatNotSupported(String outputFormatStr) {
return new IllegalArgumentException("Unknown output format " + outputFormatStr);
}
public static boolean isStandardOutput(String output) {
return StringUtils.isEmpty(output);
}
protected StudyConfiguration getStudyConfiguration(Query query, VariantDBAdaptor dbAdaptor, boolean singleStudy) {
List<Integer> studyIds = dbAdaptor.getReturnedStudies(query, QueryOptions.empty());
StudyConfigurationManager scm = dbAdaptor.getStudyConfigurationManager();
if (studyIds.isEmpty()) {
studyIds = scm.getStudyIds(null);
if (studyIds == null) {
throw new IllegalArgumentException();
}
}
if (singleStudy) {
if (studyIds.size() > 1) {
throw new IllegalArgumentException();
}
}
return scm.getStudyConfiguration(studyIds.get(0), null).first();
}
}