package org.opencb.opencga.storage.hadoop.variant.exporters;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.apache.avro.mapreduce.AvroJob;
import org.apache.avro.mapreduce.AvroKeyOutputFormat;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.opencb.biodata.models.variant.avro.VariantAvro;
import org.opencb.opencga.storage.core.exceptions.StorageEngineException;
import org.opencb.opencga.storage.core.metadata.StudyConfiguration;
import org.opencb.opencga.storage.hadoop.variant.AbstractAnalysisTableDriver;
import java.io.IOException;
import java.util.Objects;
/**
* Created by mh719 on 21/11/2016.
* @author Matthias Haimel
*/
public class VariantTableExportDriver extends AbstractAnalysisTableDriver {
@Deprecated
public static final String CONFIG_VARIANT_TABLE_EXPORT_AVRO_PATH = "opencga.variant.table.export.avro.path";
@Deprecated
public static final String CONFIG_VARIANT_TABLE_EXPORT_AVRO_GENOTYPE = "opencga.variant.table.export.avro.genotype";
public static final String CONFIG_VARIANT_TABLE_EXPORT_PATH = "opencga.variant.table.export.path";
public static final String CONFIG_VARIANT_TABLE_EXPORT_TYPE = "opencga.variant.table.export.type";
public static final String CONFIG_VARIANT_TABLE_EXPORT_GENOTYPE = "opencga.variant.table.export.genotype";
private String outFile;
private ExportType type;
public enum ExportType {AVRO, VCF};
public VariantTableExportDriver() { /* nothing */ }
public VariantTableExportDriver(Configuration conf) {
super(conf);
}
@Override
protected void parseAndValidateParameters() {
outFile = null;
if (!Objects.isNull(getConf().get(CONFIG_VARIANT_TABLE_EXPORT_AVRO_PATH, null))) {
outFile = getConf().get(CONFIG_VARIANT_TABLE_EXPORT_AVRO_PATH);
}
String typeString = getConf().get(CONFIG_VARIANT_TABLE_EXPORT_TYPE, ExportType.AVRO.name());
this.type = ExportType.valueOf(typeString);
outFile = getConf().get(CONFIG_VARIANT_TABLE_EXPORT_PATH, outFile);
if (StringUtils.isEmpty(outFile)) {
throw new IllegalArgumentException("No output file specified!!!");
}
}
@Override
protected Class<? extends TableMapper> getMapperClass() {
return AnalysisToFileMapper.class;
}
@Override
protected void initMapReduceJob(String inTable, Job job, Scan scan, boolean addDependencyJar) throws IOException {
super.initMapReduceJob(inTable, job, scan, addDependencyJar);
FileOutputFormat.setOutputPath(job, new Path(this.outFile)); // set Path
FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); // compression
switch (this.type) {
case AVRO:
job.setOutputFormatClass(AvroKeyOutputFormat.class);
AvroJob.setOutputKeySchema(job, VariantAvro.getClassSchema()); // Set schema
break;
case VCF:
job.setOutputFormatClass(HadoopVcfOutputFormat.class);
break;
default:
throw new IllegalStateException("Type not known: " + this.type);
}
job.setNumReduceTasks(0);
}
@Override
protected void postExecution(boolean succeed) throws IOException, StorageEngineException {
super.postExecution(succeed);
StudyConfiguration studyConfiguration = loadStudyConfiguration();
writeMetadata(studyConfiguration, this.outFile + ".studyConfiguration");
}
protected void writeMetadata(StudyConfiguration studyConfiguration, String output) throws IOException {
ObjectMapper objectMapper = new ObjectMapper();
Path path = new Path(output);
FileSystem fs = FileSystem.get(getConf());
try (FSDataOutputStream fos = fs.create(path)) {
objectMapper.writeValue(fos, studyConfiguration);
}
}
public static void main(String[] args) throws Exception {
try {
System.exit(privateMain(args, null, new VariantTableExportDriver()));
} catch (Exception e) {
e.printStackTrace();
System.exit(1);
}
}
}