/* * Copyright 2015-2016 OpenCB * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.opencb.opencga.storage.hadoop.variant.archive; import com.fasterxml.jackson.databind.MapperFeature; import com.fasterxml.jackson.databind.ObjectMapper; import org.apache.avro.file.DataFileStream; import org.apache.avro.io.DatumReader; import org.apache.avro.mapreduce.AvroJob; import org.apache.avro.mapreduce.AvroKeyInputFormat; import org.apache.avro.specific.SpecificDatumReader; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.client.Connection; import org.apache.hadoop.hbase.client.ConnectionFactory; import org.apache.hadoop.hbase.io.compress.Compression; import org.apache.hadoop.hbase.io.compress.Compression.Algorithm; import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.util.GenericOptionsParser; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.opencb.biodata.models.variant.VariantSource; import org.opencb.biodata.models.variant.avro.VariantAvro; import org.opencb.biodata.models.variant.avro.VariantFileMetadata; import org.opencb.biodata.models.variant.protobuf.VcfMeta; import org.opencb.opencga.storage.hadoop.utils.HBaseManager; import org.opencb.opencga.storage.hadoop.variant.GenomeHelper; import org.opencb.opencga.storage.hadoop.variant.adaptors.HadoopVariantSourceDBAdaptor; import org.opencb.opencga.storage.hadoop.variant.archive.mr.VariantToVcfSliceMapper; import org.opencb.opencga.storage.hadoop.variant.archive.mr.VcfSliceCombiner; import org.opencb.opencga.storage.hadoop.variant.archive.mr.VcfSliceReducer; import org.opencb.opencga.storage.hadoop.variant.archive.mr.VcfSliceWritable; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.io.InputStream; import java.net.URI; import java.util.*; import java.util.zip.GZIPInputStream; /** * @author mh719 */ public class ArchiveDriver extends Configured implements Tool { public static final String CONFIG_ARCHIVE_FILE_ID = "opencga.archive.file.id"; public static final String CONFIG_ARCHIVE_INPUT_FILE_VCF = "opencga.archive.input.file.vcf"; public static final String CONFIG_ARCHIVE_INPUT_FILE_VCF_META = "opencga.archive.input.file.vcf.meta"; public static final String CONFIG_ARCHIVE_TABLE_NAME = "opencga.archive.table.name"; public static final String CONFIG_ARCHIVE_TABLE_COMPRESSION = "opencga.archive.table.compression"; public static final String CONFIG_ARCHIVE_TABLE_PRESPLIT_SIZE = "opencga.archive.table.presplit.size"; public static final String CONFIG_ARCHIVE_CHUNK_SIZE = "opencga.archive.chunk_size"; public static final String CONFIG_ARCHIVE_ROW_KEY_SEPARATOR = "opencga.archive.row_key_sep"; public static final int DEFAULT_CHUNK_SIZE = 1000; private static final Logger LOGGER = LoggerFactory.getLogger(ArchiveDriver.class); public ArchiveDriver() { } public ArchiveDriver(Configuration conf) { super(conf); } public int run(String[] args) throws Exception { Configuration conf = getConf(); HBaseConfiguration.addHbaseResources(conf); URI inputFile = URI.create(conf.get(CONFIG_ARCHIVE_INPUT_FILE_VCF)); URI inputMetaFile = URI.create(conf.get(CONFIG_ARCHIVE_INPUT_FILE_VCF_META)); String tableName = conf.get(CONFIG_ARCHIVE_TABLE_NAME); int studyId = conf.getInt(GenomeHelper.CONFIG_STUDY_ID, -1); int fileId = conf.getInt(CONFIG_ARCHIVE_FILE_ID, -1); GenomeHelper genomeHelper = new GenomeHelper(conf); /* SERVER details */ if (createArchiveTableIfNeeded(genomeHelper, tableName)) { LOGGER.info(String.format("Create table '%s' in hbase!", tableName)); } else { LOGGER.info(String.format("Table '%s' exists in hbase!", tableName)); } // add metadata config as string VcfMeta meta = readMetaData(conf, inputMetaFile); // StudyID and FileID may not be correct. Use the given through the CLI and overwrite the values from meta. meta.getVariantSource().setStudyId(Integer.toString(studyId)); meta.getVariantSource().setFileId(Integer.toString(fileId)); storeMetaData(meta, conf); /* JOB setup */ final Job job = Job.getInstance(conf, "opencga: Load file [" + fileId + "] to ArchiveTable '" + tableName + "'"); job.setJarByClass(getClass()); conf = job.getConfiguration(); conf.set("mapreduce.job.user.classpath.first", "true"); // input FileInputFormat.addInputPath(job, new Path(inputFile)); AvroJob.setInputKeySchema(job, VariantAvro.getClassSchema()); job.setInputFormatClass(AvroKeyInputFormat.class); // mapper job.setMapperClass(VariantToVcfSliceMapper.class); // combiner job.setCombinerClass(VcfSliceCombiner.class); TableMapReduceUtil.initTableReducerJob(tableName, VcfSliceReducer.class, job, null, null, null, null, conf.getBoolean(GenomeHelper.CONFIG_HBASE_ADD_DEPENDENCY_JARS, true)); job.setMapOutputValueClass(VcfSliceWritable.class); Thread hook = new Thread(() -> { try { if (!job.isComplete()) { job.killJob(); } } catch (IOException e) { e.printStackTrace(); } }); Runtime.getRuntime().addShutdownHook(hook); boolean succeed = job.waitForCompletion(true); Runtime.getRuntime().removeShutdownHook(hook); try (HadoopVariantSourceDBAdaptor manager = new HadoopVariantSourceDBAdaptor(conf)) { manager.updateLoadedFilesSummary(studyId, Collections.singletonList(fileId)); } return succeed ? 0 : 1; } public static boolean createArchiveTableIfNeeded(GenomeHelper genomeHelper, String tableName) throws IOException { try (Connection con = ConnectionFactory.createConnection(genomeHelper.getConf())) { return createArchiveTableIfNeeded(genomeHelper, tableName, con); } } public static boolean createArchiveTableIfNeeded(GenomeHelper genomeHelper, String tableName, Connection con) throws IOException { Algorithm compression = Compression.getCompressionAlgorithmByName( genomeHelper.getConf().get(CONFIG_ARCHIVE_TABLE_COMPRESSION, Compression.Algorithm.SNAPPY.getName())); int nSplits = genomeHelper.getConf().getInt(CONFIG_ARCHIVE_TABLE_PRESPLIT_SIZE, 100); List<byte[]> preSplits = GenomeHelper.generateBootPreSplitsHuman(nSplits, (chr, pos) -> genomeHelper.generateBlockIdAsBytes(chr, pos)); return HBaseManager.createTableIfNeeded(con, tableName, genomeHelper.getColumnFamily(), preSplits, compression); } private void storeMetaData(VcfMeta meta, Configuration conf) throws IOException { try (HadoopVariantSourceDBAdaptor manager = new HadoopVariantSourceDBAdaptor(conf)) { manager.updateVcfMetaData(meta); } } private VcfMeta readMetaData(Configuration conf, URI inputMetaFile) throws IOException { Path from = new Path(inputMetaFile); FileSystem fs = FileSystem.get(conf); DatumReader<VariantFileMetadata> userDatumReader = new SpecificDatumReader<>(VariantFileMetadata.class); VariantFileMetadata variantFileMetadata; if (inputMetaFile.toString().endsWith("json") || inputMetaFile.toString().endsWith("json.gz")) { ObjectMapper objectMapper = new ObjectMapper(); objectMapper.configure(MapperFeature.REQUIRE_SETTERS_FOR_GETTERS, true); try (InputStream ids = inputMetaFile.toString().endsWith("json.gz") ? new GZIPInputStream(fs.open(from)) : fs.open(from)) { variantFileMetadata = objectMapper.readValue(ids, VariantSource.class).getImpl(); } } else { try (FSDataInputStream ids = fs.open(from); DataFileStream<VariantFileMetadata> dataFileReader = new DataFileStream<>(ids, userDatumReader)) { Iterator<VariantFileMetadata> iter = dataFileReader.iterator(); if (!iter.hasNext()) { throw new IllegalStateException(String.format("No Meta data object found in %s !!!", inputMetaFile)); } variantFileMetadata = iter.next(); if (iter.hasNext()) { LOGGER.warn(String.format("More than 1 entry found in metadata file %s", inputMetaFile)); } } } return new VcfMeta(new VariantSource(variantFileMetadata)); } public static String buildCommandLineArgs(URI input, URI inputMeta, String server, String outputTable, int studyId, int fileId, Map<String, Object> other) { StringBuilder stringBuilder = new StringBuilder() .append(input).append(' ') .append(inputMeta).append(' ') .append(server).append(' ') .append(outputTable).append(' ') .append(studyId).append(' ') .append(fileId); addOtherParams(other, stringBuilder); return stringBuilder.toString(); } public static void addOtherParams(Map<String, Object> other, StringBuilder stringBuilder) { for (Map.Entry<String, Object> entry : other.entrySet()) { Object value = entry.getValue(); if (value != null && (value instanceof Number || value instanceof Boolean || value instanceof String && !((String) value).contains(" ") && !((String) value).isEmpty())) { stringBuilder.append(' ').append(entry.getKey()).append(' ').append(value); } } } public static void main(String[] args) throws Exception { try { System.exit(privateMain(args, null)); } catch (Exception e) { LOGGER.error("Error: ", e); System.exit(1); } } public static int privateMain(String[] args, Configuration conf) throws Exception { if (conf == null) { conf = new Configuration(); } ArchiveDriver driver = new ArchiveDriver(); GenericOptionsParser parser = new GenericOptionsParser(conf, args); //get the args w/o generic hadoop args String[] toolArgs = parser.getRemainingArgs(); int fixedSizeArgs = 6; if (toolArgs.length < fixedSizeArgs || (toolArgs.length - fixedSizeArgs) % 2 != 0) { System.err.printf("Usage: %s [generic options] <avro> <avro-meta> <server> <output-table> <study-id> <file-id>" + " [<key> <value>]*\n", ArchiveDriver.class.getSimpleName()); System.err.println("Found argc:" + toolArgs.length + ", argv: " + Arrays.toString(toolArgs)); ToolRunner.printGenericCommandUsage(System.err); return -1; } conf.set(CONFIG_ARCHIVE_INPUT_FILE_VCF, toolArgs[0]); conf.set(CONFIG_ARCHIVE_INPUT_FILE_VCF_META, toolArgs[1]); conf = HBaseManager.addHBaseSettings(conf, toolArgs[2]); conf.set(CONFIG_ARCHIVE_TABLE_NAME, toolArgs[3]); conf.set(GenomeHelper.CONFIG_STUDY_ID, toolArgs[4]); conf.set(CONFIG_ARCHIVE_FILE_ID, toolArgs[5]); for (int i = fixedSizeArgs; i < toolArgs.length; i = i + 2) { conf.set(toolArgs[i], toolArgs[i + 1]); } //set the configuration back, so that Tool can configure itself driver.setConf(conf); /* Alternative to using tool runner */ // int exitCode = ToolRunner.run(conf,new GenomeVariantDriver(), args); return driver.run(toolArgs); } }