/* * Copyright 2015 OpenCB * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.opencb.hpg.bigdata.tools.variant; import java.io.IOException; import java.net.URI; import java.util.List; import java.util.Map; import org.apache.avro.mapred.AvroKey; import org.apache.avro.mapreduce.AvroJob; import org.apache.avro.mapreduce.AvroKeyInputFormat; import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.client.Put; import org.apache.hadoop.hbase.io.ImmutableBytesWritable; import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.ga4gh.models.Call; import org.ga4gh.models.Variant; import org.opencb.hpg.bigdata.tools.utils.HBaseUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * @author Matthias Haimel mh719+git@cam.ac.uk */ public class Variant2HbaseMR extends Mapper<AvroKey<Variant>, NullWritable, ImmutableBytesWritable, Put> { private static final String VARIANT_2_HBASE_EXPAND_REGIONS = "VARIANT_2_HBASE.EXPAND_REGIONS"; private static final String VARIANT_2_HBASE_NON_VAR = "VARIANT_2_HBASE.NON_VARIANT"; public static final byte[] COLUMN_FAMILY = Bytes.toBytes("d"); private static final Logger LOG = LoggerFactory.getLogger(Variant2HbaseMR.class); private Configuration config; private boolean expandRegions = false; private boolean nonVariant = false; public Variant2HbaseMR() { super(); } public static Logger getLog() { return LOG; } public void setExpandRegions(boolean expandRegions) { this.expandRegions = expandRegions; } public boolean isExpandRegions() { return expandRegions; } public boolean isNonVariant() { return nonVariant; } public void setNonVariant(boolean nonVariant) { this.nonVariant = nonVariant; } @Override protected void setup( Mapper<AvroKey<Variant>, NullWritable, ImmutableBytesWritable, Put>.Context context) throws IOException, InterruptedException { Configuration conf = context.getConfiguration(); setExpandRegions(conf.getBoolean(VARIANT_2_HBASE_EXPAND_REGIONS, isExpandRegions())); setNonVariant(conf.getBoolean(VARIANT_2_HBASE_NON_VAR, isNonVariant())); super.setup(context); } @Override protected void map(AvroKey<Variant> key, NullWritable value, Mapper<AvroKey<Variant>, NullWritable, ImmutableBytesWritable, Put>.Context context) throws IOException, InterruptedException { Variant variant = key.datum(); if (isReference(variant) && this.isNonVariant()) { // is just reference or non-call String refplaceholder = "#"; // TODO require lookup service to expand Long start = variant.getStart(); Long endPos = start + 1; List<Call> calls = variant.getCalls(); boolean nocall = calls.isEmpty(); if (isExpandRegions()) { context.getCounter("VCF", "REG_EXPAND" + (nocall ? "_NOCALL" : "")).increment(1); Map<String, List<String>> info = variant.getInfo(); List<String> endLst = info.get("END"); // Get End position if (null == endLst || endLst.isEmpty()) { // Region of size 1 context.getCounter("VCF", "REF_END_EMPTY" + (nocall ? "_NOCALL" : "")).increment(1); } else { String endStr = endLst.get(0).toString(); endPos = Long.valueOf(endStr); } } String counterName = "REG_EXPAND_CNT" + (nocall ? "_NOCALL" : ""); context.getCounter("VCF", counterName).increment((endPos - start)); if (!nocall) { // only if calls for (long pos = start; pos < endPos; ++pos) { // For each position -> store String idStr = HBaseUtils.buildRefernceStorageId(variant.getReferenceName(), pos, refplaceholder); store(context, calls, idStr); } } } else { // is a variant (not just coverage info) int altCnt = variant.getAlternateBases().size(); if (altCnt > 1) { context.getCounter("VCF", "biallelic_COUNT").increment(1); return; // skip biallelic cases } List<Call> calls = variant.getCalls(); if (null == calls || calls.isEmpty()) { context.getCounter("VCF", "NO_CALL_COUNT").increment(1); return; // skip SV } int altIdx = 0; String altBases = "-"; if (altCnt > 0) { altBases = variant.getAlternateBases().get(altIdx); } String refBases = variant.getReferenceBases(); if (altBases.length() >= HBaseUtils.SV_THRESHOLD || refBases.length() >= HBaseUtils.SV_THRESHOLD) { context.getCounter("VCF", "SV_COUNT").increment(1); return; // skip SV } String idStr = HBaseUtils.buildStorageId(variant.getReferenceName(), variant.getStart(), refBases, altBases); store(context, calls, idStr); /* Ignore fields */ // List<String> ids = v.getAlleleIds(); // graph mode -> not supported /* TODO fields - fine for first implementation*/ // v.getInfo() // v.getNames() // v.getEnd(); } } private void store( Mapper<AvroKey<Variant>, NullWritable, ImmutableBytesWritable, Put>.Context context, List<Call> calls, String idStr) throws IOException, InterruptedException { byte[] id = Bytes.toBytes(idStr); Put put = new Put(id); for (Call call : calls) { addEntry(put, call); } ImmutableBytesWritable rowKey = new ImmutableBytesWritable(id); /* Submit data to HBase */ context.write(rowKey, put); } private boolean isReference(Variant variant) { return null == variant.getAlternateBases() || variant.getAlternateBases().isEmpty(); } private void addEntry(Put put, Call call) { String id = call.getCallSetId(); String idStr = id.toString(); /* other possibility * id = call.getCallSetName() */ // TODO check what happens in case of > 1 alt base put.addColumn( COLUMN_FAMILY, Bytes.toBytes(idStr), Bytes.toBytes(call.toString()) ); // json } public void setConf(Configuration conf) { this.config = conf; } public Configuration getConf() { return this.config; } public static class Builder { private URI uri; private String inputfile; private boolean expand = false; private boolean non_var = false; public Builder(String inputfile, URI uri) { this.inputfile = inputfile; this.uri = uri; } public Builder setUri(URI uri) { this.uri = uri; return this; } public Builder setInputfile(String inputfile) { this.inputfile = inputfile; return this; } public Builder setExpand(boolean expand) { this.expand = expand; return this; } public Builder setNonVar(boolean nonVar) { this.non_var = nonVar; return this; } public Job build(boolean createTableIfNeeded) throws IOException { /* INPUT file */ String inputfile = this.inputfile; /* SERVER details */ String server = null; Integer port = 60000; String tablename = null; if (null == uri) { throw new IllegalArgumentException("No Server output specified!"); } server = uri.getHost(); if (StringUtils.isBlank(server)) { throw new IllegalArgumentException("No Server host name specified in URI: " + uri); } if (uri.getPort() > 0) { // if port is specified port = uri.getPort(); } String master = String.join(":", server, port.toString()); /* TABLE details */ if (StringUtils.isBlank(uri.getPath()) || StringUtils.equals(uri.getPath().trim(), "/")) { throw new IllegalArgumentException("No Table name specified in URI: " + uri); } // Extract table name from Path tablename = uri.getPath(); tablename = tablename.startsWith("/") ? tablename.substring(1) : tablename; // Remove leading / getLog().info(String.format("Loading data into server '%s' using table '%s' ", master, tablename)); /* CONFIG */ Configuration conf = new Configuration(); conf.set("hbase.zookeeper.quorum", server); conf.set("hbase.master", master); // SET additional parameters conf.setBoolean(VARIANT_2_HBASE_EXPAND_REGIONS, this.expand); conf.setBoolean(VARIANT_2_HBASE_NON_VAR, this.non_var); // HBase conf = HBaseConfiguration.addHbaseResources(conf); /* JOB setup */ Class<Variant2HbaseMR> clazz = Variant2HbaseMR.class; Job job = Job.getInstance(conf, clazz.getName()); job.setJarByClass(clazz); // input AvroJob.setInputKeySchema(job, Variant.getClassSchema()); FileInputFormat.setInputPaths(job, new Path(inputfile)); job.setInputFormatClass(AvroKeyInputFormat.class); job.setNumReduceTasks(0); // output TableMapReduceUtil.initTableReducerJob(tablename, null, job); // mapper job.setMapperClass(Variant2HbaseMR.class); /* TABLE check */ if (createTableIfNeeded) { // create table if needed createTableIfNeeded(tablename, conf); } return job; } } /** * Create HBase table if needed. * * @param tablename HBase table name * @param conf Configuration * @throws IOException throws {@link IOException} from creating a connection / table */ public static void createTableIfNeeded(String tablename, Configuration conf) throws IOException { if (HBaseUtils.createTableIfNeeded(tablename, COLUMN_FAMILY, conf)) { getLog().info(String.format("Create table '%s' in hbase!", tablename)); } } }