/* * Copyright © 2014 Cask Data, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package co.cask.cdap.hbase.wd; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.client.Scan; import org.apache.hadoop.hbase.mapreduce.TableInputFormat; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.JobContext; import java.io.IOException; import java.util.ArrayList; import java.util.List; /** * Convert HBase tabular data into a format that is consumable by Map/Reduce with respect to * row key distribution strategy */ public class WdTableInputFormat extends TableInputFormat { public static final String ROW_KEY_DISTRIBUTOR_CLASS = "hbase.mapreduce.scan.wd.distributor.class"; public static final String ROW_KEY_DISTRIBUTOR_PARAMS = "hbase.mapreduce.scan.wd.distributor.params"; private AbstractRowKeyDistributor rowKeyDistributor; @Override public void setConf(Configuration conf) { super.setConf(conf); if (conf.get(ROW_KEY_DISTRIBUTOR_CLASS) != null) { String clazz = conf.get(ROW_KEY_DISTRIBUTOR_CLASS); try { rowKeyDistributor = (AbstractRowKeyDistributor) Class.forName(clazz).newInstance(); if (conf.get(ROW_KEY_DISTRIBUTOR_PARAMS) != null) { rowKeyDistributor.init(conf.get(ROW_KEY_DISTRIBUTOR_PARAMS)); } } catch (Exception e) { throw new RuntimeException("Cannot create row key distributor, " + ROW_KEY_DISTRIBUTOR_CLASS + ": " + clazz, e); } } } @Override public List<InputSplit> getSplits(JobContext context) throws IOException { List<InputSplit> allSplits = new ArrayList<>(); Scan originalScan = getScan(); Scan[] scans = rowKeyDistributor.getDistributedScans(originalScan); for (Scan scan : scans) { // Internally super.getSplits(...) uses scan object stored in private variable, // to re-use the code of super class we switch scan object with scans we setScan(scan); List<InputSplit> splits = super.getSplits(context); allSplits.addAll(splits); } // Setting original scan back setScan(originalScan); return allSplits; } }