TableInputFormat.java example

Explorer
stratosphere-master
/***********************************************************************************************************************
 * Copyright (C) 2010-2013 by the Stratosphere project (http://stratosphere.eu)
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 * specific language governing permissions and limitations under the License.
 **********************************************************************************************************************/

package eu.stratosphere.addons.hbase;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableRecordReader;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.Pair;
import org.apache.hadoop.util.StringUtils;

import eu.stratosphere.addons.hbase.common.HBaseKey;
import eu.stratosphere.addons.hbase.common.HBaseResult;
import eu.stratosphere.addons.hbase.common.HBaseUtil;
import eu.stratosphere.api.common.io.InputFormat;
import eu.stratosphere.api.common.io.statistics.BaseStatistics;
import eu.stratosphere.configuration.Configuration;
import eu.stratosphere.types.Record;
import eu.stratosphere.util.OperatingSystem;

/**
 * {@link InputFormat} subclass that wraps the access for HTables.
 */
public class TableInputFormat implements InputFormat<Record, TableInputSplit> {

	private static final long serialVersionUID = 1L;

	private static final Log LOG = LogFactory.getLog(TableInputFormat.class);

	/** A handle on an HBase table */
	private HTable table;

	/** The scanner that performs the actual access on the table. HBase object */
	private Scan scan;

	/** Hbase' iterator wrapper */
	private TableRecordReader tableRecordReader;

	/** helper variable to decide whether the input is exhausted or not */
	private boolean endReached = false;

	/** Job parameter that specifies the input table. */
	public static final String INPUT_TABLE = "hbase.inputtable";

	/** Location of the hbase-site.xml. If set, the HBaseAdmin will build inside */
	public static final String CONFIG_LOCATION = "hbase.config.location";

	/**
	 * Base-64 encoded scanner. All other SCAN_ confs are ignored if this is specified.
	 * See {@link TableMapReduceUtil#convertScanToString(Scan)} for more details.
	 */
	public static final String SCAN = "hbase.scan";

	/** Column Family to Scan */
	public static final String SCAN_COLUMN_FAMILY = "hbase.scan.column.family";

	/** Space delimited list of columns to scan. */
	public static final String SCAN_COLUMNS = "hbase.scan.columns";

	/** The timestamp used to filter columns with a specific timestamp. */
	public static final String SCAN_TIMESTAMP = "hbase.scan.timestamp";

	/** The starting timestamp used to filter columns with a specific range of versions. */
	public static final String SCAN_TIMERANGE_START = "hbase.scan.timerange.start";

	/** The ending timestamp used to filter columns with a specific range of versions. */
	public static final String SCAN_TIMERANGE_END = "hbase.scan.timerange.end";

	/** The maximum number of version to return. */
	public static final String SCAN_MAXVERSIONS = "hbase.scan.maxversions";

	/** Set to false to disable server-side caching of blocks for this scan. */
	public static final String SCAN_CACHEBLOCKS = "hbase.scan.cacheblocks";

	/** The number of rows for caching that will be passed to scanners. */
	public static final String SCAN_CACHEDROWS = "hbase.scan.cachedrows";

	/** mutable objects that are used to avoid recreation of wrapper objects */
	protected HBaseKey hbaseKey;

	protected HBaseResult hbaseResult;

	private org.apache.hadoop.conf.Configuration hConf;

	@Override
	public void configure(Configuration parameters) {
		HTable table = createTable(parameters);
		setTable(table);
		Scan scan = createScanner(parameters);
		setScan(scan);
	}

	/**
	 * Read the configuration and creates a {@link Scan} object.
	 * 
	 * @param parameters
	 * @return
	 */
	protected Scan createScanner(Configuration parameters) {
		Scan scan = null;
		if (parameters.getString(SCAN, null) != null) {
			try {
				scan = HBaseUtil.convertStringToScan(parameters.getString(SCAN, null));
			} catch (IOException e) {
				LOG.error("An error occurred.", e);
			}
		} else {
			try {
				scan = new Scan();

				// if (parameters.getString(SCAN_COLUMNS, null) != null) {
				// scan.addColumns(parameters.getString(SCAN_COLUMNS, null));
				// }

				if (parameters.getString(SCAN_COLUMN_FAMILY, null) != null) {
					scan.addFamily(Bytes.toBytes(parameters.getString(SCAN_COLUMN_FAMILY, null)));
				}

				if (parameters.getString(SCAN_TIMESTAMP, null) != null) {
					scan.setTimeStamp(Long.parseLong(parameters.getString(SCAN_TIMESTAMP, null)));
				}

				if (parameters.getString(SCAN_TIMERANGE_START, null) != null
					&& parameters.getString(SCAN_TIMERANGE_END, null) != null) {
					scan.setTimeRange(
						Long.parseLong(parameters.getString(SCAN_TIMERANGE_START, null)),
						Long.parseLong(parameters.getString(SCAN_TIMERANGE_END, null)));
				}

				if (parameters.getString(SCAN_MAXVERSIONS, null) != null) {
					scan.setMaxVersions(Integer.parseInt(parameters.getString(SCAN_MAXVERSIONS, null)));
				}

				if (parameters.getString(SCAN_CACHEDROWS, null) != null) {
					scan.setCaching(Integer.parseInt(parameters.getString(SCAN_CACHEDROWS, null)));
				}

				// false by default, full table scans generate too much BC churn
				scan.setCacheBlocks((parameters.getBoolean(SCAN_CACHEBLOCKS, false)));
			} catch (Exception e) {
				LOG.error(StringUtils.stringifyException(e));
			}
		}
		return scan;
	}

	/**
	 * Create an {@link HTable} instance and set it into this format.
	 * 
	 * @param parameters
	 *        a {@link Configuration} that holds at least the table name.
	 */
	protected HTable createTable(Configuration parameters) {
		String configLocation = parameters.getString(TableInputFormat.CONFIG_LOCATION, null);
		LOG.info("Got config location: " + configLocation);
		if (configLocation != null)
		{
			org.apache.hadoop.conf.Configuration dummyConf = new org.apache.hadoop.conf.Configuration();
			if(OperatingSystem.isWindows()) {
				dummyConf.addResource(new Path("file:/" + configLocation));
			} else {
				dummyConf.addResource(new Path("file://" + configLocation));
			}
			hConf = HBaseConfiguration.create(dummyConf);
			;
			// hConf.set("hbase.master", "im1a5.internetmemory.org");
			LOG.info("hbase master: " + hConf.get("hbase.master"));
			LOG.info("zookeeper quorum: " + hConf.get("hbase.zookeeper.quorum"));

		}
		String tableName = parameters.getString(INPUT_TABLE, "");
		try {
			return new HTable(this.hConf, tableName);
		} catch (Exception e) {
			LOG.error(StringUtils.stringifyException(e));
		}
		return null;
	}

	@Override
	public BaseStatistics getStatistics(BaseStatistics cachedStatistics) {
		// TODO Auto-generated method stub
		return null;
	}

	@Override
	public boolean reachedEnd() throws IOException {
		return this.endReached;
	}

	protected boolean nextResult() throws IOException {
		if (this.tableRecordReader == null)
		{
			throw new IOException("No table record reader provided!");
		}

		try {
			if (this.tableRecordReader.nextKeyValue())
			{
				ImmutableBytesWritable currentKey = this.tableRecordReader.getCurrentKey();
				Result currentValue = this.tableRecordReader.getCurrentValue();

				hbaseKey.setWritable(currentKey);
				hbaseResult.setResult(currentValue);
			} else
			{
				this.endReached = true;
				return false;
			}
		} catch (InterruptedException e) {
			LOG.error("Table reader has been interrupted", e);
			throw new IOException(e);
		}

		return true;
	}

	@Override
	public Record nextRecord(Record record) throws IOException {
		if (nextResult()) {
			mapResultToRecord(record, hbaseKey, hbaseResult);
			return record;
		} else {
			return null;
		}
	}

	/**
	 * Maps the current HBase Result into a Record.
	 * This implementation simply stores the HBaseKey at position 0, and the HBase Result object at position 1.
	 * 
	 * @param record
	 * @param key
	 * @param result
	 */
	public void mapResultToRecord(Record record, HBaseKey key, HBaseResult result) {
		record.setField(0, key);
		record.setField(1, result);
	}

	@Override
	public void close() throws IOException {
		this.tableRecordReader.close();
	}

	@Override
	public void open(TableInputSplit split) throws IOException {
		if (split == null)
		{
			throw new IOException("Input split is null!");
		}

		if (this.table == null)
		{
			throw new IOException("No HTable provided!");
		}

		if (this.scan == null)
		{
			throw new IOException("No Scan instance provided");
		}

		this.tableRecordReader = new TableRecordReader();

		this.tableRecordReader.setHTable(this.table);

		Scan sc = new Scan(this.scan);
		sc.setStartRow(split.getStartRow());
		LOG.info("split start row: " + new String(split.getStartRow()));
		sc.setStopRow(split.getEndRow());
		LOG.info("split end row: " + new String(split.getEndRow()));

		this.tableRecordReader.setScan(sc);
		this.tableRecordReader.restart(split.getStartRow());

		this.hbaseKey = new HBaseKey();
		this.hbaseResult = new HBaseResult();

		endReached = false;
	}


	@Override
	public TableInputSplit[] createInputSplits(final int minNumSplits) throws IOException {

		if (this.table == null) {
			throw new IOException("No table was provided.");
		}

		final Pair<byte[][], byte[][]> keys = this.table.getStartEndKeys();

		if (keys == null || keys.getFirst() == null || keys.getFirst().length == 0) {

			throw new IOException("Expecting at least one region.");
		}
		int count = 0;
		final List<TableInputSplit> splits = new ArrayList<TableInputSplit>(keys.getFirst().length);
		for (int i = 0; i < keys.getFirst().length; i++) {

			if (!includeRegionInSplit(keys.getFirst()[i], keys.getSecond()[i])) {
				continue;
			}

			final String regionLocation = this.table.getRegionLocation(keys.getFirst()[i], false).getHostnamePort();
			final byte[] startRow = this.scan.getStartRow();
			final byte[] stopRow = this.scan.getStopRow();

			// determine if the given start an stop key fall into the region
			if ((startRow.length == 0 || keys.getSecond()[i].length == 0 ||
				Bytes.compareTo(startRow, keys.getSecond()[i]) < 0) &&
				(stopRow.length == 0 ||
				Bytes.compareTo(stopRow, keys.getFirst()[i]) > 0)) {

				final byte[] splitStart = startRow.length == 0 ||
					Bytes.compareTo(keys.getFirst()[i], startRow) >= 0 ?
					keys.getFirst()[i] : startRow;
				final byte[] splitStop = (stopRow.length == 0 ||
					Bytes.compareTo(keys.getSecond()[i], stopRow) <= 0) &&
					keys.getSecond()[i].length > 0 ?
					keys.getSecond()[i] : stopRow;
				final TableInputSplit split = new TableInputSplit(splits.size(), new String[] { regionLocation },
					this.table.getTableName(), splitStart, splitStop);
				splits.add(split);
				if (LOG.isDebugEnabled()) {
					LOG.debug("getSplits: split -> " + (count++) + " -> " + split);
				}
			}
		}

		return splits.toArray(new TableInputSplit[0]);
	}

	/**
	 * Test if the given region is to be included in the InputSplit while splitting
	 * the regions of a table.
	 * <p>
	 * This optimization is effective when there is a specific reasoning to exclude an entire region from the M-R job,
	 * (and hence, not contributing to the InputSplit), given the start and end keys of the same. <br>
	 * Useful when we need to remember the last-processed top record and revisit the [last, current) interval for M-R
	 * processing, continuously. In addition to reducing InputSplits, reduces the load on the region server as well, due
	 * to the ordering of the keys. <br>
	 * <br>
	 * Note: It is possible that <code>endKey.length() == 0 </code> , for the last (recent) region. <br>
	 * Override this method, if you want to bulk exclude regions altogether from M-R. By default, no region is excluded(
	 * i.e. all regions are included).
	 * 
	 * @param startKey
	 *        Start key of the region
	 * @param endKey
	 *        End key of the region
	 * @return true, if this region needs to be included as part of the input (default).
	 */
	private static boolean includeRegionInSplit(final byte[] startKey, final byte[] endKey) {
		return true;
	}


	@Override
	public Class<TableInputSplit> getInputSplitType() {

		return TableInputSplit.class;
	}

	public void setTable(HTable table)
	{
		this.table = table;
	}

	public HTable getTable() {
		return table;
	}

	public void setScan(Scan scan)
	{
		this.scan = scan;
	}

	public Scan getScan() {
		return scan;
	}
}