Main.java example

Explorer

hbase-secondary-index-master
- src
  - main
    - java
      - net
        hbase
        secondaryindex
        coprocessor
        IndexerDemo.java
        mapred
        Combination.java
        DeprecatedIndexJsonMapper.java
        IndexJsonMapper.java
        IndexMapper.java
        IndexRowkeyMapper.java
        LoadMapper.java
        Main.java
        MapperWrapper.java
        util
        ConfigFactory.java
        ConfigProperties.java
        Const.java
        DateFormatUtil.java
        JsonUtil.java
        PreRegionsCreation.java
      - org
        apache
        hadoop
        hbase
        client
        tableindexed
        IndexKeyGenerator.java
        IndexNotFoundException.java
        IndexSpecification.java
        IndexSpecificationArray.java
        IndexedTable.java
        IndexedTableAdmin.java
        IndexedTableDescriptor.java
        SimpleIndexKeyGenerator.java
        transactional
        CommitUnsuccessfulException.java
        HBaseBackedTransactionLogger.java
        JtaXAResource.java
        LocalTransactionLogger.java
        TransactionLogger.java
        TransactionManager.java
        TransactionScannerCallable.java
        TransactionState.java
        TransactionalTable.java
        UnknownTransactionException.java
        ipc
        HBaseRPCProtocolVersion.java
        IndexedRegionInterface.java
        TransactionalRegionInterface.java
        regionserver
        tableindexed
        IndexMaintenanceUtils.java
        IndexedRegion.java
        IndexedRegionServer.java
        transactional
        CleanOldTransactionsChore.java
        KeyValueListScanner.java
        SingleVersionDeleteNotSupported.java
        THLog.java
        THLogKey.java
        THLogRecoveryManager.java
        THLogSplitter.java
        TransactionState.java
        TransactionalRegion.java
        TransactionalRegionServer.java
        wal
        WALObserver.java
        util
        GetUtil.java
        ScanUtil.java
  - test
    - java
      - net
        hbase
        secondaryindex
        HBaseManager.java
        Test.java

package net.hbase.secondaryindex.mapred;

import net.hbase.secondaryindex.util.ConfigFactory;
import net.hbase.secondaryindex.util.ConfigProperties;
import net.hbase.secondaryindex.util.Const;
import net.hbase.secondaryindex.util.DateFormatUtil;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.cli.PosixParser;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableOutputFormat;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

public class Main {
	static final Log LOG = LogFactory.getLog(Main.class);

	public static final String NAME = "Build-Secondary-Index";
	public static final String TEMP_INDEX_PATH = "/tmp/hbase-secondary-index";

	static ConfigProperties config = ConfigFactory.getInstance()
			.getConfigProperties(ConfigFactory.INDEX_CONFIG_PATH);

	public static void main(String[] args) throws Exception {
		Configuration conf = HBaseConfiguration.create();
		String[] otherArgs = new GenericOptionsParser(conf, args)
				.getRemainingArgs();
		CommandLine cmd = parseArgs(otherArgs);
		Scan scan = new Scan();
		String outputTable = cmd.getOptionValue("o");
		String inputTable = cmd.getOptionValue("i");
		String column = cmd.getOptionValue("c");
		String[] arr = column.split(",", -1);
		String mapperType = null;
		conf.set(Const.HBASE_CONF_COLUMN_NAME, column);
		if (column.indexOf(":") < 0 && column.indexOf(",") < 0
				&& !column.equals(Const.MAPPER_TYPE_ROWKEY))
			throw new Exception(
					"Column is not invalid! such as: family1:qualifier1,family2:qualifier2");

		String startDateStr = cmd.getOptionValue("s");
		long startDate = -1L;
		if (null != startDateStr && startDateStr.length() > 0) {
			if (startDateStr.length() == 8) {
				startDate = DateFormatUtil
						.formatStringTimeToLong2(startDateStr);
			} else
				throw new Exception(
						"start-date format is invalid. must be 20130102");
		}

		String endDateStr = cmd.getOptionValue("e");
		long endDate = System.currentTimeMillis();
		if (null != endDateStr && endDateStr.length() > 0) {
			if (endDateStr.length() == 8) {
				endDate = DateFormatUtil.formatStringTimeToLong2(endDateStr);
			} else
				throw new Exception(
						"end-date format is invalid. must be 20130102");
		}

		String versionStr = cmd.getOptionValue("v");
		int versions = Integer.MAX_VALUE;
		if (null != versionStr && versionStr.length() > 0) {
			versions = Integer.parseInt(versionStr);
		}

		String singleIndex = cmd.getOptionValue("si");
		boolean isBuildSingleIndex = true;
		if (null != singleIndex && singleIndex.length() > 0) {
			isBuildSingleIndex = Boolean.parseBoolean(singleIndex);
		}
		conf.setBoolean(Const.HBASE_CONF_ISBUILDSINGLEINDEX_NAME,
				isBuildSingleIndex);

		String json = cmd.getOptionValue("j");
		if (null != json && json.length() > 0) {
			if (arr.length > 1) {
				throw new Exception(
						"You are using the '-j' or '--json' option for building index for json field, so the '-c' or '--column' must contain only 1 column(json column)!");
			}
			String[] jarr = json.split(",", -1);
			if (jarr.length < 1 || jarr.length > 3) {
				throw new Exception("The input json field is between [1,3].");
			}

			mapperType = Const.MAPPER_TYPE_JSON;
			conf.set(Const.HBASE_CONF_JSON_NAME, json);
		}

		String rowkey = cmd.getOptionValue("r");
		if (null != rowkey && rowkey.length() > 0) {
			if (column.indexOf(Const.MAPPER_TYPE_ROWKEY) < 0) {
				throw new Exception(
						"You are using the '-r' or '--rowkey' option for building index for rowkey, so the '-c' or '--column' must contain rowkey");
			}
			String[] jarr = rowkey.split(",", -1);
			if (jarr.length != 3
					|| rowkey.indexOf(Const.PARAMETER_ISROWKEY) < 0) {
				throw new Exception(
						"The input rowkey field must be 2. You must be point the new rowkey field. use 'isrowkey:2'");
			}

			mapperType = Const.MAPPER_TYPE_ROWKEY;
			conf.set(Const.HBASE_CONF_ROWKEY_NAME, rowkey);

			/* batch and caching */
			scan.setBatch(0);
			scan.setCaching(10000);
		}

		/* Must be only one mapper type once */
		if (null != json && null != rowkey) {
			throw new Exception(
					"Must be only one mapper type. -r or -j option appear only once a time.");
		}

		/* Max columns is 3 to build combined index! */
		if (!isBuildSingleIndex) {
			if ((null == json || json.trim().length() == 0)
					&& (arr.length > 3 || arr.length < 2)) {
				throw new Exception(
						"The max number of column for building 'combined index' is 3 and the min number is 2! [2,3]");
			}
		}

		/* configure scan */
		if (column != null) {
			if (null != arr && arr.length > 0) {
				for (String col : arr) {
					if (col.equals(Const.MAPPER_TYPE_ROWKEY))
						continue;
					byte[][] colkey = KeyValue.parseColumn(Bytes.toBytes(col));
					if (colkey.length > 1) {
						scan.addColumn(colkey[0], colkey[1]);
					} else {
						scan.addFamily(colkey[0]);
					}
				}
			}
		}
		scan.setTimeRange(startDate, endDate);
		scan.setMaxVersions(versions);

		LOG.info("Build hbase secondary index. From table{" + inputTable
				+ "} to table{" + outputTable + "} with condition: \ncolumns="
				+ column + "\nstartdate=" + startDate + "\nendate=" + endDate
				+ "\nversions=" + versions + "\ninBuildSingleIndex="
				+ isBuildSingleIndex + "\njson=" + json + "\nrowkey=");

		// hbase master
		conf.set(ConfigProperties.CONFIG_NAME_HBASE_MASTER,
				config.getProperty(ConfigProperties.CONFIG_NAME_HBASE_MASTER));
		// zookeeper quorum
		conf.set(
				ConfigProperties.CONFIG_NAME_HBASE_ZOOKEEPER_QUORUM,
				config.getProperty(ConfigProperties.CONFIG_NAME_HBASE_ZOOKEEPER_QUORUM));

		// set hadoop speculative execution to false
		conf.setBoolean(Const.HADOOP_MAP_SPECULATIVE_EXECUTION, false);
		conf.setBoolean(Const.HADOOP_REDUCE_SPECULATIVE_EXECUTION, false);

		Path tempIndexPath = new Path(TEMP_INDEX_PATH);
		FileSystem fs = FileSystem.get(conf);
		if (fs.exists(tempIndexPath)) {
			fs.delete(tempIndexPath, true);
		}

		/* JOB-1: generate secondary index */
		Job job = new Job(conf, "Build hbase secodary index in " + inputTable
				+ ", write to " + outputTable + " JOB-1");
		job.setJarByClass(Main.class);

		TableMapReduceUtil.initTableMapperJob(inputTable, scan,
				MapperWrapper.wrap(mapperType), Text.class, Text.class, job);
		job.setNumReduceTasks(0);
		job.setOutputFormatClass(TextOutputFormat.class);
		FileOutputFormat.setOutputPath(job, tempIndexPath);

		int success = job.waitForCompletion(true) ? 0 : 1;

		/* JOB-2: load index data into hbase */
		if (success == 0) {
			job = new Job(conf, "Build hbase secodary index in " + inputTable
					+ ", write to " + outputTable + " JOB-2");
			job.setJarByClass(Main.class);
			job.setMapperClass(LoadMapper.class);
			job.setNumReduceTasks(0);
			job.setOutputFormatClass(TableOutputFormat.class);
			job.getConfiguration().set(TableOutputFormat.OUTPUT_TABLE,
					outputTable);

			FileInputFormat.addInputPath(job, tempIndexPath);
			success = job.waitForCompletion(true) ? 0 : 1;
		}

		System.exit(success);
	}

	private static CommandLine parseArgs(String[] args) throws ParseException {
		Options options = new Options();
		Option o = new Option("i", "input", true,
				"the directory or file to read from (must exist)");
		o.setArgName("input-table-name");
		o.setRequired(true);
		options.addOption(o);

		o = new Option("o", "output", true, "table to import into (must exist)");
		o.setArgName("output-table-name");
		o.setRequired(true);
		options.addOption(o);

		o = new Option(
				"c",
				"column",
				true,
				"column to store row data into (must exist). Such as: cf1:age,cf2:tag,cf2:msg  or rowkey or rowkey,cf1:age. The last two usage are for 'rowkey' index building.");
		o.setArgName("family:qualifier");
		o.setRequired(true);
		options.addOption(o);

		o = new Option("s", "sdate", true,
				"the start date of data to build index(default is 19700101), such as: 20130101");
		o.setArgName("start-date");
		o.setRequired(false);
		options.addOption(o);

		o = new Option("e", "edate", true,
				"the end date of data to build index(default is today), such as: 20130120");
		o.setArgName("end-date");
		o.setRequired(false);
		options.addOption(o);

		o = new Option("v", "versions", true,
				"the versions of each cell to build index(default is Integer.MAX_VALUE)");
		o.setArgName("versions");
		o.setRequired(false);
		options.addOption(o);

		o = new Option(
				"si",
				"sindex",
				true,
				"if use single index. true means 'single index', false means 'combined index'(default is true). If build combined index, the max number of columns is 3.");
		o.setArgName("single-index");
		o.setRequired(false);
		options.addOption(o);

		o = new Option(
				"j",
				"json",
				true,
				"json fields to build index. The max number of fields is 3! This kind of data uses IndexJsonMapper.class.");
		o.setArgName("json fields");
		o.setRequired(false);
		options.addOption(o);

		o = new Option(
				"r",
				"rowkey",
				true,
				"rowkey fields to build index. The max number of fields is 2! This kind of data uses IndexRowkeyMapper.class. The format is: "
						+ "uid:1,msgid:2,isrowkey:1 \n uid and msgid are the field name, 1 and 2 is the order in the rowkey(like: uid_msgid_ts). isrowkey is the "
						+ "label to define which field is the new rowkey. The separator in rowkey is _ . You can use validate column to build incremental index. "
						+ "If use validate column, you need to add a column to -c parameter, the -c should be 'rowkey,cf1:age'");
		o.setArgName("rowkey fields");
		o.setRequired(false);
		options.addOption(o);

		options.addOption("d", "debug", false, "switch on DEBUG log level");

		CommandLineParser parser = new PosixParser();
		CommandLine cmd = null;
		try {
			cmd = parser.parse(options, args);
		} catch (Exception e) {
			System.err.println("ERROR: " + e.getMessage() + "\n");
			HelpFormatter formatter = new HelpFormatter();
			formatter.printHelp(NAME + " ", options, true);
			System.exit(-1);
		}
		return cmd;
	}

}