package net.hbase.secondaryindex.mapred; import net.hbase.secondaryindex.util.ConfigFactory; import net.hbase.secondaryindex.util.ConfigProperties; import net.hbase.secondaryindex.util.Const; import net.hbase.secondaryindex.util.DateFormatUtil; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.Option; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; import org.apache.commons.cli.PosixParser; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.KeyValue; import org.apache.hadoop.hbase.client.Scan; import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil; import org.apache.hadoop.hbase.mapreduce.TableOutputFormat; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import org.apache.hadoop.util.GenericOptionsParser; public class Main { static final Log LOG = LogFactory.getLog(Main.class); public static final String NAME = "Build-Secondary-Index"; public static final String TEMP_INDEX_PATH = "/tmp/hbase-secondary-index"; static ConfigProperties config = ConfigFactory.getInstance() .getConfigProperties(ConfigFactory.INDEX_CONFIG_PATH); public static void main(String[] args) throws Exception { Configuration conf = HBaseConfiguration.create(); String[] otherArgs = new GenericOptionsParser(conf, args) .getRemainingArgs(); CommandLine cmd = parseArgs(otherArgs); Scan scan = new Scan(); String outputTable = cmd.getOptionValue("o"); String inputTable = cmd.getOptionValue("i"); String column = cmd.getOptionValue("c"); String[] arr = column.split(",", -1); String mapperType = null; conf.set(Const.HBASE_CONF_COLUMN_NAME, column); if (column.indexOf(":") < 0 && column.indexOf(",") < 0 && !column.equals(Const.MAPPER_TYPE_ROWKEY)) throw new Exception( "Column is not invalid! such as: family1:qualifier1,family2:qualifier2"); String startDateStr = cmd.getOptionValue("s"); long startDate = -1L; if (null != startDateStr && startDateStr.length() > 0) { if (startDateStr.length() == 8) { startDate = DateFormatUtil .formatStringTimeToLong2(startDateStr); } else throw new Exception( "start-date format is invalid. must be 20130102"); } String endDateStr = cmd.getOptionValue("e"); long endDate = System.currentTimeMillis(); if (null != endDateStr && endDateStr.length() > 0) { if (endDateStr.length() == 8) { endDate = DateFormatUtil.formatStringTimeToLong2(endDateStr); } else throw new Exception( "end-date format is invalid. must be 20130102"); } String versionStr = cmd.getOptionValue("v"); int versions = Integer.MAX_VALUE; if (null != versionStr && versionStr.length() > 0) { versions = Integer.parseInt(versionStr); } String singleIndex = cmd.getOptionValue("si"); boolean isBuildSingleIndex = true; if (null != singleIndex && singleIndex.length() > 0) { isBuildSingleIndex = Boolean.parseBoolean(singleIndex); } conf.setBoolean(Const.HBASE_CONF_ISBUILDSINGLEINDEX_NAME, isBuildSingleIndex); String json = cmd.getOptionValue("j"); if (null != json && json.length() > 0) { if (arr.length > 1) { throw new Exception( "You are using the '-j' or '--json' option for building index for json field, so the '-c' or '--column' must contain only 1 column(json column)!"); } String[] jarr = json.split(",", -1); if (jarr.length < 1 || jarr.length > 3) { throw new Exception("The input json field is between [1,3]."); } mapperType = Const.MAPPER_TYPE_JSON; conf.set(Const.HBASE_CONF_JSON_NAME, json); } String rowkey = cmd.getOptionValue("r"); if (null != rowkey && rowkey.length() > 0) { if (column.indexOf(Const.MAPPER_TYPE_ROWKEY) < 0) { throw new Exception( "You are using the '-r' or '--rowkey' option for building index for rowkey, so the '-c' or '--column' must contain rowkey"); } String[] jarr = rowkey.split(",", -1); if (jarr.length != 3 || rowkey.indexOf(Const.PARAMETER_ISROWKEY) < 0) { throw new Exception( "The input rowkey field must be 2. You must be point the new rowkey field. use 'isrowkey:2'"); } mapperType = Const.MAPPER_TYPE_ROWKEY; conf.set(Const.HBASE_CONF_ROWKEY_NAME, rowkey); /* batch and caching */ scan.setBatch(0); scan.setCaching(10000); } /* Must be only one mapper type once */ if (null != json && null != rowkey) { throw new Exception( "Must be only one mapper type. -r or -j option appear only once a time."); } /* Max columns is 3 to build combined index! */ if (!isBuildSingleIndex) { if ((null == json || json.trim().length() == 0) && (arr.length > 3 || arr.length < 2)) { throw new Exception( "The max number of column for building 'combined index' is 3 and the min number is 2! [2,3]"); } } /* configure scan */ if (column != null) { if (null != arr && arr.length > 0) { for (String col : arr) { if (col.equals(Const.MAPPER_TYPE_ROWKEY)) continue; byte[][] colkey = KeyValue.parseColumn(Bytes.toBytes(col)); if (colkey.length > 1) { scan.addColumn(colkey[0], colkey[1]); } else { scan.addFamily(colkey[0]); } } } } scan.setTimeRange(startDate, endDate); scan.setMaxVersions(versions); LOG.info("Build hbase secondary index. From table{" + inputTable + "} to table{" + outputTable + "} with condition: \ncolumns=" + column + "\nstartdate=" + startDate + "\nendate=" + endDate + "\nversions=" + versions + "\ninBuildSingleIndex=" + isBuildSingleIndex + "\njson=" + json + "\nrowkey="); // hbase master conf.set(ConfigProperties.CONFIG_NAME_HBASE_MASTER, config.getProperty(ConfigProperties.CONFIG_NAME_HBASE_MASTER)); // zookeeper quorum conf.set( ConfigProperties.CONFIG_NAME_HBASE_ZOOKEEPER_QUORUM, config.getProperty(ConfigProperties.CONFIG_NAME_HBASE_ZOOKEEPER_QUORUM)); // set hadoop speculative execution to false conf.setBoolean(Const.HADOOP_MAP_SPECULATIVE_EXECUTION, false); conf.setBoolean(Const.HADOOP_REDUCE_SPECULATIVE_EXECUTION, false); Path tempIndexPath = new Path(TEMP_INDEX_PATH); FileSystem fs = FileSystem.get(conf); if (fs.exists(tempIndexPath)) { fs.delete(tempIndexPath, true); } /* JOB-1: generate secondary index */ Job job = new Job(conf, "Build hbase secodary index in " + inputTable + ", write to " + outputTable + " JOB-1"); job.setJarByClass(Main.class); TableMapReduceUtil.initTableMapperJob(inputTable, scan, MapperWrapper.wrap(mapperType), Text.class, Text.class, job); job.setNumReduceTasks(0); job.setOutputFormatClass(TextOutputFormat.class); FileOutputFormat.setOutputPath(job, tempIndexPath); int success = job.waitForCompletion(true) ? 0 : 1; /* JOB-2: load index data into hbase */ if (success == 0) { job = new Job(conf, "Build hbase secodary index in " + inputTable + ", write to " + outputTable + " JOB-2"); job.setJarByClass(Main.class); job.setMapperClass(LoadMapper.class); job.setNumReduceTasks(0); job.setOutputFormatClass(TableOutputFormat.class); job.getConfiguration().set(TableOutputFormat.OUTPUT_TABLE, outputTable); FileInputFormat.addInputPath(job, tempIndexPath); success = job.waitForCompletion(true) ? 0 : 1; } System.exit(success); } private static CommandLine parseArgs(String[] args) throws ParseException { Options options = new Options(); Option o = new Option("i", "input", true, "the directory or file to read from (must exist)"); o.setArgName("input-table-name"); o.setRequired(true); options.addOption(o); o = new Option("o", "output", true, "table to import into (must exist)"); o.setArgName("output-table-name"); o.setRequired(true); options.addOption(o); o = new Option( "c", "column", true, "column to store row data into (must exist). Such as: cf1:age,cf2:tag,cf2:msg or rowkey or rowkey,cf1:age. The last two usage are for 'rowkey' index building."); o.setArgName("family:qualifier"); o.setRequired(true); options.addOption(o); o = new Option("s", "sdate", true, "the start date of data to build index(default is 19700101), such as: 20130101"); o.setArgName("start-date"); o.setRequired(false); options.addOption(o); o = new Option("e", "edate", true, "the end date of data to build index(default is today), such as: 20130120"); o.setArgName("end-date"); o.setRequired(false); options.addOption(o); o = new Option("v", "versions", true, "the versions of each cell to build index(default is Integer.MAX_VALUE)"); o.setArgName("versions"); o.setRequired(false); options.addOption(o); o = new Option( "si", "sindex", true, "if use single index. true means 'single index', false means 'combined index'(default is true). If build combined index, the max number of columns is 3."); o.setArgName("single-index"); o.setRequired(false); options.addOption(o); o = new Option( "j", "json", true, "json fields to build index. The max number of fields is 3! This kind of data uses IndexJsonMapper.class."); o.setArgName("json fields"); o.setRequired(false); options.addOption(o); o = new Option( "r", "rowkey", true, "rowkey fields to build index. The max number of fields is 2! This kind of data uses IndexRowkeyMapper.class. The format is: " + "uid:1,msgid:2,isrowkey:1 \n uid and msgid are the field name, 1 and 2 is the order in the rowkey(like: uid_msgid_ts). isrowkey is the " + "label to define which field is the new rowkey. The separator in rowkey is _ . You can use validate column to build incremental index. " + "If use validate column, you need to add a column to -c parameter, the -c should be 'rowkey,cf1:age'"); o.setArgName("rowkey fields"); o.setRequired(false); options.addOption(o); options.addOption("d", "debug", false, "switch on DEBUG log level"); CommandLineParser parser = new PosixParser(); CommandLine cmd = null; try { cmd = parser.parse(options, args); } catch (Exception e) { System.err.println("ERROR: " + e.getMessage() + "\n"); HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(NAME + " ", options, true); System.exit(-1); } return cmd; } }