package com.inin.analytics.elasticsearch.example; import java.io.IOException; import org.apache.commons.lang.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; import org.apache.hadoop.util.Tool; import com.inin.analytics.elasticsearch.BaseESReducer; import com.inin.analytics.elasticsearch.index.rotation.ElasticSearchIndexMetadata; import com.inin.analytics.elasticsearch.index.routing.ElasticsearchRoutingStrategy; import com.inin.analytics.elasticsearch.index.routing.ElasticsearchRoutingStrategyV1; /** * Sample hadoop job for taking data from GenerateData.java and writes it out * into a format suitable for ExampleIndexingJob.java */ public class ExampleJobPrep implements Tool { private static Configuration conf; private static final String INDEX_TYPE = "conversation"; private static final String INDEX_SUFFIX_CONFIG = "indexSuffixConfigKey"; private static final String NUM_SHARDS_PER_CUSTOMER = "numShardsPerCustomer"; private static final String NUM_SHARDS = "numShards"; public static class DocMapper extends Mapper <LongWritable, Text, Text, Text> { private ElasticsearchRoutingStrategy elasticsearchRoutingStrategy; @Override protected void setup(Context context) throws IOException, InterruptedException { Integer numShardsPerOrg = context.getConfiguration().getInt(NUM_SHARDS_PER_CUSTOMER, 1); Integer numShards = context.getConfiguration().getInt(NUM_SHARDS, 1); ElasticSearchIndexMetadata indexMetadata = new ElasticSearchIndexMetadata(); indexMetadata.setNumShards(numShards); indexMetadata.setNumShardsPerOrg(numShardsPerOrg); elasticsearchRoutingStrategy = new ElasticsearchRoutingStrategyV1(); elasticsearchRoutingStrategy.configure(indexMetadata); } @Override public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String[] csv = StringUtils.split(value.toString(), ","); String docId = csv[1]; String customerId = csv[0]; String json = "{\"customerId\":\"" + customerId + "\",\"color\":\"" + csv[2] + "\",\"id\":\"" + docId + "\",\"description\":\"" + csv[3] + "\"}"; String routingHash = elasticsearchRoutingStrategy.getRoutingHash(customerId, docId); Text outputKey = new Text(INDEX_TYPE + BaseESReducer.TUPLE_SEPARATOR + routingHash); Text outputValue = new Text(INDEX_TYPE + BaseESReducer.TUPLE_SEPARATOR + customerId + BaseESReducer.TUPLE_SEPARATOR + json); context.write(outputKey, outputValue); } } public static boolean main(String[] args) throws Exception { if(args.length != 5) { System.err.println("Invalid # arguments. EG: loadES [pipe separated paths to source files containing segments & properties] [output location] [index name suffix] [numShardsPerIndex] [maxNumShardsPerCustomer (for routing)]"); } String inputs = args[0]; String output = args[1]; String indexSuffix = args[2]; Integer numShards = new Integer(args[3]); Integer numShardsPerCustomer = new Integer(args[4]); conf = new Configuration(); Job job = Job.getInstance(conf, "Prep example"); job.setJarByClass(ExampleJobPrep.class); job.setMapperClass(DocMapper.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setNumReduceTasks(0); job.getConfiguration().set(INDEX_SUFFIX_CONFIG, indexSuffix); job.getConfiguration().set(NUM_SHARDS_PER_CUSTOMER, numShardsPerCustomer.toString()); job.getConfiguration().set(NUM_SHARDS, numShards.toString()); FileOutputFormat.setOutputPath(job, new Path(output)); // Set up inputs String[]inputFolders = StringUtils.split(inputs, "|"); for(String input : inputFolders) { FileInputFormat.addInputPath(job, new Path(input)); } return job.waitForCompletion(true); } @Override public void setConf(Configuration conf) { this.conf = conf; } @Override public Configuration getConf() { return conf; } @Override public int run(String[] args) throws Exception { boolean success = ExampleJobPrep.main(args); if(success) { return 0; } else { return 1; } } }