ExampleJobPrep.java example

Explorer

elasticsearch-lambda-master
- src
  - main
    - java
      - com
        inin
        analytics
        elasticsearch
        BaseESMapper.java
        BaseESReducer.java
        ConfigParams.java
        ESEmbededContainer.java
        IndexingPostProcessor.java
        ShardConfig.java
        driver
        Driver.java
        example
        ExampleIndexingJob.java
        ExampleIndexingReducerImpl.java
        ExampleJobPrep.java
        GenerateData.java
        index
        rotation
        ElasticSearchIndexMetadata.java
        ElasticsearchIndexRotationManager.java
        ElasticsearchIndexRotationManagerNoop.java
        ElasticsearchIndexRotationManagerZookeeper.java
        ExampleElasticsearchIndexRotationStrategyZookeeper.java
        RebuildPipelineState.java
        routing
        ElasticsearchRoutingStrategy.java
        ElasticsearchRoutingStrategyV1.java
        selector
        RealtimeIndexSelectionStrategy.java
        RealtimeIndexSelectionStrategyLagged.java
        transport
        BaseTransport.java
        HDFSSnapshotTransport.java
        LocalFSSnapshotTransport.java
        S3SnapshotTransport.java
        SnapshotTransportStrategy.java
        util
        DateTimeDeserializer.java
        DateTimeSerializer.java
        GsonFactory.java
        LocalDateDeserializer.java
        LocalDateSerializer.java
        MurmurHash.java
  - test
    - java
      - com
        inin
        analytics
        DateSerializerTest.java
        ElasticsearchRoutingStrategyV1Test.java
        IndexRotationStrategyZookeeperTest.java

package com.inin.analytics.elasticsearch.example;

import java.io.IOException;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.util.Tool;

import com.inin.analytics.elasticsearch.BaseESReducer;
import com.inin.analytics.elasticsearch.index.rotation.ElasticSearchIndexMetadata;
import com.inin.analytics.elasticsearch.index.routing.ElasticsearchRoutingStrategy;
import com.inin.analytics.elasticsearch.index.routing.ElasticsearchRoutingStrategyV1;

/**
 * Sample hadoop job for taking data from GenerateData.java and writes it out
 * into a format suitable for ExampleIndexingJob.java 
 */
public class ExampleJobPrep  implements Tool {
	private static Configuration conf;
	private static final String INDEX_TYPE = "conversation";
	private static final String INDEX_SUFFIX_CONFIG = "indexSuffixConfigKey";

	private static final String NUM_SHARDS_PER_CUSTOMER = "numShardsPerCustomer";
	private static final String NUM_SHARDS = "numShards";

	public static class DocMapper extends Mapper <LongWritable, Text, Text, Text> {
		private ElasticsearchRoutingStrategy elasticsearchRoutingStrategy;

		@Override
		protected void setup(Context context) throws IOException, InterruptedException {
			Integer numShardsPerOrg = context.getConfiguration().getInt(NUM_SHARDS_PER_CUSTOMER, 1);
			Integer numShards = context.getConfiguration().getInt(NUM_SHARDS, 1);

			ElasticSearchIndexMetadata indexMetadata = new ElasticSearchIndexMetadata();
			indexMetadata.setNumShards(numShards);
			indexMetadata.setNumShardsPerOrg(numShardsPerOrg);
			elasticsearchRoutingStrategy = new ElasticsearchRoutingStrategyV1();
			elasticsearchRoutingStrategy.configure(indexMetadata);
		}

		@Override
		public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
			String[] csv = StringUtils.split(value.toString(), ",");
			String docId = csv[1];
			String customerId = csv[0];
			String json = "{\"customerId\":\"" + customerId + "\",\"color\":\"" + csv[2] + "\",\"id\":\"" + docId + "\",\"description\":\"" + csv[3] + "\"}";
			String routingHash = elasticsearchRoutingStrategy.getRoutingHash(customerId, docId);

			Text outputKey = new Text(INDEX_TYPE + BaseESReducer.TUPLE_SEPARATOR + routingHash);
			Text outputValue = new Text(INDEX_TYPE + BaseESReducer.TUPLE_SEPARATOR + customerId + BaseESReducer.TUPLE_SEPARATOR + json);
			context.write(outputKey, outputValue);
		} 
	}

	public static boolean main(String[] args) throws Exception {
		if(args.length != 5) {
			System.err.println("Invalid # arguments. EG: loadES [pipe separated paths to source files containing segments & properties] [output location] [index name suffix] [numShardsPerIndex] [maxNumShardsPerCustomer (for routing)]");
		}

		String inputs = args[0];
		String output = args[1];
		String indexSuffix = args[2];
		Integer numShards = new Integer(args[3]);
		Integer numShardsPerCustomer = new Integer(args[4]);

		conf = new Configuration();
		Job job = Job.getInstance(conf, "Prep example");
		job.setJarByClass(ExampleJobPrep.class);
		job.setMapperClass(DocMapper.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);
		job.setInputFormatClass(SequenceFileInputFormat.class);
		job.setOutputFormatClass(SequenceFileOutputFormat.class);

		job.setNumReduceTasks(0);

		job.getConfiguration().set(INDEX_SUFFIX_CONFIG, indexSuffix);
		job.getConfiguration().set(NUM_SHARDS_PER_CUSTOMER, numShardsPerCustomer.toString());
		job.getConfiguration().set(NUM_SHARDS, numShards.toString());

		FileOutputFormat.setOutputPath(job, new Path(output));

		// Set up inputs
		String[]inputFolders = StringUtils.split(inputs, "|");
		for(String input : inputFolders) {
			FileInputFormat.addInputPath(job, new Path(input));	
		}

		return job.waitForCompletion(true);
	}

	@Override
	public void setConf(Configuration conf) {
		this.conf = conf;
	}

	@Override
	public Configuration getConf() {
		return conf;
	}

	@Override
	public int run(String[] args) throws Exception {
		boolean success = ExampleJobPrep.main(args);
		if(success) {
			return 0;
		} else {
			return 1;
		}
	}

}