BaseESReducer.java example

Explorer

elasticsearch-lambda-master
- src
  - main
    - java
      - com
        inin
        analytics
        elasticsearch
        BaseESMapper.java
        BaseESReducer.java
        ConfigParams.java
        ESEmbededContainer.java
        IndexingPostProcessor.java
        ShardConfig.java
        driver
        Driver.java
        example
        ExampleIndexingJob.java
        ExampleIndexingReducerImpl.java
        ExampleJobPrep.java
        GenerateData.java
        index
        rotation
        ElasticSearchIndexMetadata.java
        ElasticsearchIndexRotationManager.java
        ElasticsearchIndexRotationManagerNoop.java
        ElasticsearchIndexRotationManagerZookeeper.java
        ExampleElasticsearchIndexRotationStrategyZookeeper.java
        RebuildPipelineState.java
        routing
        ElasticsearchRoutingStrategy.java
        ElasticsearchRoutingStrategyV1.java
        selector
        RealtimeIndexSelectionStrategy.java
        RealtimeIndexSelectionStrategyLagged.java
        transport
        BaseTransport.java
        HDFSSnapshotTransport.java
        LocalFSSnapshotTransport.java
        S3SnapshotTransport.java
        SnapshotTransportStrategy.java
        util
        DateTimeDeserializer.java
        DateTimeSerializer.java
        GsonFactory.java
        LocalDateDeserializer.java
        LocalDateSerializer.java
        MurmurHash.java
  - test
    - java
      - com
        inin
        analytics
        DateSerializerTest.java
        ElasticsearchRoutingStrategyV1Test.java
        IndexRotationStrategyZookeeperTest.java

package com.inin.analytics.elasticsearch;

import static org.elasticsearch.common.settings.ImmutableSettings.settingsBuilder;

import java.io.File;
import java.io.IOException;
import java.util.Arrays;
import java.util.Iterator;

import org.apache.commons.io.FileUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.elasticsearch.action.ActionFuture;
import org.elasticsearch.action.admin.indices.delete.DeleteIndexRequest;
import org.elasticsearch.action.admin.indices.delete.DeleteIndexResponse;
import org.elasticsearch.action.index.IndexResponse;
import org.elasticsearch.cluster.metadata.IndexMetaData;
import com.inin.analytics.elasticsearch.transport.SnapshotTransportStrategy;

public abstract class BaseESReducer implements Reducer<Text, Text, NullWritable, Text> {
	public static final char TUPLE_SEPARATOR = '|';
	public static final char DIR_SEPARATOR = '/';
	
	public static enum JOB_COUNTER {
		TIME_SPENT_INDEXING_MS, TIME_SPENT_FLUSHING_MS, TIME_SPENT_MERGING_MS, TIME_SPENT_SNAPSHOTTING_MS, TIME_SPENT_TRANSPORTING_SNAPSHOT_MS, INDEXING_DOC_FAIL, INDEX_DOC_CREATED, INDEX_DOC_NOT_CREATED
	}
	
	// We prefix all snapshots with the word snapshot
	public static final String SNAPSHOT_NAME = "snapshot";
	
	// The local filesystem location that ES will write the snapshot out to
	private String snapshotWorkingLocation;
	
	// Where the snapshot will be moved to. Typical use case would be to throw it onto S3
	private String snapshotFinalDestination;
	
	// The name of a snapshot repo. We'll enumerate that on each job run so that the repo names are unique across rebuilds
	private String snapshotRepoName;
	
	// Local filesystem location where index data is built
	private String esWorkingDir;
	
	// The partition of data this reducer is serving. Useful for making directories unique if running multiple reducers on a task tracker 
	private String partition;
	
	// The container handles spinning up our embedded elasticsearch instance
	private ESEmbededContainer esEmbededContainer;
	
	private ShardConfig shardConfig;
		
	// Hold onto some frequently generated objects to cut down on GC overhead 
	private String indexType;
	private String docId;
	private String pre;
	private String json;
   
	@Override
	public void configure(JobConf job) {
		partition = job.get("mapred.task.partition");
        String attemptId = job.get("mapred.task.id");
		
		// If running multiple reducers on a node, the node needs a unique name & data directory hence the random number we append 
		snapshotWorkingLocation = job.get(ConfigParams.SNAPSHOT_WORKING_LOCATION_CONFIG_KEY.toString()) + partition + attemptId + DIR_SEPARATOR;
		snapshotFinalDestination = job.get(ConfigParams.SNAPSHOT_FINAL_DESTINATION.toString());
		snapshotRepoName = job.get(ConfigParams.SNAPSHOT_REPO_NAME_CONFIG_KEY.toString());
		esWorkingDir = job.get(ConfigParams.ES_WORKING_DIR.toString()) + partition + attemptId + DIR_SEPARATOR;
		if(shardConfig == null) {
		    shardConfig = getShardConfig(job);    
		}
	}
	
	public void setShardConfig(ShardConfig shardConfig) {
        this.shardConfig = shardConfig;
    }

    private void init(String index) {
		String templateName = getTemplateName();
		String templateJson = getTemplate();

		ESEmbededContainer.Builder builder = new ESEmbededContainer.Builder()
		.withNodeName("embededESTempLoaderNode" + partition)
		.withWorkingDir(esWorkingDir)
		.withClusterName("bulkLoadPartition:" + partition)
		.withSnapshotWorkingLocation(snapshotWorkingLocation)
		.withSnapshotRepoName(snapshotRepoName);
		
		if(templateName != null && templateJson != null) {
			builder.withTemplate(templateName, templateJson);	
		}
		
		if(esEmbededContainer == null) {
			esEmbededContainer = builder.build();	
		} 
		
		// Create index
		esEmbededContainer.getNode().client().admin().indices().prepareCreate(index).setSettings(settingsBuilder()
		        .put("index.number_of_replicas", 0)
		        .put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, shardConfig.getShardsForIndex(index))
		        ).get();
	}
	
	/**
	 * Provide the JSON contents of the index template. This is your hook for configuring ElasticSearch.
	 * 
	 * http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/indices-templates.html
	 * 
     * @return String
	 */
	public abstract String getTemplate();
	
	
	/**
	 * Provide a ShardConfig which provides the number of shards per index and the number of 
	 * shards to split organizations across. The number can be uniform across indices or a mapping
	 * can be provided to enable per-index configuration values.
	 *  
	 * @param job
	 * @return ShardConfig
	 */
	public abstract ShardConfig getShardConfig(JobConf job);
	
	/**
	 * Provide an all lower case template name
	 *  
	 * @return String
	 */
	public abstract String getTemplateName();

	@Override
	public void reduce(Text docMetaData, Iterator<Text> documentPayloads, OutputCollector<NullWritable, Text> output, final Reporter reporter) throws IOException {
		String[] pieces = StringUtils.split(docMetaData.toString(), TUPLE_SEPARATOR);
		String indexName = pieces[0];
		String routing = pieces[1]; 
		init(indexName);

		long start = System.currentTimeMillis();
		while(documentPayloads.hasNext()) {
			Text line = documentPayloads.next();
			if(line == null) {
				continue;
			}
			
			pieces = StringUtils.split(line.toString(), TUPLE_SEPARATOR);
			indexType = pieces[0];
			docId = pieces[1];
			pre = indexType + TUPLE_SEPARATOR + docId + TUPLE_SEPARATOR;
			json = line.toString().substring(pre.length());

			IndexResponse response = esEmbededContainer.getNode().client().prepareIndex(indexName, indexType).setId(docId).setRouting(routing).setSource(json).execute().actionGet();
			if(response.isCreated()) {
				reporter.incrCounter(JOB_COUNTER.INDEX_DOC_CREATED, 1l);
			} else {
				reporter.incrCounter(JOB_COUNTER.INDEX_DOC_NOT_CREATED, 1l);
			}
		}

		reporter.incrCounter(JOB_COUNTER.TIME_SPENT_INDEXING_MS, System.currentTimeMillis() - start);
		
		snapshot(indexName, reporter);
		output.collect(NullWritable.get(), new Text(indexName));
	}

	@Override
	public void close() throws IOException {
		if(esEmbededContainer != null) {
			esEmbededContainer.getNode().close();
			while(!esEmbededContainer.getNode().isClosed());
			FileUtils.deleteDirectory(new File(snapshotWorkingLocation));
		}
	}

	public void snapshot(String index, Reporter reporter) throws IOException {
		esEmbededContainer.snapshot(Arrays.asList(index), SNAPSHOT_NAME, snapshotRepoName, reporter);
		
		// Delete the index to free up that space
		ActionFuture<DeleteIndexResponse> response = esEmbededContainer.getNode().client().admin().indices().delete(new DeleteIndexRequest(index));
		while(!response.isDone());
		
		// Move the shard snapshot to the destination
		long start = System.currentTimeMillis();
		SnapshotTransportStrategy.get(snapshotWorkingLocation, snapshotFinalDestination).execute(SNAPSHOT_NAME, index);
		reporter.incrCounter(JOB_COUNTER.TIME_SPENT_TRANSPORTING_SNAPSHOT_MS, System.currentTimeMillis() - start);
		
		esEmbededContainer.deleteSnapshot(SNAPSHOT_NAME, snapshotRepoName);
	}
}