package com.inin.analytics.elasticsearch; import static org.elasticsearch.common.settings.ImmutableSettings.settingsBuilder; import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStreamReader; import java.io.PrintWriter; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import org.apache.commons.io.FileUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileUtil; import org.apache.hadoop.fs.Path; import org.elasticsearch.cluster.metadata.IndexMetaData; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.inin.analytics.elasticsearch.transport.BaseTransport; import com.inin.analytics.elasticsearch.transport.SnapshotTransportStrategy; public class IndexingPostProcessor { private static transient Logger logger = LoggerFactory.getLogger(IndexingPostProcessor.class); /** * The job output in HDFS is just a manifest of indicies generated by the Job. Why? S3 is eventually consistent in some * zones. That means if you try to list the indicies you just generated by this job, you might miss some. Instead, we * have the job spit out tiny manifests. This method merges them together, de-dupes them, and if there's any shards that * didn't get generated because they have no data it puts a placeholder empty shard in it's place to satisfy ElasticSearch. * * @param jobOutput * @param manifestFile * @param scratchDir * @param shardConfig * @param conf * @param reducerClass * @throws IOException * @throws InstantiationException * @throws IllegalAccessException */ public void execute(Path jobOutput, Path manifestFile, String scratchDir, ShardConfig shardConfig, Configuration conf, Class<? extends BaseESReducer> reducerClass) throws IOException, InstantiationException, IllegalAccessException { FileSystem fs = FileSystem.get(conf); ESEmbededContainer esEmbededContainer = null; boolean rootManifestUploaded = false; try{ Map<String, Integer> numShardsGenerated = new HashMap<String, Integer>(); // Each reducer spits out it's own manifest file, merge em all together into 1 file FileUtil.copyMerge(fs, jobOutput, fs, manifestFile, false, conf, ""); // Read the merged file, de-duping entries as it reads BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(manifestFile))); String line; line=br.readLine(); Set<String> indicies = new HashSet<>(); while (line != null){ indicies.add(line); int count = numShardsGenerated.containsKey(line) ? numShardsGenerated.get(line) : 0; numShardsGenerated.put(line, count + 1); line=br.readLine(); } File scratch = new File(scratchDir); if(!scratch.exists()) { // Make the dir if it doesn't exist scratch.mkdirs(); } else { FileUtils.deleteDirectory(scratch); scratch.mkdirs(); } esEmbededContainer = getESEmbededContainer(conf, reducerClass); String scratchFile = scratchDir + "manifest"; PrintWriter writer = new PrintWriter(scratchFile, "UTF-8"); // Create all the indexes for(String index : indicies) { esEmbededContainer.getNode().client().admin().indices().prepareCreate(index).setSettings(settingsBuilder() .put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, shardConfig.getShardsForIndex(index))).get(); } // Snapshot it List<String> indexesToSnapshot = new ArrayList<>(); indexesToSnapshot.addAll(indicies); esEmbededContainer.snapshot(indexesToSnapshot, BaseESReducer.SNAPSHOT_NAME, conf.get(ConfigParams.SNAPSHOT_REPO_NAME_CONFIG_KEY.toString()), null); for(String index : indicies) { try{ placeMissingIndexes(BaseESReducer.SNAPSHOT_NAME, esEmbededContainer, conf, index, shardConfig, !rootManifestUploaded); // The root level manifests are the same on each one, so it need only be uploaded once rootManifestUploaded = true; } catch (FileNotFoundException e) { logger.error("Unable to include index " + index + " in the manifest because missing shards could not be generated", e); continue; } // Re-write the manifest to local disk writer.println(index); } // Clean up index from embedded instance for(String index : indicies) { esEmbededContainer.getNode().client().admin().indices().prepareDelete(index).execute(); } writer.close(); // Move the manifest onto HDFS fs.copyFromLocalFile(new Path(scratchFile), manifestFile); } finally { if(esEmbededContainer != null) { esEmbededContainer.getNode().close(); while(!esEmbededContainer.getNode().isClosed()); } FileUtils.deleteDirectory(new File(conf.get(ConfigParams.SNAPSHOT_WORKING_LOCATION_CONFIG_KEY.toString()))); } } /** * * @param snapshotName * @param esEmbededContainer * @param conf * @param index * @param shardConfig * @param includeRootManifest * @throws IOException */ public void placeMissingIndexes(String snapshotName, ESEmbededContainer esEmbededContainer, Configuration conf, String index, ShardConfig shardConfig, boolean includeRootManifest) throws IOException { BaseTransport transport = SnapshotTransportStrategy.get(conf.get(ConfigParams.SNAPSHOT_WORKING_LOCATION_CONFIG_KEY.toString()), conf.get(ConfigParams.SNAPSHOT_FINAL_DESTINATION.toString())); transport.placeMissingShards(snapshotName, index, shardConfig, includeRootManifest); } /** * Returns a ESEmbededContainer configured for some local indexing * * @param conf * @param reducerClass * @return * @throws IOException * @throws InstantiationException * @throws IllegalAccessException */ private ESEmbededContainer getESEmbededContainer(Configuration conf, Class<? extends BaseESReducer> reducerClass) throws IOException, InstantiationException, IllegalAccessException { ESEmbededContainer esEmbededContainer = null; BaseESReducer red = reducerClass.newInstance(); String templateName = red.getTemplateName(); String templateJson = red.getTemplate(); red.close(); ESEmbededContainer.Builder builder = new ESEmbededContainer.Builder() .withNodeName("embededESTempLoaderNode") .withInMemoryBackedIndexes(true) .withWorkingDir(conf.get(ConfigParams.ES_WORKING_DIR.toString())) .withClusterName("bulkLoadPartition") .withSnapshotWorkingLocation(conf.get(ConfigParams.SNAPSHOT_WORKING_LOCATION_CONFIG_KEY.toString())) .withSnapshotRepoName(conf.get(ConfigParams.SNAPSHOT_REPO_NAME_CONFIG_KEY.toString())); if(templateName != null && templateJson != null) { builder.withTemplate(templateName, templateJson); } esEmbededContainer = builder.build(); return esEmbededContainer; } }