MapReduceJob.java example

Explorer
sensei-master
/**
 * This software is licensed to you under the Apache License, Version 2.0 (the
 * "Apache License").
 *
 * LinkedIn's contributions are made under the Apache License. If you contribute
 * to the Software, the contributions will be deemed to have been made under the
 * Apache License, unless you expressly indicate otherwise. Please do not make any
 * contributions that would be inconsistent with the Apache License.
 *
 * You may obtain a copy of the Apache License at http://www.apache.org/licenses/LICENSE-2.0
 * Unless required by applicable law or agreed to in writing, this software
 * distributed under the Apache License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the Apache
 * License for the specific language governing permissions and limitations for the
 * software governed under the Apache License.
 *
 * © 2012 LinkedIn Corp. All Rights Reserved.  
 */

package com.senseidb.indexing.hadoop.job;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.text.NumberFormat;
import java.util.Arrays;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.Trash;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.util.StringUtils;
import org.apache.log4j.Logger;

import com.senseidb.indexing.hadoop.keyvalueformat.IntermediateForm;
import com.senseidb.indexing.hadoop.keyvalueformat.Shard;
import com.senseidb.indexing.hadoop.map.SenseiMapper;
import com.senseidb.indexing.hadoop.reduce.FileSystemDirectory;
import com.senseidb.indexing.hadoop.reduce.IndexUpdateOutputFormat;
import com.senseidb.indexing.hadoop.reduce.SenseiCombiner;
import com.senseidb.indexing.hadoop.reduce.SenseiReducer;
import com.senseidb.indexing.hadoop.util.LuceneUtil;
import com.senseidb.indexing.hadoop.util.MRConfig;
import com.senseidb.indexing.hadoop.util.MRJobConfig;
import com.senseidb.indexing.hadoop.util.SenseiJobConfig;

public class MapReduceJob extends Configured {

	private static final NumberFormat NUMBER_FORMAT = NumberFormat.getInstance();
	private static final Logger logger = Logger.getLogger(MapReduceJob.class);
	
	  public JobConf createJob(Class MRClass) throws IOException, URISyntaxException {
		    
		    Configuration conf = getConf();
		    Path[] inputPaths;
		    Path outputPath;
		    Shard[] shards = null;
			int numMapTasks = conf.getInt(MRJobConfig.NUM_MAPS, 2);
			int numShards = conf.getInt(SenseiJobConfig.NUM_SHARDS, 2);
//			inputPaths = FileInputFormat.getInputPaths(jobConf);
			
		    String dirs = conf.get(SenseiJobConfig.INPUT_DIRS, null);
		    logger.info("dirs:"+ dirs);
		    String [] list = StringUtils.split(dirs);
		    logger.info("length after split:"+ list.length);
		    inputPaths = new Path[list.length];
		    for (int i = 0; i < list.length; i++) {
		    	inputPaths[i] = new Path(StringUtils.unEscapeString(list[i]));
		    }
		    logger.info("path[0] is:" + inputPaths[0]);
		    	    
			outputPath = new Path(conf.get(SenseiJobConfig.OUTPUT_DIR));
			String indexPath = conf.get(SenseiJobConfig.INDEX_PATH);
			String indexSubDirPrefix = conf.get(SenseiJobConfig.INDEX_SUBDIR_PREFIX, "");
			shards = createShards(indexPath, numShards, conf, indexSubDirPrefix);
			
		    FileSystem fs = FileSystem.get(conf);
		    String username = conf.get("hadoop.job.ugi");
		    if (fs.exists(outputPath) && conf.getBoolean(SenseiJobConfig.FORCE_OUTPUT_OVERWRITE, false))
		        fs.delete(outputPath, true);
		    if (fs.exists(new Path(indexPath)) && conf.getBoolean(SenseiJobConfig.FORCE_OUTPUT_OVERWRITE, false))
		        fs.delete(new Path(indexPath), true);
			
			
		    // set the starting generation for each shard
		    // when a reduce task fails, a new reduce task
		    // has to know where to re-start
		    setShardGeneration(conf, shards);

		    Shard.setIndexShards(conf, shards);

		    // MapTask.MapOutputBuffer uses JobContext.IO_SORT_MB to decide its max buffer size
		    // (max buffer size = 1/2 * JobContext.IO_SORT_MB).
		    // Here we half-en JobContext.IO_SORT_MB because we use the other half memory to
		    // build an intermediate form/index in Combiner.
		    conf.setInt(MRJobConfig.IO_SORT_MB,  conf.getInt(MRJobConfig.IO_SORT_MB, 100) / 2);
		    
		    // set the temp dir for the job;
		    conf.set(MRConfig.TEMP_DIR, "${mapred.child.tmp}/hindex/");
		    if (fs.exists(new Path(conf.get(MRConfig.TEMP_DIR))))
		        fs.delete(new Path(conf.get(MRConfig.TEMP_DIR)), true);
		    if(fs.exists(new Path("./tmp")))
		    	fs.delete(new Path("./tmp"), true);
		    
		    (new Trash(conf)).expunge();  //empty trash;
		    
		    
		    //always using compound file format to speed up;
		    conf.setBoolean(SenseiJobConfig.USE_COMPOUND_FILE, true);
		    
		    String schemaFile = conf.get(SenseiJobConfig.SCHEMA_FILE_URL);
		    if(schemaFile == null)
		    	throw new IOException("no schema file is found");
		    else{
		    	logger.info("Adding schema file: " + conf.get(SenseiJobConfig.SCHEMA_FILE_URL));	      
				DistributedCache.addCacheFile(new URI(schemaFile), conf);
		    }

		    // create the job configuration
		    JobConf jobConf = new JobConf(conf, MRClass);
		    if(jobConf.getJobName().length()<1)
		    	jobConf.setJobName(MRClass.getName() + "_"+ System.currentTimeMillis());

		    // provided by application
		    FileInputFormat.setInputPaths(jobConf, inputPaths);
		    FileOutputFormat.setOutputPath(jobConf, outputPath);

		    jobConf.setNumMapTasks(numMapTasks);

		    // already set shards
		    jobConf.setNumReduceTasks(shards.length);

		    jobConf.setInputFormat(
		    		conf.getClass(SenseiJobConfig.INPUT_FORMAT, TextInputFormat.class, InputFormat.class));

		    Path[] inputs = FileInputFormat.getInputPaths(jobConf);
		    StringBuilder buffer = new StringBuilder(inputs[0].toString());
		    for (int i = 1; i < inputs.length; i++) {
		      buffer.append(",");
		      buffer.append(inputs[i].toString());
		    }
		    logger.info("mapred.input.dir = " + buffer.toString());
		    logger.info("mapreduce.output.fileoutputformat.outputdir = " + 
		             FileOutputFormat.getOutputPath(jobConf).toString());
		    logger.info("mapreduce.job.maps = " + jobConf.getNumMapTasks());
		    logger.info("mapreduce.job.reduces = " + jobConf.getNumReduceTasks());
		    logger.info(shards.length + " shards = " + conf.get(SenseiJobConfig.INDEX_SHARDS));
		    logger.info("mapred.input.format.class = "
		        + jobConf.getInputFormat().getClass().getName());
		    logger.info("mapreduce.cluster.temp.dir = " + jobConf.get(MRConfig.TEMP_DIR));

		    // set by the system
		    jobConf.setMapOutputKeyClass(Shard.class);
		    jobConf.setMapOutputValueClass(IntermediateForm.class);
		    jobConf.setOutputKeyClass(Shard.class);
		    jobConf.setOutputValueClass(Text.class);

		    jobConf.setMapperClass(SenseiMapper.class);
		    // no need for the partitioner.class;
		    jobConf.setCombinerClass(SenseiCombiner.class);
		    jobConf.setReducerClass(SenseiReducer.class);

		    jobConf.setOutputFormat(IndexUpdateOutputFormat.class);

		    jobConf.setReduceSpeculativeExecution(false);
		    return jobConf;
		  }
	  
	  private static FileSystem getFileSystem(String user) {
		    Configuration conf = new Configuration();
		    conf.set("hadoop.job.ugi", user);
			try
			{
		      return FileSystem.get(conf);
		    }
		    catch(IOException e)
		    {
		      throw new RuntimeException(e);    
		    }
		  }
	  
	  private static Shard[] createShards(String indexPath, int numShards,
			  org.apache.hadoop.conf.Configuration conf, String indexSubDirPrefix) throws IOException {

	    String parent = Shard.normalizePath(indexPath) + Path.SEPARATOR;
	    long versionNumber = -1;
	    long generation = -1;

	    FileSystem fs = FileSystem.get(conf);
	    Path path = new Path(indexPath);

	    if (fs.exists(path)) {
	      FileStatus[] fileStatus = fs.listStatus(path);
	      String[] shardNames = new String[fileStatus.length];
	      int count = 0;
	      for (int i = 0; i < fileStatus.length; i++) {
	        if (fileStatus[i].isDir()) {
	          shardNames[count] = fileStatus[i].getPath().getName();
	          count++;
	        }
	      }
	      Arrays.sort(shardNames, 0, count);

	      Shard[] shards = new Shard[count >= numShards ? count : numShards];
	      for (int i = 0; i < count; i++) {
	        shards[i] =
	            new Shard(versionNumber, parent + shardNames[i], generation);
	      }

	      int number = count;
	      for (int i = count; i < numShards; i++) {
	        String shardPath;
	        while (true) {
	          shardPath = parent + indexSubDirPrefix + NUMBER_FORMAT.format(number++);
	          if (!fs.exists(new Path(shardPath))) {
	            break;
	          }
	        }
	        shards[i] = new Shard(versionNumber, shardPath, generation);
	      }
	      return shards;
	    } else {
	      Shard[] shards = new Shard[numShards];
	      for (int i = 0; i < shards.length; i++) {
	        shards[i] =
	            new Shard(versionNumber, parent + indexSubDirPrefix + NUMBER_FORMAT.format(i),
	                generation);
	      }
	      return shards;
	    }
	  }
	  
	  
	  void setShardGeneration(Configuration conf, Shard[] shards)
		      throws IOException {
		    FileSystem fs = FileSystem.get(conf);

		    for (int i = 0; i < shards.length; i++) {
		      Path path = new Path(shards[i].getDirectory());
		      long generation = -1;

		      if (fs.exists(path)) {
		        FileSystemDirectory dir = null;

		        try {
		          dir = new FileSystemDirectory(fs, path, false, conf);
		          generation = LuceneUtil.getCurrentSegmentGeneration(dir);
		        } finally {
		          if (dir != null) {
		            dir.close();
		          }
		        }
		      }

		      if (generation != shards[i].getGeneration()) {
		        // set the starting generation for the shard
		        shards[i] =
		            new Shard(shards[i].getVersion(), shards[i].getDirectory(),
		                generation);
		      }
		    }
		  }
}