/** * This software is licensed to you under the Apache License, Version 2.0 (the * "Apache License"). * * LinkedIn's contributions are made under the Apache License. If you contribute * to the Software, the contributions will be deemed to have been made under the * Apache License, unless you expressly indicate otherwise. Please do not make any * contributions that would be inconsistent with the Apache License. * * You may obtain a copy of the Apache License at http://www.apache.org/licenses/LICENSE-2.0 * Unless required by applicable law or agreed to in writing, this software * distributed under the Apache License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the Apache * License for the specific language governing permissions and limitations for the * software governed under the Apache License. * * © 2012 LinkedIn Corp. All Rights Reserved. */ package com.senseidb.indexing.hadoop.job; import java.io.IOException; import java.net.URI; import java.net.URISyntaxException; import java.text.NumberFormat; import java.util.Arrays; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.filecache.DistributedCache; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Trash; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.InputFormat; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.TextInputFormat; import org.apache.hadoop.util.StringUtils; import org.apache.log4j.Logger; import com.senseidb.indexing.hadoop.keyvalueformat.IntermediateForm; import com.senseidb.indexing.hadoop.keyvalueformat.Shard; import com.senseidb.indexing.hadoop.map.SenseiMapper; import com.senseidb.indexing.hadoop.reduce.FileSystemDirectory; import com.senseidb.indexing.hadoop.reduce.IndexUpdateOutputFormat; import com.senseidb.indexing.hadoop.reduce.SenseiCombiner; import com.senseidb.indexing.hadoop.reduce.SenseiReducer; import com.senseidb.indexing.hadoop.util.LuceneUtil; import com.senseidb.indexing.hadoop.util.MRConfig; import com.senseidb.indexing.hadoop.util.MRJobConfig; import com.senseidb.indexing.hadoop.util.SenseiJobConfig; public class MapReduceJob extends Configured { private static final NumberFormat NUMBER_FORMAT = NumberFormat.getInstance(); private static final Logger logger = Logger.getLogger(MapReduceJob.class); public JobConf createJob(Class MRClass) throws IOException, URISyntaxException { Configuration conf = getConf(); Path[] inputPaths; Path outputPath; Shard[] shards = null; int numMapTasks = conf.getInt(MRJobConfig.NUM_MAPS, 2); int numShards = conf.getInt(SenseiJobConfig.NUM_SHARDS, 2); // inputPaths = FileInputFormat.getInputPaths(jobConf); String dirs = conf.get(SenseiJobConfig.INPUT_DIRS, null); logger.info("dirs:"+ dirs); String [] list = StringUtils.split(dirs); logger.info("length after split:"+ list.length); inputPaths = new Path[list.length]; for (int i = 0; i < list.length; i++) { inputPaths[i] = new Path(StringUtils.unEscapeString(list[i])); } logger.info("path[0] is:" + inputPaths[0]); outputPath = new Path(conf.get(SenseiJobConfig.OUTPUT_DIR)); String indexPath = conf.get(SenseiJobConfig.INDEX_PATH); String indexSubDirPrefix = conf.get(SenseiJobConfig.INDEX_SUBDIR_PREFIX, ""); shards = createShards(indexPath, numShards, conf, indexSubDirPrefix); FileSystem fs = FileSystem.get(conf); String username = conf.get("hadoop.job.ugi"); if (fs.exists(outputPath) && conf.getBoolean(SenseiJobConfig.FORCE_OUTPUT_OVERWRITE, false)) fs.delete(outputPath, true); if (fs.exists(new Path(indexPath)) && conf.getBoolean(SenseiJobConfig.FORCE_OUTPUT_OVERWRITE, false)) fs.delete(new Path(indexPath), true); // set the starting generation for each shard // when a reduce task fails, a new reduce task // has to know where to re-start setShardGeneration(conf, shards); Shard.setIndexShards(conf, shards); // MapTask.MapOutputBuffer uses JobContext.IO_SORT_MB to decide its max buffer size // (max buffer size = 1/2 * JobContext.IO_SORT_MB). // Here we half-en JobContext.IO_SORT_MB because we use the other half memory to // build an intermediate form/index in Combiner. conf.setInt(MRJobConfig.IO_SORT_MB, conf.getInt(MRJobConfig.IO_SORT_MB, 100) / 2); // set the temp dir for the job; conf.set(MRConfig.TEMP_DIR, "${mapred.child.tmp}/hindex/"); if (fs.exists(new Path(conf.get(MRConfig.TEMP_DIR)))) fs.delete(new Path(conf.get(MRConfig.TEMP_DIR)), true); if(fs.exists(new Path("./tmp"))) fs.delete(new Path("./tmp"), true); (new Trash(conf)).expunge(); //empty trash; //always using compound file format to speed up; conf.setBoolean(SenseiJobConfig.USE_COMPOUND_FILE, true); String schemaFile = conf.get(SenseiJobConfig.SCHEMA_FILE_URL); if(schemaFile == null) throw new IOException("no schema file is found"); else{ logger.info("Adding schema file: " + conf.get(SenseiJobConfig.SCHEMA_FILE_URL)); DistributedCache.addCacheFile(new URI(schemaFile), conf); } // create the job configuration JobConf jobConf = new JobConf(conf, MRClass); if(jobConf.getJobName().length()<1) jobConf.setJobName(MRClass.getName() + "_"+ System.currentTimeMillis()); // provided by application FileInputFormat.setInputPaths(jobConf, inputPaths); FileOutputFormat.setOutputPath(jobConf, outputPath); jobConf.setNumMapTasks(numMapTasks); // already set shards jobConf.setNumReduceTasks(shards.length); jobConf.setInputFormat( conf.getClass(SenseiJobConfig.INPUT_FORMAT, TextInputFormat.class, InputFormat.class)); Path[] inputs = FileInputFormat.getInputPaths(jobConf); StringBuilder buffer = new StringBuilder(inputs[0].toString()); for (int i = 1; i < inputs.length; i++) { buffer.append(","); buffer.append(inputs[i].toString()); } logger.info("mapred.input.dir = " + buffer.toString()); logger.info("mapreduce.output.fileoutputformat.outputdir = " + FileOutputFormat.getOutputPath(jobConf).toString()); logger.info("mapreduce.job.maps = " + jobConf.getNumMapTasks()); logger.info("mapreduce.job.reduces = " + jobConf.getNumReduceTasks()); logger.info(shards.length + " shards = " + conf.get(SenseiJobConfig.INDEX_SHARDS)); logger.info("mapred.input.format.class = " + jobConf.getInputFormat().getClass().getName()); logger.info("mapreduce.cluster.temp.dir = " + jobConf.get(MRConfig.TEMP_DIR)); // set by the system jobConf.setMapOutputKeyClass(Shard.class); jobConf.setMapOutputValueClass(IntermediateForm.class); jobConf.setOutputKeyClass(Shard.class); jobConf.setOutputValueClass(Text.class); jobConf.setMapperClass(SenseiMapper.class); // no need for the partitioner.class; jobConf.setCombinerClass(SenseiCombiner.class); jobConf.setReducerClass(SenseiReducer.class); jobConf.setOutputFormat(IndexUpdateOutputFormat.class); jobConf.setReduceSpeculativeExecution(false); return jobConf; } private static FileSystem getFileSystem(String user) { Configuration conf = new Configuration(); conf.set("hadoop.job.ugi", user); try { return FileSystem.get(conf); } catch(IOException e) { throw new RuntimeException(e); } } private static Shard[] createShards(String indexPath, int numShards, org.apache.hadoop.conf.Configuration conf, String indexSubDirPrefix) throws IOException { String parent = Shard.normalizePath(indexPath) + Path.SEPARATOR; long versionNumber = -1; long generation = -1; FileSystem fs = FileSystem.get(conf); Path path = new Path(indexPath); if (fs.exists(path)) { FileStatus[] fileStatus = fs.listStatus(path); String[] shardNames = new String[fileStatus.length]; int count = 0; for (int i = 0; i < fileStatus.length; i++) { if (fileStatus[i].isDir()) { shardNames[count] = fileStatus[i].getPath().getName(); count++; } } Arrays.sort(shardNames, 0, count); Shard[] shards = new Shard[count >= numShards ? count : numShards]; for (int i = 0; i < count; i++) { shards[i] = new Shard(versionNumber, parent + shardNames[i], generation); } int number = count; for (int i = count; i < numShards; i++) { String shardPath; while (true) { shardPath = parent + indexSubDirPrefix + NUMBER_FORMAT.format(number++); if (!fs.exists(new Path(shardPath))) { break; } } shards[i] = new Shard(versionNumber, shardPath, generation); } return shards; } else { Shard[] shards = new Shard[numShards]; for (int i = 0; i < shards.length; i++) { shards[i] = new Shard(versionNumber, parent + indexSubDirPrefix + NUMBER_FORMAT.format(i), generation); } return shards; } } void setShardGeneration(Configuration conf, Shard[] shards) throws IOException { FileSystem fs = FileSystem.get(conf); for (int i = 0; i < shards.length; i++) { Path path = new Path(shards[i].getDirectory()); long generation = -1; if (fs.exists(path)) { FileSystemDirectory dir = null; try { dir = new FileSystemDirectory(fs, path, false, conf); generation = LuceneUtil.getCurrentSegmentGeneration(dir); } finally { if (dir != null) { dir.close(); } } } if (generation != shards[i].getGeneration()) { // set the starting generation for the shard shards[i] = new Shard(shards[i].getVersion(), shards[i].getDirectory(), generation); } } } }