IndexUpdater.java example

Explorer
yarn-comment-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.contrib.index.mapred;

import java.io.IOException;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.contrib.index.lucene.FileSystemDirectory;
import org.apache.hadoop.contrib.index.lucene.LuceneUtil;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;

/**
 * An implementation of an index updater interface which creates a Map/Reduce
 * job configuration and run the Map/Reduce job to analyze documents and update
 * Lucene instances in parallel.
 */
public class IndexUpdater implements IIndexUpdater {
  public static final Log LOG = LogFactory.getLog(IndexUpdater.class);

  public IndexUpdater() {
  }

  /* (non-Javadoc)
   * @see org.apache.hadoop.contrib.index.mapred.IIndexUpdater#run(org.apache.hadoop.conf.Configuration, org.apache.hadoop.fs.Path[], org.apache.hadoop.fs.Path, int, org.apache.hadoop.contrib.index.mapred.Shard[])
   */
  public void run(Configuration conf, Path[] inputPaths, Path outputPath,
      int numMapTasks, Shard[] shards) throws IOException {
    JobConf jobConf =
        createJob(conf, inputPaths, outputPath, numMapTasks, shards);
    JobClient.runJob(jobConf);
  }

  JobConf createJob(Configuration conf, Path[] inputPaths, Path outputPath,
      int numMapTasks, Shard[] shards) throws IOException {
    // set the starting generation for each shard
    // when a reduce task fails, a new reduce task
    // has to know where to re-start
    setShardGeneration(conf, shards);

    // iconf.set sets properties in conf
    IndexUpdateConfiguration iconf = new IndexUpdateConfiguration(conf);
    Shard.setIndexShards(iconf, shards);

    // MapTask.MapOutputBuffer uses JobContext.IO_SORT_MB to decide its max buffer size
    // (max buffer size = 1/2 * JobContext.IO_SORT_MB).
    // Here we half-en JobContext.IO_SORT_MB because we use the other half memory to
    // build an intermediate form/index in Combiner.
    iconf.setIOSortMB(iconf.getIOSortMB() / 2);

    // create the job configuration
    JobConf jobConf = new JobConf(conf, IndexUpdater.class);
    jobConf.setJobName(this.getClass().getName() + "_"
        + System.currentTimeMillis());

    // provided by application
    FileInputFormat.setInputPaths(jobConf, inputPaths);
    FileOutputFormat.setOutputPath(jobConf, outputPath);

    jobConf.setNumMapTasks(numMapTasks);

    // already set shards
    jobConf.setNumReduceTasks(shards.length);

    jobConf.setInputFormat(iconf.getIndexInputFormatClass());

    Path[] inputs = FileInputFormat.getInputPaths(jobConf);
    StringBuilder buffer = new StringBuilder(inputs[0].toString());
    for (int i = 1; i < inputs.length; i++) {
      buffer.append(",");
      buffer.append(inputs[i].toString());
    }
    LOG.info("mapred.input.dir = " + buffer.toString());
    LOG.info("mapreduce.output.fileoutputformat.outputdir = " + 
             FileOutputFormat.getOutputPath(jobConf).toString());
    LOG.info("mapreduce.job.maps = " + jobConf.getNumMapTasks());
    LOG.info("mapreduce.job.reduces = " + jobConf.getNumReduceTasks());
    LOG.info(shards.length + " shards = " + iconf.getIndexShards());
    // better if we don't create the input format instance
    LOG.info("mapred.input.format.class = "
        + jobConf.getInputFormat().getClass().getName());

    // set by the system
    jobConf.setMapOutputKeyClass(IndexUpdateMapper.getMapOutputKeyClass());
    jobConf.setMapOutputValueClass(IndexUpdateMapper.getMapOutputValueClass());
    jobConf.setOutputKeyClass(IndexUpdateReducer.getOutputKeyClass());
    jobConf.setOutputValueClass(IndexUpdateReducer.getOutputValueClass());

    jobConf.setMapperClass(IndexUpdateMapper.class);
    jobConf.setPartitionerClass(IndexUpdatePartitioner.class);
    jobConf.setCombinerClass(IndexUpdateCombiner.class);
    jobConf.setReducerClass(IndexUpdateReducer.class);

    jobConf.setOutputFormat(IndexUpdateOutputFormat.class);

    return jobConf;
  }

  void setShardGeneration(Configuration conf, Shard[] shards)
      throws IOException {
    FileSystem fs = FileSystem.get(conf);

    for (int i = 0; i < shards.length; i++) {
      Path path = new Path(shards[i].getDirectory());
      long generation = -1;

      if (fs.exists(path)) {
        FileSystemDirectory dir = null;

        try {
          dir = new FileSystemDirectory(fs, path, false, conf);
          generation = LuceneUtil.getCurrentSegmentGeneration(dir);
        } finally {
          if (dir != null) {
            dir.close();
          }
        }
      }

      if (generation != shards[i].getGeneration()) {
        // set the starting generation for the shard
        shards[i] =
            new Shard(shards[i].getVersion(), shards[i].getDirectory(),
                generation);
      }
    }
  }
}