IndexerJobDriver.java example

Explorer
incubator-blur-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.blur.indexer;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.UUID;
import java.util.concurrent.TimeUnit;

import org.apache.blur.indexer.mapreduce.ExistingDataIndexLookupMapper;
import org.apache.blur.indexer.mapreduce.ExistingDataMapper;
import org.apache.blur.indexer.mapreduce.LookupBuilderMapper;
import org.apache.blur.indexer.mapreduce.LookupBuilderReducer;
import org.apache.blur.indexer.mapreduce.NewDataMapper;
import org.apache.blur.indexer.mapreduce.PrunedBlurInputFormat;
import org.apache.blur.indexer.mapreduce.PrunedSequenceFileInputFormat;
import org.apache.blur.log.Log;
import org.apache.blur.log.LogFactory;
import org.apache.blur.mapreduce.lib.BlurInputFormat;
import org.apache.blur.mapreduce.lib.BlurOutputFormat;
import org.apache.blur.mapreduce.lib.update.IndexKey;
import org.apache.blur.mapreduce.lib.update.IndexKeyPartitioner;
import org.apache.blur.mapreduce.lib.update.IndexKeyWritableComparator;
import org.apache.blur.mapreduce.lib.update.IndexValue;
import org.apache.blur.mapreduce.lib.update.UpdateReducer;
import org.apache.blur.thirdparty.thrift_0_9_0.TException;
import org.apache.blur.thrift.BlurClient;
import org.apache.blur.thrift.generated.Blur.Iface;
import org.apache.blur.thrift.generated.BlurException;
import org.apache.blur.thrift.generated.TableDescriptor;
import org.apache.blur.thrift.generated.TableStats;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.io.BooleanWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Counters;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.TaskReport;
import org.apache.hadoop.mapreduce.TaskType;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.MultipleInputs;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class IndexerJobDriver extends Configured implements Tool {

  public static final String BLUR_UPDATE_ID = "blur.update.id";
  private static final String BLUR_EXEC_TYPE = "blur.exec.type";
  public static final String TMP = "tmp";

  public enum EXEC {
    MR_ONLY, MR_WITH_LOOKUP, AUTOMATIC
  }

  public static final String MRUPDATE_SNAPSHOT = "mrupdate-snapshot";
  public static final String CACHE = "cache";
  public static final String COMPLETE = "complete";
  public static final String INPROGRESS = "inprogress";
  public static final String NEW = "new";
  private static final Log LOG = LogFactory.getLog(IndexerJobDriver.class);

  public static void main(String[] args) throws Exception {
    int res = ToolRunner.run(new Configuration(), new IndexerJobDriver(), args);
    System.exit(res);
  }

  static class PartitionedInputResult {
    final Path _partitionedInputData;
    final Counters _counters;
    final long[] _rowIdsFromNewData;
    final long[] _rowIdsToUpdateFromNewData;
    final long[] _rowIdsFromIndex;

    PartitionedInputResult(Path partitionedInputData, Counters counters, int shards, TaskReport[] taskReports) {
      _partitionedInputData = partitionedInputData;
      _counters = counters;
      _rowIdsFromNewData = new long[shards];
      _rowIdsToUpdateFromNewData = new long[shards];
      _rowIdsFromIndex = new long[shards];
      for (TaskReport tr : taskReports) {
        int id = tr.getTaskID().getId();
        Counters taskCounters = tr.getTaskCounters();
        Counter total = taskCounters.findCounter(BlurIndexCounter.ROW_IDS_FROM_NEW_DATA);
        _rowIdsFromNewData[id] = total.getValue();
        Counter update = taskCounters.findCounter(BlurIndexCounter.ROW_IDS_TO_UPDATE_FROM_NEW_DATA);
        _rowIdsToUpdateFromNewData[id] = update.getValue();
        Counter index = taskCounters.findCounter(BlurIndexCounter.ROW_IDS_FROM_INDEX);
        _rowIdsFromIndex[id] = index.getValue();
      }
    }

  }

  @Override
  public int run(String[] args) throws Exception {
    int c = 0;
    if (args.length < 5) {
      System.err
          .println("Usage Driver <table> <mr inc working path> <output path> <zk connection> <reducer multipler> <extra config files...>");
      return 1;
    }
    String table = args[c++];
    String mrIncWorkingPathStr = args[c++];
    String outputPathStr = args[c++];
    String blurZkConnection = args[c++];
    int reducerMultipler = Integer.parseInt(args[c++]);
    for (; c < args.length; c++) {
      String externalConfigFileToAdd = args[c];
      getConf().addResource(new Path(externalConfigFileToAdd));
    }

    Path outputPath = new Path(outputPathStr);
    Path mrIncWorkingPath = new Path(mrIncWorkingPathStr);
    FileSystem fileSystem = mrIncWorkingPath.getFileSystem(getConf());

    Path newData = new Path(mrIncWorkingPath, NEW);
    Path inprogressData = new Path(mrIncWorkingPath, INPROGRESS);
    Path completeData = new Path(mrIncWorkingPath, COMPLETE);
    Path fileCache = new Path(mrIncWorkingPath, CACHE);
    Path tmpPathDontDelete = new Path(mrIncWorkingPath, TMP);

    Path tmpPath = new Path(tmpPathDontDelete, UUID.randomUUID().toString());

    fileSystem.mkdirs(newData);
    fileSystem.mkdirs(inprogressData);
    fileSystem.mkdirs(completeData);
    fileSystem.mkdirs(fileCache);

    List<Path> srcPathList = new ArrayList<Path>();
    for (FileStatus fileStatus : fileSystem.listStatus(newData)) {
      srcPathList.add(fileStatus.getPath());
    }
    if (srcPathList.isEmpty()) {
      return 0;
    }

    List<Path> inprogressPathList = new ArrayList<Path>();
    boolean success = false;
    Iface client = null;

    EXEC exec = EXEC.valueOf(getConf().get(BLUR_EXEC_TYPE, EXEC.AUTOMATIC.name()).toUpperCase());

    String uuid = UUID.randomUUID().toString();

    try {
      client = BlurClient.getClientFromZooKeeperConnectionStr(blurZkConnection);
      TableDescriptor descriptor = client.describe(table);
      Map<String, String> tableProperties = descriptor.getTableProperties();
      String fastDir = tableProperties.get("blur.table.disable.fast.dir");
      if (fastDir == null || !fastDir.equals("true")) {
        LOG.error("Table [{0}] has blur.table.disable.fast.dir enabled, not supported in fast MR update.", table);
        return 1;
      }

      waitForOtherSnapshotsToBeRemoved(client, table, MRUPDATE_SNAPSHOT);
      client.createSnapshot(table, MRUPDATE_SNAPSHOT);
      TableStats tableStats = client.tableStats(table);

      inprogressPathList = movePathList(fileSystem, inprogressData, srcPathList);

      switch (exec) {
      case MR_ONLY:
        success = runMrOnly(descriptor, inprogressPathList, table, fileCache, outputPath, reducerMultipler);
        break;
      case MR_WITH_LOOKUP:
        success = runMrWithLookup(uuid, descriptor, inprogressPathList, table, fileCache, outputPath, reducerMultipler,
            tmpPath, tableStats, MRUPDATE_SNAPSHOT);
        break;
      case AUTOMATIC:
        success = runAutomatic(uuid, descriptor, inprogressPathList, table, fileCache, outputPath, reducerMultipler,
            tmpPath, tableStats, MRUPDATE_SNAPSHOT);
        break;
      default:
        throw new RuntimeException("Exec type [" + exec + "] not supported.");
      }
    } finally {
      if (success) {
        LOG.info("Associate lookup cache with new data!");
        associateLookupCache(uuid, fileCache, outputPath);
        LOG.info("Indexing job succeeded!");
        client.loadData(table, outputPathStr);
        LOG.info("Load data called");
        movePathList(fileSystem, completeData, inprogressPathList);
        LOG.info("Input data moved to complete");
        ClusterDriver.waitForDataToLoad(client, table);
        LOG.info("Data loaded");
      } else {
        LOG.error("Indexing job failed!");
        movePathList(fileSystem, newData, inprogressPathList);
      }
      fileSystem.delete(tmpPath, true);
      if (client != null) {
        client.removeSnapshot(table, MRUPDATE_SNAPSHOT);
      }
    }

    if (success) {
      return 0;
    } else {
      return 1;
    }
  }

  private void associateLookupCache(String uuid, Path fileCache, Path outputPath) throws IOException {
    FileSystem fileSystem = fileCache.getFileSystem(getConf());
    cleanupExtraFileFromSpecX(fileSystem, uuid, fileCache);
    associateLookupCache(fileSystem, uuid, fileSystem.getFileStatus(fileCache), outputPath);
  }

  private void cleanupExtraFileFromSpecX(FileSystem fileSystem, String uuid, Path fileCache) throws IOException {
    FileStatus[] listStatus = fileSystem.listStatus(fileCache);
    List<FileStatus> uuidPaths = new ArrayList<FileStatus>();
    for (FileStatus fs : listStatus) {
      Path path = fs.getPath();
      if (fs.isDirectory()) {
        cleanupExtraFileFromSpecX(fileSystem, uuid, path);
      } else if (path.getName().startsWith(uuid)) {
        uuidPaths.add(fs);
      }
    }
    if (uuidPaths.size() > 1) {
      deleteIncomplete(fileSystem, uuidPaths);
    }
  }

  private void deleteIncomplete(FileSystem fileSystem, List<FileStatus> uuidPaths) throws IOException {
    long max = 0;
    FileStatus keeper = null;
    for (FileStatus fs : uuidPaths) {
      long len = fs.getLen();
      if (len > max) {
        keeper = fs;
        max = len;
      }
    }
    for (FileStatus fs : uuidPaths) {
      if (fs != keeper) {
        LOG.info("Deleteing incomplete cache file [{0}]", fs.getPath());
        fileSystem.delete(fs.getPath(), false);
      }
    }
  }

  private void associateLookupCache(FileSystem fileSystem, String uuid, FileStatus fileCache, Path outputPath)
      throws IOException {
    Path path = fileCache.getPath();
    if (fileCache.isDirectory()) {
      FileStatus[] listStatus = fileSystem.listStatus(path);
      for (FileStatus fs : listStatus) {
        associateLookupCache(fileSystem, uuid, fs, outputPath);
      }
    } else if (path.getName().startsWith(uuid)) {
      Path parent = path.getParent();
      String shardName = parent.getName();
      Path indexPath = findOutputDirPath(outputPath, shardName);
      LOG.info("Path found for shard [{0}] outputPath [{1}]", shardName, outputPath);
      String id = MergeSortRowIdMatcher.getIdForSingleSegmentIndex(getConf(), indexPath);
      Path file = new Path(path.getParent(), id + ".seq");
      MergeSortRowIdMatcher.commitWriter(getConf(), file, path);
    }
  }

  private Path findOutputDirPath(Path outputPath, String shardName) throws IOException {
    FileSystem fileSystem = outputPath.getFileSystem(getConf());
    Path shardPath = new Path(outputPath, shardName);
    if (!fileSystem.exists(shardPath)) {
      throw new IOException("Shard path [" + shardPath + "]");
    }
    FileStatus[] listStatus = fileSystem.listStatus(shardPath, new PathFilter() {
      @Override
      public boolean accept(Path path) {
        return path.getName().endsWith(".commit");          
      }
    });
    if (listStatus.length == 1) {
      FileStatus fs = listStatus[0];
      return fs.getPath();
    } else {
      throw new IOException("More than one sub dir [" + shardPath + "]");
    }
  }

  private boolean runAutomatic(String uuid, TableDescriptor descriptor, List<Path> inprogressPathList, String table,
      Path fileCache, Path outputPath, int reducerMultipler, Path tmpPath, TableStats tableStats, String snapshot)
      throws ClassNotFoundException, IOException, InterruptedException {
    PartitionedInputResult result = buildPartitionedInputData(uuid, tmpPath, descriptor, inprogressPathList, snapshot,
        fileCache);

    Job job = Job.getInstance(getConf(), "Blur Row Updater for table [" + table + "]");

    InputSplitPruneUtil.setBlurLookupRowIdFromNewDataCounts(job, table, result._rowIdsFromNewData);
    InputSplitPruneUtil.setBlurLookupRowIdUpdateFromNewDataCounts(job, table, result._rowIdsToUpdateFromNewData);
    InputSplitPruneUtil.setBlurLookupRowIdFromIndexCounts(job, table, result._rowIdsFromIndex);
    InputSplitPruneUtil.setTable(job, table);

    BlurInputFormat.setLocalCachePath(job, fileCache);

    // Existing data - This adds the copy data files first open and stream
    // through all documents.
    {
      Path tablePath = new Path(descriptor.getTableUri());
      BlurInputFormat.addTable(job, descriptor, MRUPDATE_SNAPSHOT);
      MultipleInputs.addInputPath(job, tablePath, PrunedBlurInputFormat.class, ExistingDataMapper.class);
    }

    // Existing data - This adds the row id lookup
    {
      ExistingDataIndexLookupMapper.setSnapshot(job, MRUPDATE_SNAPSHOT);
      FileInputFormat.addInputPath(job, result._partitionedInputData);
      MultipleInputs.addInputPath(job, result._partitionedInputData, PrunedSequenceFileInputFormat.class,
          ExistingDataIndexLookupMapper.class);
    }

    // New Data
    for (Path p : inprogressPathList) {
      FileInputFormat.addInputPath(job, p);
      MultipleInputs.addInputPath(job, p, SequenceFileInputFormat.class, NewDataMapper.class);
    }

    BlurOutputFormat.setOutputPath(job, outputPath);
    BlurOutputFormat.setupJob(job, descriptor);

    job.setReducerClass(UpdateReducer.class);
    job.setMapOutputKeyClass(IndexKey.class);
    job.setMapOutputValueClass(IndexValue.class);
    job.setPartitionerClass(IndexKeyPartitioner.class);
    job.setGroupingComparatorClass(IndexKeyWritableComparator.class);

    BlurOutputFormat.setReducerMultiplier(job, reducerMultipler);

    boolean success = job.waitForCompletion(true);
    Counters counters = job.getCounters();
    LOG.info("Counters [" + counters + "]");
    return success;
  }

  private boolean runMrWithLookup(String uuid, TableDescriptor descriptor, List<Path> inprogressPathList, String table,
      Path fileCache, Path outputPath, int reducerMultipler, Path tmpPath, TableStats tableStats, String snapshot)
      throws ClassNotFoundException, IOException, InterruptedException {
    PartitionedInputResult result = buildPartitionedInputData(uuid, tmpPath, descriptor, inprogressPathList, snapshot,
        fileCache);

    Job job = Job.getInstance(getConf(), "Blur Row Updater for table [" + table + "]");

    ExistingDataIndexLookupMapper.setSnapshot(job, MRUPDATE_SNAPSHOT);
    FileInputFormat.addInputPath(job, result._partitionedInputData);
    MultipleInputs.addInputPath(job, result._partitionedInputData, SequenceFileInputFormat.class,
        ExistingDataIndexLookupMapper.class);

    for (Path p : inprogressPathList) {
      FileInputFormat.addInputPath(job, p);
      MultipleInputs.addInputPath(job, p, SequenceFileInputFormat.class, NewDataMapper.class);
    }

    BlurOutputFormat.setOutputPath(job, outputPath);
    BlurOutputFormat.setupJob(job, descriptor);

    job.setReducerClass(UpdateReducer.class);
    job.setMapOutputKeyClass(IndexKey.class);
    job.setMapOutputValueClass(IndexValue.class);
    job.setPartitionerClass(IndexKeyPartitioner.class);
    job.setGroupingComparatorClass(IndexKeyWritableComparator.class);

    BlurOutputFormat.setReducerMultiplier(job, reducerMultipler);

    boolean success = job.waitForCompletion(true);
    Counters counters = job.getCounters();
    LOG.info("Counters [" + counters + "]");
    return success;
  }

  private boolean runMrOnly(TableDescriptor descriptor, List<Path> inprogressPathList, String table, Path fileCache,
      Path outputPath, int reducerMultipler) throws IOException, ClassNotFoundException, InterruptedException {
    Job job = Job.getInstance(getConf(), "Blur Row Updater for table [" + table + "]");
    Path tablePath = new Path(descriptor.getTableUri());
    BlurInputFormat.setLocalCachePath(job, fileCache);
    BlurInputFormat.addTable(job, descriptor, MRUPDATE_SNAPSHOT);
    MultipleInputs.addInputPath(job, tablePath, BlurInputFormat.class, ExistingDataMapper.class);

    for (Path p : inprogressPathList) {
      FileInputFormat.addInputPath(job, p);
      MultipleInputs.addInputPath(job, p, SequenceFileInputFormat.class, NewDataMapper.class);
    }

    BlurOutputFormat.setOutputPath(job, outputPath);
    BlurOutputFormat.setupJob(job, descriptor);

    job.setReducerClass(UpdateReducer.class);
    job.setMapOutputKeyClass(IndexKey.class);
    job.setMapOutputValueClass(IndexValue.class);
    job.setPartitionerClass(IndexKeyPartitioner.class);
    job.setGroupingComparatorClass(IndexKeyWritableComparator.class);

    BlurOutputFormat.setReducerMultiplier(job, reducerMultipler);

    boolean success = job.waitForCompletion(true);
    Counters counters = job.getCounters();
    LOG.info("Counters [" + counters + "]");
    return success;
  }

  private PartitionedInputResult buildPartitionedInputData(String uuid, Path tmpPath, TableDescriptor descriptor,
      List<Path> inprogressPathList, String snapshot, Path fileCachePath) throws IOException, ClassNotFoundException,
      InterruptedException {
    Job job = Job.getInstance(getConf(), "Partitioning data for table [" + descriptor.getName() + "]");
    job.getConfiguration().set(BLUR_UPDATE_ID, uuid);

    // Needed for the bloom filter path information.
    BlurOutputFormat.setTableDescriptor(job, descriptor);
    BlurInputFormat.setLocalCachePath(job, fileCachePath);
    ExistingDataIndexLookupMapper.setSnapshot(job, snapshot);

    for (Path p : inprogressPathList) {
      FileInputFormat.addInputPath(job, p);
    }
    Path outputPath = new Path(tmpPath, UUID.randomUUID().toString());
    job.setJarByClass(getClass());
    job.setMapperClass(LookupBuilderMapper.class);
    job.setReducerClass(LookupBuilderReducer.class);

    int shardCount = descriptor.getShardCount();
    job.setNumReduceTasks(shardCount);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(NullWritable.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(BooleanWritable.class);
    FileOutputFormat.setOutputPath(job, outputPath);
    if (job.waitForCompletion(true)) {
      return new PartitionedInputResult(outputPath, job.getCounters(), shardCount, job.getTaskReports(TaskType.REDUCE));
    } else {
      throw new IOException("Partitioning failed!");
    }
  }

  private void waitForOtherSnapshotsToBeRemoved(Iface client, String table, String snapshot) throws BlurException,
      TException, InterruptedException {
    while (true) {
      Map<String, List<String>> listSnapshots = client.listSnapshots(table);
      boolean mrupdateSnapshots = false;
      for (Entry<String, List<String>> e : listSnapshots.entrySet()) {
        List<String> value = e.getValue();
        if (value.contains(snapshot)) {
          mrupdateSnapshots = true;
        }
      }
      if (!mrupdateSnapshots) {
        return;
      } else {
        LOG.info(snapshot + " Snapshot for table [{0}] already exists", table);
        Thread.sleep(TimeUnit.SECONDS.toMillis(10));
        LOG.info("Retrying");
      }
    }
  }

  private List<Path> movePathList(FileSystem fileSystem, Path dstDir, List<Path> lst) throws IOException {
    List<Path> result = new ArrayList<Path>();
    for (Path src : lst) {
      Path dst = new Path(dstDir, src.getName());
      if (fileSystem.rename(src, dst)) {
        LOG.info("Moving [{0}] to [{1}]", src, dst);
        result.add(dst);
      } else {
        LOG.error("Could not move [{0}] to [{1}]", src, dst);
      }
    }
    return result;
  }

}