TableShardCountCollapser.java example

Explorer
incubator-blur-master
package org.apache.blur.utils;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
import static org.apache.blur.lucene.LuceneVersionConstant.LUCENE_VERSION;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.SortedSet;
import java.util.TreeSet;

import org.apache.blur.lucene.codec.Blur024Codec;
import org.apache.blur.store.hdfs.HdfsDirectory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.lucene.analysis.core.KeywordAnalyzer;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;

/**
 * This class is used to reduce the total number of shards of a table. The main
 * use would be if during an indexing job the number of reducers were increased
 * to make indexing faster, but the total number of shards in the table needed
 * to be smaller. This utility safely collapses indexes together thus reducing
 * the total number of shards in the table.
 * 
 * For example if you wanted to run 1024 reducers but only wanted to run 128
 * shards in a table. After the bulk map reducer job finishes, this utility
 * could be executed:
 * 
 * TableShardCountCollapser <hdfs path> 128
 * 
 * The result would be 128 shards in the table path.
 * 
 */
public class TableShardCountCollapser extends Configured implements Tool {

  public static void main(String[] args) throws Exception {
    // Let ToolRunner handle generic command-line options
    int res = ToolRunner.run(new Configuration(), new TableShardCountCollapser(), args);
    System.exit(res);
  }

  private Path path;

  @Override
  public int run(String[] args) throws Exception {
    // prompt to make sure the table is not enabled

    Path path = new Path(args[0]);
    int count = Integer.parseInt(args[1]);
    setTablePath(path);
    collapseShardsTo(count);
    return 0;
  }

  public boolean validateCount(int count) throws IOException {
    if (getCollapsePossibilities().contains(count)) {
      return true;
    }
    return false;
  }

  public void setTablePath(Path path) {
    this.path = path;
  }

  public List<Integer> getCollapsePossibilities() throws IOException {
    FileSystem fileSystem = path.getFileSystem(getConf());
    FileStatus[] listStatus = fileSystem.listStatus(path);
    SortedSet<String> shards = new TreeSet<String>();
    for (FileStatus status : listStatus) {
      Path shardPath = status.getPath();
      if (shardPath.getName().startsWith(BlurConstants.SHARD_PREFIX)) {
        shards.add(shardPath.getName());
      }
    }
    validateShards(shards);
    List<Integer> result = getFactors(shards.size());
    return result;
  }

  private List<Integer> getFactors(int size) {
    List<Integer> result = new ArrayList<Integer>();
    for (int i = 1; i < size; i++) {
      if (size % i == 0) {
        result.add(i);
      }
    }
    return result;
  }

  private void validateShards(SortedSet<String> shards) {
    int count = shards.size();
    for (int i = 0; i < count; i++) {
      if (!shards.contains(ShardUtil.getShardName(i))) {
        throw new RuntimeException("Invalid table");
      }
    }
  }

  public void collapseShardsTo(int newShardCount) throws IOException {
    if (!validateCount(newShardCount)) {
      throw new RuntimeException("Count [" + newShardCount + "] is not valid, valid values are ["
          + getCollapsePossibilities() + "]");
    }

    Path[] paths = getPaths();
    int numberOfShardsToMergePerPass = paths.length / newShardCount;
    for (int i = 0; i < newShardCount; i++) {
      System.out.println("Base Index [" + paths[i] + "]");
      IndexWriterConfig lconf = new IndexWriterConfig(LUCENE_VERSION, new KeywordAnalyzer());
      lconf.setCodec(new Blur024Codec());
      HdfsDirectory dir = new HdfsDirectory(getConf(), paths[i]);
      IndexWriter indexWriter = new IndexWriter(dir, lconf);
      Directory[] dirs = new Directory[numberOfShardsToMergePerPass - 1];
      Path[] pathsToDelete = new Path[numberOfShardsToMergePerPass - 1];
      for (int p = 1; p < numberOfShardsToMergePerPass; p++) {
        Path pathToMerge = paths[i + p * newShardCount];
        System.out.println("Merge [" + pathToMerge + "]");
        dirs[p - 1] = new HdfsDirectory(getConf(), pathToMerge);
        pathsToDelete[p - 1] = pathToMerge;
      }
      indexWriter.addIndexes(dirs);
      // Causes rewrite of of index and the symlinked files are
      // merged/rewritten.
      indexWriter.forceMerge(1);
      indexWriter.close();
      FileSystem fileSystem = path.getFileSystem(getConf());
      for (Path p : pathsToDelete) {
        fileSystem.delete(p, true);
      }
    }
  }

  private Path[] getPaths() throws IOException {
    FileSystem fileSystem = path.getFileSystem(getConf());
    FileStatus[] listStatus = fileSystem.listStatus(path);
    SortedSet<Path> shards = new TreeSet<Path>();
    for (FileStatus status : listStatus) {
      Path shardPath = status.getPath();
      if (shardPath.getName().startsWith(BlurConstants.SHARD_PREFIX)) {
        shards.add(shardPath);
      }
    }
    return shards.toArray(new Path[shards.size()]);
  }

}