package org.apache.blur.utils;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import static org.apache.blur.lucene.LuceneVersionConstant.LUCENE_VERSION;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.SortedSet;
import java.util.TreeSet;
import org.apache.blur.lucene.codec.Blur024Codec;
import org.apache.blur.store.hdfs.HdfsDirectory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.lucene.analysis.core.KeywordAnalyzer;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
/**
* This class is used to reduce the total number of shards of a table. The main
* use would be if during an indexing job the number of reducers were increased
* to make indexing faster, but the total number of shards in the table needed
* to be smaller. This utility safely collapses indexes together thus reducing
* the total number of shards in the table.
*
* For example if you wanted to run 1024 reducers but only wanted to run 128
* shards in a table. After the bulk map reducer job finishes, this utility
* could be executed:
*
* TableShardCountCollapser <hdfs path> 128
*
* The result would be 128 shards in the table path.
*
*/
public class TableShardCountCollapser extends Configured implements Tool {
public static void main(String[] args) throws Exception {
// Let ToolRunner handle generic command-line options
int res = ToolRunner.run(new Configuration(), new TableShardCountCollapser(), args);
System.exit(res);
}
private Path path;
@Override
public int run(String[] args) throws Exception {
// prompt to make sure the table is not enabled
Path path = new Path(args[0]);
int count = Integer.parseInt(args[1]);
setTablePath(path);
collapseShardsTo(count);
return 0;
}
public boolean validateCount(int count) throws IOException {
if (getCollapsePossibilities().contains(count)) {
return true;
}
return false;
}
public void setTablePath(Path path) {
this.path = path;
}
public List<Integer> getCollapsePossibilities() throws IOException {
FileSystem fileSystem = path.getFileSystem(getConf());
FileStatus[] listStatus = fileSystem.listStatus(path);
SortedSet<String> shards = new TreeSet<String>();
for (FileStatus status : listStatus) {
Path shardPath = status.getPath();
if (shardPath.getName().startsWith(BlurConstants.SHARD_PREFIX)) {
shards.add(shardPath.getName());
}
}
validateShards(shards);
List<Integer> result = getFactors(shards.size());
return result;
}
private List<Integer> getFactors(int size) {
List<Integer> result = new ArrayList<Integer>();
for (int i = 1; i < size; i++) {
if (size % i == 0) {
result.add(i);
}
}
return result;
}
private void validateShards(SortedSet<String> shards) {
int count = shards.size();
for (int i = 0; i < count; i++) {
if (!shards.contains(ShardUtil.getShardName(i))) {
throw new RuntimeException("Invalid table");
}
}
}
public void collapseShardsTo(int newShardCount) throws IOException {
if (!validateCount(newShardCount)) {
throw new RuntimeException("Count [" + newShardCount + "] is not valid, valid values are ["
+ getCollapsePossibilities() + "]");
}
Path[] paths = getPaths();
int numberOfShardsToMergePerPass = paths.length / newShardCount;
for (int i = 0; i < newShardCount; i++) {
System.out.println("Base Index [" + paths[i] + "]");
IndexWriterConfig lconf = new IndexWriterConfig(LUCENE_VERSION, new KeywordAnalyzer());
lconf.setCodec(new Blur024Codec());
HdfsDirectory dir = new HdfsDirectory(getConf(), paths[i]);
IndexWriter indexWriter = new IndexWriter(dir, lconf);
Directory[] dirs = new Directory[numberOfShardsToMergePerPass - 1];
Path[] pathsToDelete = new Path[numberOfShardsToMergePerPass - 1];
for (int p = 1; p < numberOfShardsToMergePerPass; p++) {
Path pathToMerge = paths[i + p * newShardCount];
System.out.println("Merge [" + pathToMerge + "]");
dirs[p - 1] = new HdfsDirectory(getConf(), pathToMerge);
pathsToDelete[p - 1] = pathToMerge;
}
indexWriter.addIndexes(dirs);
// Causes rewrite of of index and the symlinked files are
// merged/rewritten.
indexWriter.forceMerge(1);
indexWriter.close();
FileSystem fileSystem = path.getFileSystem(getConf());
for (Path p : pathsToDelete) {
fileSystem.delete(p, true);
}
}
}
private Path[] getPaths() throws IOException {
FileSystem fileSystem = path.getFileSystem(getConf());
FileStatus[] listStatus = fileSystem.listStatus(path);
SortedSet<Path> shards = new TreeSet<Path>();
for (FileStatus status : listStatus) {
Path shardPath = status.getPath();
if (shardPath.getName().startsWith(BlurConstants.SHARD_PREFIX)) {
shards.add(shardPath);
}
}
return shards.toArray(new Path[shards.size()]);
}
}