/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.blur.mapreduce.lib;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
import java.util.Map.Entry;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import org.apache.blur.command.BlurArray;
import org.apache.blur.command.BlurObject;
import org.apache.blur.log.Log;
import org.apache.blur.log.LogFactory;
import org.apache.blur.manager.writer.SnapshotIndexDeletionPolicy;
import org.apache.blur.store.hdfs.DirectoryUtil;
import org.apache.blur.store.hdfs.HdfsDirectory;
import org.apache.blur.thrift.BlurClient;
import org.apache.blur.thrift.generated.Blur.Iface;
import org.apache.blur.thrift.generated.TableDescriptor;
import org.apache.blur.utils.BlurConstants;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexCommit;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.index.SegmentInfoPerCommit;
import org.apache.lucene.index.SegmentInfos;
import org.apache.lucene.store.Directory;
public class BlurInputFormat extends FileInputFormat<Text, TableBlurRecord> {
private static final String BLUR_INPUT_FORMAT_MAX_MAPS = "blur.input.format.max.maps";
private static final String BLUR_INPUTFORMAT_FILE_CACHE_PATH = "blur.inputformat.file.cache.path";
private static final Log LOG = LogFactory.getLog(BlurInputFormat.class);
private static final String BLUR_TABLE_PATH_MAPPING = "blur.table.path.mapping.";
private static final String BLUR_INPUT_FORMAT_DISCOVERY_THREADS = "blur.input.format.discovery.threads";
private static final String BLUR_TABLE_SNAPSHOT_MAPPING = "blur.table.snapshot.mapping.";
@Override
public List<InputSplit> getSplits(JobContext context) throws IOException {
Path[] dirs = getInputPaths(context);
List<BlurInputSplit> splits;
Configuration configuration = context.getConfiguration();
if (isSplitCommandSupported(configuration)) {
splits = getSplitsFromCommand(configuration, dirs);
} else {
splits = getSplits(configuration, dirs);
}
return toList(getMaxNumberOfMaps(configuration), splits);
}
public static int getMaxNumberOfMaps(Configuration configuration) {
return configuration.getInt(BLUR_INPUT_FORMAT_MAX_MAPS, Integer.MAX_VALUE);
}
public static void setMaxNumberOfMaps(Configuration configuration, int maxNumberOfMaps) {
configuration.setInt(BLUR_INPUT_FORMAT_MAX_MAPS, maxNumberOfMaps);
}
public static void setMaxNumberOfMaps(Job job, int maxNumberOfMaps) {
setMaxNumberOfMaps(job.getConfiguration(), maxNumberOfMaps);
}
private static List<BlurInputSplit> getSplitsFromCommand(Configuration configuration, Path[] dirs) throws IOException {
String zkConnection = configuration.get(BlurConstants.BLUR_ZOOKEEPER_CONNECTION);
Iface client = BlurClient.getClientFromZooKeeperConnectionStr(zkConnection);
List<BlurInputSplit> splits = new ArrayList<BlurInputSplit>();
for (Path dir : dirs) {
Text table = BlurInputFormat.getTableFromPath(configuration, dir);
String snapshot = getSnapshotForTable(configuration, table.toString());
BlurInputFormatSplitCommand splitCommand = new BlurInputFormatSplitCommand();
splitCommand.setSnapshot(snapshot);
splitCommand.setTable(table.toString());
List<BlurInputSplit> splitsList = toSplitList(splitCommand.run(client));
splits.addAll(splitsList);
}
return splits;
}
private static List<BlurInputSplit> toSplitList(BlurObject bo) {
Iterator<String> keys = bo.keys();
List<BlurInputSplit> splits = new ArrayList<BlurInputSplit>();
while (keys.hasNext()) {
String shard = keys.next();
BlurArray blurArray = bo.getBlurArray(shard);
splits.addAll(toSplits(blurArray));
}
return splits;
}
public static List<BlurInputSplit> toSplits(BlurArray blurArray) {
List<BlurInputSplit> splits = new ArrayList<BlurInputSplit>();
for (int i = 0; i < blurArray.length(); i++) {
BlurObject blurObject = blurArray.getBlurObject(i);
splits.add(toSplit(blurObject));
}
return splits;
}
public static BlurArray toBlurArray(List<BlurInputSplit> splits) throws IOException {
BlurArray blurArray = new BlurArray();
for (BlurInputSplit inputSplit : splits) {
blurArray.put(toBlurObject(inputSplit));
}
return blurArray;
}
private static BlurInputSplit toSplit(BlurObject blurObject) {
Path dir = new Path(blurObject.getString("dir"));
String segmentsName = blurObject.getString("segmentsName");
String segmentInfoName = blurObject.getString("segmentInfoName");
long fileLength = blurObject.getLong("fileLength");
Text table = new Text(blurObject.getString("table"));
return new BlurInputSplit(dir, segmentsName, segmentInfoName, fileLength, table);
}
private static BlurObject toBlurObject(BlurInputSplit inputSplit) throws IOException {
BlurObject blurObject = new BlurObject();
blurObject.put("dir", inputSplit.getDir().toString());
blurObject.put("segmentsName", inputSplit.getSegmentsName());
blurObject.put("segmentInfoName", inputSplit.getSegmentInfoName());
blurObject.put("fileLength", inputSplit.getLength());
blurObject.put("table", inputSplit.getTable().toString());
return blurObject;
}
private boolean isSplitCommandSupported(Configuration configuration) {
return configuration.get(BlurConstants.BLUR_ZOOKEEPER_CONNECTION) != null;
}
private List<InputSplit> toList(int maxSplits, List<BlurInputSplit> splits) {
// Reduce splits number requested
List<BlurInputSplitColletion> collections = new ArrayList<BlurInputSplitColletion>();
for (BlurInputSplit blurInputSplit : splits) {
BlurInputSplitColletion blurInputSplitColletion;
if (collections.size() < maxSplits) {
blurInputSplitColletion = new BlurInputSplitColletion();
collections.add(blurInputSplitColletion);
} else {
blurInputSplitColletion = findSmallest(collections);
}
blurInputSplitColletion.add(blurInputSplit);
}
List<InputSplit> inputSplits = new ArrayList<InputSplit>();
for (BlurInputSplitColletion inputSplit : collections) {
inputSplits.add(inputSplit);
}
return inputSplits;
}
private BlurInputSplitColletion findSmallest(List<BlurInputSplitColletion> collections) {
Collections.sort(collections, new Comparator<BlurInputSplitColletion>() {
@Override
public int compare(BlurInputSplitColletion o1, BlurInputSplitColletion o2) {
long l1 = o1.getLength();
long l2 = o2.getLength();
if (l1 == l2) {
return 0;
}
// Smallest first
return l1 < l2 ? -1 : 1;
}
});
return collections.get(0);
}
public static class BlurInputSplitColletion extends InputSplit implements Writable {
private List<BlurInputSplit> _splits = new ArrayList<BlurInputSplit>();
private long _length;
public BlurInputSplitColletion() {
}
public void add(BlurInputSplit blurInputSplit) {
_splits.add(blurInputSplit);
_length += blurInputSplit.getLength();
}
public BlurInputSplitColletion(List<BlurInputSplit> splits) {
_splits = splits;
}
@Override
public long getLength() {
return _length;
}
@Override
public String[] getLocations() {
return new String[] {};
}
public List<BlurInputSplit> getSplits() {
return _splits;
}
public void setSplits(List<BlurInputSplit> splits) {
_splits = splits;
}
public void setLength(long length) {
_length = length;
}
@Override
public void write(DataOutput out) throws IOException {
out.writeLong(_length);
out.writeInt(_splits.size());
for (BlurInputSplit split : _splits) {
split.write(out);
}
}
@Override
public void readFields(DataInput in) throws IOException {
_splits.clear();
_length = in.readLong();
int size = in.readInt();
for (int i = 0; i < size; i++) {
BlurInputSplit blurInputSplit = new BlurInputSplit();
blurInputSplit.readFields(in);
_splits.add(blurInputSplit);
}
}
}
public static List<BlurInputSplit> getSplits(Configuration configuration, Path[] dirs) throws IOException {
int threads = configuration.getInt(BLUR_INPUT_FORMAT_DISCOVERY_THREADS, 10);
ExecutorService service = Executors.newFixedThreadPool(threads);
try {
List<BlurInputSplit> splits = new ArrayList<BlurInputSplit>();
for (Path dir : dirs) {
Text table = BlurInputFormat.getTableFromPath(configuration, dir);
String snapshot = getSnapshotForTable(configuration, table.toString());
splits.addAll(getSegmentSplits(dir, service, configuration, table, new Text(snapshot)));
}
return splits;
} finally {
service.shutdownNow();
}
}
public static void putPathToTable(Configuration configuration, String tableName, Path path) {
configuration.set(BLUR_TABLE_PATH_MAPPING + tableName, path.toString());
}
public static Text getTableFromPath(Configuration configuration, Path path) throws IOException {
for (Entry<String, String> e : configuration) {
if (e.getKey().startsWith(BLUR_TABLE_PATH_MAPPING)) {
String k = e.getKey();
String table = k.substring(BLUR_TABLE_PATH_MAPPING.length());
String pathStr = e.getValue();
Path tablePath = new Path(pathStr);
if (tablePath.equals(path)) {
return new Text(table);
}
}
}
throw new IOException("Table name not found for path [" + path + "]");
}
public static void putSnapshotForTable(Configuration configuration, String tableName, String snapshot) {
configuration.set(BLUR_TABLE_SNAPSHOT_MAPPING + tableName, snapshot);
}
public static String getSnapshotForTable(Configuration configuration, String tableName) throws IOException {
for (Entry<String, String> e : configuration) {
if (e.getKey().startsWith(BLUR_TABLE_SNAPSHOT_MAPPING)) {
String k = e.getKey();
String table = k.substring(BLUR_TABLE_SNAPSHOT_MAPPING.length());
if (table.equals(tableName)) {
return e.getValue();
}
}
}
throw new IOException("Snaphost not found for table [" + tableName + "]");
}
private static List<BlurInputSplit> getSegmentSplits(final Path dir, ExecutorService service,
final Configuration configuration, final Text table, final Text snapshot) throws IOException {
FileSystem fileSystem = dir.getFileSystem(configuration);
FileStatus[] shardDirs = fileSystem.listStatus(dir, new PathFilter() {
@Override
public boolean accept(Path path) {
return path.getName().startsWith(BlurConstants.SHARD_PREFIX);
}
});
List<Future<List<BlurInputSplit>>> futures = new ArrayList<Future<List<BlurInputSplit>>>();
for (final FileStatus shardFileStatus : shardDirs) {
futures.add(service.submit(new Callable<List<BlurInputSplit>>() {
@Override
public List<BlurInputSplit> call() throws Exception {
return getSegmentSplits(shardFileStatus.getPath(), configuration, table, snapshot);
}
}));
}
List<BlurInputSplit> results = new ArrayList<BlurInputSplit>();
for (Future<List<BlurInputSplit>> future : futures) {
try {
results.addAll(future.get());
} catch (InterruptedException e) {
throw new IOException(e);
} catch (ExecutionException e) {
Throwable cause = e.getCause();
if (cause instanceof IOException) {
throw (IOException) cause;
} else {
throw new IOException(cause);
}
}
}
return results;
}
private static List<BlurInputSplit> getSegmentSplits(Path shardDir, Configuration configuration, Text table,
Text snapshot) throws IOException {
final long start = System.nanoTime();
Directory directory = getDirectory(configuration, table.toString(), shardDir);
try {
return getSplitForDirectory(shardDir, configuration, table, snapshot, directory);
} finally {
directory.close();
final long end = System.nanoTime();
LOG.info("Found split in shard [{0}] in [{1} ms].", shardDir, (end - start) / 1000000000.0);
}
}
public static List<BlurInputSplit> getSplitForDirectory(Path shardDir, Configuration configuration, String table,
String snapshot, Directory directory) throws IOException {
return getSplitForDirectory(shardDir, configuration, new Text(table), new Text(snapshot), directory);
}
public static List<BlurInputSplit> getSplitForDirectory(Path shardDir, Configuration configuration, Text table,
Text snapshot, Directory directory) throws IOException {
List<BlurInputSplit> splits = new ArrayList<BlurInputSplit>();
SnapshotIndexDeletionPolicy policy = new SnapshotIndexDeletionPolicy(configuration,
SnapshotIndexDeletionPolicy.getGenerationsPath(shardDir));
Long generation = policy.getGeneration(snapshot.toString());
if (generation == null) {
throw new IOException("Snapshot [" + snapshot + "] not found in shard [" + shardDir + "]");
}
List<IndexCommit> listCommits = DirectoryReader.listCommits(directory);
IndexCommit indexCommit = findIndexCommit(listCommits, generation, shardDir);
String segmentsFileName = indexCommit.getSegmentsFileName();
SegmentInfos segmentInfos = new SegmentInfos();
segmentInfos.read(directory, segmentsFileName);
for (SegmentInfoPerCommit commit : segmentInfos) {
SegmentInfo segmentInfo = commit.info;
if (commit.getDelCount() == segmentInfo.getDocCount()) {
LOG.info("Segment [{0}] in dir [{1}] has all records deleted.", segmentInfo.name, shardDir);
} else {
String name = segmentInfo.name;
Collection<String> files = commit.files();
long fileLength = 0;
for (String file : files) {
fileLength += directory.fileLength(file);
}
splits.add(new BlurInputSplit(shardDir, segmentsFileName, name, fileLength, table));
}
}
return splits;
}
private static IndexCommit findIndexCommit(List<IndexCommit> listCommits, long generation, Path shardDir)
throws IOException {
for (IndexCommit commit : listCommits) {
if (commit.getGeneration() == generation) {
return commit;
}
}
throw new IOException("Generation [" + generation + "] not found in shard [" + shardDir + "]");
}
@Override
public RecordReader<Text, TableBlurRecord> createRecordReader(InputSplit split, TaskAttemptContext context)
throws IOException, InterruptedException {
final GenericRecordReaderCollection genericRecordReader = new GenericRecordReaderCollection();
genericRecordReader.initialize((BlurInputSplitColletion) split, context.getConfiguration());
return new RecordReader<Text, TableBlurRecord>() {
@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
genericRecordReader.initialize((BlurInputSplitColletion) split, context.getConfiguration());
}
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
return genericRecordReader.nextKeyValue();
}
@Override
public Text getCurrentKey() throws IOException, InterruptedException {
return genericRecordReader.getCurrentKey();
}
@Override
public TableBlurRecord getCurrentValue() throws IOException, InterruptedException {
return genericRecordReader.getCurrentValue();
}
@Override
public float getProgress() throws IOException, InterruptedException {
return genericRecordReader.getProgress();
}
@Override
public void close() throws IOException {
genericRecordReader.close();
}
};
}
public static class BlurInputSplit extends InputSplit implements org.apache.hadoop.mapred.InputSplit, Writable {
private static final String UTF_8 = "UTF-8";
private long _fileLength;
private String _segmentsName;
private Path _dir;
private String _segmentInfoName;
private Text _table = new Text();
public BlurInputSplit() {
}
public BlurInputSplit(Path dir, String segmentsName, String segmentInfoName, long fileLength, Text table) {
_fileLength = fileLength;
_segmentsName = segmentsName;
_segmentInfoName = segmentInfoName;
_table = table;
_dir = dir;
}
@Override
public long getLength() {
return _fileLength;
}
@Override
public String[] getLocations() {
// @TODO create locations for fdt file
return new String[] {};
}
public String getSegmentInfoName() {
return _segmentInfoName;
}
public String getSegmentsName() {
return _segmentsName;
}
public Path getDir() {
return _dir;
}
public Text getTable() {
return _table;
}
@Override
public void write(DataOutput out) throws IOException {
writeString(out, _dir.toString());
writeString(out, _segmentsName);
writeString(out, _segmentInfoName);
_table.write(out);
out.writeLong(_fileLength);
}
@Override
public void readFields(DataInput in) throws IOException {
_dir = new Path(readString(in));
_segmentsName = readString(in);
_segmentInfoName = readString(in);
_table.readFields(in);
_fileLength = in.readLong();
}
private void writeString(DataOutput out, String s) throws IOException {
byte[] bs = s.getBytes(UTF_8);
out.writeInt(bs.length);
out.write(bs);
}
private String readString(DataInput in) throws IOException {
int length = in.readInt();
byte[] buf = new byte[length];
in.readFully(buf);
return new String(buf, UTF_8);
}
}
public static void setLocalCachePath(Job job, Path fileCachePath) {
setLocalCachePath(job.getConfiguration(), fileCachePath);
}
public static void setLocalCachePath(Configuration configuration, Path fileCachePath) {
configuration.set(BLUR_INPUTFORMAT_FILE_CACHE_PATH, fileCachePath.toString());
}
public static Path getLocalCachePath(Configuration configuration) {
String p = configuration.get(BLUR_INPUTFORMAT_FILE_CACHE_PATH);
if (p == null) {
return null;
}
return new Path(p);
}
public static void addTable(Job job, TableDescriptor tableDescriptor, String snapshot)
throws IllegalArgumentException, IOException {
String tableName = tableDescriptor.getName();
Path path = new Path(tableDescriptor.getTableUri());
FileInputFormat.addInputPath(job, path);
putPathToTable(job.getConfiguration(), tableName, path);
putSnapshotForTable(job.getConfiguration(), tableName, snapshot);
}
public static Directory getDirectory(Configuration configuration, String table, Path shardDir) throws IOException {
Path fastPath = DirectoryUtil.getFastDirectoryPath(shardDir);
FileSystem fileSystem = shardDir.getFileSystem(configuration);
boolean disableFast = !fileSystem.exists(fastPath);
HdfsDirectory directory = new HdfsDirectory(configuration, shardDir, null);
return DirectoryUtil.getDirectory(configuration, directory, disableFast, null, table, shardDir.getName(), true);
}
public static void setZooKeeperConnectionStr(Configuration configuration, String zk) {
configuration.set(BlurConstants.BLUR_ZOOKEEPER_CONNECTION, zk);
}
public static void setZooKeeperConnectionStr(Job job, String zk) {
setZooKeeperConnectionStr(job.getConfiguration(), zk);
}
}