/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.blur.indexer; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.UUID; import java.util.concurrent.TimeUnit; import org.apache.blur.indexer.mapreduce.ExistingDataIndexLookupMapper; import org.apache.blur.indexer.mapreduce.ExistingDataMapper; import org.apache.blur.indexer.mapreduce.LookupBuilderMapper; import org.apache.blur.indexer.mapreduce.LookupBuilderReducer; import org.apache.blur.indexer.mapreduce.NewDataMapper; import org.apache.blur.indexer.mapreduce.PrunedBlurInputFormat; import org.apache.blur.indexer.mapreduce.PrunedSequenceFileInputFormat; import org.apache.blur.log.Log; import org.apache.blur.log.LogFactory; import org.apache.blur.mapreduce.lib.BlurInputFormat; import org.apache.blur.mapreduce.lib.BlurOutputFormat; import org.apache.blur.mapreduce.lib.update.IndexKey; import org.apache.blur.mapreduce.lib.update.IndexKeyPartitioner; import org.apache.blur.mapreduce.lib.update.IndexKeyWritableComparator; import org.apache.blur.mapreduce.lib.update.IndexValue; import org.apache.blur.mapreduce.lib.update.UpdateReducer; import org.apache.blur.thirdparty.thrift_0_9_0.TException; import org.apache.blur.thrift.BlurClient; import org.apache.blur.thrift.generated.Blur.Iface; import org.apache.blur.thrift.generated.BlurException; import org.apache.blur.thrift.generated.TableDescriptor; import org.apache.blur.thrift.generated.TableStats; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.PathFilter; import org.apache.hadoop.io.BooleanWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Counter; import org.apache.hadoop.mapreduce.Counters; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.TaskReport; import org.apache.hadoop.mapreduce.TaskType; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.MultipleInputs; import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; public class IndexerJobDriver extends Configured implements Tool { public static final String BLUR_UPDATE_ID = "blur.update.id"; private static final String BLUR_EXEC_TYPE = "blur.exec.type"; public static final String TMP = "tmp"; public enum EXEC { MR_ONLY, MR_WITH_LOOKUP, AUTOMATIC } public static final String MRUPDATE_SNAPSHOT = "mrupdate-snapshot"; public static final String CACHE = "cache"; public static final String COMPLETE = "complete"; public static final String INPROGRESS = "inprogress"; public static final String NEW = "new"; private static final Log LOG = LogFactory.getLog(IndexerJobDriver.class); public static void main(String[] args) throws Exception { int res = ToolRunner.run(new Configuration(), new IndexerJobDriver(), args); System.exit(res); } static class PartitionedInputResult { final Path _partitionedInputData; final Counters _counters; final long[] _rowIdsFromNewData; final long[] _rowIdsToUpdateFromNewData; final long[] _rowIdsFromIndex; PartitionedInputResult(Path partitionedInputData, Counters counters, int shards, TaskReport[] taskReports) { _partitionedInputData = partitionedInputData; _counters = counters; _rowIdsFromNewData = new long[shards]; _rowIdsToUpdateFromNewData = new long[shards]; _rowIdsFromIndex = new long[shards]; for (TaskReport tr : taskReports) { int id = tr.getTaskID().getId(); Counters taskCounters = tr.getTaskCounters(); Counter total = taskCounters.findCounter(BlurIndexCounter.ROW_IDS_FROM_NEW_DATA); _rowIdsFromNewData[id] = total.getValue(); Counter update = taskCounters.findCounter(BlurIndexCounter.ROW_IDS_TO_UPDATE_FROM_NEW_DATA); _rowIdsToUpdateFromNewData[id] = update.getValue(); Counter index = taskCounters.findCounter(BlurIndexCounter.ROW_IDS_FROM_INDEX); _rowIdsFromIndex[id] = index.getValue(); } } } @Override public int run(String[] args) throws Exception { int c = 0; if (args.length < 5) { System.err .println("Usage Driver <table> <mr inc working path> <output path> <zk connection> <reducer multipler> <extra config files...>"); return 1; } String table = args[c++]; String mrIncWorkingPathStr = args[c++]; String outputPathStr = args[c++]; String blurZkConnection = args[c++]; int reducerMultipler = Integer.parseInt(args[c++]); for (; c < args.length; c++) { String externalConfigFileToAdd = args[c]; getConf().addResource(new Path(externalConfigFileToAdd)); } Path outputPath = new Path(outputPathStr); Path mrIncWorkingPath = new Path(mrIncWorkingPathStr); FileSystem fileSystem = mrIncWorkingPath.getFileSystem(getConf()); Path newData = new Path(mrIncWorkingPath, NEW); Path inprogressData = new Path(mrIncWorkingPath, INPROGRESS); Path completeData = new Path(mrIncWorkingPath, COMPLETE); Path fileCache = new Path(mrIncWorkingPath, CACHE); Path tmpPathDontDelete = new Path(mrIncWorkingPath, TMP); Path tmpPath = new Path(tmpPathDontDelete, UUID.randomUUID().toString()); fileSystem.mkdirs(newData); fileSystem.mkdirs(inprogressData); fileSystem.mkdirs(completeData); fileSystem.mkdirs(fileCache); List<Path> srcPathList = new ArrayList<Path>(); for (FileStatus fileStatus : fileSystem.listStatus(newData)) { srcPathList.add(fileStatus.getPath()); } if (srcPathList.isEmpty()) { return 0; } List<Path> inprogressPathList = new ArrayList<Path>(); boolean success = false; Iface client = null; EXEC exec = EXEC.valueOf(getConf().get(BLUR_EXEC_TYPE, EXEC.AUTOMATIC.name()).toUpperCase()); String uuid = UUID.randomUUID().toString(); try { client = BlurClient.getClientFromZooKeeperConnectionStr(blurZkConnection); TableDescriptor descriptor = client.describe(table); Map<String, String> tableProperties = descriptor.getTableProperties(); String fastDir = tableProperties.get("blur.table.disable.fast.dir"); if (fastDir == null || !fastDir.equals("true")) { LOG.error("Table [{0}] has blur.table.disable.fast.dir enabled, not supported in fast MR update.", table); return 1; } waitForOtherSnapshotsToBeRemoved(client, table, MRUPDATE_SNAPSHOT); client.createSnapshot(table, MRUPDATE_SNAPSHOT); TableStats tableStats = client.tableStats(table); inprogressPathList = movePathList(fileSystem, inprogressData, srcPathList); switch (exec) { case MR_ONLY: success = runMrOnly(descriptor, inprogressPathList, table, fileCache, outputPath, reducerMultipler); break; case MR_WITH_LOOKUP: success = runMrWithLookup(uuid, descriptor, inprogressPathList, table, fileCache, outputPath, reducerMultipler, tmpPath, tableStats, MRUPDATE_SNAPSHOT); break; case AUTOMATIC: success = runAutomatic(uuid, descriptor, inprogressPathList, table, fileCache, outputPath, reducerMultipler, tmpPath, tableStats, MRUPDATE_SNAPSHOT); break; default: throw new RuntimeException("Exec type [" + exec + "] not supported."); } } finally { if (success) { LOG.info("Associate lookup cache with new data!"); associateLookupCache(uuid, fileCache, outputPath); LOG.info("Indexing job succeeded!"); client.loadData(table, outputPathStr); LOG.info("Load data called"); movePathList(fileSystem, completeData, inprogressPathList); LOG.info("Input data moved to complete"); ClusterDriver.waitForDataToLoad(client, table); LOG.info("Data loaded"); } else { LOG.error("Indexing job failed!"); movePathList(fileSystem, newData, inprogressPathList); } fileSystem.delete(tmpPath, true); if (client != null) { client.removeSnapshot(table, MRUPDATE_SNAPSHOT); } } if (success) { return 0; } else { return 1; } } private void associateLookupCache(String uuid, Path fileCache, Path outputPath) throws IOException { FileSystem fileSystem = fileCache.getFileSystem(getConf()); cleanupExtraFileFromSpecX(fileSystem, uuid, fileCache); associateLookupCache(fileSystem, uuid, fileSystem.getFileStatus(fileCache), outputPath); } private void cleanupExtraFileFromSpecX(FileSystem fileSystem, String uuid, Path fileCache) throws IOException { FileStatus[] listStatus = fileSystem.listStatus(fileCache); List<FileStatus> uuidPaths = new ArrayList<FileStatus>(); for (FileStatus fs : listStatus) { Path path = fs.getPath(); if (fs.isDirectory()) { cleanupExtraFileFromSpecX(fileSystem, uuid, path); } else if (path.getName().startsWith(uuid)) { uuidPaths.add(fs); } } if (uuidPaths.size() > 1) { deleteIncomplete(fileSystem, uuidPaths); } } private void deleteIncomplete(FileSystem fileSystem, List<FileStatus> uuidPaths) throws IOException { long max = 0; FileStatus keeper = null; for (FileStatus fs : uuidPaths) { long len = fs.getLen(); if (len > max) { keeper = fs; max = len; } } for (FileStatus fs : uuidPaths) { if (fs != keeper) { LOG.info("Deleteing incomplete cache file [{0}]", fs.getPath()); fileSystem.delete(fs.getPath(), false); } } } private void associateLookupCache(FileSystem fileSystem, String uuid, FileStatus fileCache, Path outputPath) throws IOException { Path path = fileCache.getPath(); if (fileCache.isDirectory()) { FileStatus[] listStatus = fileSystem.listStatus(path); for (FileStatus fs : listStatus) { associateLookupCache(fileSystem, uuid, fs, outputPath); } } else if (path.getName().startsWith(uuid)) { Path parent = path.getParent(); String shardName = parent.getName(); Path indexPath = findOutputDirPath(outputPath, shardName); LOG.info("Path found for shard [{0}] outputPath [{1}]", shardName, outputPath); String id = MergeSortRowIdMatcher.getIdForSingleSegmentIndex(getConf(), indexPath); Path file = new Path(path.getParent(), id + ".seq"); MergeSortRowIdMatcher.commitWriter(getConf(), file, path); } } private Path findOutputDirPath(Path outputPath, String shardName) throws IOException { FileSystem fileSystem = outputPath.getFileSystem(getConf()); Path shardPath = new Path(outputPath, shardName); if (!fileSystem.exists(shardPath)) { throw new IOException("Shard path [" + shardPath + "]"); } FileStatus[] listStatus = fileSystem.listStatus(shardPath, new PathFilter() { @Override public boolean accept(Path path) { return path.getName().endsWith(".commit"); } }); if (listStatus.length == 1) { FileStatus fs = listStatus[0]; return fs.getPath(); } else { throw new IOException("More than one sub dir [" + shardPath + "]"); } } private boolean runAutomatic(String uuid, TableDescriptor descriptor, List<Path> inprogressPathList, String table, Path fileCache, Path outputPath, int reducerMultipler, Path tmpPath, TableStats tableStats, String snapshot) throws ClassNotFoundException, IOException, InterruptedException { PartitionedInputResult result = buildPartitionedInputData(uuid, tmpPath, descriptor, inprogressPathList, snapshot, fileCache); Job job = Job.getInstance(getConf(), "Blur Row Updater for table [" + table + "]"); InputSplitPruneUtil.setBlurLookupRowIdFromNewDataCounts(job, table, result._rowIdsFromNewData); InputSplitPruneUtil.setBlurLookupRowIdUpdateFromNewDataCounts(job, table, result._rowIdsToUpdateFromNewData); InputSplitPruneUtil.setBlurLookupRowIdFromIndexCounts(job, table, result._rowIdsFromIndex); InputSplitPruneUtil.setTable(job, table); BlurInputFormat.setLocalCachePath(job, fileCache); // Existing data - This adds the copy data files first open and stream // through all documents. { Path tablePath = new Path(descriptor.getTableUri()); BlurInputFormat.addTable(job, descriptor, MRUPDATE_SNAPSHOT); MultipleInputs.addInputPath(job, tablePath, PrunedBlurInputFormat.class, ExistingDataMapper.class); } // Existing data - This adds the row id lookup { ExistingDataIndexLookupMapper.setSnapshot(job, MRUPDATE_SNAPSHOT); FileInputFormat.addInputPath(job, result._partitionedInputData); MultipleInputs.addInputPath(job, result._partitionedInputData, PrunedSequenceFileInputFormat.class, ExistingDataIndexLookupMapper.class); } // New Data for (Path p : inprogressPathList) { FileInputFormat.addInputPath(job, p); MultipleInputs.addInputPath(job, p, SequenceFileInputFormat.class, NewDataMapper.class); } BlurOutputFormat.setOutputPath(job, outputPath); BlurOutputFormat.setupJob(job, descriptor); job.setReducerClass(UpdateReducer.class); job.setMapOutputKeyClass(IndexKey.class); job.setMapOutputValueClass(IndexValue.class); job.setPartitionerClass(IndexKeyPartitioner.class); job.setGroupingComparatorClass(IndexKeyWritableComparator.class); BlurOutputFormat.setReducerMultiplier(job, reducerMultipler); boolean success = job.waitForCompletion(true); Counters counters = job.getCounters(); LOG.info("Counters [" + counters + "]"); return success; } private boolean runMrWithLookup(String uuid, TableDescriptor descriptor, List<Path> inprogressPathList, String table, Path fileCache, Path outputPath, int reducerMultipler, Path tmpPath, TableStats tableStats, String snapshot) throws ClassNotFoundException, IOException, InterruptedException { PartitionedInputResult result = buildPartitionedInputData(uuid, tmpPath, descriptor, inprogressPathList, snapshot, fileCache); Job job = Job.getInstance(getConf(), "Blur Row Updater for table [" + table + "]"); ExistingDataIndexLookupMapper.setSnapshot(job, MRUPDATE_SNAPSHOT); FileInputFormat.addInputPath(job, result._partitionedInputData); MultipleInputs.addInputPath(job, result._partitionedInputData, SequenceFileInputFormat.class, ExistingDataIndexLookupMapper.class); for (Path p : inprogressPathList) { FileInputFormat.addInputPath(job, p); MultipleInputs.addInputPath(job, p, SequenceFileInputFormat.class, NewDataMapper.class); } BlurOutputFormat.setOutputPath(job, outputPath); BlurOutputFormat.setupJob(job, descriptor); job.setReducerClass(UpdateReducer.class); job.setMapOutputKeyClass(IndexKey.class); job.setMapOutputValueClass(IndexValue.class); job.setPartitionerClass(IndexKeyPartitioner.class); job.setGroupingComparatorClass(IndexKeyWritableComparator.class); BlurOutputFormat.setReducerMultiplier(job, reducerMultipler); boolean success = job.waitForCompletion(true); Counters counters = job.getCounters(); LOG.info("Counters [" + counters + "]"); return success; } private boolean runMrOnly(TableDescriptor descriptor, List<Path> inprogressPathList, String table, Path fileCache, Path outputPath, int reducerMultipler) throws IOException, ClassNotFoundException, InterruptedException { Job job = Job.getInstance(getConf(), "Blur Row Updater for table [" + table + "]"); Path tablePath = new Path(descriptor.getTableUri()); BlurInputFormat.setLocalCachePath(job, fileCache); BlurInputFormat.addTable(job, descriptor, MRUPDATE_SNAPSHOT); MultipleInputs.addInputPath(job, tablePath, BlurInputFormat.class, ExistingDataMapper.class); for (Path p : inprogressPathList) { FileInputFormat.addInputPath(job, p); MultipleInputs.addInputPath(job, p, SequenceFileInputFormat.class, NewDataMapper.class); } BlurOutputFormat.setOutputPath(job, outputPath); BlurOutputFormat.setupJob(job, descriptor); job.setReducerClass(UpdateReducer.class); job.setMapOutputKeyClass(IndexKey.class); job.setMapOutputValueClass(IndexValue.class); job.setPartitionerClass(IndexKeyPartitioner.class); job.setGroupingComparatorClass(IndexKeyWritableComparator.class); BlurOutputFormat.setReducerMultiplier(job, reducerMultipler); boolean success = job.waitForCompletion(true); Counters counters = job.getCounters(); LOG.info("Counters [" + counters + "]"); return success; } private PartitionedInputResult buildPartitionedInputData(String uuid, Path tmpPath, TableDescriptor descriptor, List<Path> inprogressPathList, String snapshot, Path fileCachePath) throws IOException, ClassNotFoundException, InterruptedException { Job job = Job.getInstance(getConf(), "Partitioning data for table [" + descriptor.getName() + "]"); job.getConfiguration().set(BLUR_UPDATE_ID, uuid); // Needed for the bloom filter path information. BlurOutputFormat.setTableDescriptor(job, descriptor); BlurInputFormat.setLocalCachePath(job, fileCachePath); ExistingDataIndexLookupMapper.setSnapshot(job, snapshot); for (Path p : inprogressPathList) { FileInputFormat.addInputPath(job, p); } Path outputPath = new Path(tmpPath, UUID.randomUUID().toString()); job.setJarByClass(getClass()); job.setMapperClass(LookupBuilderMapper.class); job.setReducerClass(LookupBuilderReducer.class); int shardCount = descriptor.getShardCount(); job.setNumReduceTasks(shardCount); job.setInputFormatClass(SequenceFileInputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(NullWritable.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(BooleanWritable.class); FileOutputFormat.setOutputPath(job, outputPath); if (job.waitForCompletion(true)) { return new PartitionedInputResult(outputPath, job.getCounters(), shardCount, job.getTaskReports(TaskType.REDUCE)); } else { throw new IOException("Partitioning failed!"); } } private void waitForOtherSnapshotsToBeRemoved(Iface client, String table, String snapshot) throws BlurException, TException, InterruptedException { while (true) { Map<String, List<String>> listSnapshots = client.listSnapshots(table); boolean mrupdateSnapshots = false; for (Entry<String, List<String>> e : listSnapshots.entrySet()) { List<String> value = e.getValue(); if (value.contains(snapshot)) { mrupdateSnapshots = true; } } if (!mrupdateSnapshots) { return; } else { LOG.info(snapshot + " Snapshot for table [{0}] already exists", table); Thread.sleep(TimeUnit.SECONDS.toMillis(10)); LOG.info("Retrying"); } } } private List<Path> movePathList(FileSystem fileSystem, Path dstDir, List<Path> lst) throws IOException { List<Path> result = new ArrayList<Path>(); for (Path src : lst) { Path dst = new Path(dstDir, src.getName()); if (fileSystem.rename(src, dst)) { LOG.info("Moving [{0}] to [{1}]", src, dst); result.add(dst); } else { LOG.error("Could not move [{0}] to [{1}]", src, dst); } } return result; } }