/** * Copyright 2013 Netflix, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.netflix; import com.google.common.collect.ImmutableList; import com.google.common.collect.Lists; import com.google.common.collect.Sets; import com.netflix.aegisthus.input.AegisthusCombinedInputFormat; import com.netflix.aegisthus.input.AegisthusInputFormat; import com.netflix.aegisthus.io.writable.AegisthusKey; import com.netflix.aegisthus.io.writable.AegisthusKeyGroupingComparator; import com.netflix.aegisthus.io.writable.AegisthusKeyMapper; import com.netflix.aegisthus.io.writable.AegisthusKeyPartitioner; import com.netflix.aegisthus.io.writable.AegisthusKeySortingComparator; import com.netflix.aegisthus.io.writable.AtomWritable; import com.netflix.aegisthus.io.writable.RowWritable; import com.netflix.aegisthus.mapreduce.CassSSTableReducer; import com.netflix.aegisthus.output.CustomFileNameFileOutputFormat; import com.netflix.aegisthus.output.JsonOutputFormat; import com.netflix.aegisthus.output.SSTableOutputFormat; import com.netflix.aegisthus.tools.DirectoryWalker; import com.netflix.aegisthus.util.CFMetadataUtility; import com.netflix.aegisthus.util.JobKiller; import org.apache.cassandra.config.CFMetaData; import org.apache.cassandra.io.sstable.Descriptor; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.GnuParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.OptionBuilder; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapreduce.Counter; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.io.InputStream; import java.net.URL; import java.util.List; import java.util.Set; import java.util.jar.Attributes; import java.util.jar.Manifest; public class Aegisthus extends Configured implements Tool { private static final Logger LOG = LoggerFactory.getLogger(Aegisthus.class); private static void logAegisthusVersion() { String classPath = Aegisthus.class.getResource("Aegisthus.class").toString(); String manifestPath = classPath.replace("com/netflix/Aegisthus.class", "META-INF/MANIFEST.MF"); try (InputStream inputStream = new URL(manifestPath).openStream()) { Manifest manifest = new Manifest(inputStream); Attributes attr = manifest.getMainAttributes(); System.out.println("Running Aegisthus version " + attr.getValue("Implementation-Version") + " built from change " + attr.getValue("Change") + " on host " + attr.getValue("Build-Host") + " on " + attr.getValue("Build-Date") + " with Java " + attr.getValue("Build-Java-Version") ); } catch (IOException ignored) { System.out.println("Unable to locate Aegisthus manifest file"); } } public static void main(String[] args) throws Exception { logAegisthusVersion(); int res = ToolRunner.run(new Configuration(), new Aegisthus(), args); boolean exit = Boolean.valueOf(System.getProperty(Feature.CONF_SYSTEM_EXIT, "true")); if (exit) { System.exit(res); } else if (res != 0) { throw new RuntimeException("aegisthus finished with a non-zero exit code: " + res); } } private void setConfigurationFromCql(Configuration conf) { CFMetaData cfMetaData = CFMetadataUtility.initializeCfMetaData(conf); String keyType = cfMetaData.getKeyValidator().toString(); String columnType = cfMetaData.comparator.toString(); LOG.info("From CQL3, setting keyType({}) and columnType({}).", keyType, columnType); conf.set(Feature.CONF_KEYTYPE, keyType); conf.set(Feature.CONF_COLUMNTYPE, columnType); } List<Path> getDataFiles(Configuration conf, String dir) throws IOException { Set<Path> globs = Sets.newHashSet(); Iterable<Path> paths = DirectoryWalker.with(conf) .add(dir) .recursive(true) .omitHidden(true) .manifest(false) .threaded() .paths(); for (Path path : paths) { String pathName = path.getName(); if (pathName.endsWith("-Data.db")) { Path outputPath = new Path(path.getParent(), pathName.replaceAll("[^/]+-Data.db", "*-Data.db")); globs.add(outputPath); } } return ImmutableList.copyOf(globs); } @SuppressWarnings("static-access") CommandLine getOptions(String[] args) { Options opts = new Options(); opts.addOption(OptionBuilder.withArgName(Feature.CMD_ARG_INPUT_FILE) .withDescription("Each input location") .hasArgs() .create(Feature.CMD_ARG_INPUT_FILE)); opts.addOption(OptionBuilder.withArgName(Feature.CMD_ARG_OUTPUT_DIR) .isRequired() .withDescription("output location") .hasArg() .create(Feature.CMD_ARG_OUTPUT_DIR)); opts.addOption(OptionBuilder.withArgName(Feature.CMD_ARG_INPUT_DIR) .withDescription("a directory from which we will recursively pull sstables") .hasArgs() .create(Feature.CMD_ARG_INPUT_DIR)); opts.addOption(OptionBuilder.withArgName(Feature.CMD_ARG_PRODUCE_SSTABLE) .withDescription("produces sstable output (default is to produce json)") .create(Feature.CMD_ARG_PRODUCE_SSTABLE)); opts.addOption(OptionBuilder.withArgName(Feature.CMD_ARG_COMBINE_SPLITS) .withDescription("combine together small input splits (default is to not combine input splits)") .create(Feature.CMD_ARG_COMBINE_SPLITS)); opts.addOption(OptionBuilder.withArgName(Feature.CMD_ARG_SSTABLE_OUTPUT_VERSION) .withDescription("version of sstable to produce (default is to produce " + Descriptor.Version.current_version + ")") .hasArg() .create(Feature.CMD_ARG_SSTABLE_OUTPUT_VERSION)); CommandLineParser parser = new GnuParser(); try { CommandLine cl = parser.parse(opts, args, true); if (!(cl.hasOption(Feature.CMD_ARG_INPUT_FILE) || cl.hasOption(Feature.CMD_ARG_INPUT_DIR))) { System.out.println("Must have either an input or inputDir option"); HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(String.format("hadoop jar aegisthus.jar %s", Aegisthus.class.getName()), opts); return null; } return cl; } catch (ParseException e) { System.out.println("Unexpected exception:" + e.getMessage()); HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(String.format("hadoop jar aegisthus.jar %s", Aegisthus.class.getName()), opts); return null; } } @Override public int run(String[] args) throws Exception { Job job = Job.getInstance(getConf()); Configuration configuration = job.getConfiguration(); job.setJarByClass(Aegisthus.class); CommandLine cl = getOptions(args); if (cl == null) { return 1; } // Check all of the paths and load the sstable version from the input filenames List<Path> paths = Lists.newArrayList(); if (cl.hasOption(Feature.CMD_ARG_INPUT_FILE)) { for (String input : cl.getOptionValues(Feature.CMD_ARG_INPUT_FILE)) { paths.add(new Path(input)); } } if (cl.hasOption(Feature.CMD_ARG_INPUT_DIR)) { paths.addAll(getDataFiles(configuration, cl.getOptionValue(Feature.CMD_ARG_INPUT_DIR))); } LOG.info("Processing paths: {}", paths); // At this point we have the version of sstable that we can use for this run Descriptor.Version version = Descriptor.Version.CURRENT; if (cl.hasOption(Feature.CMD_ARG_SSTABLE_OUTPUT_VERSION)) { version = new Descriptor.Version(cl.getOptionValue(Feature.CMD_ARG_SSTABLE_OUTPUT_VERSION)); } configuration.set(Feature.CONF_SSTABLE_VERSION, version.toString()); if (configuration.get(Feature.CONF_CQL_SCHEMA) != null) { setConfigurationFromCql(configuration); } if(cl.hasOption(Feature.CMD_ARG_COMBINE_SPLITS)) { job.setInputFormatClass(AegisthusCombinedInputFormat.class); } else { job.setInputFormatClass(AegisthusInputFormat.class); } job.setMapOutputKeyClass(AegisthusKey.class); job.setMapOutputValueClass(AtomWritable.class); job.setOutputKeyClass(AegisthusKey.class); job.setOutputValueClass(RowWritable.class); job.setMapperClass(AegisthusKeyMapper.class); job.setReducerClass(CassSSTableReducer.class); job.setGroupingComparatorClass(AegisthusKeyGroupingComparator.class); job.setPartitionerClass(AegisthusKeyPartitioner.class); job.setSortComparatorClass(AegisthusKeySortingComparator.class); TextInputFormat.setInputPaths(job, paths.toArray(new Path[paths.size()])); if (cl.hasOption(Feature.CMD_ARG_PRODUCE_SSTABLE)) { job.setOutputFormatClass(SSTableOutputFormat.class); } else { job.setOutputFormatClass(JsonOutputFormat.class); } CustomFileNameFileOutputFormat.setOutputPath(job, new Path(cl.getOptionValue(Feature.CMD_ARG_OUTPUT_DIR))); job.submit(); if (configuration.getBoolean(Feature.CONF_SHUTDOWN_HOOK, true)) { Runtime.getRuntime().addShutdownHook(new JobKiller(job)); } System.out.println(job.getJobID()); System.out.println(job.getTrackingURL()); boolean success = job.waitForCompletion(true); if (success) { Counter errorCounter = job.getCounters().findCounter("aegisthus", "error_skipped_input"); long errorCount = errorCounter != null ? errorCounter.getValue() : 0L; int maxAllowed = configuration.getInt(Feature.CONF_MAX_CORRUPT_FILES_TO_SKIP, 0); if (errorCounter != null && errorCounter.getValue() > maxAllowed) { LOG.error("Found {} corrupt files which is greater than the max allowed {}", errorCount, maxAllowed); success = false; } else if (errorCount > 0) { LOG.warn("Found {} corrupt files but not failing the job because the max allowed is {}", errorCount, maxAllowed); } } return success ? 0 : 1; } public static final class Feature { public static final String CMD_ARG_COMBINE_SPLITS = "combineSplits"; public static final String CMD_ARG_INPUT_DIR = "inputDir"; public static final String CMD_ARG_INPUT_FILE = "input"; public static final String CMD_ARG_OUTPUT_DIR = "output"; public static final String CMD_ARG_PRODUCE_SSTABLE = "produceSSTable"; public static final String CMD_ARG_SSTABLE_OUTPUT_VERSION = "sstable_output_version"; /** * If set this is the blocksize aegisthus will use when splitting input files otherwise the hadoop vaule will * be used. */ public static final String CONF_BLOCKSIZE = "aegisthus.blocksize"; /** * The column type, used for sorting columns in all output formats and also in the JSON output format. The * default is BytesType. */ public static final String CONF_COLUMNTYPE = "aegisthus.columntype"; /** * The converter to use for the column value, used in the JSON output format. The default is BytesType. */ public static final String CONF_COLUMN_VALUE_TYPE = "aegisthus.column_value_type"; /** * Name of the keyspace and dataset to use for the output sstable file name. The default is "keyspace-dataset". */ public static final String CONF_DATASET = "aegisthus.dataset"; /** * The converter to use for the key, used in the JSON output format. The default is BytesType. */ public static final String CONF_KEYTYPE = "aegisthus.keytype"; /** * Earlier versions of Aegisthus did extra formatting on just the column name. This defaults to false. */ public static final String CONF_LEGACY_COLUMN_NAME_FORMATTING = "aegisthus.legacy_column_name_formatting"; /** * If set rows with columns larger than this size will be dropped during the reduce stage. * For legacy reasons this is based on the size of the columns on disk in SSTable format not the string size of * the columns. */ public static final String CONF_MAXCOLSIZE = "aegisthus.maxcolsize"; /** * The maximum number of files that can be combined in a single input split. Defaults to 200. */ public static final String CONF_MAX_COMBINED_SPLITS = "aegisthus.max_combined_splits"; /** * The maximum number of corrupt files that Aegisthus can automatically skip. Defaults to 0. */ public static final String CONF_MAX_CORRUPT_FILES_TO_SKIP = "aegisthus.max_corrupt_files_to_skip"; /** * Whether to add a shutdown hook to kill the hadoop job. Defaults to true. */ public static final String CONF_SHUTDOWN_HOOK = "aegisthus.shutdown_hook"; /** * Sort the columns by name rather than by the order in Cassandra. This defaults to false. */ public static final String CONF_SORT_COLUMNS_BY_NAME = "aegisthus.sort_columns_by_name"; /** * The version of SSTable to input and output. */ public static final String CONF_SSTABLE_VERSION = "aegisthus.version_of_sstable"; /** * Configures if the System.exit should be called to end the processing in main. Defaults to true. */ public static final String CONF_SYSTEM_EXIT = "aegisthus.exit"; /** * The CQL "Create Table" statement that defines the schema of the input sstables. */ public static final String CONF_CQL_SCHEMA = "aegisthus.cql_schema"; /** * When this is enabled, Aegisthus keeps track of which source file all data came from. When used with * the json output format the filename will be output with the row. Note: that this is just for debugging, * when enabled rows from different source files will not be combined. Defaults to false. */ public static final String CONF_TRACE_DATA_FROM_SOURCE = "aegisthus.trace_source"; } }