package org.coursera;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import com.netflix.Aegisthus;
import com.netflix.aegisthus.input.AegisthusInputFormat;
import com.netflix.aegisthus.tools.DirectoryWalker;
import com.netflix.aegisthus.util.CFMetadataUtility;
import org.apache.avro.Schema;
import org.apache.avro.mapreduce.AvroJob;
import org.apache.avro.mapreduce.AvroKeyOutputFormat;
import org.apache.cassandra.config.CFMetaData;
import org.apache.cassandra.io.sstable.Descriptor;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.coursera.mapreducer.CQLMapper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.List;
import java.util.Set;
public class SSTableExport extends Configured implements Tool {
private static final Logger LOG = LoggerFactory.getLogger(SSTableExport.class);
private Descriptor.Version version;
public static void main(String[] args) throws Exception {
int res = ToolRunner.run(new Configuration(), new SSTableExport(), args);
boolean exit = Boolean.valueOf(System.getProperty(Aegisthus.Feature.CONF_SYSTEM_EXIT, "true"));
if (exit) {
System.exit(res);
} else if (res != 0) {
throw new RuntimeException("SSTableExport finished with a non-zero exit code: " + res);
}
}
private void checkVersionFromFilename(String filename) {
Descriptor descriptor = Descriptor.fromFilename(filename);
if (this.version == null) {
this.version = descriptor.version;
} else if (!this.version.equals(descriptor.version)) {
throw new IllegalStateException("All files must have the same sstable version. File '" + filename
+ "' has version '" + descriptor.version + "' and we have already seen a file with version '"
+ version + "'");
}
}
private void setConfigurationFromCql(Configuration conf) {
CFMetaData cfMetaData = CFMetadataUtility.initializeCfMetaData(conf);
String keyType = cfMetaData.getKeyValidator().toString();
String columnType = cfMetaData.comparator.toString();
LOG.info("From CQL3, setting keyType({}) and columnType({}).", keyType, columnType);
conf.set(Aegisthus.Feature.CONF_KEYTYPE, keyType);
conf.set(Aegisthus.Feature.CONF_COLUMNTYPE, columnType);
}
List<Path> getDataFiles(Configuration conf, String dir) throws IOException {
Set<String> globs = Sets.newHashSet();
List<Path> output = Lists.newArrayList();
Path dirPath = new Path(dir);
FileSystem fs = dirPath.getFileSystem(conf);
List<FileStatus> input = Lists.newArrayList(fs.listStatus(dirPath));
for (String path : DirectoryWalker.with(conf).threaded().addAllStatuses(input).pathsString()) {
if (path.endsWith("-Data.db")) {
checkVersionFromFilename(path);
globs.add(path.replaceAll("[^/]+-Data.db", "*-Data.db"));
}
}
for (String path : globs) {
output.add(new Path(path));
}
return output;
}
@SuppressWarnings("static-access")
CommandLine getOptions(String[] args) {
Options opts = new Options();
opts.addOption(OptionBuilder.withArgName(Feature.CMD_ARG_INPUT_FILE)
.withDescription("Each input location")
.hasArgs()
.create(Feature.CMD_ARG_INPUT_FILE));
opts.addOption(OptionBuilder.withArgName(Feature.CMD_ARG_INPUT_DIR)
.withDescription("a directory from which we will recursively pull sstables")
.hasArgs()
.create(Feature.CMD_ARG_INPUT_DIR));
opts.addOption(OptionBuilder.withArgName(Feature.CMD_ARG_AVRO_SCHEMA_FILE)
.withDescription("location of avro schema")
.isRequired()
.hasArgs()
.create(Feature.CMD_ARG_AVRO_SCHEMA_FILE));
opts.addOption(OptionBuilder.withArgName(Feature.CMD_ARG_OUTPUT_DIR)
.isRequired()
.withDescription("output location")
.hasArg()
.create(Feature.CMD_ARG_OUTPUT_DIR));
CommandLineParser parser = new GnuParser();
try {
CommandLine cl = parser.parse(opts, args, true);
if (!(cl.hasOption(Feature.CMD_ARG_INPUT_FILE) || cl.hasOption(Feature.CMD_ARG_INPUT_DIR))) {
System.out.println("Must have either an input or inputDir option");
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp(String.format("hadoop jar aegisthus.jar %s", SSTableExport.class.getName()), opts);
return null;
}
return cl;
} catch (ParseException e) {
System.out.println("Unexpected exception:" + e.getMessage());
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp(String.format("hadoop jar aegisthus.jar %s", SSTableExport.class.getName()), opts);
return null;
}
}
private String getAvroSchema(String schemaLocation, Configuration conf) throws IOException {
Path schemaPath = new Path(schemaLocation);
return IOUtils.toString(schemaPath.getFileSystem(conf).open(schemaPath));
}
@Override
public int run(String[] args) throws Exception {
Job job = Job.getInstance(getConf());
job.setJarByClass(SSTableExport.class);
CommandLine cl = getOptions(args);
if (cl == null) {
return 1;
}
// Check all of the paths and load the sstable version from the input filenames
List<Path> paths = Lists.newArrayList();
if (cl.hasOption(Feature.CMD_ARG_INPUT_FILE)) {
for (String input : cl.getOptionValues(Feature.CMD_ARG_INPUT_FILE)) {
checkVersionFromFilename(input);
paths.add(new Path(input));
}
}
if (cl.hasOption(Feature.CMD_ARG_INPUT_DIR)) {
paths.addAll(getDataFiles(job.getConfiguration(), cl.getOptionValue(Feature.CMD_ARG_INPUT_DIR)));
}
String avroSchemaString = getAvroSchema(cl.getOptionValue(Feature.CMD_ARG_AVRO_SCHEMA_FILE), job.getConfiguration());
Schema avroSchema = new Schema.Parser().parse(avroSchemaString);
// At this point we have the version of sstable that we can use for this run
job.getConfiguration().set(Aegisthus.Feature.CONF_SSTABLE_VERSION, version.toString());
if (job.getConfiguration().get(Aegisthus.Feature.CONF_CQL_SCHEMA) != null) {
setConfigurationFromCql(job.getConfiguration());
}
job.setInputFormatClass(AegisthusInputFormat.class);
job.setMapperClass(CQLMapper.class);
job.setOutputFormatClass(AvroKeyOutputFormat.class);
AvroJob.setOutputKeySchema(job, avroSchema);
// Map-only job
job.setNumReduceTasks(0);
TextInputFormat.setInputPaths(job, paths.toArray(new Path[paths.size()]));
FileOutputFormat.setOutputPath(job, new Path(cl.getOptionValue(Feature.CMD_ARG_OUTPUT_DIR)));
job.submit();
System.out.println(job.getJobID());
System.out.println(job.getTrackingURL());
boolean success = job.waitForCompletion(true);
return success ? 0 : 1;
}
public static final class Feature {
public static final String CMD_ARG_INPUT_DIR = "inputDir";
public static final String CMD_ARG_INPUT_FILE = "input";
public static final String CMD_ARG_OUTPUT_DIR = "output";
public static final String CMD_ARG_AVRO_SCHEMA_FILE = "avroSchemaFile";
}
}