// Copyright (C) 2011-2012 CRS4.
//
// This file is part of Seal.
//
// Seal is free software: you can redistribute it and/or modify it
// under the terms of the GNU General Public License as published by the Free
// Software Foundation, either version 3 of the License, or (at your option)
// any later version.
//
// Seal is distributed in the hope that it will be useful, but
// WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
// or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with Seal. If not, see <http://www.gnu.org/licenses/>.
package it.crs4.seal.common;
import it.crs4.seal.common.ClusterUtils;
import java.io.IOException;
import java.io.File;
import java.io.FileReader;
import java.util.List;
import java.util.ArrayList;
import java.util.Iterator;
import org.apache.commons.cli.*;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
public class SealToolParser {
public static final File DefaultConfigFile = new File(System.getProperty("user.home"), ".sealrc");
public static final int DEFAULT_MIN_REDUCE_TASKS = 0;
public static final int DEFAULT_REDUCE_TASKS_PER_NODE = 3;
public static final String INPUT_FORMAT_CONF = "seal.input-format";
public static final String OUTPUT_FORMAT_CONF = "seal.output-format";
protected static final String INPUT_FORMAT_DESC = "Input format name";
protected static final String OUTPUT_FORMAT_DESC = "Output format name";
public static final String INPUT_FORMAT_ENCODING = "seal.input.base-quality-encoding";
private int minReduceTasks;
/**
* Configuration object used to parse the command line, cached for further queries.
*/
private Configuration myconf;
protected Options options;
private Option opt_nReduceTasks;
private Option opt_configFileOverride;
private Option opt_inputFormat;
private Option opt_outputFormat;
private Option opt_compressOutput;
private String[] acceptedInputFormats;
private String[] acceptedOutputFormats;
private Integer nReduceTasks;
private int nReduceTasksPerNode;
private String configSection;
protected String toolName;
protected ArrayList<Path> inputs;
private Path outputDir;
/**
* Construct a SealToolParser instance.
*
* The instance is set to read the properties in configuration file's section sectionName,
* in addition to the default section. Properties set on the command line will override
* the file's settings.
*
* @param configSection Name of section of configuration to load, in addition to DEFAULT.
* If null, only DEFAULT is loaded
* @param toolName Name used in the help message
*/
@SuppressWarnings("static") // for OptionBuilder
public SealToolParser(String configSection, String toolName)
{
this.toolName = toolName;
options = new Options(); // empty
opt_nReduceTasks = OptionBuilder
.withDescription("Number of reduce tasks to use.")
.hasArg()
.withArgName("INT")
.withLongOpt("num-reducers")
.create("r");
options.addOption(opt_nReduceTasks);
opt_configFileOverride = OptionBuilder
.withDescription("Override default Seal config file (" + DefaultConfigFile + ")")
.hasArg()
.withArgName("FILE")
.withLongOpt("seal-config")
.create("sc");
options.addOption(opt_configFileOverride);
opt_inputFormat = OptionBuilder
.withDescription(INPUT_FORMAT_DESC)
.hasArg()
.withArgName("FORMAT")
.withLongOpt("input-format")
.create("if");
options.addOption(opt_inputFormat);
opt_outputFormat = OptionBuilder
.withDescription(OUTPUT_FORMAT_DESC)
.hasArg()
.withArgName("FORMAT")
.withLongOpt("output-format")
.create("of");
options.addOption(opt_outputFormat);
opt_compressOutput = OptionBuilder
.withDescription("Compress output files with CODEC (one of gzip, bzip2, snappy, auto)")
.hasArg()
.withArgName("CODEC")
.withLongOpt("compress-output")
.create("oc");
options.addOption(opt_compressOutput);
nReduceTasks = null;
inputs = new ArrayList<Path>(10);
outputDir = null;
this.configSection = (configSection == null) ? "" : configSection;
minReduceTasks = DEFAULT_MIN_REDUCE_TASKS;
myconf = null;
nReduceTasksPerNode = DEFAULT_REDUCE_TASKS_PER_NODE;
}
/**
* Set the minimum acceptable number of reduce tasks.
* If a user specifies a number lower than this limit parseOptions will raise
* an error.
*/
public void setMinReduceTasks(int x)
{
if (x < 0)
throw new IllegalArgumentException("minimum number of reduce tasks must be >= 0");
minReduceTasks = x;
}
public int getMinReduceTasks() { return minReduceTasks; }
protected void loadConfig(Configuration conf, File fname) throws ParseException, IOException
{
ConfigFileParser parser = new ConfigFileParser();
try {
parser.load( new FileReader(fname) );
Iterator<ConfigFileParser.KvPair> it = parser.getSectionIterator(configSection);
ConfigFileParser.KvPair pair;
while (it.hasNext())
{
pair = it.next();
conf.set(pair.getKey(), pair.getValue());
}
}
catch (FormatException e)
{
throw new ParseException("Error reading config file " + fname + ". " + e);
}
}
/**
* Decides whether to use an rc file, and if so which one.
*
* This method is necessary only because we'd like the user to be able to override the default
* location of the seal configuration file ($HOME/.sealrc). So, it scans
* the command line arguments looking for a user-specified seal configuration file.
* If one is specified, it verifies that it exists and is readable. If none is specified
* it checks to see whether a configuration file is available at the default location,
* and if it is the method verifies that it is readable.
*
* If a config file is found and is readable, its path is returned as a File object. On the other
* hand, if a config file isn't found the method returns null.
*
* @param args command line arguments
* @exception ParseException raise if the file specified on the cmd line doesn't exist or isn't readable.
*/
protected File getRcFile(String[] args) throws ParseException
{
File fname = null;
String shortOpt = "--" + opt_configFileOverride.getOpt();
String longOpt = "--" + opt_configFileOverride.getLongOpt();
for (int i = 0; i < args.length; ++i)
{
if (args[i].equals(shortOpt) || args[i].equals(longOpt))
{
if (i+1 >= args.length)
throw new ParseException("Missing file argument to " + args[i]);
fname = new File(args[i+1]);
break;
}
}
if (fname != null) // a seal configuration file was specified
{
if (!fname.exists())
throw new ParseException("Configuration file " + fname + " doesn't exist");
if (!fname.canRead())
throw new ParseException("Can't read configuration file " + fname);
// at this point it should be all good.
}
else // none specified. Try the default
{
// presume that if it exists the user intends to use it
if (DefaultConfigFile.exists())
{
if (DefaultConfigFile.canRead())
fname = DefaultConfigFile;
else
{
// The file exists but it can't be read. Warn the user.
LogFactory.getLog(SealToolParser.class).warn("Seal configuration file " + DefaultConfigFile + " isn't readable");
// leave fname as null so no configuration file will be used
}
}
}
return fname;
}
/**
* Set properties useful for the whole Seal suite.
*/
protected void setDefaultProperties(Configuration conf)
{
conf.set("mapred.compress.map.output", "true");
}
public void parse(Configuration conf, String[] args) throws IOException
{
try
{
parseOptions(conf, args);
}
catch( ParseException e )
{
defaultUsageError(e.getMessage()); // doesn't return
}
}
/**
* Parses command line.
*
* Override this method to implement additional command line options,
* but do make sure you call this method to parse the default options.
*/
protected CommandLine parseOptions(Configuration conf, String[] args)
throws ParseException, IOException
{
myconf = conf;
setDefaultProperties(conf);
// load settings from configuration file
// first, parse the command line (in getRcFile) looking for an option overriding the default seal configuration file
File configFile = getRcFile(args);
if (configFile != null)
loadConfig(conf, configFile);
// now parse the entire command line using the default hadoop parser. Now
// the user can override properties specified in the config file with properties
// specified on the command line.
CommandLine line = new GenericOptionsParser(conf, options, args).getCommandLine();
if (line == null)
throw new ParseException("Error parsing command line"); // getCommandLine returns an null if there was a parsing error
////////////////////// input/output formats //////////////////////
// set the configuration property. Then, we'll check the property
// to ensure it has a valid value, regardless of whether we just set it,
// so that the check will also be valid if the property is set directly.
if (line.hasOption(opt_inputFormat.getOpt()))
myconf.set(INPUT_FORMAT_CONF, line.getOptionValue(opt_inputFormat.getOpt()));
validateIOFormat(INPUT_FORMAT_CONF, acceptedInputFormats);
if (line.hasOption(opt_outputFormat.getOpt()))
myconf.set(OUTPUT_FORMAT_CONF, line.getOptionValue(opt_outputFormat.getOpt()));
validateIOFormat(OUTPUT_FORMAT_CONF, acceptedOutputFormats);
if (conf.get(INPUT_FORMAT_ENCODING) != null) {
String value = conf.get(INPUT_FORMAT_ENCODING);
if (value.equals("sanger") || value.equals("illumina"))
conf.set(org.seqdoop.hadoop_bam.FormatConstants.CONF_INPUT_BASE_QUALITY_ENCODING, value);
else
throw new ParseException("Invalid " + INPUT_FORMAT_ENCODING + ". Expected 'sanger' or 'illumina'");
}
/////////////////////// output compression /////////////////////
if (line.hasOption(opt_compressOutput.getOpt()))
{
myconf.setBoolean("mapred.output.compress", true);
String codec = line.getOptionValue(opt_compressOutput.getOpt());
if (codec != null)
{
String codecClass = "org.apache.hadoop.io.compress.GzipCodec"; // default
if ("auto".equalsIgnoreCase(codec) || "gzip".equalsIgnoreCase(codec))
{
// pass. Already set
}
else if ("bzip2".equalsIgnoreCase(codec))
codecClass = "org.apache.hadoop.io.compress.BZip2Codec";
else if ("snappy".equalsIgnoreCase(codec))
codecClass = "org.apache.hadoop.io.compress.SnappyCodec";
else
{
throw new ParseException("Unknown codec " + codec + ". Valid values are gzip, bzip2, snappy and auto.\n" +
"If you want to use an unsupported codec pass 'auto' and set the property mapred.output.compression.codec directly");
}
myconf.set("mapred.output.compression.codec", codecClass);
}
}
////////////////////// number of reducers //////////////////////
if (line.hasOption(opt_nReduceTasks.getOpt()))
{
String rString = line.getOptionValue(opt_nReduceTasks.getOpt());
try
{
int r = Integer.parseInt(rString);
if (r >= minReduceTasks)
nReduceTasks = r;
else
throw new ParseException("Number of reducers must be greater than or equal to " + minReduceTasks + " (got " + rString + ")");
}
catch (NumberFormatException e)
{
throw new ParseException("Invalid number of reduce tasks '" + rString + "'");
}
}
////////////////////// positional arguments //////////////////////
String[] otherArgs = line.getArgs();
if (otherArgs.length < 2) // require at least two: one input and one output
throw new ParseException("You must provide input and output paths");
else
{
//
FileSystem fs;
for (int i = 0; i < otherArgs.length - 1; ++i) {
Path p = new Path(otherArgs[i]);
fs = p.getFileSystem(conf);
p = p.makeQualified(fs);
FileStatus[] files = fs.globStatus(p);
if (files != null && files.length > 0)
{
for (FileStatus status: files)
inputs.add(status.getPath());
}
else
throw new ParseException("Input path " + p.toString() + " doesn't exist");
}
// now the last one, should be the output path
outputDir = new Path(otherArgs[otherArgs.length - 1]);
fs = outputDir.getFileSystem(conf);
outputDir = outputDir.makeQualified(fs);
if (fs.exists(outputDir))
throw new ParseException("Output path " + outputDir.toString() + " already exists. Won't overwrite");
}
return line;
}
/**
* Set accepted input format names.
*
* @names if null, turns off checking of input format name. Otherwise,
* specifying as a parameter to --input-format any name that is
* not in the list will generate an error.
*/
public void setAcceptedInputFormats(String[] names)
{
acceptedInputFormats = names;
String inputFormatHelp = INPUT_FORMAT_DESC;
if (names != null)
inputFormatHelp += " (" + joinStrings(names, ",") + ")";
opt_inputFormat.setDescription(inputFormatHelp);
}
/**
* Set accepted output format names.
*
* @names if null, turns off checking of output format name. Otherwise,
* specifying as a parameter to --output-format any name that is
* not in the list will generate an error.
*/
public void setAcceptedOutputFormats(String[] names)
{
acceptedOutputFormats = names;
String help = OUTPUT_FORMAT_DESC;
if (names != null)
help += "(" + joinStrings(names, ",") + ")";
opt_outputFormat.setDescription(help);
}
/**
* Return the input format specified, if any.
*
* @param defaultName Return this value if the configuration isn't set.
*/
public String getInputFormatName(String defaultName)
{
return myconf.get(INPUT_FORMAT_CONF, defaultName);
}
/**
* Return the input format specified, if any.
*/
public String getInputFormatName()
{
return myconf.get(INPUT_FORMAT_CONF);
}
/**
* Return the output format specified, if any.
*
* @param defaultName Return this value if the configuration isn't set.
*/
public String getOutputFormatName(String defaultName)
{
return myconf.get(OUTPUT_FORMAT_CONF, defaultName);
}
/**
* Return the output format specified, if any.
*/
public String getOutputFormatName()
{
return myconf.get(OUTPUT_FORMAT_CONF);
}
/**
* Get total number of reduce tasks to run.
* This option parser must have already parsed the command line.
*/
public int getNReduceTasks() throws java.io.IOException
{
if (myconf == null)
throw new IllegalStateException("getNReduceTasks called before parsing the command line.");
if (nReduceTasksPerNode < 0)
throw new IllegalArgumentException("Invalid number of default reduce tasks per node: " + nReduceTasksPerNode);
if (nReduceTasks == null)
{
// Calculate and cache value
// To calculate, we get the number of tasktrackers available and multiply by
// nReduceTasksPerNode. It can happen that a cluster doesn't yet have any
// task trackers, so we use a lower bound of 1.
nReduceTasks = Math.max(ClusterUtils.getNumberTaskTrackers(myconf), 1) * nReduceTasksPerNode;
return nReduceTasks;
}
else
return nReduceTasks;
}
public void setNReduceTasksPerNode(int value)
{
if (value < 0)
throw new IllegalArgumentException("number of reduce tasks per node must be >= 0 (got " + value + ")");
nReduceTasksPerNode = value;
nReduceTasks = null; // reset cached value
}
/**
* Return the specified output path.
*/
public Path getOutputPath()
{
return outputDir;
}
public List<Path> getInputPaths()
{
ArrayList<Path> retval = new ArrayList<Path>(getNumInputPaths());
for (Path p: inputs)
retval.add(p);
return retval;
}
public int getNumInputPaths()
{
return inputs.size();
}
public void defaultUsageError()
{
defaultUsageError(null);
}
/**
* Prints help and exits with code 3.
*/
public void defaultUsageError(String msg)
{
System.err.print("Usage error");
if (msg != null)
System.err.println(": " + msg);
System.err.print("\n");
// XXX: redirect System.out to System.err since the simple version of
// HelpFormatter.printHelp prints to System.out, and we're on a way to
// a fatal exit.
System.setOut(System.err);
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp("" + toolName + " [options] <in>+ <out>", options);
System.exit(3);
}
protected static String joinStrings(String[] strings, String joinString)
{
if (strings.length == 0)
return "";
else if (strings.length == 1)
return strings[0];
else
{
StringBuilder builder = new StringBuilder();
builder.append(strings[0]);
for (int i = 1; i < strings.length; ++i)
{
builder.append(joinString);
builder.append(strings[i]);
}
return builder.toString();
}
}
protected void validateIOFormat(String ioProperty, String[] acceptedFormats) throws ParseException
{
String selectedFormat = myconf.get(ioProperty);
if (acceptedFormats != null && selectedFormat != null)
{
for (int i = 0; i < acceptedFormats.length; ++i)
if (acceptedFormats[i].equals(selectedFormat))
return;
throw new ParseException("Incompatible file format selected. " + ioProperty + " is " + selectedFormat + " but acceptable values are " + joinStrings(acceptedFormats, ", "));
}
}
}