// Copyright (C) 2011-2012 CRS4.
//
// This file is part of Seal.
//
// Seal is free software: you can redistribute it and/or modify it
// under the terms of the GNU General Public License as published by the Free
// Software Foundation, either version 3 of the License, or (at your option)
// any later version.
//
// Seal is distributed in the hope that it will be useful, but
// WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
// or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with Seal. If not, see <http://www.gnu.org/licenses/>.
package it.crs4.seal.recab;
import it.crs4.seal.common.ClusterUtils;
import it.crs4.seal.common.ContextAdapter;
import it.crs4.seal.common.FormatNameMap;
import it.crs4.seal.common.IMRContext;
import it.crs4.seal.common.ReadPair;
import it.crs4.seal.common.SealToolRunner;
import org.apache.commons.cli.*;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.ClusterStatus;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import java.net.URI;
import java.net.URISyntaxException;
import java.io.IOException;
import java.io.FileReader;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.util.Collection;
public class RecabTable extends Configured implements Tool
{
private static final Log LOG = LogFactory.getLog(RecabTable.class);
public static final int DEFAULT_RED_TASKS_PER_TRACKER = 3;
public static final String ASCII = "US-ASCII";
public static final String TableDelim = ",";
public static final byte[] TableDelimBytes;
static {
try {
TableDelimBytes = TableDelim.getBytes(ASCII);
} catch (UnsupportedEncodingException e) {
throw new RuntimeException(ASCII + " character set not supported!");
}
}
public static final String LocalVariantsFile = "variants_table.data";
public static final String VariantsFileTypeProperty = "seal.recab.variants-table-type";
public static final String VariantsFileTypeVcf = "vcf";
public static final String VariantsFileTypeRod = "rod";
// Whether to only consider SNP variation locations or to consider all variation types.
public static final String SnpsOnlyProperty = "seal.recab.snps-only";
// Default: consider only SNPs
public static final boolean SnpsOnlyDefault = false;
public static class Map extends Mapper<LongWritable, ReadPair, Text, ObservationCount>
{
private RecabTableMapper impl;
private IMRContext<Text,ObservationCount> contextAdapter;
protected VariantReader getVariantFileReader(Configuration conf) throws IOException
{
// known variation sites
Reader in = new FileReader(LocalVariantsFile);
VariantReader reader = null;
String variantsFileType = conf.get(VariantsFileTypeProperty);
if (variantsFileType == null)
{
LOG.warn("Configuration property " + VariantsFileTypeProperty + " isn't set. Assuming variants file is VCF");
variantsFileType = VariantsFileTypeVcf;
}
if (variantsFileType.equals(VariantsFileTypeVcf))
{
VcfVariantReader vcfReader = new VcfVariantReader(in);
vcfReader.setReadSnpsOnly(conf.getBoolean(SnpsOnlyProperty, SnpsOnlyDefault));
reader = vcfReader;
}
else if (variantsFileType.equals(VariantsFileTypeRod))
{
reader = new RodFileVariantReader(in);
if (!conf.getBoolean(SnpsOnlyProperty, SnpsOnlyDefault))
throw new RuntimeException("Sorry. Using all variant types is currently not supported for Rod files. Please let the Seal developers know if this is important to you.");
}
else
throw new IllegalArgumentException("unrecognized variants file type set in " + VariantsFileTypeProperty + " (accepted values are " + VariantsFileTypeVcf + " and " + VariantsFileTypeRod + ")");
return reader;
}
@Override
public void setup(Context context) throws IOException
{
impl = new RecabTableMapper();
contextAdapter = new ContextAdapter<Text,ObservationCount>(context);
Configuration conf = context.getConfiguration();
VariantReader reader = getVariantFileReader(conf);
impl.setup(reader, contextAdapter, conf);
}
@Override
public void map(LongWritable pos, ReadPair pair, Context context) throws java.io.IOException, InterruptedException
{
impl.map(pos, pair, contextAdapter);
}
}
public static class Combiner extends Reducer<Text, ObservationCount, Text, ObservationCount>
{
private RecabTableCombiner impl;
private IMRContext<Text,ObservationCount> contextAdapter;
@Override
public void setup(Context context) throws IOException
{
contextAdapter = new ContextAdapter<Text,ObservationCount>(context);
impl = new RecabTableCombiner();
impl.setup(context.getConfiguration());
}
@Override
public void reduce(Text key, Iterable<ObservationCount> values, Context context) throws IOException, InterruptedException
{
impl.reduce(key, values, contextAdapter);
}
}
public static class Red extends Reducer<Text, ObservationCount, Text, Text>
{
private RecabTableReducer impl;
private IMRContext<Text,Text> contextAdapter;
@Override
public void setup(Context context) throws IOException
{
contextAdapter = new ContextAdapter<Text,Text>(context);
impl = new RecabTableReducer();
impl.setup(context.getConfiguration());
}
@Override
public void reduce(Text key, Iterable<ObservationCount> values, Context context) throws IOException, InterruptedException
{
impl.reduce(key, values, contextAdapter);
}
}
private void distributeVariantsFile(RecabTableOptionParser parser)
{
Configuration conf = getConf();
DistributedCache.createSymlink(conf); // create symlinks in each task's working directory for the distributed files
String distPath;
String variantsFileType;
if (parser.getVcfFile() != null)
{
variantsFileType = VariantsFileTypeVcf;
distPath = parser.getVcfFile().toString();
conf.set(VariantsFileTypeProperty, VariantsFileTypeVcf);
}
else if (parser.getRodFile() != null)
{
variantsFileType = VariantsFileTypeRod;
distPath = parser.getRodFile().toString();
conf.set(VariantsFileTypeProperty, VariantsFileTypeRod);
}
else
throw new RuntimeException("BUG!! RecabTableOptionParser defined with getRodFile and getVcfFile both null!");
distPath += "#" + LocalVariantsFile;
try {
DistributedCache.addCacheFile(new URI(distPath), conf);
}
catch (URISyntaxException e) {
throw new RuntimeException("Invalid syntax in path to variants file. " + e);
}
}
@Override
public int run(String[] args) throws Exception {
LOG.info("starting");
RecabTableOptionParser parser = new RecabTableOptionParser();
parser.parse(getConf(), args);
LOG.info("Using " + parser.getNReduceTasks() + " reduce tasks");
// must be called before creating the job, since the job
// *copies* the Configuration.
distributeVariantsFile(parser);
// Create a Job using the processed conf
Job job = new Job(getConf(), "RecabTable " + parser.getInputPaths().get(0));
job.setJarByClass(RecabTable.class);
job.setInputFormatClass(
FormatNameMap.getInputFormat(job.getConfiguration().get(RecabTableOptionParser.INPUT_FORMAT_CONF, "sam")));
LOG.info("Using input format " + job.getInputFormatClass().getName());
// input paths
for (Path p: parser.getInputPaths())
FileInputFormat.addInputPath(job, p);
job.setMapperClass(Map.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(ObservationCount.class);
job.setCombinerClass(Combiner.class);
job.setReducerClass(Red.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
// output
FileOutputFormat.setOutputPath(job, parser.getOutputPath());
// Submit the job, then poll for progress until the job is complete
boolean result = job.waitForCompletion(true);
if (result)
{
LOG.info("done");
return 0;
}
else
{
LOG.fatal(this.getClass().getName() + " failed!");
return 1;
}
}
public static void main(String[] args) throws Exception {
int res = new SealToolRunner().run(new RecabTable(), args);
System.exit(res);
}
}