package picard.util;
import htsjdk.samtools.SAMException;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMProgramRecord;
import htsjdk.samtools.util.CollectionUtil;
import htsjdk.samtools.util.IOUtil;
import htsjdk.samtools.util.Interval;
import htsjdk.samtools.util.IntervalList;
import htsjdk.samtools.util.Log;
import htsjdk.variant.vcf.VCFFileReader;
import picard.PicardException;
import picard.cmdline.CommandLineParser;
import picard.cmdline.CommandLineProgram;
import picard.cmdline.CommandLineProgramProperties;
import picard.cmdline.Option;
import picard.cmdline.programgroups.Intervals;
import picard.cmdline.StandardOptionDefinitions;
import java.io.File;
import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
/**
* Little class to aid working with interval lists.
*
* @author Tim Fennell
*/
@CommandLineProgramProperties(
usage = IntervalListTools.USAGE_SUMMARY + IntervalListTools.USAGE_DETAILS,
usageShort = IntervalListTools.USAGE_SUMMARY,
programGroup = Intervals.class
)
public class IntervalListTools extends CommandLineProgram {
static final String USAGE_SUMMARY = "Manipulates interval lists. ";
static final String USAGE_DETAILS = "This tool offers multiple interval list file manipulation capabilities include sorting, " +
"merging, subtracting, padding, customizing, and other set-theoretic operations. If given one or more inputs, the default " +
"operation is to merge and sort them. Other options e.g. interval subtraction are controlled by the arguments. The tool " +
"lists intervals with respect to a reference sequence." +
"<br /><br />" +
"Both interval_list and VCF files are accepted as input. The interval_list file format is relatively simple" +
" and reflects the SAM alignment format to a degree. A SAM style header must be present in the file that " +
"lists the sequence records against which the intervals are described. After the header, the file then" +
" contains records, one per line in text format with the following" +
" values tab-separated: " +
"<pre>" +
" -Sequence name (SN) <br />" +
" -Start position (1-based)** <br />" +
" -End position (1-based, end inclusive) <br />" +
" -Strand (either + or -) <br />" +
" -Interval name (ideally unique names for intervals)" +
"</pre>" +
"The coordinate system of interval_list files is such that the first base or position in a sequence is position \"1\"." +
"<h4>Usage example:</h4>" +
"<pre>" +
"java -jar picard.jar IntervalListTools \\<br />" +
" I=input.interval_list \\<br />" +
" SI=input_2.interval_list \\<br />" +
" O=new.interval_list" +
"</pre>" +
"<hr />";
@Option(shortName = StandardOptionDefinitions.INPUT_SHORT_NAME,
doc = "One or more interval lists. If multiple interval lists are provided the output is the" +
"result of merging the inputs. Supported formats are interval_list and VCF.", minElements = 1)
public List<File> INPUT;
@Option(doc = "The output interval list file to write (if SCATTER_COUNT is 1) or the directory into which " +
"to write the scattered interval sub-directories (if SCATTER_COUNT > 1)", shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME, optional = true)
public File OUTPUT;
@Option(doc = "The amount to pad each end of the intervals by before other operations are undertaken. Negative numbers are allowed " +
"and indicate intervals should be shrunk. Resulting intervals < 0 bases long will be removed. Padding is applied to the interval lists <b> before </b> the ACTION is performed.", optional = true)
public int PADDING = 0;
@Option(doc = "If true, merge overlapping and adjacent intervals to create a list of unique intervals. Implies SORT=true")
public boolean UNIQUE = false;
@Option(doc = "If true, sort the resulting interval list by coordinate.")
public boolean SORT = true;
@Option(doc = "Action to take on inputs.")
public Action ACTION = Action.CONCAT;
@Option(shortName = "SI", doc = "Second set of intervals for SUBTRACT and DIFFERENCE operations.", optional = true)
public List<File> SECOND_INPUT;
@Option(doc = "One or more lines of comment to add to the header of the output file.", optional = true)
public List<String> COMMENT = null;
@Option(doc = "The number of files into which to scatter the resulting list by locus; in some situations, fewer intervals may be emitted. " +
"Note - if > 1, the resultant scattered intervals will be sorted and uniqued. The sort will be inverted if the INVERT flag is set.")
public int SCATTER_COUNT = 1;
@Option(doc = "Whether to include filtered variants in the vcf when generating an interval list from vcf", optional = true)
public boolean INCLUDE_FILTERED=false;
@Option(shortName = "BRK", doc = "If set to a positive value will create a new interval list with the original intervals broken up at integer multiples of this value. Set to 0 to NOT break up intervals", optional = true)
public int BREAK_BANDS_AT_MULTIPLES_OF = 0;
@Option(shortName = "M", doc = "Do not subdivide ")
public IntervalListScatterer.Mode SUBDIVISION_MODE = IntervalListScatterer.Mode.INTERVAL_SUBDIVISION;
@Option(doc = "Produce the inverse list", optional = true)
public boolean INVERT = false;
private static final Log LOG = Log.getInstance(IntervalListTools.class);
public enum Action implements CommandLineParser.ClpEnum {
CONCAT("The concatenation of all the INPUTs, no sorting or merging of overlapping/abutting intervals implied. Will result in an unsorted list unless requested otherwise.") {
@Override
IntervalList act(final List<IntervalList> list, final List<IntervalList> unused) {
if (!unused.isEmpty())
throw new IllegalArgumentException(String.format("Second List found when action was %s. Ignoring second list.", this.name()));
return IntervalList.concatenate(list);
}
},
UNION("Like CONCATENATE but with UNIQUE and SORT implied, the result being the set-wise union of all INPUTS.") {
@Override
IntervalList act(final List<IntervalList> list, final List<IntervalList> unused) {
if (!unused.isEmpty())
throw new IllegalArgumentException(String.format("Second List found when action was %s. Ignoring second list.", this.name()));
return IntervalList.union(list);
}
},
INTERSECT("The sorted, uniqued set of all loci that are contained in all of the INPUTs.") {
@Override
IntervalList act(final List<IntervalList> list, final List<IntervalList> unused) {
if (!unused.isEmpty())
throw new IllegalArgumentException(String.format("Second List found when action was %s. Ignoring second list.", this.name()));
return IntervalList.intersection(list);
}
},
SUBTRACT("Subtracts SECOND_INPUT from INPUT. The resulting loci are there in INPUT that are not in SECOND_INPUT") {
@Override
IntervalList act(final List<IntervalList> list1, final List<IntervalList> list2) {
return IntervalList.subtract(list1, list2);
}
},
SYMDIFF("Find loci that are in INPUT or SECOND_INPUT but are not in both.") {
@Override
IntervalList act(final List<IntervalList> list1, final List<IntervalList> list2) {
return IntervalList.difference(list1, list2);
}
};
String helpdoc;
Action(final String helpdoc) {
this.helpdoc = helpdoc;
}
@Override
public String getHelpDoc() {
return helpdoc;
}
abstract IntervalList act(final List<IntervalList> list1, final List<IntervalList> list2);
}
// Stock main method
public static void main(final String[] args) {
new IntervalListTools().instanceMainWithExit(args);
}
@Override
protected int doWork() {
// Check inputs
for (final File f : INPUT) IOUtil.assertFileIsReadable(f);
for (final File f : SECOND_INPUT) IOUtil.assertFileIsReadable(f);
if (OUTPUT != null) {
if (SCATTER_COUNT == 1) {
IOUtil.assertFileIsWritable(OUTPUT);
} else {
IOUtil.assertDirectoryIsWritable(OUTPUT);
}
}
// Read in the interval lists and apply any padding
final List<IntervalList> lists = openIntervalLists(INPUT);
// same for the second list
final List<IntervalList> secondLists = openIntervalLists(SECOND_INPUT);
if (UNIQUE && !SORT) {
LOG.warn("UNIQUE=true requires sorting but SORT=false was specified. Results will be sorted!");
}
final IntervalList result = ACTION.act(lists, secondLists);
if (SCATTER_COUNT > 1) {
// Scattering requires a uniqued, sorted interval list. We want to do this up front (before BREAKING AT BANDS)
SORT = true;
UNIQUE = true;
}
if (INVERT) {
SORT = false; // no need to sort, since return will be sorted by definition.
UNIQUE = true;
}
final IntervalList possiblySortedResult = SORT ? result.sorted() : result;
final IntervalList possiblyInvertedResult = INVERT ? IntervalList.invert(possiblySortedResult) : possiblySortedResult;
//only get unique if this has been asked unless inverting (since the invert will return a unique list)
List<Interval> finalIntervals = UNIQUE ? possiblyInvertedResult.uniqued().getIntervals() : possiblyInvertedResult.getIntervals();
if (BREAK_BANDS_AT_MULTIPLES_OF > 0) {
finalIntervals = IntervalList.breakIntervalsAtBandMultiples(finalIntervals, BREAK_BANDS_AT_MULTIPLES_OF);
}
// Decide on a PG ID and make a program group
final SAMFileHeader header = result.getHeader();
final Set<String> pgs = new HashSet<String>();
for (final SAMProgramRecord pg : header.getProgramRecords()) pgs.add(pg.getId());
for (int i = 1; i < Integer.MAX_VALUE; ++i) {
if (!pgs.contains(String.valueOf(i))) {
final SAMProgramRecord pg = new SAMProgramRecord(String.valueOf(i));
pg.setCommandLine(getCommandLine());
pg.setProgramName(getClass().getSimpleName());
header.addProgramRecord(pg);
break;
}
}
// Add any comments
if (COMMENT != null) {
for (final String comment : COMMENT) {
header.addComment(comment);
}
}
final IntervalList output = new IntervalList(header);
for (final Interval i : finalIntervals) {
output.add(i);
}
final List<IntervalList> resultIntervals;
if (OUTPUT != null) {
if (SCATTER_COUNT == 1) {
output.write(OUTPUT);
resultIntervals = Arrays.asList(output);
} else {
final List<IntervalList> scattered = writeScatterIntervals(output);
LOG.info(String.format("Wrote %s scatter subdirectories to %s.", scattered.size(), OUTPUT));
if (scattered.size() != SCATTER_COUNT) {
LOG.warn(String.format(
"Requested scatter width of %s, but only emitted %s. (This may be an expected consequence of running in %s mode.)",
SCATTER_COUNT,
scattered.size(),
SUBDIVISION_MODE
));
}
resultIntervals = scattered;
}
} else {
resultIntervals = Arrays.asList(output);
}
long totalUniqueBaseCount = 0;
long intervalCount = 0;
for (final IntervalList finalInterval : resultIntervals) {
totalUniqueBaseCount += finalInterval.getUniqueBaseCount();
intervalCount += finalInterval.size();
}
LOG.info("Produced " + intervalCount + " intervals totalling " + totalUniqueBaseCount + " unique bases.");
return 0;
}
private List<IntervalList> openIntervalLists(final List<File> files){
final List<IntervalList> lists = new ArrayList<IntervalList>();
for (final File f : files) {
lists.add(TYPE.getIntervalList(f, INCLUDE_FILTERED).padded(PADDING));
}
return lists;
}
@Override
protected String[] customCommandLineValidation() {
final List<String> errorMsgs = new ArrayList<String>();
if (SCATTER_COUNT < 1) {
errorMsgs.add("SCATTER_COUNT must be greater than 0.");
}
if (BREAK_BANDS_AT_MULTIPLES_OF < 0) {
errorMsgs.add("BREAK_BANDS_AT_MULTIPLES_OF must be greater than or equal to 0.");
}
return errorMsgs.isEmpty() ? null : errorMsgs.toArray(new String[errorMsgs.size()]);
}
/**
* Method to scatter an interval list by locus.
*
* @param list The list of intervals to scatter
* @return The scattered intervals, represented as a {@link List} of {@link IntervalList}
*/
private List<IntervalList> writeScatterIntervals(final IntervalList list) {
final IntervalListScatterer scatterer = new IntervalListScatterer(SUBDIVISION_MODE);
final List<IntervalList> scattered = scatterer.scatter(list, SCATTER_COUNT, UNIQUE);
final DecimalFormat fileNameFormatter = new DecimalFormat("0000");
int fileIndex = 1;
for (final IntervalList intervals : scattered) {
intervals.write(createDirectoryAndGetScatterFile(OUTPUT, scattered.size(), fileNameFormatter.format(fileIndex++)));
}
return scattered;
}
public static File getScatteredFileName(final File scatterDirectory, final long scatterTotal, final String formattedIndex) {
return new File(scatterDirectory.getAbsolutePath() + "/temp_" + formattedIndex + "_of_" +
scatterTotal + "/scattered" + IntervalList.INTERVAL_LIST_FILE_EXTENSION);
}
private static File createDirectoryAndGetScatterFile(final File outputDirectory, final long scatterCount, final String formattedIndex) {
createDirectoryOrFail(outputDirectory);
final File result = getScatteredFileName(outputDirectory, scatterCount, formattedIndex);
createDirectoryOrFail(result.getParentFile());
return result;
}
private static void createDirectoryOrFail(final File directory) {
if (!directory.exists() && !directory.mkdir()) {
throw new PicardException("Unable to create directory: " + directory.getAbsolutePath());
}
}
enum TYPE {
VCF(IOUtil.VCF_EXTENSIONS) {
@Override
protected IntervalList getIntervalListInternal(final File vcf, final boolean includeFiltered) {
return VCFFileReader.fromVcf(vcf, includeFiltered);
}
},
INTERVAL_LIST(IOUtil.INTERVAL_LIST_FILE_EXTENSION) {
@Override
protected IntervalList getIntervalListInternal(final File intervalList, final boolean includeFiltered) {
return IntervalList.fromFile(intervalList);
}
};
final Collection<String> applicableExtensions;
TYPE(final String... s) {
applicableExtensions = CollectionUtil.makeSet(s);
}
TYPE(final Collection<String> extensions) {
applicableExtensions = extensions;
}
abstract protected IntervalList getIntervalListInternal(final File file, final boolean includeFiltered);
static TYPE forFile(final File intervalListExtractable) {
for (final TYPE type : TYPE.values()) {
for (final String s : type.applicableExtensions) {
if (intervalListExtractable.getName().endsWith(s)) {
return type;
}
}
}
throw new SAMException("Cannot figure out type of file " + intervalListExtractable.getAbsolutePath() + " from extension. Current implementation understands the following types: " + Arrays.toString(TYPE.values()));
}
static public IntervalList getIntervalList(final File file, final boolean includeFiltered){
return forFile(file).getIntervalListInternal(file, includeFiltered);
}
@Override
public String toString() {
return super.toString() + ": " + applicableExtensions.toString();
}
}
}