/*
* The MIT License (MIT)
*
* Copyright (c) 2007-2015 Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package org.broad.igv.tools.motiffinder;
import com.google.common.collect.Iterators;
import htsjdk.samtools.util.SequenceUtil;
import org.apache.log4j.Logger;
import org.broad.igv.feature.BasicFeature;
import org.broad.igv.feature.IGVFeature;
import org.broad.igv.feature.LocusScore;
import org.broad.igv.feature.Strand;
import org.broad.igv.feature.genome.Genome;
import org.broad.igv.session.SessionXmlAdapters;
import org.broad.igv.session.SubtlyImportant;
import org.broad.igv.track.FeatureSource;
import htsjdk.tribble.Feature;
import javax.xml.bind.annotation.XmlAccessType;
import javax.xml.bind.annotation.XmlAccessorType;
import javax.xml.bind.annotation.XmlAttribute;
import javax.xml.bind.annotation.adapters.XmlJavaTypeAdapter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Class for searching for pattern in a sequence.
*
* We recognize single letter codes, including ambiguity codes,
* only.
* See http://en.wikipedia.org/wiki/Nucleic_acid_notation
* or http://www.chem.qmul.ac.uk/iubmb/misc/naseq.html
* User: jacob
* Date: 2013-Jan-22
*/
@XmlAccessorType(XmlAccessType.NONE)
public class MotifFinderSource implements FeatureSource<Feature> {
private static Logger log = Logger.getLogger(MotifFinderSource.class);
@XmlAttribute private String pattern;
@XmlJavaTypeAdapter(SessionXmlAdapters.Genome.class)
@XmlAttribute private Genome genome;
@XmlAttribute private int featureWindowSize = (int) 10e6;
@XmlAttribute private Strand strand;
@SubtlyImportant
private MotifFinderSource(){}
/**
*
* @param pattern The regex pattern to search
* @param genome Genome from which to get sequence data
*/
public MotifFinderSource(String pattern, Strand strand, Genome genome){
this.pattern = pattern;
assert strand == Strand.POSITIVE || strand == Strand.NEGATIVE;
this.strand = strand;
this.genome = genome;
}
/**
* See {@link #search(String, Strand, String, int, byte[])} for explanation of parameters
* @param pattern
* @param strand POSITIVE or NEGATIVE
* @param chr
* @param posStart
* @param sequence
* @return
*/
static Iterator<Feature> searchSingleStrand(String pattern, Strand strand, String chr, int posStart, byte[] sequence){
Matcher matcher = getMatcher(pattern, strand, sequence);
return new MatchFeatureIterator(chr, strand, posStart, sequence.length, matcher);
}
/**
* See {@link #search(String, Strand, String, int, byte[])} for explanation of parameters
* @param pattern
* @param strand
* @param sequence
* @return
*/
static Matcher getMatcher(String pattern, Strand strand, byte[] sequence){
byte[] seq = sequence;
if(strand == Strand.NEGATIVE){
//sequence could be quite long, cloning it might take up a lot of memory
//and is un-necessary if we are careful.
//seq = seq.clone();
SequenceUtil.reverseComplement(seq);
}
Pattern regex = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE);
String stringSeq = new String(seq);
return regex.matcher(stringSeq);
}
/**
* Search the provided sequence for the provided pattern
* {@code chr} and {@code seqStart} are used in constructing the resulting
* {@code Feature}s. Search is performed over the specified {@code strand}
* @param pattern
* @param strand
* @param chr
* @param posStart The 0-based offset from the beginning of the genome that the {@code sequence} is based
* Always relative to positive strand
* @param sequence The positive-strand nucleotide sequence. This may be altered during execution!
* @return
*/
public static Iterator<Feature> search(String pattern, Strand strand, String chr, int posStart, byte[] sequence){
switch(strand){
case POSITIVE:
return searchSingleStrand(pattern, strand, chr, posStart, sequence);
case NEGATIVE:
Iterator<Feature> negIter = searchSingleStrand(pattern, Strand.NEGATIVE, chr, posStart, sequence);
List<Feature> negStrandFeatures = new ArrayList<Feature>();
Iterators.addAll(negStrandFeatures, negIter);
Collections.reverse(negStrandFeatures);
return negStrandFeatures.iterator();
default:
throw new IllegalArgumentException("Strand must be either POSITIVE or NEGATIVE");
}
}
@Override
public Iterator<Feature> getFeatures(String chr, int start, int end) throws IOException {
byte[] seq = genome.getSequence(chr, start, end);
if(seq == null) Collections.emptyList().iterator();
return search(this.pattern, this.strand, chr, start, seq);
}
@Override
public List<LocusScore> getCoverageScores(String chr, int start, int end, int zoom) {
//TODO Precalculate and/or store?
return null;
}
@Override
public int getFeatureWindowSize() {
return this.featureWindowSize;
}
@Override
public void setFeatureWindowSize(int size) {
this.featureWindowSize = size;
}
/**
* Iterator which turns regex Matcher results into Features.
* The ordering of features will be either forwards or backwards, depending
* on the strand.
*/
private static class MatchFeatureIterator implements Iterator<Feature>{
private final int sequenceLength;
private String chr;
private int posOffset;
private Strand strand;
/**
* Number of characters from the start of the string at which the
* last match was found.
*
* We want to find overlapping matches. By default, matcher.find()
* starts from the end of the previous match, which would preclude overlapping matches.
* By storing the last start position we reset each time
*/
private int lastMatchStart = -1;
private Matcher matcher;
private IGVFeature nextFeat;
/**
*
* @param chr The chromosome we are searching
* @param strand The strand we are searching
* @param posOffset The position within the chromosome that we started searching.
* Always in positive strand coordinates
* @param sequenceLength Length of the string which we are matching
* @param matcher Matcher over sequence.
*/
private MatchFeatureIterator(String chr, Strand strand, int posOffset, int sequenceLength, Matcher matcher){
this.chr = chr;
this.strand = strand;
this.posOffset = posOffset;
this.matcher = matcher;
if(this.strand == Strand.NEGATIVE){
this.posOffset += sequenceLength;
}
this.sequenceLength = sequenceLength;
findNext();
}
private void findNext(){
try {
if(matcher.find(lastMatchStart + 1)){
lastMatchStart = matcher.start();
//The start/end coordinates are always in positive-strand coordinates
int start, end;
if(strand == Strand.POSITIVE){
start = posOffset + lastMatchStart;
end = posOffset + matcher.end();
}else{
start = posOffset - matcher.end();
end = posOffset - lastMatchStart;
}
nextFeat = new BasicFeature(chr, start, end, this.strand);
}else{
nextFeat = null;
}
} catch (IndexOutOfBoundsException e) {
log.error(e);
nextFeat = null;
}
}
@Override
public boolean hasNext() {
return nextFeat != null;
}
@Override
public IGVFeature next() {
IGVFeature nF = nextFeat;
findNext();
return nF;
}
@Override
public void remove() {
throw new RuntimeException("Cannot remove from this iterator");
}
}
}