package org.seqcode.tools.sequence;
import java.util.*;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.IOException;
import org.seqcode.genome.Genome;
import org.seqcode.genome.location.Gene;
import org.seqcode.genome.location.NamedStrandedRegion;
import org.seqcode.genome.location.NamedTypedRegion;
import org.seqcode.genome.location.Region;
import org.seqcode.genome.location.ScoredRegion;
import org.seqcode.genome.sequence.SequenceGenerator;
import org.seqcode.gsebricks.*;
import org.seqcode.gsebricks.verbs.*;
import org.seqcode.gsebricks.verbs.location.ChromRegionIterator;
import org.seqcode.gsebricks.verbs.location.GeneToPromoter;
import org.seqcode.gsebricks.verbs.location.PhastConsGenerator;
import org.seqcode.gsebricks.verbs.location.RefGeneGenerator;
import org.seqcode.gsebricks.verbs.location.ScoredRegionGenerator;
import org.seqcode.gseutils.Args;
import org.seqcode.gseutils.NotFoundException;
/**
* Returns the sequences of named gene promoter regions
*
* cat gene_names.txt | java GenePromoters --species "$SC;sacCer3" --genes sgdGene --upstream 500 --downstream 100
* cat gene_names.txt | java GenePromoters --species "$SC;sacCer3" --genes sgdGene --upstream 500 --downstream 100 --fasta
* java GenePromoters --species "$SC;sacCer3" --genes sgdGene --upstream 500 --downstream 100 --fasta --allgenes
*
* [--fasta] fasta formatted output
* [--dontoverlaporfs] don't overlap annotated genes or some other features like refBase and sgdOther
* [--phastmax] don't include regions with a phastcons score greater than this value
* [--phastmin] only include regions with a phastcons score greater than this value
* [--maskkeep] provide "name;score" for a ScoredRegionGenerator and minimum score to specify regions to keep
* [--maskout] provide "name;score" for a ScoredRegionGenerator and maximum score to specify regions to keep
* [--minlen] specify a minimum length for the output regions
*
*
*/
public class GenePromoters {
private int upstream, downstream, minlen;
private List<RefGeneGenerator> geneGenerators;
private SequenceGenerator seqgen;
private Genome genome;
private boolean allGenes, toFasta, dontOverlapOrfs;
private GeneToPromoter promgen;
private Map<Expander<Region,? extends ScoredRegion>, Double> maskKeep, maskOut;
public static void main(String args[]) throws Exception {
GenePromoters gp = new GenePromoters();
gp.parseArgs(args);
gp.run();
}
public GenePromoters() {}
public void parseArgs(String args[]) throws NotFoundException {
geneGenerators = Args.parseGenes(args);
for (RefGeneGenerator r : geneGenerators) {
r.retrieveExons(false);
}
upstream = Args.parseInteger(args,"upstream",10000);
downstream = Args.parseInteger(args,"downstream",2000);
minlen = Args.parseInteger(args,"minlen",0);
genome = Args.parseGenome(args).getLast();
seqgen = new SequenceGenerator(genome);
allGenes = Args.parseFlags(args).contains("allgenes");
toFasta = Args.parseFlags(args).contains("fasta");
dontOverlapOrfs = Args.parseFlags(args).contains("dontoverlaporfs");
if (dontOverlapOrfs) {
Collection<Expander<Region, ? extends Region>> dontoverlap =
new ArrayList<Expander<Region, ? extends Region>>();
for (RefGeneGenerator g : geneGenerators) {
dontoverlap.add(g);
}
try {
RegionExpanderFactoryLoader<NamedTypedRegion> annotLoader =
new RegionExpanderFactoryLoader<NamedTypedRegion>("annots");
RegionExpanderFactory factory = annotLoader.getFactory(genome,
"repBase");
dontoverlap.add(factory.getExpander(genome));
System.err.println("Also filtering against repbase");
} catch (Exception e) {
System.err.println("Trying to add repbase to list of things not to overlap (will continue):");
e.printStackTrace();
}
try {
RegionExpanderFactoryLoader<NamedTypedRegion> annotLoader =
new RegionExpanderFactoryLoader<NamedTypedRegion>("annots");
RegionExpanderFactory factory = annotLoader.getFactory(genome,
"sgdOther");
dontoverlap.add(factory.getExpander(genome));
System.err.println("Also filtering against sgdOther");
} catch (Exception e) {
System.err.println("Trying to add sgdOther to list of things not to overlap (will continue):");
e.printStackTrace();
}
promgen = new GeneToPromoter(upstream, downstream, dontoverlap);
} else {
promgen = new GeneToPromoter(upstream, downstream);
}
maskKeep = new HashMap<Expander<Region,? extends ScoredRegion>, Double>();
maskOut = new HashMap<Expander<Region,? extends ScoredRegion>, Double>();
for (String s : Args.parseStrings(args,"phastmin")) {
String pieces[] = s.split(";");
if (!pieces[1].equals("0")) {
maskKeep.put(new PhastConsGenerator(genome, pieces[0]), Double.parseDouble(pieces[1]));
}
}
for (String s : Args.parseStrings(args,"phastmax")) {
String pieces[] = s.split(";");
if (!pieces[1].equals("0")) {
maskOut.put(new PhastConsGenerator(genome, pieces[0]), Double.parseDouble(pieces[1]));
}
}
for (String s : Args.parseStrings(args,"maskkeep")) {
String pieces[] = s.split(";");
maskKeep.put(new ScoredRegionGenerator(genome, pieces[0]), Double.parseDouble(pieces[1]));
}
for (String s : Args.parseStrings(args,"maskout")) {
String pieces[] = s.split(";");
maskOut.put(new ScoredRegionGenerator(genome, pieces[0]), Double.parseDouble(pieces[1]));
}
}
public Collection<Region> getMasksForRegion(Region r) {
Collection<Region> maskedout = new ArrayList<Region>();
if (maskOut.size() == 0 && maskKeep.size() == 0) {
return maskedout;
}
if (maskOut.size() == 0) {
maskedout.add(r);
} else {
for (Expander<Region,? extends ScoredRegion> exp : maskOut.keySet()) {
Iterator<? extends ScoredRegion> iter = exp.execute(r);
while (iter.hasNext()) {
ScoredRegion sr = iter.next();
if (sr.getScore() >= maskOut.get(exp)) {maskedout.add(sr);}
}
}
}
for (Expander<Region,? extends ScoredRegion> exp : maskKeep.keySet()) {
Iterator<? extends ScoredRegion> iter = exp.execute(r);
while (iter.hasNext()) {
ScoredRegion sr = iter.next();
if (sr.getScore() >= maskKeep.get(exp)) {
ArrayList<Region> newmo = new ArrayList<Region>();
for (Region old : maskedout) {
if (old.overlaps(sr)) {
if (old.contains(sr)) {
newmo.add(new Region(old.getGenome(), old.getChrom(), old.getStart(), sr.getStart()));
newmo.add(new Region(old.getGenome(), old.getChrom(), sr.getEnd(), old.getEnd()));
} else if (old.before(sr)) {
newmo.add(new Region(old.getGenome(), old.getChrom(), old.getStart(), sr.getStart()));
} else {
newmo.add(new Region(old.getGenome(), old.getChrom(), Math.min(sr.getEnd(), old.getEnd()), Math.max(sr.getEnd(), old.getEnd())));
}
} else {
newmo.add(old);
}
}
maskedout = newmo;
}
}
}
return maskedout;
}
public char[] getMaskedRegion(Region r, Collection<Region> masks) {
boolean[] keep = new boolean[r.getWidth()];
for (int i = 0; i < keep.length; i++) {
keep[i] = true;
}
for (Region mask : masks) {
Region overlap = mask.getOverlap(r);
for (int pos = overlap.getStart() - r.getStart(); pos < overlap.getEnd() - r.getStart(); pos++) {
keep[pos] = false;
}
}
char[] chars = seqgen.execute(r).toUpperCase().toCharArray();
for (int i = 0; i < chars.length; i++) {
if (!keep[i]) {
chars[i] = 'X';
}
}
return chars;
}
public void run() throws IOException {
if (allGenes) {
ChromRegionIterator chroms = new ChromRegionIterator(genome);
while (chroms.hasNext()) {
Region chrom = chroms.next();
/* we'll use all the gene generators provided but don't want to output duplicate regions.
seen is keyed on 5' and contains a list of 3' ends that have already been output.
*/
Map<Integer,List<Integer>> seen = new HashMap<Integer,List<Integer>>();
for (RefGeneGenerator refgene : geneGenerators) {
Iterator<Gene> iter = refgene.execute(chrom);
while (iter.hasNext()) {
Gene g = iter.next();
if (seen.containsKey(g.getFivePrime()) &&
seen.get(g.getFivePrime()).contains(g.getThreePrime())) {
continue;
} else {
if (!seen.containsKey(g.getFivePrime())) {
seen.put(g.getFivePrime(), new ArrayList<Integer>());
}
seen.get(g.getFivePrime()).add(g.getThreePrime());
}
output(promgen.execute(g));
}
}
}
} else {
BufferedReader reader = new BufferedReader(new InputStreamReader(System.in));
String line = null;
while ((line = reader.readLine()) != null) {
Gene g = null;
String pieces[] = line.split("\\t");
for (int i = 0; i < pieces.length; i++) {
for (RefGeneGenerator refgene : geneGenerators) {
Iterator<Gene> iter = refgene.byName(pieces[i]);
while (iter.hasNext()) {
if (g == null) {
g = iter.next();
} else {
iter.next();
}
}
if (g != null) {
break;
}
}
if (g != null) {
break;
}
}
if (g != null) {
g.setName(line);
output(promgen.execute(g));
}
}
}
}
public void output(NamedStrandedRegion r) {
char[] c = getMaskedRegion(r, getMasksForRegion(r));
if (r.getWidth() < minlen) {
return;
}
if (toFasta) {
System.out.println(">" + r.getName());
for (int pos = 0; pos < c.length; pos += 60) {
for (int i = 0; i < 60 && pos + i < c.length; i++) {
System.out.print(c[pos+i]);
}
System.out.println();
}
} else {
System.out.println(String.format("%s\t%s:%d-%d:%s",
r.toString(),
r.getChrom(), r.getStart(), r.getEnd(), r.getStrand()));
}
}
}