package org.seqcode.data.io;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Random;
import org.seqcode.data.io.parsing.BEDLine;
import org.seqcode.data.io.parsing.BEDParser;
import org.seqcode.data.io.parsing.GFFEntry;
import org.seqcode.data.io.parsing.ParseGFF;
import org.seqcode.genome.Genome;
import org.seqcode.genome.location.NamedRegion;
import org.seqcode.genome.location.Point;
import org.seqcode.genome.location.Region;
import org.seqcode.genome.location.StrandedPoint;
import org.seqcode.genome.location.StrandedRegion;
import org.seqcode.genome.sequence.SequenceGenerator;
import org.seqcode.genome.sequence.SequenceUtils;
import org.seqcode.gsebricks.verbs.location.ChromRegionIterator;
import org.seqcode.gsebricks.verbs.location.PointParser;
import org.seqcode.gsebricks.verbs.location.RegionParser;
import org.seqcode.gsebricks.verbs.location.StrandedPointParser;
import org.seqcode.gsebricks.verbs.location.StrandedRegionParser;
import org.seqcode.gseutils.Pair;
public class RegionFileUtilities {
public RegionFileUtilities(){}
/**
* Loads a set of points from the third or first column of a file
* (Suitable for GPS & StatisticalPeakFinder files)
* @param filename
* @return
*/
public static List<Point> loadPointsFromFile(String filename, Genome gen){
List<Point> points = new ArrayList<Point>();
try{
File pFile = new File(filename);
if(!pFile.isFile()){System.err.println("Invalid file name: "+filename);System.exit(1);}
BufferedReader reader = new BufferedReader(new FileReader(pFile));
String line;
while ((line = reader.readLine()) != null) {
line = line.trim();
String[] words = line.split("\\s+");
if(words.length>0 && !words[0].contains("#") && !words[0].equals("Region") && !words[0].equals("Position")){
if(words.length>=3 && words[2].contains(":")){
PointParser pparser = new PointParser(gen);
Point p = pparser.execute(words[2]);
if(p!=null)
points.add(p);
}else if(words.length>=1 && words[0].contains(":")){
if(words[0].contains("-")){
RegionParser rparser = new RegionParser(gen);
Region q = rparser.execute(words[0]);
if(q!=null)
points.add(q.getMidpoint());
}else{
PointParser pparser = new PointParser(gen);
Point p = pparser.execute(words[0]);
if(p!=null)
points.add(p);
}
}
}
}reader.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
System.err.println("Loaded "+points.size()+" points from "+filename);
return(points);
}
/**
* Loads a set of points from GFF file
* @param filename
* @return
*/
public static List<Point> loadPointsFromGFFFile(String filename, Genome gen){
List<Point> points = new ArrayList<Point>();
try {
ParseGFF parser = new ParseGFF(new File(filename));
while(parser.hasNext()){
GFFEntry site = parser.next();
Point currPt = new Point(gen, site.getChr(), site.getMidPoint());
points.add(currPt);
}
} catch (IOException e) {
//Silent exceptions
}
System.err.println("Loaded "+points.size()+" points from "+filename);
return(points);
}
public static List<Pair<Point,Point>> loadIntersFromFile(String filename, Genome gen) {
List<Pair<Point,Point>> inters = new ArrayList<Pair<Point,Point>>();
BufferedReader r;
String s;
String[] split = {""};
try {
r = new BufferedReader(new FileReader(filename));
while ((s = r.readLine()) != null) {
split = s.split("\t");
Point tmp1 = Point.fromString(gen, split[0]);
Point tmp2 = Point.fromString(gen, split[1]);
if (tmp1==null || tmp2==null) {
System.err.println(s);
} else {
inters.add(new Pair<Point,Point>(tmp1,tmp2));
}
}
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return inters;
}
/**
* Loads a set of regions from the third or first column of a file
* (Suitable for GPS & StatisticalPeakFinder files
* @param filename String
* @param win integer width of region to impose (-1 leaves region width alone)
* @return
*/
public static List<Region> loadRegionsFromFile(String filename, Genome gen, int win){
List<Region> regs = new ArrayList<Region>();
RegionParser rparser = new RegionParser(gen);
PointParser pparser = new PointParser(gen);
try{
File pFile = new File(filename);
if(!pFile.isFile()){System.err.println("Invalid file name: "+filename);System.exit(1);}
BufferedReader reader = new BufferedReader(new FileReader(pFile));
String line;
while ((line = reader.readLine()) != null) {
line = line.trim();
String[] words = line.split("\\s+");
if(words.length>0 && !words[0].contains("#") && !words[0].equals("Region") && !words[0].equals("Position")){
if(words.length>=3 && words[2].contains(":")){
Point p = pparser.execute(words[2]);
if(win==-1 && words[0].contains(":") && words[0].contains("-")){
Region q = rparser.execute(words[0]);
regs.add(q);
}else{
regs.add(p.expand(win/2));
}
}else if(words.length>=1 && words[0].contains(":")){
if(words[0].contains("-")){
Region q = rparser.execute(words[0]);
if(win==-1){
if(q!=null){regs.add(q);}
}else
regs.add(q.getMidpoint().expand(win/2));
}else{
Point p = pparser.execute(words[0]);
regs.add(p.expand(win/2));
}
}
}
}reader.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return(regs);
}
//Load a set of stranded points from a file (stranded point in first column)
public static List<StrandedPoint> loadStrandedPointsFromFile(Genome gen, String filename){
List<StrandedPoint> points = new ArrayList<StrandedPoint>();
StrandedPointParser spparser = new StrandedPointParser(gen);
PointParser pparser = new PointParser(gen);
try{
File pFile = new File(filename);
if(!pFile.isFile()){System.err.println("Invalid file name: "+filename);System.exit(1);}
BufferedReader reader = new BufferedReader(new FileReader(pFile));
String line;
while ((line = reader.readLine()) != null) {
line = line.trim();
String[] words = line.split("\\s+");
char strand = '+';
if(words.length>0 && !words[0].contains("#") && !words[0].equals("Region") && !words[0].equals("Position")){
if(words.length>=1 && words[0].contains(":")){
if(words[0].split(":").length>2){
StrandedPoint sq = spparser.execute(words[0]);
if(sq!=null){points.add(sq);}
}else{
Point p = pparser.execute(words[0]);
StrandedPoint sp = new StrandedPoint(p, strand);
points.add(sp);
}
}
}
}reader.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return(points);
}
/**
* Load a set of regions from a peak file
* @param gen
* @param filename
* @param win
* @return
*/
public static List<Region> loadRegionsFromPeakFile(Genome gen, String filename, int win){
List<Region> regs = new ArrayList<Region>();
PointParser pparser = new PointParser(gen);
RegionParser rparser = new RegionParser(gen);
try{
File pFile = new File(filename);
if(!pFile.isFile()){System.err.println("Invalid file name: "+filename);System.exit(1);}
BufferedReader reader = new BufferedReader(new FileReader(pFile));
String line;
while ((line = reader.readLine()) != null) {
line = line.trim();
String[] words = line.split("\\s+");
if(words.length>0 && !words[0].contains("#") && !words[0].equals("Region") && !words[0].equals("Position")){
if(words.length>=3 && words[2].contains(":")){
Point p = pparser.execute(words[2]);
if(win==-1 && words[0].contains(":") && words[0].contains("-")){
Region q = rparser.execute(words[0]);
regs.add(q);
}else{
regs.add(p.expand(win/2));
}
}else if(words.length>=1 && words[0].contains(":")){
if(words[0].contains("-")){
Region q = rparser.execute(words[0]);
if(win==-1){
if(q!=null){regs.add(q);}
}else
regs.add(q.getMidpoint().expand(win/2));
}else{
Point p = pparser.execute(words[0]);
regs.add(p.expand(win/2));
}
}
}
}reader.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return(regs);
}
/**
* Load Stranded points from stranded peak file, Usually used to load TSS
* @param gen
* @param filename
* @return
*/
public static List<StrandedPoint> loadStrandedPointFromRefTssFile(Genome gen, String filename){
List<StrandedPoint> pts = new ArrayList<StrandedPoint>();
PointParser pparser = new PointParser(gen);
try{
File pFile = new File(filename);
if(!pFile.isFile()){System.err.println("Invalid file name: "+filename);System.exit(1);}
BufferedReader reader = new BufferedReader(new FileReader(pFile));
String line;
while((line = reader.readLine()) != null){
line=line.trim();
String[] words = line.split("\\s+");
if(words.length >=1 && !words[0].contains("#") && !words[0].equals("Region") && !words[0].equals("Position")){
String[] subwords = words[0].split(":");
Point p = pparser.execute(subwords[0]+":"+subwords[1]);
StrandedPoint sp = new StrandedPoint(p,subwords[2].charAt(0));
pts.add(sp);
}
}
reader.close();
} catch(FileNotFoundException e){
e.printStackTrace();
} catch(IOException e){
e.printStackTrace();
}
return pts;
}
/**
* Load a set of stranded regions from a file
* @param gen
* @param filename
* @param win
* @return
*/
public static List<StrandedRegion> loadStrandedRegionsFromMotifFile(Genome gen, String filename, int win){
List<StrandedRegion> regs = new ArrayList<StrandedRegion>();
StrandedRegionParser rparser = new StrandedRegionParser(gen);
PointParser pparser = new PointParser(gen);
try{
File pFile = new File(filename);
if(!pFile.isFile()){System.err.println("Invalid file name: "+filename);System.exit(1);}
BufferedReader reader = new BufferedReader(new FileReader(pFile));
String line;
while ((line = reader.readLine()) != null) {
line = line.trim();
String[] words = line.split("\\s+");
char strand = '+';
if(words.length>0 && !words[0].contains("#") && !words[0].equals("Region") && !words[0].equals("Position")){
if(words.length>=3 && words[2].contains(":")){
Point p = pparser.execute(words[2]);
if(win==-1 && words[0].contains(":") && words[0].contains("-")){
StrandedRegion sq = rparser.execute(words[0]);
regs.add(sq);
}else{
StrandedRegion sp = new StrandedRegion(p.expand(win/2), strand);
regs.add(sp);
}
}else if(words.length>=1 && words[0].contains(":")){
String[] subwords = words[0].split(":");
if(subwords[1].contains("-")){
StrandedRegion sq = rparser.execute(words[0]);
if(win==-1){
if(sq!=null){regs.add(sq);}
}else
regs.add(new StrandedRegion(sq.getMidpoint().expand(win/2), sq.getStrand()));
}else{
Point p = pparser.execute(subwords[0]+":"+subwords[1]);
StrandedRegion sp=null;
if(subwords.length>=3)
sp = new StrandedRegion(p.expand(win/2), subwords[2].charAt(0));
else
sp = new StrandedRegion(p.expand(win/2), strand);
regs.add(sp);
}
}
}
}reader.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return(regs);
}
/**
* Load a set of stranded regions from a BED file
* @param gen
* @param filename
* @param win
* @return
*/
public static List<StrandedRegion> loadStrandedRegionsFromBEDFile(Genome gen, String filename, int win){
List<StrandedRegion> regs = new ArrayList<StrandedRegion>();
try{
BEDParser parser = new BEDParser(new File(filename));
BEDLine line;
while (parser.hasNext()) {
line = parser.next();
StrandedRegion sq = new StrandedRegion(
gen,
line.getChrom(),
line.getChromStart()+1,
line.getChromEnd(),
line.getStrand());
if(win==-1)
regs.add(sq);
else
regs.add(new StrandedRegion(sq.getMidpoint().expand(win), sq.getStrand()));
}
parser.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return(regs);
}
/**
* Load a set of stranded points from a file
* @param gen
* @param filename
* @param win
* @return
*/
public static List<StrandedPoint> loadStrandedPointsFromMotifFile(Genome gen, String filename, int win){
List<StrandedRegion> regs = new ArrayList<StrandedRegion>();
List<StrandedPoint> peaks = new ArrayList<StrandedPoint>();
PointParser pparser = new PointParser(gen);
StrandedRegionParser rparser = new StrandedRegionParser(gen);
try{
File pFile = new File(filename);
if(!pFile.isFile()){System.err.println("Invalid file name: "+filename);System.exit(1);}
BufferedReader reader = new BufferedReader(new FileReader(pFile));
String line;
while ((line = reader.readLine()) != null) {
line = line.trim();
String[] words = line.split("\\s+");
char strand = '+';
if(words.length>0 && !words[0].contains("#") && !words[0].equals("Region") && !words[0].equals("Position")){
if(words.length>=3 && words[2].contains(":")){
Point p = pparser.execute(words[2]);
if(win==-1 && words[0].contains(":") && words[0].contains("-")){
StrandedRegion sq = rparser.execute(words[0]);
regs.add(sq);
}else{
StrandedRegion sp = new StrandedRegion(p.expand(win/2), strand);
regs.add(sp);
}
}else if(words.length>=1 && words[0].contains(":")){
String[] subwords = words[0].split(":");
if(subwords[1].contains("-")){
StrandedRegion sq = rparser.execute(words[0]);
if(win==-1){
if(sq!=null){regs.add(sq);}
}else
regs.add(sq.expand(win/2, win/2));
}else{
Point p = pparser.execute(subwords[0]+":"+subwords[1]);
StrandedRegion sp=null;
if(subwords.length>=3)
sp = new StrandedRegion(p.expand(win/2), subwords[2].charAt(0));
else
sp = new StrandedRegion(p.expand(win/2), strand);
regs.add(sp);
}
}
}
}reader.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
for(StrandedRegion r : regs)
peaks.add(new StrandedPoint(r.getGenome(), r.getChrom(), r.getMidpoint().getLocation(), r.getStrand()));
return(peaks);
}
/**
* Load a set of regions from a peak file
* @param gen
* @param filename
* @param win
* @return
*/
public static List<Point> loadPeaksFromPeakFile(Genome gen, String filename, int win){
List<Point> peaks = new ArrayList<Point>();
PointParser pparser = new PointParser(gen);
try{
File pFile = new File(filename);
if(!pFile.isFile()){System.err.println("Invalid positive file name");System.exit(1);}
BufferedReader reader = new BufferedReader(new FileReader(pFile));
String line;
while ((line = reader.readLine()) != null) {
line = line.trim();
String[] words = line.split("\\s+");
if(words.length>0 && !words[0].contains("#") && !words[0].equals("Region") && !words[0].equals("Position")){
if(words.length>=3 && words[2].contains(":")){
Point p = pparser.execute(words[2]);
peaks.add(p);
}else if(words.length>=1 && words[0].contains(":")){
if(words[0].contains("-")){
RegionParser rparser = new RegionParser(gen);
Region q = rparser.execute(words[0]);
peaks.add(q.getMidpoint());
}else{
Point p = pparser.execute(words[0]);
peaks.add(p);
}
}
}
}reader.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return(peaks);
}
public static List<String> loadLinesFromFile(String filename){
List<String> lines = new ArrayList<String>();
try{
File inFile = new File(filename);
if(!inFile.isFile()){System.err.println("Invalid file name");System.exit(1);}
BufferedReader reader = new BufferedReader(new FileReader(inFile));
String line;
while ((line = reader.readLine()) != null) {
line = line.trim();
String[] words = line.split("\\s+");
if(words.length>0 && !words[0].contains("#") && !words[0].equals("Region") && !words[0].equals("Position")){
lines.add(line);
}
}reader.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return(lines);
}
/**
* Get sequences for a set of regions
* @param regions
* @param seqgen
* @return
*/
public static List<String> getSequencesForRegions(List<Region> regions, SequenceGenerator seqgen){
ArrayList<String> seqs = new ArrayList<String>();
if(seqgen==null)
seqgen = new SequenceGenerator();
for(Region r : regions){
seqs.add(seqgen.execute(r).toUpperCase());
}return(seqs);
}
//Get sequences for a set of regions
public static List<String> getSequencesForStrandedRegions(List<StrandedRegion> regions, SequenceGenerator seqgen){
ArrayList<String> seqs = new ArrayList<String>();
if(seqgen==null)
seqgen = new SequenceGenerator();
for(StrandedRegion r : regions){
String seq = seqgen.execute(r).toUpperCase();
if(r.getStrand()=='-')
seq = SequenceUtils.reverseComplement(seq);
seqs.add(seq);
}return(seqs);
}
//Randomly pick a set of Regions
public static List<Region> randomRegionPick(Genome gen, List<Region> blackList, int numSamples, int sampleSize){
List<Region> regs = new ArrayList<Region>();
Random rand = new Random();
int validSamples=0;
//First see how big the genome is:
int numChroms=0;
long genomeSize=0;
long [] chromoSize = new long[gen.getChromList().size()];
String [] chromoNames = new String[gen.getChromList().size()];
Iterator<NamedRegion> chroms = new ChromRegionIterator(gen);
while (chroms.hasNext()) {
NamedRegion currentChrom = chroms.next();
genomeSize += (double)currentChrom.getWidth();
chromoSize[numChroms]=currentChrom.getWidth();
chromoNames[numChroms]=currentChrom.getChrom();
numChroms++;
}
//Now, iteratively generate random positions and check if they are valid and not overlapping repeats.
while(validSamples<numSamples){
Region potential;
long randPos = (long)(1+(rand.nextDouble()*genomeSize));
//find the chr
boolean found=false;
long total=0;
for(int c=0; c<numChroms && !found; c++){
if(randPos<total+chromoSize[c]){
found=true;
if(randPos+sampleSize<total+chromoSize[c]){
potential = new Region(gen, chromoNames[c], (int)(randPos-total), (int)(randPos+sampleSize-total));
//is this region in the blacklist?
boolean valid=true;
if(blackList!=null){
for(Region r : blackList){
if(potential.overlaps(r)){valid=false;}
}
}
if(valid){
validSamples++;
regs.add(potential);
}
}
}total+=chromoSize[c];
}
}
return(regs);
}
/**
* Regions to midpoints
* @param regs
* @return
*/
public static List<Point> regions2midpoints(List<Region> regs){
List<Point> p = new ArrayList<Point>();
for(Region r : regs)
p.add(r.getMidpoint());
return p;
}
/**
* Convert a base to an int value
*
* @param base
* @return
*/
public static int base2int(char base) {
int intVal = -1;
switch (base) {
case 'A':
intVal = 0;
break;
case 'C':
intVal = 1;
break;
case 'G':
intVal = 2;
break;
case 'T':
intVal = 3;
break;
default:
throw new IllegalArgumentException("Invalid character: " + base);
}
return intVal;
}
/**
* Return a base for the specified integer
*
* @param x
* @return
*/
public static char int2base(int x) {
char base;
switch (x) {
case 0:
base = 'A';
break;
case 1:
base = 'C';
break;
case 2:
base = 'G';
break;
case 3:
base = 'T';
break;
default:
throw new IllegalArgumentException("Invalid int: " + x);
}
return (base);
}
/**
* Convert a nucleotide sequence to an integer value
*
* @param seq
* @return
*/
public static int seq2int(String seq) {
int intVal = 0;
int len = seq.length();
for (int i = 0; i < len; i++) {
long currInt = base2int(seq.charAt(i));
if (currInt == -1) {
return -1;
}
intVal = intVal << 2;
intVal += currInt;
}
return intVal;
}
/**
*
* @param x
* @return
*/
public static String int2seq(long x, int kmerLen) {
/**
* check that the x is valid for the specified maxKmerLen. Note: 4 << (2 *
* (kmerLen - 1)) = 4^kmerLen
*/
if (x > ((4 << (2 * (kmerLen - 1))) - 1)) {
throw new IllegalArgumentException("Invalid int value, " + x + ", for kmerLen " + kmerLen);
}
StringBuffer seq = new StringBuffer(kmerLen);
for (int i = 0; i < kmerLen; i++) {
int baseVal = (int) (x % 4);
seq.append(int2base(baseVal));
x = x >> 2;
}
return seq.reverse().toString();
}
/**
* Get all possible k-mers of a given length k
* @param kmerLen
* @return
*/
public static List<String> getAllKmers(int kmerLen){
List<String> kmers = new ArrayList<String>();
int numK = (int) Math.pow(4, kmerLen);
for(int k=0; k<numK; k++)
kmers.add(int2seq(k, kmerLen));
return kmers;
}
}