package org.seqcode.genome.sequence;
import java.io.IOException;
import java.io.File;
import java.util.*;
import java.sql.*;
import org.seqcode.data.connections.DatabaseConnectionManager;
import org.seqcode.data.connections.DatabaseException;
import org.seqcode.data.connections.UnknownRoleException;
import org.seqcode.data.io.parsing.FASTAStream;
import org.seqcode.genome.Genome;
import org.seqcode.genome.location.Region;
import org.seqcode.genome.location.StrandedRegion;
import org.seqcode.gsebricks.types.*;
import org.seqcode.gsebricks.verbs.Mapper;
import org.seqcode.gseutils.*;
/**
* <code>SequenceGenerator</code> maps a Region to the genomic
* sequence included in that Region.
*
* 1-based genome
*/
public class SequenceGenerator<X extends Region> implements Mapper<X,String>, SelfDescribingVerb {
private Genome genome;
private static Map<Integer,String> cache;
private boolean useCache = false;
private boolean useLocalFiles = false;
private String genomePath = null;
private boolean genomePathIsFullGenomeFile=false;
private int maxQuery = -1;
private static Map<String, String[]> regionCache;
private static Map<String, int[]> regionStarts;
private static boolean regionIsCached = false;
public boolean isRegionCached(){return regionIsCached;}
public boolean usingLocalFiles(){return useLocalFiles;}
/**
* SequenceGenerator
* maxQuery is here to avoid calling the sequence generator if the Region width is greater
* than a given size. This is really only useful for SeqView painters.
* @param g
*/
public SequenceGenerator (Genome g) {this(g, -1);}
public SequenceGenerator (Genome g, int maxQuery) {
genome = g;
this.maxQuery=maxQuery;
}
public SequenceGenerator() {}
public void useCache(boolean b) {
if (b && cache == null) {
cache = new HashMap<Integer,String>();
}
useCache = b;
}
public void useLocalFiles(boolean b) {
useLocalFiles = b;
}
public void setGenomePath(String genomePath){
this.genomePath = genomePath;
File test = new File(genomePath);
if(!test.exists()){
System.err.println("Genome sequence directory/file does not exist!");
System.exit(1);
}else{
if(test.isFile() && (genomePath.endsWith(".fa")||genomePath.endsWith(".fasta")||genomePath.endsWith(".seq")))
genomePathIsFullGenomeFile=true;
}
}
/** cache the whole chromosome of this region */
private void cache(X region) throws SQLException, IOException {
int chromid = region.getGenome().getChromID(region.getChrom());
synchronized(cache) {
if (cache.containsKey(chromid)) {
return;
}
}
String chromseq = null;
if (useLocalFiles) {
if (genomePath==null)
genomePath = "/scratch/" + region.getGenome().getVersion();
File f=null;
if(genomePathIsFullGenomeFile){
f = new File(genomePath);
}else{
f = new File( genomePath + "/chr" + region.getChrom() + ".fa");
if (!f.exists())
f = new File( genomePath+ "/chr" + region.getChrom() + ".fasta");
if (!f.exists())
f = new File( genomePath+ "/" + region.getChrom() + ".fa");
if (!f.exists())
f = new File( genomePath+ "/" + region.getChrom() + ".fasta");
if (!f.exists())
f = new File( genomePath+ "/chromosome" + region.getChrom() + ".fa");
if (!f.exists())
f = new File( genomePath+ "/chromosome" + region.getChrom() + ".fasta");
if (!f.exists())
f = new File( genomePath+ "/chrom" + region.getChrom() + ".fa");
if (!f.exists())
f = new File( genomePath+ "/chrom" + region.getChrom() + ".fasta");
}
if (f.exists()) {
FASTAStream stream = new FASTAStream(f);
while (stream.hasNext()) {
Pair<String,String> pair = stream.next();
String pairchrom = pair.car().replaceFirst("^chromosome", "").replaceFirst("^chrom", "").replaceFirst("^chr","");
if (pairchrom.equals(region.getChrom())) {
chromseq = pair.cdr();
break;
}
}
stream.close();
}else{
System.out.print("FASTA file for chromosome "+region.getChrom() +" is not found at "+genomePath+". \n");
System.exit(-1);
}
}
if (chromseq == null) {
java.sql.Connection cxn = DatabaseConnectionManager.getConnection("core");
PreparedStatement ps = cxn.prepareStatement("select sequence from chromsequence where id = ?");
ps.setInt(1,chromid);
ResultSet rs = ps.executeQuery();
if (rs.next()) {
chromseq = rs.getString(1);
}
rs.close();
ps.close();
if(cxn!=null) try {cxn.close();}catch (Exception ex) {throw new DatabaseException("Couldn't close connection with role core", ex); }
}
if (chromseq == null) {
return;
}
synchronized(cache) {
if (!cache.containsKey(chromid)) {
cache.put(chromid, chromseq);
}
}
}
/**
* get sequence of specified region (including start and end)
*/
public String execute(X region) {
String result = null;
if(maxQuery==-1 || region.getWidth()<=maxQuery){
if (regionIsCached)
return getRegionCacheSequence(region);
String chromname = region.getChrom();
try {
Genome genome = region.getGenome();
int chromid = genome.getChromID(chromname);
if (useCache) {
cache(region);
String chromString = null;
synchronized(cache) {
if (!cache.containsKey(chromid)) {
return null;
}
chromString = cache.get(chromid);
}
//1-based version (string.substr is 0-based)
result = chromString.substring(region.getStart()-1, region.getEnd());
}
if (result == null) {
java.sql.Connection cxn =
DatabaseConnectionManager.getConnection("core");
PreparedStatement ps;
//1-based version (mysql substr is 1-based)
int start = Math.max(region.getStart(), 0);
ps = cxn.prepareStatement("select substr(sequence,?,?) from chromsequence where id = ?");
ps.setInt(1,start);
ps.setInt(2,region.getEnd() - region.getStart() + 1);
ps.setInt(3,chromid);
ResultSet rs = ps.executeQuery();
if (rs.next()) {
result = rs.getString(1);
}
rs.close();
ps.close();
if(cxn!=null) try {cxn.close();}catch (Exception ex) {throw new DatabaseException("Couldn't close connection with role core", ex); }
}
} catch (SQLException ex) {
ex.printStackTrace();
} catch (UnknownRoleException ex) {
ex.printStackTrace();
throw new DatabaseException("Couldn't connect to core",ex);
} catch (IOException ex) {
ex.printStackTrace();
throw new RuntimeException("Couldn't load file to cache " + ex.toString(), ex);
} catch (StringIndexOutOfBoundsException ex){
System.err.println("Error in "+region.getLocationString());
ex.printStackTrace();
}
if (result == null) {
throw new DatabaseException("Couldn't get any sequence for " + region);
}
if (result.length() != region.getWidth()) {
System.err.println("Wanted " + region + "(" +
region.getWidth() + ") but only got " + result.length());
}
}
return result;
}
/**
* Setup light-weight region cache of genome sequences, cover only the specified regions<br>
* So that it does not cache the whole chromosome, save memory space. <br>
* At the same time, retrieve some one-time sequences in rs.
* @param regions sorted, non-overlapping regions for cache
* @param rs regions for one-time sequence retrieval
*/
public String[] setupRegionCache(List<Region> regions, List<Region> rs){
ArrayList<String> seqs = new ArrayList<String>();
if (regions==null||regions.isEmpty())
return null;
Collections.sort(regions);
// group one-time regions by chrom
Map<String, ArrayList<Region>> chr2rs = new HashMap<String, ArrayList<Region>>();
for(Region r:rs) {
String chrom = r.getChrom();
if(!chr2rs.containsKey(chrom))
chr2rs.put(chrom, new ArrayList<Region>());
chr2rs.get(chrom).add(r);
}
useCache(true);
regionCache = new HashMap<String, String[]>();
regionStarts = new HashMap<String, int[]>();
Genome g = regions.get(0).getGenome();
Region lastRegion = regions.get(regions.size()-1);
// setup the region space
String chrom = regions.get(0).getChrom();
int count = 0;
for(Region r: regions){
if (!r.getChrom().equals(chrom)){ // new chrom
regionCache.put(chrom, new String[count]);
regionStarts.put(chrom, new int[count]);
chrom = r.getChrom();
count = 1;
}
else // same chrom
count ++;
}
regionCache.put(chrom, new String[count]);
regionStarts.put(chrom, new int[count]);
chrom = regions.get(0).getChrom();
count = 0;
for (int i=0;i<regions.size();i++){
Region r = regions.get(i);
if (!r.getChrom().equals(chrom)){ // new Chrom
if (cache!=null){ // for previous chrom
if (chr2rs.containsKey(chrom)){ // piggy-back to retrieve one-time sequences
for (Region r1:chr2rs.get(chrom))
seqs.add(execute((X)r1));
}
synchronized(cache) {
cache.put(g.getChromID(chrom), null);
cache.remove(g.getChromID(chrom)); // clean cache
}
System.gc();
}
chrom = r.getChrom();
count = 0;
}
// cache region using the current cache
synchronized(regionCache) {
regionStarts.get(chrom)[count]=r.getStart();
regionCache.get(chrom)[count]=execute((X)r);
}
count ++;
}
if (cache!=null){
if (chr2rs.containsKey(lastRegion.getChrom())){ // piggy-back to retrieve one-time sequences
for (Region r1:chr2rs.get(lastRegion.getChrom()))
seqs.add(execute((X)r1));
}
synchronized(cache) {
cache.put(g.getChromID(lastRegion.getChrom()), null);
cache.remove(g.getChromID(lastRegion.getChrom()));
}
// retrieve those regions that are not in the chromosomes of cache regions
for(String chr:chr2rs.keySet()){
if (!regionCache.containsKey(chr)){
for (Region r1:chr2rs.get(chr))
seqs.add(execute((X)r1));
synchronized(cache) {
cache.put(g.getChromID(chrom), null);
cache.remove(g.getChromID(chrom)); // clean cache for this chrom
}
System.gc();
}
}
cache=null;
System.gc();
}
regionIsCached = true;
String[] result = new String[seqs.size()];
for (int i=0;i<result.length;i++)
result[i]=seqs.get(i);
return result;
}
private String getRegionCacheSequence(Region r){
int[] starts = regionStarts.get(r.getChrom());
int idx = Arrays.binarySearch(starts, r.getStart());
if( idx < 0 ) { idx = -idx - 2; }
if (!regionCache.containsKey(r.getChrom()))
return null;
synchronized(regionCache) {
try{
return regionCache.get(r.getChrom())[idx].substring(r.getStart()-starts[idx], r.getEnd()-starts[idx]+1);
}
catch(Exception e){
e.printStackTrace(System.out);
System.out.println(r.toString()+" idx="+idx+" starts[idx]="+starts[idx]);
return null;
}
}
}
public static void clearCache() {
synchronized(cache) {
cache.clear();
}
}
// the following method has bee add by akshay
public static void setOffRegionCache(){regionIsCached = false;}
private static final String[] inputNames = { "Regions" };
private static final EchoType[] inputTypes = { new ClassType(Region.class) };
private static final EchoType outputType = new ClassType(String.class);
public EchoType[] getInputClasses() { return inputTypes; }
public String[] getInputNames() { return inputNames; }
public EchoType getOutputClass() { return outputType; }
public EchoType[] getParameterClasses() {
return null;
}
public String[] getParameterNames() {
return null;
}
public void init(Map<String, Object> params) {
}
}