package com.github.lindenb.jvarkit.tools.biostar; import java.io.BufferedReader; import java.io.File; import java.io.IOException; import java.io.PrintStream; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Random; import htsjdk.samtools.util.CloserUtil; import com.beust.jcommander.Parameter; import com.github.lindenb.jvarkit.util.bio.bed.BedLine; import com.github.lindenb.jvarkit.util.bio.bed.BedLineCodec; import com.github.lindenb.jvarkit.util.jcommander.Launcher; import com.github.lindenb.jvarkit.util.jcommander.Program; import com.github.lindenb.jvarkit.util.log.Logger; @Program(name="biostar77828", description="Divide the human genome among X cores, taking into account gaps See http://www.biostars.org/p/77828/ ", biostars=77828 ) public class Biostar77828 extends Launcher { private static final Logger LOG = Logger.build(Biostar77828.class).make(); @Parameter(names={"-o","--output"},description="Output file. Optional . Default: stdout") private File outputFile = null; @Parameter(names={"-minc","--minc"},description="min core") private int MINC = 20 ; @Parameter(names={"-maxc","--maxc"},description="max core") private int MAXC = 30 ; @Parameter(names={"-iter","--iter"},description="number of iterations") private long N_ITERATIONS = 1000000 ; private List<Segment> all_segments=new ArrayList<Segment>(); private final Random random=new Random(System.currentTimeMillis()); private long effective_genome_size=0L; private static class Segment implements Comparable<Segment> { String chrom; int start; int end; String name; Segment(String chrom,int start,int end) { this(chrom,start,end,chrom+":"+start+":"+end); } Segment(String chrom,int start,int end,String name) { this.chrom=chrom; this.start=start; this.end=end; this.name=name; } int size() { return end-start; } @Override public int compareTo(Segment o) { int i=chrom.compareTo(o.chrom); if(i!=0) return i; i=start-o.start; if(i!=0) return i; return end-o.end; } } private static class Core { List<Segment> segments=new ArrayList<Segment>(); long length() { long L=0L; for(Segment s:this.segments) L+=s.size(); return L; } void simplify() { Collections.sort(this.segments); int i=0; while(i+1< this.segments.size()) { Segment S1=this.segments.get(i); Segment S2=this.segments.get(i+1); if(S1.chrom.equals(S2.chrom) && S1.end==S2.start) { this.segments.set(i, new Segment(S1.chrom, S1.start, S1.end,S1.name)); this.segments.remove(i+1); } else { i++; } } } } private static class Solution implements Comparable<Solution> { long generation=0L; List<Core> cores=new ArrayList<Core>(); private Double _mean=null; private Double _stddev=null; double mean() { if(_mean==null) { _mean=0.0; for(Core s:this.cores) _mean+=s.length(); _mean/=this.cores.size(); } return _mean; } Double stddev() { if(_stddev==null) { double m=mean(); _stddev=0.0; for(Core s:this.cores) _stddev+=Math.pow(s.length()-m,2.0); _stddev=Math.sqrt(_stddev/(this.cores.size())); } return _stddev; } @Override public int compareTo(Solution other) { return this.stddev().compareTo(other.stddev()); } public void print(PrintStream out) { String header="##mean:"+mean()+"\n##stdev:"+stddev()+" \n##cores:"+this.cores.size()+"\n##generation:"+this.generation; out.println(header); for(int i=0;i< this.cores.size();++i) { this.cores.get(i).simplify(); out.println("#cores["+i+"]. N="+this.cores.get(i).segments.size()+" size_bp="+this.cores.get(i).length()+" bp."); for(Segment seg:this.cores.get(i).segments) { out.print(seg.chrom); out.print('\t'); out.print(seg.start); out.print('\t'); out.print(seg.end); out.print('\t'); out.print(seg.size()); out.print('\t'); out.print(seg.name); out.println(); } } out.println(header); } @Override public String toString() { return "cores:"+this.cores.size()+" mean "+mean()+" stddev:"+stddev()+" gen:"+generation; } } private Solution createSolution() { int n_cores= this.MINC+(this.MINC>=this.MAXC?0:this.random.nextInt(this.MAXC-this.MINC)) ; long optimal_size= (this.effective_genome_size/n_cores); Solution sol=new Solution(); //create a copy, split the segments ArrayList<Segment> segments=new ArrayList<Segment>(this.all_segments.size()); for(Segment seg:this.all_segments) { if((long)seg.size()<=optimal_size) { segments.add(seg); } else { int split_count=(int)(Math.ceil(seg.size()/(double)optimal_size)); int fragment_size=seg.size()/split_count; int start=seg.start; for(int s=0;s<split_count;++s) { int end=(s+1==split_count?seg.end:start+fragment_size); segments.add(new Segment(seg.chrom, start, end,seg.name)); start=end; } } } //shuffle Collections.shuffle(segments, this.random); //create cores, and add one segment for(int i=0;i< n_cores && i< segments.size();++i) { //get last Core core=new Core(); core.segments.add(segments.get(i)); sol.cores.add(core); } double seq_total_size=0.0; int seq_count=0; //fill core with random segments for(int i=sol.cores.size();i< segments.size();++i) { Segment seg=segments.get(i); //start a new sequence of insertion if(seq_count%sol.cores.size()==0) { //put it in a random core Core core=sol.cores.get(this.random.nextInt(sol.cores.size())); core.segments.add(seg); seq_total_size=seg.size(); seq_count=1; } else { double mean=seq_total_size/seq_count; //find the best core to put this segment best=smaller distance to mean Core best=null; for(Core core:sol.cores) { if(best==null || (Math.abs((core.length()+seg.size())-mean) < Math.abs((best.length()+seg.size())-mean))) { best=core; } } best.segments.add(seg); seq_total_size+=seg.size(); seq_count++; } } return sol; } @Override public int doWork(List<String> args) { PrintStream pw =null; try { LOG.info("load BED"); BufferedReader in=super.openBufferedReader(oneFileOrNull(args)); final BedLineCodec codec = new BedLineCodec(); String line; while((line=in.readLine())!=null) { if(line.isEmpty() || line.startsWith("#")) continue; final BedLine bedLine = codec.decode(line); if(bedLine==null) continue; if(bedLine.getColumnCount()<3) throw new IOException("bad BED input "+bedLine); final Segment seg=new Segment( bedLine.getContig(), bedLine.getStart()-1, bedLine.getEnd() ) ; if(seg.size()==0) continue; this.effective_genome_size+=seg.size(); this.all_segments.add(seg); } pw = super.openFileOrStdoutAsPrintStream(this.outputFile); Solution best=null; for(long generation=0;generation< this.N_ITERATIONS;++generation) { if(generation%100000==0) LOG.info("generation:"+generation+"/"+this.N_ITERATIONS+" "+ (int)((generation/(double)this.N_ITERATIONS)*100.0)+"% "+String.valueOf(best)); Solution sol=createSolution(); sol.generation=generation; if(best==null || sol.compareTo(best)<0) { best=sol; /* if(LOG.isDebugEnabled()) { LOG.info("%%generation:"+generation); best.print(stderr()); }*/ } } if(best!=null) { best.print(pw); } pw.flush(); pw.close(); return RETURN_OK; } catch(final Exception err) { return wrapException(err); } finally { CloserUtil.close(pw); } } /** * @param args */ public static void main(String[] args) { new Biostar77828().instanceMainWithExit(args); } }