package abra; import java.io.BufferedWriter; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.Comparator; import java.util.List; import java.util.Set; import java.util.TreeSet; /** * Produces BED files indicating genomic windows and kmer sizes that lend themselves to assembly. * * @author Lisle E. Mose (lmose at unc dot edu) */ public class KmerSizeEvaluator { public static final int MIN_KMER = 5; private int readLength; private CompareToReference2 c2r; private String outputFile; private BufferedWriter output; private Set<Feature> outputRegions; private ThreadManager threadManager; private String regionsBed; public KmerSizeEvaluator(int readLength, CompareToReference2 c2r, String outputFile, int numThreads, String regionsBed) { this.readLength = readLength; this.c2r = c2r; this.outputFile = outputFile; this.regionsBed = regionsBed; this.threadManager = new ThreadManager(numThreads); } public KmerSizeEvaluator() { } private String getBases(Feature region, CompareToReference2 c2r) { return c2r.getSequence(region.getSeqname(), (int) region.getStart()+1-(readLength-1), (int) region.getLength() + (readLength*2-2)); } public void run() throws IOException, InterruptedException { // new NativeLibraryLoader().load("."); List<Feature> regions = ReAligner.getRegions(regionsBed, readLength, false); output = new BufferedWriter(new FileWriter(outputFile, false)); outputRegions = Collections.synchronizedSortedSet(new TreeSet<Feature>(new RegionComparator())); String lastChromosome = ""; for (Feature region : regions) { if (!region.getSeqname().equals(lastChromosome)) { // Write all entries for this chromosome before moving on to the next threadManager.waitForAllThreadsToComplete(); outputRegions(output, outputRegions); outputRegions.clear(); threadManager = new ThreadManager(threadManager.getNumThreads()); } String regionBases = getBases(region, c2r); //TODO: Handle other ambiguous bases if (!regionBases.contains("N")) { threadManager.spawnThread(new EvalRunnable(threadManager, this, region, regionBases)); } else { excludeRegion(region); outputRegions.add(region); } lastChromosome = region.getSeqname(); } threadManager.waitForAllThreadsToComplete(); // Because assembly regions are overlapped, there is overlap between final include/exclude output outputRegions(output, outputRegions); output.close(); System.err.println("Done."); } private void excludeRegion(Feature region) { region.setAdditionalInfo((readLength+1) + "\tN"); } public int identifyMinKmer(int readLength, CompareToReference2 c2r, List<Feature> regions) { List<String> regionBases = new ArrayList<String>(); for (Feature region : regions) { regionBases.add(getBases(region, c2r)); } boolean isEditDistanceOK = false; int distKmer = MIN_KMER; while (!isEditDistanceOK && distKmer < readLength) { isEditDistanceOK = isHammingDistanceAtLeast2(regionBases, distKmer); if (!isEditDistanceOK) { distKmer += 2; } } return distKmer; } private void evalRegion(Feature region, String regionBases) { List<Feature> regionList = new ArrayList<Feature>(); regionList.add(region); int distKmer = identifyMinKmer(readLength, c2r, regionList); region.setAdditionalInfo(String.valueOf(distKmer) + "\t."); outputRegions.add(region); } private void outputRegions(BufferedWriter writer, Collection<Feature> regions) throws IOException { for (Feature region : regions) { String output = region.getSeqname() + "\t" + region.getStart() + "\t" + region.getEnd(); if (region.getAdditionalInfo() != null) { output += "\t" + region.getAdditionalInfo(); } output += "\n"; writer.write(output); } } // Basic Region comparator. Only considers start position, so must not // be used across chromosomes. static class RegionComparator implements Comparator<Feature> { @Override public int compare(Feature region1, Feature region2) { return (int) (region1.getStart() - region2.getStart()); } } static class EvalRunnable extends AbraRunnable { private KmerSizeEvaluator evaluator; private Feature region; private String regionBases; public EvalRunnable(ThreadManager threadManager, KmerSizeEvaluator evaluator, Feature region, String regionBases) { super(threadManager); this.evaluator = evaluator; this.region = region; this.regionBases = regionBases; } @Override public void go() throws Exception { evaluator.evalRegion(region, regionBases); } } static int[] getKmers(String str) { String[] strings = str.split(","); int[] kmers = new int[strings.length]; for (int i=0; i<strings.length; i++) { kmers[i] = Integer.parseInt(strings[i]); } return kmers; } // Is the hamming distance between all bases in this region at least 2 private boolean isHammingDistanceAtLeast2(List<String> basesList, int k) { List<String> kmers = new ArrayList<String>(); for (String bases : basesList) { for (int i=0; i<=bases.length()-k; i++) { String kmer = bases.substring(i, i+k); for (String kmer2 : kmers) { if (!isHammingDistanceAtLeast2(kmer, kmer2)) { return false; } } kmers.add(kmer); } } return true; } private boolean isHammingDistanceAtLeast2(String kmer1, String kmer2) { int dist = 0; for (int i=0; i<kmer1.length(); i++) { if (kmer1.charAt(i) != kmer2.charAt(i)) { dist += 1; if (dist >=2) { return true; } } } return false; } public static void main(String[] args) throws Exception { // NativeLibraryLoader l = new NativeLibraryLoader(); // l.load("/home/lmose/code/abra/target"); // // int readLength = 100; // String reference = "/home/lmose/reference/chr20/20.fa"; // //String reference = "/home/lmose/dev/abra/dream/test.fa"; // // // int threads = 1; // String outputBed = "/home/lmose/dev/abra/dream/round2/output.bed"; // String regionsBed = "/home/lmose/dev/abra/dream/round2/20.orig.bed"; if (args.length != 5) { System.err.println("KmerSizeEvaluator <readLength> <reference> <output_bed> <num_threads> <input_bed>"); System.exit(-1); } int readLength = Integer.parseInt(args[0]); String reference = args[1]; String outputBed = args[2]; int threads = Integer.parseInt(args[3]); String regionsBed = args[4]; double s = System.currentTimeMillis(); CompareToReference2 c2r = new CompareToReference2(); c2r.init(reference); KmerSizeEvaluator re = new KmerSizeEvaluator(readLength, c2r, outputBed, threads, regionsBed); re.run(); double e = System.currentTimeMillis(); double elapsed = (e-s)/1000; elapsed /= 60; System.err.println("Elapsed mins: " + elapsed); } }