package abra; import java.io.BufferedWriter; import java.io.FileWriter; import java.io.IOException; import java.util.Collection; import java.util.Collections; import java.util.Comparator; import java.util.List; import java.util.Set; import java.util.TreeSet; /** * Produces BED files indicating genomic windows that lend themselves to assembly. * * @author Lisle E. Mose (lmose at unc dot edu) */ public class ReferenceEvaluator { private int readLength; private String reference; private int[] kmers; private String in; private String out; private String qualities; private BufferedWriter include; private BufferedWriter exclude; private Set<Feature> includeRegions; private Set<Feature> excludeRegions; private ThreadManager threadManager; private static final int MAX_NODES = 9000; public ReferenceEvaluator(int readLength, String reference, int[] kmers, String in, String out, int numThreads) { this.readLength = readLength; this.reference = reference; this.kmers = kmers; this.in = in; this.out = out; this.qualities = new String(); for (int i=0; i<readLength; i++) { qualities += "H"; } this.threadManager = new ThreadManager(numThreads); } public void run() throws IOException, InterruptedException { new NativeLibraryLoader().load("."); CompareToReference2 c2r = new CompareToReference2(); c2r.init8bit(reference); include = new BufferedWriter(new FileWriter(in, false)); exclude = new BufferedWriter(new FileWriter(out, false)); for (String chr : c2r.getChromosomes()) { includeRegions = Collections.synchronizedSortedSet(new TreeSet<Feature>(new RegionComparator())); excludeRegions = Collections.synchronizedSortedSet(new TreeSet<Feature>(new RegionComparator())); int i = 0; int chromosomeLength = c2r.getReferenceLength(chr); while (i < chromosomeLength - ReAligner.MAX_REGION_LENGTH) { int regionStart = i; int regionStop = i + ReAligner.MAX_REGION_LENGTH; int start = Math.max(regionStart - readLength, 0); int stop = Math.min(regionStop + readLength, chromosomeLength-1); String regionBases = c2r.getSequence(chr, start+1, stop-start); Feature region = new Feature(chr, regionStart, regionStop); //TODO: Handle other ambiguous bases if (!regionBases.contains("N")) { threadManager.spawnThread(new EvalRunnable(threadManager, this, region, regionBases)); } else { excludeRegions.add(region); } i += ReAligner.REGION_OVERLAP; } threadManager.waitForAllThreadsToComplete(); //TODO: Because assembly regions are overlapped, there is overlap between final include/exclude output outputRegions(include, includeRegions); outputRegions(exclude, excludeRegions); } include.close(); exclude.close(); System.err.println("Done."); } private void evalRegion(Feature region, String regionBases) { boolean shouldInclude = false; NativeAssembler assembler = new NativeAssembler(); StringBuffer readBuf = new StringBuffer((ReAligner.MAX_REGION_LENGTH + 2*readLength) * readLength); for (int j=0; j<=regionBases.length() - readLength; j++) { readBuf.append("0"); // forward strand only String read = regionBases.substring(j, j+readLength); readBuf.append(read); readBuf.append(qualities); } String contig = assembler.nativeAssemble(readBuf.toString(), region.getDescriptor(), "eval", 0, 1, (ReAligner.MAX_REGION_LENGTH + 2*readLength)*2, readLength, kmers, 1, 0, .01, 1, MAX_NODES); int basesIdx = contig.indexOf('\n') + 1; if (basesIdx < contig.length()) { String contigBases = contig.substring(basesIdx, contig.length()-1); if (regionBases.equals(contigBases)) { shouldInclude = true; } } if (shouldInclude) { includeRegions.add(region); } else { excludeRegions.add(region); } } private void outputRegions(BufferedWriter writer, Collection<Feature> regions) throws IOException { List<Feature> mergedRegions = RegionLoader.collapseRegions(regions, 0); for (Feature region : mergedRegions) { writer.write(region.getSeqname() + "\t" + region.getStart() + "\t" + region.getEnd() + "\n"); } } // Basic Region comparator. Only considers start position, so must not // be used across chromosomes. static class RegionComparator implements Comparator<Feature> { @Override public int compare(Feature region1, Feature region2) { return (int) (region1.getStart() - region2.getStart()); } } static class EvalRunnable extends AbraRunnable { private ReferenceEvaluator evaluator; private Feature region; private String regionBases; public EvalRunnable(ThreadManager threadManager, ReferenceEvaluator evaluator, Feature region, String regionBases) { super(threadManager); this.evaluator = evaluator; this.region = region; this.regionBases = regionBases; } @Override public void go() throws Exception { evaluator.evalRegion(region, regionBases); } } static int[] getKmers(String str) { String[] strings = str.split(","); int[] kmers = new int[strings.length]; for (int i=0; i<strings.length; i++) { kmers[i] = Integer.parseInt(strings[i]); } return kmers; } public static void main(String[] args) throws Exception { /* NativeLibraryLoader l = new NativeLibraryLoader(); l.load("/home/lmose/code/abra/target"); int readLength = 100; //String reference = "/home/lmose/reference/chr20/20.fa"; String reference = "/home/lmose/dev/abra/dream/test.fa"; int[] kmers = new int[] { 43,53,63,73,83 }; String in = "/home/lmose/dev/abra/dream/include.bed"; String out = "/home/lmose/dev/abra/dream/exclude.bed"; */ if (args.length != 6) { System.out.println("ReferenceEvaluator <readLength> <reference> <kmers> <include_bed> <exclude_bed> <num_threads>"); } int readLength = Integer.parseInt(args[0]); String reference = args[1]; int kmers[] = getKmers(args[2]); String includeBed = args[3]; String excludeBed = args[4]; int threads = Integer.parseInt(args[5]); ReferenceEvaluator re = new ReferenceEvaluator(readLength, reference, kmers, includeBed, excludeBed, threads); re.run(); } }