/* Copyright 2013 University of North Carolina at Chapel Hill. All rights reserved. */ package abra.utils; import java.io.BufferedReader; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; import java.io.InputStreamReader; import java.util.Set; import java.util.SortedSet; import java.util.TreeSet; import java.util.zip.GZIPInputStream; public class DbSnpSearcher { private SortedSet<Variant> variants = new TreeSet<Variant>(); public void run(String dbSnpFile, String variantFile) throws Exception { load(dbSnpFile); BufferedReader reader = new BufferedReader(new FileReader(variantFile)); String line = reader.readLine(); int matchCount = 0; while (line != null) { String[] fields = line.split("\t"); String chr = "chr" + fields[2]; int pos = Integer.parseInt(fields[3]); int length = Math.max(Integer.parseInt(fields[5])-1, Integer.parseInt(fields[6])-1); // int fuzz = 100 + length; int fuzz = 100; Variant floor = new Variant("", chr, pos-fuzz, length); Variant ceil = new Variant("", chr, pos+fuzz, length); Set<Variant> matches = variants.subSet(floor, ceil); StringBuffer matchStr = new StringBuffer(); boolean isMatch = false; for (Variant match : matches) { // if (match.length == length) { if (Math.abs(match.length - length) <= (length/3)+1) { matchStr.append(match.toString()); matchStr.append(','); isMatch = true; } } if (isMatch) { matchCount++; } System.out.println(line.trim() + "\t" + matchStr); line = reader.readLine(); } reader.close(); System.out.println("match count: " + matchCount); } private void load(String dbSnpFile) throws FileNotFoundException, IOException { GZIPInputStream gzip = new GZIPInputStream(new FileInputStream(dbSnpFile)); BufferedReader br = new BufferedReader(new InputStreamReader(gzip)); String line = br.readLine(); int count = 0; while (line != null) { String[] fields = line.split("\t"); String chr = fields[1]; int pos = Integer.parseInt(fields[2]); String name = fields[4]; int length = fields[8].length(); Variant var = new Variant(name, chr, pos, length); variants.add(var); line = br.readLine(); count++; } br.close(); gzip.close(); // System.err.println("" + count + " variants loaded"); } static class Variant implements Comparable<Variant> { String name; String chr; int pos; int length; public Variant(String name, String chr, int pos, int length) { this.name = name; this.chr = chr; this.pos = pos; this.length = length; } @Override public int compareTo(Variant that) { // TODO Auto-generated method stub int compare = this.chr.compareTo(that.chr); if (compare == 0) { compare = this.pos - that.pos; } if (compare == 0) { compare = this.length - that.length; } if (compare == 0) { compare = this.name.compareTo(that.name); } return compare; } @Override public int hashCode() { return name.hashCode(); } @Override public boolean equals(Object obj) { Variant that = (Variant) obj; return ((this.name.equals(that.name)) && (this.chr.equals(that.chr)) && (this.pos == that.pos) && (this.length == that.length)); } public String toString() { return name + ":" + chr + ":" + pos + ":" + length; } } public static void main(String[] args) throws Exception { DbSnpSearcher s = new DbSnpSearcher(); // s.run("/home/lmose/dev/ayc/germline_analysis/dbsnp_indels.txt.gz", "/home/lmose/dev/ayc/germline_analysis/abra_only.txt"); // s.run("/home/lmose/dev/ayc/germline_analysis/dbsnp_indels.txt.gz", "/home/lmose/dev/ayc/germline_analysis/all_abra_only.txt"); // s.run("/home/lmose/dev/ayc/germline_analysis/dbsnp_indels.txt.gz", "/home/lmose/dev/ayc/germline_analysis/abra_only_qual.txt"); // s.run("/home/lmose/dev/ayc/germline_analysis/dbsnp_indels.txt.gz", "/home/lmose/dev/ayc/germline_analysis/abra_only_high_freq.txt"); // s.run("/home/lmose/dev/ayc/germline_analysis/dbsnp_indels.txt.gz", "/home/lmose/Documents/abra/abra46_results/abra_germline_indels2.txt"); // s.run("/home/lmose/dev/ayc/germline_analysis/dbsnp_indels.txt.gz", "/home/lmose/Documents/abra/abra46_results/gl3.tsv"); // s.run("/home/lmose/dev/ayc/germline_analysis/dbsnp_indels.txt.gz", "/home/lmose/dev/ayc/germline_analysis/round2/abra_only.txt"); // s.run("/home/lmose/dev/ayc/germline_analysis/dbsnp_indels.txt.gz", "/home/lmose/dev/abra/calls2/germline/data/all_germline.sort.txt"); // s.run("/home/lmose/dev/ayc/germline_analysis/dbsnp_indels.txt.gz", "/home/lmose/dev/abra/calls5/data/all.calls.txt"); // s.run("/home/lmose/dev/ayc/germline_analysis/dbsnp_indels.txt.gz", "/home/lmose/dev/abra/calls6/germline/data/abra48_freebayes_prim.tsv"); s.run("/home/lmose/dev/ayc/germline_analysis/dbsnp_indels.txt.gz", "/home/lmose/dev/abra/calls6/round2/germline/tcga_brca_germline1.txt"); } }