package net.sf.cram;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Scanner;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import htsjdk.samtools.reference.ReferenceSequence;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import htsjdk.samtools.reference.ReferenceSequenceFileFactory;
import htsjdk.samtools.util.Log;
import net.sf.cram.CramTools.LevelConverter;
import net.sf.cram.common.Utils;
import com.beust.jcommander.JCommander;
import com.beust.jcommander.Parameter;
import com.beust.jcommander.Parameters;
import com.beust.jcommander.converters.FileConverter;
public class RefRepo {
private static Log log = Log.getInstance(RefRepo.class);
public static final String COMMAND = "repo";
private static void printUsage(JCommander jc) {
StringBuilder sb = new StringBuilder();
sb.append("\n");
jc.usage(sb);
System.out.println("Version " + RefRepo.class.getPackage().getImplementationVersion());
System.out.println(sb.toString());
}
public static void main(String[] args) throws IOException {
Params params = new Params();
JCommander jc = new JCommander(params);
try {
jc.parse(args);
} catch (Exception e) {
System.out.println("Failed to parse parameteres, detailed message below: ");
System.out.println(e.getMessage());
System.out.println();
System.out.println("See usage: -h");
System.exit(1);
}
if (args.length == 0 || params.help) {
printUsage(jc);
System.exit(1);
}
Log.setGlobalLogLevel(params.logLevel);
if (params.repoFile == null) {
System.out.println("Expecting repo file.");
printUsage(jc);
System.err.println(1);
}
if (params.repoFile.exists())
readRepoFile(params.repoFile);
Set<String> files = listAllFiles(params.pathsToAdd);
for (String path : files) {
if (byFile.containsKey(path) && !params.refreshPathCollisions)
continue;
for (Entry e : readFile(new File(path)))
put(e);
}
for (Entry e : readFiles(files, params.parallel)) {
put(e);
}
save(params.repoFile);
}
private static void save(File file) throws IOException {
FileWriter w = new FileWriter(file, false);
for (Entry e : map.values()) {
w.write(e.toString());
w.write('\n');
}
w.close();
}
private static List<Entry> readFiles(Collection<String> paths, int threads) {
ExecutorService pool = Executors.newFixedThreadPool(threads);
List<Entry> list;
try {
Set<Future<List<Entry>>> futures = new HashSet<Future<List<Entry>>>();
for (String path : paths) {
FileJob job = new FileJob(new File(path));
log.info("Submitting job: ", path);
Future<List<Entry>> future = pool.submit(job);
futures.add(future);
}
list = new ArrayList<RefRepo.Entry>();
for (Future<List<Entry>> f : futures) {
try {
list.addAll(f.get());
} catch (Exception e) {
throw new RuntimeException(e);
}
}
} finally {
pool.shutdownNow();
}
return list;
}
private static List<Entry> readFile(File file) {
List<Entry> entries = new ArrayList<RefRepo.Entry>();
return entries;
}
private static class FileJob implements Callable<List<Entry>> {
private File file;
public FileJob(File file) {
this.file = file;
}
@Override
public List<Entry> call() throws Exception {
List<Entry> list = new ArrayList<RefRepo.Entry>();
ReferenceSequenceFile rsFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(file);
ReferenceSequence sequence = null;
while ((sequence = rsFile.nextSequence()) != null) {
sequence.getBases();
Entry e = new Entry();
e.md5 = Utils.calculateMD5String(sequence.getBases());
e.file = "file://" + file.getAbsolutePath();
e.name = sequence.getName();
e.length = sequence.length();
log.info(String.format("New entry: %s", e.toString()));
list.add(e);
}
return list;
}
};
private static Set<String> listAllFiles(Collection<String> paths) {
Set<String> set = new TreeSet<String>();
for (String path : paths) {
File file = new File(path);
if (!file.exists()) {
log.warn("File or directory does not exist: " + path);
continue;
}
if (!file.canRead()) {
log.warn("Cannot read file or directory: " + path);
continue;
}
if (file.isDirectory()) {
List<String> subPaths = new ArrayList<String>();
for (File f : file.listFiles())
subPaths.add(f.getAbsolutePath());
set.addAll(listAllFiles(subPaths));
continue;
}
if (!file.isFile()) {
log.warn("Neither file nor directory: " + path);
continue;
}
if (file.getName().endsWith(".fasta") || file.getName().endsWith(".fa")) {
set.add(file.getAbsolutePath());
}
}
return set;
}
private static Map<String, Entry> byFile = new TreeMap<String, RefRepo.Entry>();
private static Map<String, Entry> byFileAndName = new TreeMap<String, RefRepo.Entry>();
private static Map<String, Entry> map = new HashMap<String, RefRepo.Entry>();
private static Entry parse(String line) {
Pattern pattern = Pattern.compile("^@SQ\tSN:(\\w+)\tLN:(\\d+)\tUR:(\\w+)\tM5:([a-z0-9]+)$");
Matcher m = pattern.matcher(line);
if (!m.matches())
throw new RuntimeException("Improper format: " + line);
Entry e = new Entry();
e.name = m.group(1);
e.length = Integer.valueOf(m.group(2));
e.file = m.group(3);
e.md5 = m.group(4);
return e;
}
private static void readRepoFile(File file) throws FileNotFoundException {
Scanner scanner = new Scanner(file);
while (scanner.hasNextLine()) {
String line = scanner.nextLine();
Entry e = parse(line);
put(e);
}
}
private static void put(Entry e) {
map.put(e.md5, e);
byFile.put(e.file, e);
byFileAndName.put(String.format("%s:%s", e.file, e.name), e);
}
private static class Entry {
String md5;
int length;
String file;
String name;
@Override
public String toString() {
return String.format("@SQ\tSN:%s\tLN:%d\tUR:%s\tM5:%s", name, length, file, md5);
}
}
@Parameters(commandDescription = "Register local reference fasta files in the repo file.")
static class Params {
@Parameter(names = { "-l", "--log-level" }, description = "Change log level: DEBUG, INFO, WARNING, ERROR.", converter = LevelConverter.class)
Log.LogLevel logLevel = Log.LogLevel.ERROR;
@Parameter(names = { "-h", "--help" }, description = "Print help and quit")
boolean help = false;
@Parameter(names = { "--repo-file", "-R" }, converter = FileConverter.class, description = "The path to the repository description file.")
File repoFile;
@Parameter(names = { "--refresh", "-f" }, description = "Update all entries for the given paths.")
boolean refreshPathCollisions = false;
@Parameter(description = "A list of directories of files to add to the repository.")
List<String> pathsToAdd;
@Parameter(names = { "--parallel", "-p" }, description = "Use this many parallel threads to calculate checksums.")
int parallel = 1;
}
}