package org.genedb.crawl.business;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileFilter;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.Arrays;
import org.apache.log4j.Logger;
public class TabixGenerator {
private Logger logger = Logger.getLogger(TabixGenerator.class);
private File inputFolder;
private File outputFolder;
private Runtime run = Runtime.getRuntime();
class GFFFileMap {
public File fasta;
public File annotations;
}
public TabixGenerator(File inputFolder, File outputFolder) throws Exception {
if (! inputFolder.isDirectory() || ! outputFolder.isDirectory()) {
throw new Exception("Both the input and output should be folders");
}
this.inputFolder = inputFolder;
this.outputFolder = outputFolder;
for (File inputFile : this.inputFolder.listFiles(new GFFFileFilter())) {
//logger.info("Tabixing " + inputFile);
GFFFileMap map = extractAnnotationsAndSequence( unzipIfZipped( copyFileToDestination( inputFile )));
File sortedFile = sortFileColumns(map.annotations);
indexAnnotations(sortedFile);
indexFasta(map.fasta);
}
}
private File copyFileToDestination(File file) throws IOException, InterruptedException {
BufferedReader buf = null;
try {
String cmd = String.format("cp %s %s", file.getAbsolutePath(), outputFolder);
logger.info(cmd);
Process pr = run.exec(cmd);
pr.waitFor();
buf = new BufferedReader(new InputStreamReader(pr.getInputStream()));
String line = "";
while ((line=buf.readLine())!=null) {
logger.info(line);
}
} finally {
if (buf != null) {
buf.close();
}
}
return new File(outputFolder.getAbsolutePath() + "/" + file.getName());
}
private File unzipIfZipped(File file) throws IOException, InterruptedException {
BufferedReader buf = null;
try {
if (file.getName().endsWith(".gz")) {
//logger.info("Unzipping " + file.getName());
String cmd = String.format("gunzip %s", file.getAbsolutePath());
logger.info(cmd);
Process pr = run.exec(cmd);
pr.waitFor();
buf = new BufferedReader(new InputStreamReader(pr.getInputStream()));
String line = "";
while ((line=buf.readLine())!=null) {
logger.info(line);
}
String newFileName = file.getName().substring(0, file.getName().length() - 3);
//logger.info(newFileName);
return new File(file.getParent() + "/" + newFileName);
}
} finally {
buf.close();
}
return file;
}
private GFFFileMap extractAnnotationsAndSequence(File file) throws IOException {
BufferedWriter fastaWriter = null;
BufferedWriter annotationWriter = null;
BufferedReader buf = null;
String fastaFileName = file.getParent() + "/" + file.getName().substring(0, file.getName().length() - 4) + ".fasta";
String annotationFileName = file.getParent() + "/" + file.getName().substring(0, file.getName().length() - 4) + ".annotations";
try {
//logger.debug(file.getAbsolutePath());
buf = new BufferedReader(new FileReader(file.getAbsolutePath()));
String line = "";
fastaWriter = new BufferedWriter(new FileWriter(fastaFileName));
annotationWriter = new BufferedWriter(new FileWriter(annotationFileName));
boolean parsingAnnotations = true;
while ((line=buf.readLine())!=null) {
//logger.debug(line);
if (line.contains("##FASTA")) {
parsingAnnotations = false;
}
if (line.startsWith("#")) {
continue;
}
if (parsingAnnotations) {
annotationWriter.write(line + "\n");
} else {
fastaWriter.write(line + "\n");
}
}
} finally {
if (buf != null) {
buf.close();
}
if (fastaWriter != null) {
fastaWriter.close();
}
if (annotationWriter != null) {
annotationWriter.close();
}
}
GFFFileMap map = new GFFFileMap();
map.fasta = new File(fastaFileName);
map.annotations = new File(annotationFileName);
return map;
}
private File sortFileColumns(File file) throws IOException, InterruptedException {
BufferedReader buf = null;
try {
//logger.info("Unzipping " + file.getName());
// grep -v ^"#" $TMPDIR/no_sequence_$INFILE.gff | sort -k1,1 -k4,4n | bgzip > $OUTDIR/$FILENAME.gff.gz
// have to run this as a shell command, because of the pipe
String[] cmd = {
"/bin/bash",
"-c",
String.format("sort -k1,1 -k4,4n %s | /Users/gv1/bin/tabix/bgzip > %s.gz", file.getAbsolutePath(), file.getAbsolutePath())
};
logger.info(Arrays.deepToString(cmd));
Process pr = run.exec(cmd);
pr.waitFor();
buf = new BufferedReader(new InputStreamReader(pr.getInputStream()));
String line = "";
while ((line=buf.readLine())!=null) {
logger.info(line);
}
return new File(file.getAbsolutePath() + ".gz");
} finally {
buf.close();
}
}
private File indexFasta(File file) throws IOException, InterruptedException {
BufferedReader buf = null;
try {
logger.info("Indexing the fasta " + file.getName());
String[] cmd = {
"/Users/gv1/bin/samtools/samtools",
"faidx",
file.getAbsolutePath()
};
logger.info(Arrays.deepToString(cmd));
Process pr = run.exec(cmd);
pr.waitFor();
buf = new BufferedReader(new InputStreamReader(pr.getInputStream()));
String line = "";
while ((line=buf.readLine())!=null) {
logger.info(line);
}
return new File(file.getAbsolutePath() + ".fai");
} finally {
if (buf != null) {
buf.close();
}
}
}
private File indexAnnotations(File file) throws IOException, InterruptedException {
BufferedReader buf = null;
try {
logger.info("Indexing the annotations " + file.getAbsolutePath());
String[] cmd = {
"/Users/gv1/bin/tabix/tabix",
"-p",
"gff",
"-f",
file.getAbsolutePath()
};
logger.info(Arrays.deepToString(cmd));
ProcessBuilder processBuilder = new ProcessBuilder(cmd);
//logger.info(processBuilder.command());
Process process = processBuilder.start();
buf = new BufferedReader(new InputStreamReader(process.getInputStream()));
String line = "";
while ((line=buf.readLine())!=null) {
logger.info(line);
}
return new File(file.getAbsolutePath() + ".tbi");
} finally {
if (buf != null) {
buf.close();
}
}
}
static class GFFFileFilter implements FileFilter {
public static final String[] extensions = {".gff.gz", ".gff"};
@Override
public boolean accept(File pathname) {
for (String extension : extensions) {
if (pathname.getName().endsWith(extension)) {
return true;
}
}
return false;
}
}
public static void main(String[] args) throws Exception {
if (args.length != 2) {
throw new Exception("Please supply an input folder and an output folder");
}
TabixGenerator g = new TabixGenerator(new File(args[0]), new File(args[1]));
}
}