package org.seqcode.data.core.tools;
import java.sql.*;
import java.io.*;
import org.seqcode.data.connections.ClobHandler;
import org.seqcode.data.connections.DatabaseConnectionManager;
import org.seqcode.data.connections.DatabaseSequence;
import org.seqcode.data.io.parsing.FASTAStream;
import org.seqcode.genome.Genome;
import org.seqcode.genome.Species;
import org.seqcode.gseutils.Args;
import org.seqcode.gseutils.NotFoundException;
import org.seqcode.gseutils.Pair;
/**
* Reads a FASTA file and creates the chromosomes and their sequence in the database.
* Leading "chr" is stripped off names in the FASTA file:
*
* AddChromosomes --species "$MM;mm8" --fasta mm8.fa
* AddChromosomes --species "$MM;mm8" < mm8.fa
*
* --incremental does writes in little chunks if your PreparedStatement.setString()
* can't handle a full chromosome sequence at once.
*/
public class AddChromosomes {
public static void main(String[] args) {
try {
boolean incrementalLoad = Args.parseFlags(args).contains("incremental");
if(!incrementalLoad) {
System.out.println("-> Bulk loading");
bulk_load(args);
} else {
System.out.println("-> Incremental Loading");
incremental_load(args);
}
} catch(IOException e) {
e.printStackTrace(System.err);
} catch(SQLException e) {
e.printStackTrace(System.err);
} catch(NotFoundException e) {
e.printStackTrace(System.err);
}
}
public static void incremental_load(String[] args) throws IOException, NotFoundException, SQLException {
Pair<Species,Genome> pair = Args.parseGenome(args);
Genome genome = pair.getLast();
String fastaname = Args.parseString(args,"fasta",null);
BufferedReader reader = null;
FASTAStream fasta;
if (fastaname == null) {
reader = new BufferedReader(new InputStreamReader(System.in));
} else {
reader = new BufferedReader(new FileReader(new File(fastaname)));
}
Connection cxn = DatabaseConnectionManager.getConnection("core");
boolean ac = cxn.getAutoCommit();
cxn.setAutoCommit(false);
PreparedStatement insertChrom =
cxn.prepareStatement("insert into chromosome(id, genome, name) values (" +
DatabaseSequence.getInsertSQL(cxn, "chromosome_id") + "," +
genome.getDBID() + ",?)");
PreparedStatement getChromID =
cxn.prepareStatement("select id from chromosome where genome = ? and name = ?");
PreparedStatement insertEmptySequence =
cxn.prepareStatement("insert into chromsequence(id,len,sequence) values (?,0,'')");
PreparedStatement appendSequence =
cxn.prepareStatement("update chromsequence " +
"set sequence=CONCAT(sequence,?) where id=?");
PreparedStatement updateLength =
cxn.prepareStatement("update chromsequence " +
"set len=length(sequence) where id=?");
String line = null;
int lastChromID = -1;
int length = 0;
int hashTick = 100000;
int nextHash = 0;
while((line = reader.readLine()) != null) {
line = line.trim();
if(line.length() > 0) {
if(line.startsWith(">")) {
String name = line.substring(1, line.length()).trim();
name = name.replaceFirst("^chr","");
try {
insertChrom.setString(1,name);
insertChrom.execute();
} catch (SQLException e) {
System.err.println("Didn't add chromosome " + name + ". Going to try sequence");
}
getChromID.setInt(1,genome.getDBID());
getChromID.setString(2,name);
ResultSet rs = getChromID.executeQuery();
rs.next();
lastChromID = rs.getInt(1);
rs.close();
insertEmptySequence.setInt(1,lastChromID);
insertEmptySequence.execute();
length = 0;
nextHash = hashTick;
System.out.println();
System.out.print(String.format("\"%s\"", name));
System.out.flush();
} else {
String seq = line;
//Append sequence and update length
//I could probably do both in one update,
//but I got worried about whether the sequence
//would be set by the time the length() operation executes
appendSequence.setString(1, seq);
appendSequence.setInt(2, lastChromID);
appendSequence.executeUpdate();
updateLength.setInt(1, lastChromID);
updateLength.executeUpdate();
length += seq.length();
if(length-seq.length() < nextHash && length >= nextHash) {
int lower = length-(length%hashTick);
System.out.print(String.format(" %dk", lower/1000));
System.out.flush();
nextHash = lower + hashTick;
}
}
}
}
System.out.println("\n-> Finished.");
System.out.println("-> Committing to DB.");
cxn.commit();
cxn.setAutoCommit(ac);
insertChrom.close();
getChromID.close();
insertEmptySequence.close();
appendSequence.close();
updateLength.close();
cxn.close();
}
public static void bulk_load(String args[]) throws IOException, NotFoundException, SQLException {
Pair<Species,Genome> pair = Args.parseGenome(args);
Genome genome = pair.getLast();
String fastaname = Args.parseString(args,"fasta",null);
FASTAStream fasta;
if (fastaname == null) {
fasta = new FASTAStream(new BufferedReader(new InputStreamReader(System.in)));
} else {
fasta = new FASTAStream(new File(fastaname));
}
Connection cxn = DatabaseConnectionManager.getConnection("core");
boolean ac = cxn.getAutoCommit();
cxn.setAutoCommit(false);
PreparedStatement insertChrom = cxn.prepareStatement("insert into chromosome(id, genome, name) values (" +
DatabaseSequence.getInsertSQL(cxn, "chromosome_id") + "," +
genome.getDBID() + ",?)");
PreparedStatement getChromID = cxn.prepareStatement("select id from chromosome where genome = ? and name = ?");
PreparedStatement insertSequence = cxn.prepareStatement("insert into chromsequence(id,len,sequence) values (?,?,?)");
PreparedStatement getSequence = cxn.prepareStatement("select sequence from chromsequence where id = ?");
while (fasta.hasNext()) {
Pair<String,String> next = fasta.next();
String name = next.car();
name = name.replaceFirst("^chr","");
String seq = next.cdr();
try {
insertChrom.setString(1,name);
insertChrom.execute();
} catch (SQLException e) {
System.err.println("Didn't add chromosome " + name + ". Going to try sequence");
}
getChromID.setInt(1,genome.getDBID());
getChromID.setString(2,name);
ResultSet rs = getChromID.executeQuery();
rs.next();
int id = rs.getInt(1);
rs.close();
insertSequence.setInt(1,id);
insertSequence.setInt(2,seq.length());
ClobHandler.setClob(cxn, insertSequence, 3, seq);
insertSequence.execute();
}
cxn.commit();
cxn.setAutoCommit(ac);
insertChrom.close();
getChromID.close();
insertSequence.close();
getSequence.close();
cxn.close();
}
}