/*
The MIT License (MIT)
Copyright (c) 2014 Pierre Lindenbaum
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
History:
* 2015 : creation
*/
package com.github.lindenb.jvarkit.tools.misc;
import java.io.File;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;
import htsjdk.samtools.SamReader;
import htsjdk.samtools.reference.IndexedFastaSequenceFile;
import htsjdk.samtools.CigarElement;
import htsjdk.samtools.CigarOperator;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMFlag;
import htsjdk.samtools.SAMReadGroupRecord;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SAMRecordIterator;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.SAMSequenceRecord;
import htsjdk.samtools.util.CloserUtil;
import htsjdk.samtools.util.Interval;
import com.beust.jcommander.Parameter;
import com.github.lindenb.jvarkit.io.IOUtils;
import com.github.lindenb.jvarkit.lang.JvarkitException;
import com.github.lindenb.jvarkit.util.jcommander.Launcher;
import com.github.lindenb.jvarkit.util.jcommander.Program;
import com.github.lindenb.jvarkit.util.log.Logger;
import com.github.lindenb.jvarkit.util.picard.GenomicSequence;
import com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress;
import com.github.lindenb.jvarkit.util.samtools.SAMSequenceDictionaryHelper;
/**
BEGIN_DOC
## Motivation
Inserting a BAM in a SQL is not a good idea of course !
But it might be interesting to get some informations about the bases in a segment of bam.
## Schema
The schema can change if some options (-c , -f) are used.
At the time of writing the schema is :
```sql
CREATE TABLE IF NOT EXISTS SamFile
(
id INTEGER PRIMARY KEY,
filename TEXT
);
CREATE TABLE IF NOT EXISTS Dictionary
(
id INTEGER PRIMARY KEY,
name TEXT NOT NULL,
length INT NOT NULL,
tid INT NOT NULL,
samfile_id INT NOT NULL,
FOREIGN KEY(samfile_id) REFERENCES SamFile(id)
);
CREATE TABLE IF NOT EXISTS ReadGroup
(
id INTEGER PRIMARY KEY,
groupId TEXT NOT NULL,
sample TEXT NOT NULL,
samfile_id INT NOT NULL,
FOREIGN KEY(samfile_id) REFERENCES SamFile(id)
);
CREATE TABLE IF NOT EXISTS Read
(
id INTEGER PRIMARY KEY,
name TEXT NOT NULL,
flag INTEGER NOT NULL,
rname TEXT,
pos INTEGER,
mapq INTEGER NOT NULL,
cigar TEXT,
rnext TEXT,
pnext INTEGER,
tlen INTEGER,
sequence TEXT NOT NULL,
qualities TEXT NOT NULL,
samfile_id INT NOT NULL,
group_id INT,
FOREIGN KEY(samfile_id) REFERENCES SamFile(id),
FOREIGN KEY(group_id) REFERENCES ReadGroup(id)
);
CREATE TABLE IF NOT EXISTS Cigar
(
id INTEGER PRIMARY KEY,
read_pos INT ,
read_base TEXT,
read_qual INT ,
ref_pos INT ,
ref_base TEXT,
operator TEXT NOT NULL,
read_id INT NOT NULL,
FOREIGN KEY(read_id) REFERENCES Read(id)
);
```
## Example
Build a sqlite3 database for a set of BAM files in the region "rotavirus:1-10""
```
$java -jar dist/bam2sql.jar -r 'rotavirus:1-10' -R ref.fa -c S*.bam |\
sqlite3 database.sqlite
```
Select data from sqlite database where the genomic position is "rotavirus:5"
```sql
select SamFile.filename,
ReadGroup.sample,
Read.flag,
Read.rname,
Cigar.operator,
Cigar.read_pos,
Cigar.read_base,
Cigar.read_qual,
Cigar.ref_pos,
Cigar.ref_base
from
SamFile,Read,Cigar,ReadGroup
where
SamFile.id = Read.samfile_id AND
ReadGroup.id = Read.group_id AND
Cigar.read_id = Read.id and
Read.rname = "rotavirus" and
Cigar.ref_pos= 5
;
```
query:
```
$ sqlite3 -header -separator ' ' database.sqlite < query.sql | column -t
```
output:
```
filename sample flag rname operator read_pos read_base read_qual ref_pos ref_base
S1.bam S1 99 rotavirus M 4 T 10 5 T
S1.bam S1 163 rotavirus M 4 T 10 5 T
S1.bam S1 163 rotavirus M 4 T 10 5 T
S1.bam S1 99 rotavirus M 4 T 10 5 T
S1.bam S1 163 rotavirus M 4 T 10 5 T
S1.bam S1 99 rotavirus M 3 T 10 5 T
S1.bam S1 99 rotavirus M 3 T 10 5 T
S1.bam S1 99 rotavirus M 3 T 10 5 T
S1.bam S1 99 rotavirus M 3 T 10 5 T
S1.bam S1 163 rotavirus M 3 T 10 5 T
S1.bam S1 163 rotavirus M 3 T 10 5 T
S1.bam S1 163 rotavirus M 3 T 10 5 T
S1.bam S1 99 rotavirus M 3 T 10 5 T
S1.bam S1 163 rotavirus M 3 T 10 5 T
S1.bam S1 99 rotavirus M 2 T 10 5 T
S1.bam S1 99 rotavirus M 2 T 10 5 T
S1.bam S1 163 rotavirus M 2 T 10 5 T
S1.bam S1 163 rotavirus M 2 T 10 5 T
S1.bam S1 99 rotavirus M 1 T 10 5 T
S1.bam S1 99 rotavirus M 1 T 10 5 T
S1.bam S1 163 rotavirus M 1 T 10 5 T
S1.bam S1 163 rotavirus M 1 T 10 5 T
S1.bam S1 163 rotavirus M 4 T 10 5 T
S1.bam S1 163 rotavirus M 0 T 10 5 T
S1.bam S1 163 rotavirus M 0 T 10 5 T
S1.bam S1 163 rotavirus M 0 T 10 5 T
S1.bam S1 163 rotavirus M 0 T 10 5 T
S1.bam S1 99 rotavirus S 3 T 10 5 T
S1.bam S1 163 rotavirus S 0 T 10 5 T
S1.bam S1 99 rotavirus S 3 T 10 5 T
S2.bam S2 99 rotavirus M 4 T 10 5 T
S2.bam S2 163 rotavirus M 4 T 10 5 T
S2.bam S2 99 rotavirus M 4 T 10 5 T
S2.bam S2 99 rotavirus M 4 T 10 5 T
S2.bam S2 163 rotavirus M 4 T 10 5 T
S2.bam S2 163 rotavirus M 3 T 10 5 T
S2.bam S2 99 rotavirus M 3 T 10 5 T
S2.bam S2 99 rotavirus M 3 T 10 5 T
S2.bam S2 99 rotavirus M 3 T 10 5 T
S2.bam S2 99 rotavirus M 2 A 10 5 T
S2.bam S2 163 rotavirus M 2 T 10 5 T
S2.bam S2 99 rotavirus M 1 T 10 5 T
S2.bam S2 99 rotavirus M 1 T 10 5 T
S2.bam S2 163 rotavirus M 3 T 10 5 T
S2.bam S2 99 rotavirus M 1 T 10 5 T
S2.bam S2 99 rotavirus M 1 T 10 5 T
S2.bam S2 99 rotavirus M 0 T 10 5 T
S2.bam S2 99 rotavirus M 0 T 10 5 T
S2.bam S2 163 rotavirus S 4 T 10 5 T
S2.bam S2 99 rotavirus S 2 A 10 5 T
S3.bam S3 99 rotavirus M 4 A 10 5 T
S3.bam S3 163 rotavirus M 4 T 10 5 T
S3.bam S3 99 rotavirus M 4 T 10 5 T
S3.bam S3 99 rotavirus M 3 T 10 5 T
S3.bam S3 99 rotavirus M 3 T 10 5 T
S3.bam S3 99 rotavirus M 3 T 10 5 T
S3.bam S3 163 rotavirus M 3 T 10 5 T
S3.bam S3 163 rotavirus M 3 T 10 5 T
S3.bam S3 163 rotavirus M 2 T 10 5 T
S3.bam S3 163 rotavirus M 2 T 10 5 T
S3.bam S3 99 rotavirus M 2 T 10 5 T
S3.bam S3 163 rotavirus M 2 T 10 5 T
S3.bam S3 99 rotavirus M 1 T 10 5 T
S3.bam S3 163 rotavirus M 1 A 10 5 T
S3.bam S3 99 rotavirus M 1 A 10 5 T
S3.bam S3 99 rotavirus M 1 A 10 5 T
S3.bam S3 99 rotavirus M 1 T 10 5 T
S3.bam S3 163 rotavirus M 1 T 10 5 T
S3.bam S3 99 rotavirus M 0 T 10 5 T
S3.bam S3 163 rotavirus M 0 T 10 5 T
S3.bam S3 163 rotavirus M 0 T 10 5 T
S3.bam S3 163 rotavirus M 0 T 10 5 T
S3.bam S3 163 rotavirus M 0 A 10 5 T
S3.bam S3 99 rotavirus M 0 T 10 5 T
S3.bam S3 163 rotavirus M 0 T 10 5 T
S3.bam S3 99 rotavirus S 2 A 10 5 T
S4.bam S4 163 rotavirus M 4 T 10 5 T
S4.bam S4 163 rotavirus M 4 T 10 5 T
S4.bam S4 99 rotavirus M 4 T 10 5 T
S4.bam S4 163 rotavirus M 4 T 10 5 T
S4.bam S4 163 rotavirus M 3 T 10 5 T
S4.bam S4 163 rotavirus M 3 T 10 5 T
S4.bam S4 99 rotavirus M 3 T 10 5 T
S4.bam S4 163 rotavirus M 2 T 10 5 T
S4.bam S4 99 rotavirus M 1 T 10 5 T
S4.bam S4 99 rotavirus M 0 T 10 5 T
S4.bam S4 99 rotavirus M 4 T 10 5 T
S4.bam S4 163 rotavirus M 0 T 10 5 T
S4.bam S4 163 rotavirus M 0 T 10 5 T
```
END_DOC
*/
@Program(name="bam2sql",description="Convert a SAM/BAM to sqlite statements",keywords={"bam","sam","sql","sqlite"})
public class BamToSql
extends Launcher
{
private static final Logger LOG = Logger.build(BamToSql.class).make();
@Parameter(names={"-o","--out"},description="Output file or stdout")
private File outputFile = null;
@Parameter(names={"-r","--region"},description="Restrict to a given region")
private String regionStr = "";
@Parameter(names={"-c","--cigar"},description="print cigar data")
private boolean printcigar = false;
@Parameter(names={"-f","--flag"},description="expands details about sam flag")
private boolean printflag = false;
@Parameter(names={"-R","--reference"},description="The Reference fasta file",required=true)
private File faidxFile=null;
public BamToSql()
{
}
private String quote(final String s ){
if(s==null) return "NULL";
final StringBuilder sb=new StringBuilder(s.length()+2);
sb.append("'");
sb.append(s);
sb.append("'");
return sb.toString();
}
@Override
public int doWork(List<String> args) {
if(this.faidxFile==null) {
LOG.error("ref sequence faidx not defined");
return -1;
}
SAMRecordIterator iter=null;
SamReader sfr=null;
PrintWriter out =null;
GenomicSequence genomicSequence=null;
IndexedFastaSequenceFile indexedFastaSequenceFile=null;
args = new ArrayList<String>(IOUtils.unrollFiles(args));
try
{
out = super.openFileOrStdoutAsPrintWriter(this.outputFile);
indexedFastaSequenceFile=new IndexedFastaSequenceFile(this.faidxFile);
out.println("CREATE TABLE IF NOT EXISTS SamFile");
out.println("(");
out.println("id INTEGER PRIMARY KEY,");
out.println("filename TEXT");
out.println(");");
out.println("CREATE TABLE IF NOT EXISTS Dictionary");
out.println("(");
out.println("id INTEGER PRIMARY KEY,");
out.println("name TEXT NOT NULL,");
out.println("length INT NOT NULL,");
out.println("tid INT NOT NULL,");
out.println("samfile_id INT NOT NULL,");
out.println("FOREIGN KEY(samfile_id) REFERENCES SamFile(id)");
out.println(");");
out.println("CREATE TABLE IF NOT EXISTS ReadGroup");
out.println("(");
out.println("id INTEGER PRIMARY KEY,");
out.println("groupId TEXT NOT NULL,");
out.println("sample TEXT NOT NULL,");
out.println("samfile_id INT NOT NULL,");
out.println("FOREIGN KEY(samfile_id) REFERENCES SamFile(id)");
out.println(");");
out.println("CREATE TABLE IF NOT EXISTS Read");
out.println("(");
out.println("id INTEGER PRIMARY KEY,");
out.println("name TEXT NOT NULL,");
out.println("flag INTEGER NOT NULL,");
if(this.printflag){
for(final SAMFlag flg: SAMFlag.values()) {
out.println(flg.name()+" INTEGER NOT NULL,");
}
}
out.println("rname TEXT,");
out.println("pos INTEGER,");
out.println("mapq INTEGER NOT NULL,");
out.println("cigar TEXT,");
out.println("rnext TEXT,");
out.println("pnext INTEGER,");
out.println("tlen INTEGER,");
out.println("sequence TEXT NOT NULL,");
out.println("qualities TEXT NOT NULL,");
out.println("samfile_id INT NOT NULL,");
out.println("group_id INT,");
out.println("FOREIGN KEY(samfile_id) REFERENCES SamFile(id),");
out.println("FOREIGN KEY(group_id) REFERENCES ReadGroup(id)");
out.println(");");
out.println("CREATE TABLE IF NOT EXISTS Cigar");
out.println("(");
out.println("id INTEGER PRIMARY KEY,");
out.println("read_pos INT ,");
out.println("read_base TEXT,");
out.println("read_qual INT ,");
out.println("ref_pos INT ,");
out.println("ref_base TEXT,");
out.println("operator TEXT NOT NULL,");
out.println("read_id INT NOT NULL,");
out.println("FOREIGN KEY(read_id) REFERENCES Read(id)");
out.println(");");
out.println("begin transaction;");
int samIndex=0;
do {
final String inputName;
if(samIndex==0 && args.isEmpty()) {
sfr = openSamReader(null);
inputName="<stdin>";
}
else
{
inputName= args.get(samIndex);
sfr = openSamReader(inputName);
}
final SAMFileHeader header1=sfr.getFileHeader();
if(header1==null)
{
throw new JvarkitException.FileFormatError("File header missing");
}
final SAMSequenceDictionary dict=header1.getSequenceDictionary();
if(dict==null) {
throw new JvarkitException.DictionaryMissing("No Dictionary in input");
}
final SAMSequenceDictionaryHelper dix = new SAMSequenceDictionaryHelper(dict);
final Interval userInterval;
iter= null;
if(this.regionStr==null || this.regionStr.isEmpty()) {
LOG.warn("You're currently scanning the whole BAM ???!!!");
iter = sfr.iterator();
userInterval = null;
}
else
{
final Optional<Interval> interval = dix.parseInterval(this.regionStr);
if(!interval.isPresent()) {
throw new JvarkitException.UserError("cannot parse interval "+this.regionStr);
}
userInterval = interval.get();
iter = sfr.query(
interval.get().getContig(),
interval.get().getStart(),
interval.get().getEnd(),
false
);
}
out.println(String.join(" ",
"insert into SamFile(filename) values(",
quote(inputName),
");"
));
for(int i=0;i< dict.size();++i) {
final SAMSequenceRecord ssr = dict.getSequence(i);
out.println(
"insert into Dictionary(name,length,tid,samfile_id) select "+
quote(inputName) + ","+
ssr.getSequenceLength()+","+
i+",max(id) from SamFile;"
);
}
for(final SAMReadGroupRecord g:header1.getReadGroups()){
out.println(
"insert into ReadGroup(groupId,sample,samfile_id) select "+
quote(g.getId()) + ","+
quote(g.getSample())+","+
"max(id) from SamFile;"
);
}
final SAMSequenceDictionaryProgress progress=new SAMSequenceDictionaryProgress(header1);
while(iter.hasNext())
{
final SAMRecord rec= progress.watch(iter.next());
final StringBuilder sql = new StringBuilder();
sql.append("insert into Read("
+ "name,flag,");
if(this.printflag){
for(final SAMFlag flg: SAMFlag.values()) {
sql.append(flg.name()).append(",");
}
}
sql.append("rname,pos,mapq,cigar,rnext,pnext,tlen,sequence,qualities,group_id,samfile_id) select ");
sql.append(quote(rec.getReadName())).append(",");
sql.append(rec.getFlags()).append(",");
if(this.printflag){
for(final SAMFlag flg: SAMFlag.values()) {
sql.append(flg.isSet(rec.getFlags())?1:0);
sql.append(",");
}
}
if(rec.getReferenceName()==null || rec.getReferenceName().equals(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME)) {
sql.append("NULL,NULL");
} else
{
sql.append(quote(rec.getReferenceName()));
sql.append(",");
sql.append(rec.getAlignmentStart());
}
sql.append(",");
sql.append(rec.getMappingQuality());
sql.append(",");
//cigar
if(rec.getCigarString()==null || rec.getCigarString().equals(SAMRecord.NO_ALIGNMENT_CIGAR)) {
sql.append("NULL");
} else
{
sql.append(quote(rec.getCigarString()));
}
sql.append(",");
//rnext
if(rec.getMateReferenceName()==null || rec.getMateReferenceName().equals(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME)) {
sql.append("NULL,NULL");
} else
{
sql.append(quote(rec.getMateReferenceName()));
sql.append(",");
sql.append(rec.getMateAlignmentStart());
}
sql.append(",");
//tlen
sql.append(rec.getInferredInsertSize());
sql.append(",");
//sequence
sql.append(quote(rec.getReadString()));
sql.append(",");
//qualities
sql.append(quote(rec.getBaseQualityString()));
sql.append(",");
if(rec.getReadGroup()==null)
{
sql.append("NULL");
}
else
{
sql.append("G.id");
}
sql.append(",F.id FROM SamFile as F");
if(rec.getReadGroup()!=null)
{
sql.append(" , ReadGroup as G where G.groupId=").
append(quote(rec.getReadGroup().getId())).
append(" and F.id = G.samfile_id ");
}
sql.append(" ORDER BY F.id DESC LIMIT 1;");
out.println(sql.toString());
if(this.printcigar && !rec.getReadUnmappedFlag() && rec.getCigar()!=null) {
if(genomicSequence==null || !genomicSequence.getChrom().equals(rec.getReferenceName())) {
genomicSequence = new GenomicSequence(indexedFastaSequenceFile, rec.getReferenceName());
}
int ref = rec.getUnclippedStart();
final byte bases[]=rec.getReadBases();
final byte quals[]=rec.getBaseQualities();
int read = 0;
for(final CigarElement ce:rec.getCigar()) {
final CigarOperator op=ce.getOperator();
if(op.equals(CigarOperator.P)) continue;
for(int i=0;i< ce.getLength();++i) {
sql.setLength(0);
boolean in_user_interval=true;
sql.append("insert into Cigar(operator,read_pos,read_base,read_qual,ref_pos,ref_base,read_id) ");
sql.append("select '");
sql.append(op.name());
sql.append("',");
if(userInterval!=null &&
!(rec.getReferenceName().equals(userInterval.getContig()) &&
ref>=userInterval.getStart() && ref<=userInterval.getEnd()))
{
in_user_interval = false;
}
switch(op){
case I: {
sql.append(read);
sql.append(",");
sql.append("'"+(char)bases[read]+"',");
sql.append(""+quals[read]+"");
sql.append(",");
sql.append("NULL,NULL");
read++;
break;
}
case D:case N:case H://yes H (hard clip)
{
sql.append("NULL,NULL,NULL,");
sql.append(ref);
sql.append(",'");
sql.append((ref<1 || ref-1>=genomicSequence.length())?'*':genomicSequence.charAt(ref-1));
sql.append("'");
ref++;
break;
}
case M:case X:case EQ:case S: //yes S, soft clip
{
sql.append(read);
sql.append(",");
sql.append("'"+(char)bases[read]+"',");
sql.append(""+quals[read]+"");
sql.append(",");
sql.append(ref);
sql.append(",'");
sql.append((ref<1 || ref-1>=genomicSequence.length())?'*':genomicSequence.charAt(ref-1));
sql.append("'");
ref++;
read++;
break;
}
default: throw new IllegalStateException();
}
sql.append(", id from Read ORDER BY id DESC LIMIT 1;");
if(in_user_interval) out.println(sql.toString());
}
}
}
}
iter.close();iter=null;
sfr.close();sfr=null;
progress.finish();
samIndex++;
} while(samIndex< args.size());
out.println("COMMIT;");
out.flush();
out.close();
LOG.info("done");
return 0;
}
catch(Exception err)
{
LOG.error(err);
return -1;
}
finally
{
CloserUtil.close(iter);
CloserUtil.close(sfr);
CloserUtil.close(out);
CloserUtil.close(indexedFastaSequenceFile);
}
}
/**
* @param args
*/
public static void main(String[] args)
{
new BamToSql().instanceMainWithExit(args);
}
}