package org.opencb.opencga.storage.core.alignment.local; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectWriter; import htsjdk.samtools.SAMFileHeader; import htsjdk.samtools.SAMSequenceRecord; import org.opencb.biodata.formats.io.FileFormatException; import org.opencb.biodata.models.alignment.RegionCoverage; import org.opencb.biodata.models.core.Region; import org.opencb.biodata.tools.alignment.AlignmentOptions; import org.opencb.biodata.tools.alignment.BamManager; import org.opencb.biodata.tools.alignment.BamUtils; import org.opencb.biodata.tools.alignment.stats.AlignmentGlobalStats; import org.opencb.commons.utils.FileUtils; import org.opencb.opencga.storage.core.StoragePipeline; import org.opencb.opencga.storage.core.exceptions.StorageEngineException; import java.io.BufferedReader; import java.io.FileInputStream; import java.io.IOException; import java.io.PrintWriter; import java.net.URI; import java.nio.ByteBuffer; import java.nio.file.Path; import java.nio.file.Paths; import java.sql.*; import java.util.*; /** * Created by pfurio on 31/10/16. */ public class LocalAlignmentStoragePipeline implements StoragePipeline { private static final String COVERAGE_SUFFIX = ".coverage"; private static final String COVERAGE_DATABASE_NAME = "coverage.db"; private static final int MINOR_CHUNK_SIZE = 1000; public LocalAlignmentStoragePipeline() { super(); } @Override public URI extract(URI input, URI ouput) throws StorageEngineException { return input; } @Override public URI preTransform(URI input) throws IOException, FileFormatException, StorageEngineException { // Check if a BAM file is passed and it is sorted. // Only binaries and sorted BAM files are accepted at this point. Path inputPath = Paths.get(input.getRawPath()); BamUtils.checkBamOrCramFile(new FileInputStream(inputPath.toFile()), inputPath.getFileName().toString(), true); return input; } @Override public URI transform(URI input, URI pedigree, URI output) throws Exception { Path path = Paths.get(input.getRawPath()); FileUtils.checkFile(path); Path workspace = Paths.get(output.getRawPath()); FileUtils.checkDirectory(workspace); // 1. Check if the bai does not exist and create it BamManager bamManager = new BamManager(path); if (!path.getParent().resolve(path.getFileName().toString() + ".bai").toFile().exists()) { bamManager.createIndex(); } // 2. Calculate stats and store in a file Path statsPath = workspace.resolve(path.getFileName() + ".stats"); if (!statsPath.toFile().exists()) { AlignmentGlobalStats stats = bamManager.stats(); ObjectMapper objectMapper = new ObjectMapper(); ObjectWriter objectWriter = objectMapper.writerFor(AlignmentGlobalStats.class); objectWriter.writeValue(statsPath.toFile(), stats); } // 3. Calculate coverage and store in SQLite SAMFileHeader fileHeader = BamUtils.getFileHeader(path); // long start = System.currentTimeMillis(); initDatabase(fileHeader.getSequenceDictionary().getSequences(), workspace); // System.out.println("SQLite database initialization, in " + ((System.currentTimeMillis() - start) / 1000.0f) // + " s."); Path coveragePath = workspace.toAbsolutePath().resolve(path.getFileName() + COVERAGE_SUFFIX); AlignmentOptions options = new AlignmentOptions(); options.setContained(false); Iterator<SAMSequenceRecord> iterator = fileHeader.getSequenceDictionary().getSequences().iterator(); PrintWriter writer = new PrintWriter(coveragePath.toFile()); StringBuilder line; // start = System.currentTimeMillis(); while (iterator.hasNext()) { SAMSequenceRecord next = iterator.next(); for (int i = 0; i < next.getSequenceLength(); i += MINOR_CHUNK_SIZE) { Region region = new Region(next.getSequenceName(), i + 1, Math.min(i + MINOR_CHUNK_SIZE, next.getSequenceLength())); RegionCoverage regionCoverage = bamManager.coverage(region, null, options); int meanDepth = Math.min(regionCoverage.meanCoverage(), 255); // File columns: chunk chromosome start end coverage // chunk format: chrom_id_suffix, where: // id: int value starting at 0 // suffix: chunkSize + k // eg. 3_4_1k line = new StringBuilder(); line.append(region.getChromosome()).append("_"); line.append(i / MINOR_CHUNK_SIZE).append("_").append(MINOR_CHUNK_SIZE / 1000).append("k"); line.append("\t").append(region.getChromosome()); line.append("\t").append(region.getStart()); line.append("\t").append(region.getEnd()); line.append("\t").append(meanDepth); writer.println(line.toString()); } } writer.close(); // System.out.println("Mean coverage file creation, in " + ((System.currentTimeMillis() - start) / 1000.0f) + " s."); // save file to db // start = System.currentTimeMillis(); insertCoverageDB(path, workspace); // System.out.println("SQLite database population, in " + ((System.currentTimeMillis() - start) / 1000.0f) + " s."); return input; } @Override public URI postTransform(URI input) throws Exception { return input; } @Override public URI preLoad(URI input, URI output) throws IOException, StorageEngineException { return null; } @Override public URI load(URI input) throws IOException, StorageEngineException { return null; } @Override public URI postLoad(URI input, URI output) throws IOException, StorageEngineException { return null; } private void initDatabase(List<SAMSequenceRecord> sequenceRecordList, Path workspace) { Path coverageDBPath = workspace.toAbsolutePath().resolve(COVERAGE_DATABASE_NAME); if (!coverageDBPath.toFile().exists()) { Statement stmt; try { Class.forName("org.sqlite.JDBC"); Connection connection = DriverManager.getConnection("jdbc:sqlite:" + coverageDBPath); // Create tables stmt = connection.createStatement(); String sql = "CREATE TABLE chunk " + "(id INTEGER PRIMARY KEY AUTOINCREMENT," + "chunk_id VARCHAR NOT NULL," + "chromosome VARCHAR NOT NULL, " + "start INT NOT NULL, " + "end INT NOT NULL); " + "CREATE UNIQUE INDEX chunk_id_idx ON chunk (chunk_id);" + "CREATE INDEX chrom_start_end_idx ON chunk (chromosome, start, end);"; stmt.executeUpdate(sql); sql = "CREATE TABLE file " + "(id INTEGER PRIMARY KEY AUTOINCREMENT," + "path VARCHAR NOT NULL," + "name VARCHAR NOT NULL);" + "CREATE UNIQUE INDEX path_idx ON file (path);"; stmt.executeUpdate(sql); sql = "CREATE TABLE mean_coverage " + "(chunk_id INTEGER," + "file_id INTEGER," + "v1 INTEGER, " + "v2 INTEGER, " + "v3 INTEGER, " + "v4 INTEGER, " + "v5 INTEGER, " + "v6 INTEGER, " + "v7 INTEGER, " + "v8 INTEGER," + "PRIMARY KEY(chunk_id, file_id));"; stmt.executeUpdate(sql); // Insert all the chunks String minorChunkSuffix = (MINOR_CHUNK_SIZE / 1000) * 64 + "k"; PreparedStatement insertChunk = connection.prepareStatement("insert into chunk (chunk_id, chromosome, start, end) " + "values (?, ?, ?, ?)"); connection.setAutoCommit(false); for (SAMSequenceRecord samSequenceRecord : sequenceRecordList) { String chromosome = samSequenceRecord.getSequenceName(); int sequenceLength = samSequenceRecord.getSequenceLength(); int cont = 0; for (int i = 0; i < sequenceLength; i += 64 * MINOR_CHUNK_SIZE) { String chunkId = chromosome + "_" + cont + "_" + minorChunkSuffix; insertChunk.setString(1, chunkId); insertChunk.setString(2, chromosome); insertChunk.setInt(3, i + 1); insertChunk.setInt(4, i + 64 * MINOR_CHUNK_SIZE); insertChunk.addBatch(); cont++; } insertChunk.executeBatch(); } connection.commit(); stmt.close(); connection.close(); } catch (Exception e) { System.err.println(e.getClass().getName() + ": " + e.getMessage()); System.exit(0); } System.out.println("Opened database successfully"); } } private void insertCoverageDB(Path bamPath, Path workspace) throws IOException { FileUtils.checkFile(bamPath); String absoluteBamPath = bamPath.toFile().getAbsolutePath(); Path coveragePath = workspace.toAbsolutePath().resolve(bamPath.getFileName() + COVERAGE_SUFFIX); String fileName = bamPath.toFile().getName(); Path coverageDBPath = workspace.toAbsolutePath().resolve("coverage.db"); try { // Insert into file table Class.forName("org.sqlite.JDBC"); Connection connection = DriverManager.getConnection("jdbc:sqlite:" + coverageDBPath); Statement stmt = connection.createStatement(); String insertFileSql = "insert into file (path, name) values ('" + absoluteBamPath + "', '" + fileName + "');"; stmt.executeUpdate(insertFileSql); stmt.close(); ResultSet rs = stmt.executeQuery("SELECT id FROM file where path = '" + absoluteBamPath + "';"); int fileId = -1; while (rs.next()) { fileId = rs.getInt("id"); } if (fileId != -1) { Map chunkIdMap = new HashMap<String, Integer>(); String sql = "SELECT id, chromosome, start FROM chunk"; // System.out.println(sql); rs = stmt.executeQuery(sql); while (rs.next()) { chunkIdMap.put(rs.getString("chromosome") + "_" + rs.getInt("start"), rs.getInt("id")); } // Iterate file PreparedStatement insertCoverage = connection.prepareStatement("insert into mean_coverage (chunk_id, " + " file_id, v1, v2, v3, v4, v5, v6, v7, v8) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"); connection.setAutoCommit(false); BufferedReader bufferedReader = FileUtils.newBufferedReader(coveragePath); // Checkstyle plugin is not happy with assignations inside while/for int chunkId = -1; byte[] meanCoverages = new byte[8]; // contains 8 coverages long[] packedCoverages = new long[8]; // contains 8 x 8 coverages int counter1 = 0; // counter for 8-byte mean coverages array int counter2 = 0; // counter for 8-long packed coverages array String prevChromosome = null; String line = bufferedReader.readLine(); while (line != null) { String[] fields = line.split("\t"); if (prevChromosome == null) { prevChromosome = fields[1]; System.out.println("Processing chromosome " + prevChromosome + "..."); } else if (!prevChromosome.equals(fields[1])) { // we have to write the current results into the DB if (counter1 > 0 || counter2 > 0) { packedCoverages[counter2] = bytesToLong(meanCoverages); insertPackedCoverages(insertCoverage, chunkId, fileId, packedCoverages); } prevChromosome = fields[1]; System.out.println("Processing chromosome " + prevChromosome + "..."); // reset arrays, counters,... Arrays.fill(meanCoverages, (byte) 0); Arrays.fill(packedCoverages, 0); counter2 = 0; counter1 = 0; chunkId = -1; } if (chunkId == -1) { String key = fields[1] + "_" + fields[2]; if (chunkIdMap.containsKey(key)) { chunkId = (int) chunkIdMap.get(key); } else { throw new SQLException("Internal error: coverage chunk " + fields[1] + ":" + fields[2] + "-, not found in database"); } } meanCoverages[counter1] = (byte) Integer.parseInt(fields[4]); if (++counter1 == 8) { // packed mean coverages and save into the packed coverages array packedCoverages[counter2] = bytesToLong(meanCoverages); if (++counter2 == 8) { // write packed coverages array to DB insertPackedCoverages(insertCoverage, chunkId, fileId, packedCoverages); // reset packed coverages array and counter2 Arrays.fill(packedCoverages, 0); counter2 = 0; chunkId = -1; } // reset mean coverages array and counter1 counter1 = 0; Arrays.fill(meanCoverages, (byte) 0); } line = bufferedReader.readLine(); } bufferedReader.close(); if (counter1 > 0 || counter2 > 0) { packedCoverages[counter2] = bytesToLong(meanCoverages); insertPackedCoverages(insertCoverage, chunkId, fileId, packedCoverages); } // insert batch to the DB insertCoverage.executeBatch(); } connection.commit(); stmt.close(); connection.close(); } catch (ClassNotFoundException e) { e.printStackTrace(); } catch (SQLException e) { e.printStackTrace(); } } private void insertPackedCoverages(PreparedStatement insertCoverage, int chunkId, int fileId, long[] packedCoverages) throws SQLException { assert(chunkId != -1); insertCoverage.setInt(1, chunkId); insertCoverage.setInt(2, fileId); for (int i = 0; i < 8; i++) { insertCoverage.setLong(i + 3, packedCoverages[i]); } insertCoverage.addBatch(); } private long bytesToLong(byte[] bytes) { ByteBuffer buffer = ByteBuffer.allocate(Long.BYTES); buffer.put(bytes); buffer.flip(); // need flip return buffer.getLong(); } }