/* * Copyright 2015-2016 OpenCB * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.opencb.opencga.storage.core.sequence; import org.opencb.biodata.formats.io.FileFormatException; import org.opencb.biodata.formats.sequence.fasta.Fasta; import org.opencb.biodata.formats.sequence.fasta.dbadaptor.SequenceDBAdaptor; import org.opencb.biodata.formats.sequence.fasta.io.FastaReader; import org.opencb.biodata.models.core.Region; import org.opencb.opencga.core.common.XObject; import org.opencb.opencga.storage.core.utils.SqliteManager; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.nio.file.Path; import java.nio.file.Paths; import java.sql.SQLException; import java.util.List; /** * Created by jacobo on 18/08/14. */ public class SqliteSequenceDBAdaptor extends SequenceDBAdaptor { public static final String SEQUENCE_TABLE = "SEQUENCE"; public static final String META_TABLE = "META"; private static final int CHUNK_SIZE = 2000; private Path dbPath; //private Path fastaPath; private SqliteManager sqliteManager; private int chunkStart; public SqliteSequenceDBAdaptor() { sqliteManager = new SqliteManager(); } public SqliteSequenceDBAdaptor(Path input) { this(); if (input.toString().endsWith(".fasta") || input.toString().endsWith(".fasta.gz")) { //createDB(input); throw new UnsupportedOperationException("Unimplemented. Needs to call \"this.createDB()\" first."); //TODO: Search db? } else if (input.toString().endsWith(".properties")) { throw new UnsupportedOperationException("Unimplemented"); } else if (input.toString().endsWith(".sqlite.db")) { dbPath = input; } } @Override public void open() throws IOException { try { sqliteManager.connect(dbPath, true); } catch (ClassNotFoundException | SQLException e) { throw new IOException(e); } } @Override public void close() throws IOException { try { sqliteManager.disconnect(true); } catch (SQLException e) { throw new IOException(e); } } /** * Creates the ChunkId for SQLite. * * @param chromosome Region name or chromosome * @param pos Absolute position 1-based * @return ChunkID. <chromosome>_<pos/CHUNK_SIZE> */ private String getChunkId(String chromosome, int pos) { return String.format("%s_%06d", chromosome, (pos - 1) / CHUNK_SIZE); } /** * Returns the sequence stored in the DB of a given region. * The stored sequence will be interpreted as 1-based. * e.g. The first 10 elements will correspond to the region 1-10 * * @param region Region requested. * @return Sequence 1-based for [region.start, region.end] * @throws IOException An exception if file not found */ @Override public String getSequence(Region region) throws IOException { /* * [0-1999],[2000,3999],[4000,5999] ==> 0-based * [1-2000],[2001,4000],[4001,6000] ==> 1-based */ /* * A B C * |----|----|----|----|----| == seq * |-------------------| == region * D E * A : chunkStart * CHUNK_SIZE * B : chunkEnd * CHUNK_SIZE * C : (chunkEnd+1) * CHUNK_SIZE * D : region.getStart() * E : region.getEnd() */ List<XObject> query; int chunkStart = (region.getStart() - 1) / CHUNK_SIZE; int chunkEnd = (region.getEnd() - 1) / CHUNK_SIZE; int regionLength = region.getEnd() - region.getStart() + 1; //+1 to include last position. [start-end] if (regionLength <= 0) { return ""; //Reject bad regions. } try { // query = sqliteManager.query( // "SELECT seq FROM " + SEQUENCE_TABLE + // " WHERE id IS " + region.getChromosome() + // " AND chunk BETWEEN " + chunkStart + " AND " + chunkEnd ); query = sqliteManager.query( "SELECT seq FROM " + SEQUENCE_TABLE + " WHERE id BETWEEN '" + getChunkId(region.getChromosome(), region.getStart()) + "'" + " AND '" + getChunkId(region.getChromosome(), region.getEnd()) + "'"); } catch (SQLException e) { throw new IOException(e); } String seq = ""; for (XObject xo : query) { seq += xo.getString("seq"); } int startIndex = (region.getStart() - 1) - chunkStart * CHUNK_SIZE; // D - A int endIndex = startIndex + regionLength; // //System.out.println(0 + " - " + startIndex + " - " + endIndex + " - " + seq.length()); //System.out.println(chunkStart * CHUNK_SIZE + " - " + region.getStart() + " - " + region.getEnd() + " - " + (chunkEnd+1) * // CHUNK_SIZE); seq = seq.substring(startIndex, endIndex); return seq; } @Override public String getSequence(Region region, String species) throws IOException { return getSequence(region); } /** * Creates a <input>.sqlite.db. * <p> * Contents 2 tables: * SEQUENCES: * id TEXT : <chromosome>_<pos/CHUNK_SIZE> * seq TEXT : sequence [ chunk_id*CHUNK_SIZE , (chunk_id+1)*CHUNK_SIZE ) * META: * id TEXT : * description TEXT : * length TEXT : * * @param fastaInput Accept formats: *.fasta, *.fasta.gz * @param outdir Destination folder for the index * @return A file object for the index * @throws IOException If any IO problem occurs * @throws SQLException If any problem with SQLite * @throws FileFormatException If format is not correct */ public File index(File fastaInput, Path outdir) throws IOException, SQLException, FileFormatException { if (fastaInput == null || !fastaInput.exists()) { throw new FileNotFoundException("Fasta '" + fastaInput + "' file not found"); } if (outdir == null) { outdir = Paths.get(fastaInput.toPath().toAbsolutePath().getParent().toString()); } Path output = Paths.get(outdir.toAbsolutePath().toString(), fastaInput.getName() + ".sqlite.db"); try { sqliteManager.connect(output, false); } catch (ClassNotFoundException e) { throw new IOException(e); } //Create Tables XObject seqColumns = new XObject(); seqColumns.put("id", "TEXT"); //seqColumns.put("chunk", "INTEGER"); seqColumns.put("seq", "CHARACTER(" + CHUNK_SIZE + ")"); XObject metaColumns = new XObject(); metaColumns.put("id", "TEXT"); metaColumns.put("description", "TEXT"); metaColumns.put("length", "INTEGER"); sqliteManager.createTable(SEQUENCE_TABLE, seqColumns); sqliteManager.createTable(META_TABLE, metaColumns); FastaReader reader; Fasta fasta; //Insert Sequences reader = new FastaReader(fastaInput.toPath()); while ((fasta = reader.read()) != null) { serializeGenomeSequence(fasta); } //Create Index XObject indices = new XObject(); indices.put("id", 0); //indices.put("chunk", 1); sqliteManager.createIndex(SEQUENCE_TABLE, "id", indices); dbPath = output; sqliteManager.disconnect(true); return output.toFile(); } private void serializeGenomeSequence(Fasta fasta) throws SQLException { System.out.println(fasta.getDescription()); System.out.println(fasta.getId()); System.out.println(fasta.getSeq().length()); String tablename = SEQUENCE_TABLE; // XObject seqColumns = new XObject(); // seqColumns.put("id", "INTEGER"); // seqColumns.put("seq", "CHARACTER(2000)"); // tablename = "SEQ_"+fasta.getId(); // sqliteManager.createTable(tablename, seqColumns); XObject meta = new XObject(); meta.put("id", fasta.getId()); meta.put("description", fasta.getDescription()); meta.put("length", fasta.getSeq().length()); sqliteManager.insert(meta, META_TABLE); int chunks = (fasta.getSeq().length() + CHUNK_SIZE - 1) / CHUNK_SIZE; //ceil(length/chunkSize) XObject seq = new XObject(); int end; for (int i = 0; i < chunks; i++) { //seq.put("id", fasta.getId()); seq.put("id", getChunkId(fasta.getId(), i * CHUNK_SIZE)); //seq.put("chunk", i); end = (i + 1) * CHUNK_SIZE; if (end >= fasta.getSeq().length()) { end = fasta.getSeq().length() - 1; } seq.put("seq", fasta.getSeq().substring(i * CHUNK_SIZE, end)); sqliteManager.insert(seq, tablename); } } // private void parse(Path fastaInputFile){ // try { // String sequenceName = ""; // String sequenceType = ""; // String sequenceAssembly = ""; // String line; // StringBuilder sequenceStringBuilder = new StringBuilder(); // // Preparing input and output files // BufferedReader br; // // if(fastaInputFile.toString().endsWith(".gz")) { // br = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(fastaInputFile.toFile())))); // } else { // //br = Files.newBufferedReader(Paths.get(genomeReferenceFastaFile.getAbsolutePath()), Charset.defaultCharset()); // br = FileUtils.newBufferedReader(fastaInputFile); // } // // while ((line = br.readLine()) != null) { // if (!line.startsWith(">")) { // sequenceStringBuilder.append(line); // } else { // // new chromosome, save data // if (sequenceStringBuilder.length() > 0) { // if(!sequenceName.contains("PATCH") && !sequenceName.contains("HSCHR")) { // System.out.println(sequenceName); // serializeGenomeSequence(sequenceName, sequenceType, sequenceAssembly, sequenceStringBuilder.toString()); // } // } // // initialize data structures // sequenceName = line.replace(">", "").split(" ")[0]; // sequenceType = line.replace(">", "").split(" ")[2].split(":")[0]; // sequenceAssembly = line.replace(">", "").split(" ")[2].split(":")[1]; // sequenceStringBuilder.delete(0, sequenceStringBuilder.length()); // } // } // // Last chromosome must be processed // if(!sequenceName.contains("PATCH") && !sequenceName.contains("HSCHR")) { // serializeGenomeSequence(sequenceName, sequenceType, sequenceAssembly, sequenceStringBuilder.toString()); // } // br.close(); // } catch (IOException e) { // e.printStackTrace(); // } // // } // private void serializeGenomeSequence(String chromosome, String sequenceType, String sequenceAssembly, String sequence){ // System.out.println(chromosome + " " + sequenceType + " " + sequenceAssembly + "[" + sequence.length() + "]"); // System.out.println(""); // } }