/*
* Copyright 2015-2016 OpenCB
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.opencb.opencga.storage.datamanagers;
import org.opencb.opencga.core.common.XObject;
import org.opencb.opencga.storage.indices.DefaultParser;
import org.opencb.opencga.storage.indices.SqliteManager;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.RandomAccessFile;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.nio.file.Path;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.zip.GZIPInputStream;
public class GffManager {
private int CHUNKSIZE = 2000;
String recordTableName;
XObject recordColumns;
String recordIndexName;
XObject recordIndices;
DefaultParser recordDefaultParser;
String chunkTableName;
XObject chunkColumns;
String chunkIndexName;
XObject chunkIndices;
String statsTableName;
XObject statsColumns;
XObject gffColumns;
public GffManager() {
//record_query_fields
recordTableName = "record_query_fields";
recordColumns = new XObject();
recordColumns.put("chromosome", "TEXT");
recordColumns.put("start", "INT");
recordColumns.put("end", "INT");
recordColumns.put("offset", "BIGINT");
recordIndexName = "chromosome_start_end";
recordIndices = new XObject();
recordIndices.put("chromosome", 0);
recordIndices.put("start", 3);
recordIndices.put("end", 4);
recordDefaultParser = new DefaultParser(recordIndices);
//chunk
chunkTableName = "chunk";
chunkColumns = new XObject();
chunkColumns.put("chromosome", "TEXT");
chunkColumns.put("chunk_id", "TEXT");
chunkColumns.put("start", "INT");
chunkColumns.put("end", "INT");
chunkColumns.put("features_count", "INT");
chunkIndexName = "chromosome_chunk_id";
chunkIndices = new XObject();
chunkIndices.put("chromosome", 0);
chunkIndices.put("chunk_id", -1);
//stats
statsTableName = "global_stats";
statsColumns = new XObject();
statsColumns.put("name", "TEXT");
statsColumns.put("title", "TEXT");
statsColumns.put("value", "TEXT");
gffColumns = new XObject();
gffColumns.put("seqname", 0);
gffColumns.put("source", 1);
gffColumns.put("feature", 2);
gffColumns.put("start", 3);
gffColumns.put("end", 4);
gffColumns.put("score", 5);
gffColumns.put("strand", 6);
gffColumns.put("frame", 7);
gffColumns.put("group", 8);
}
public void createIndex(Path filePath) throws SQLException, IOException, ClassNotFoundException {
SqliteManager sqliteManager = new SqliteManager();
sqliteManager.connect(filePath, false);
//record_query_fields
sqliteManager.createTable(recordTableName, recordColumns);
//chunk
sqliteManager.createTable(chunkTableName, chunkColumns);
//stats
sqliteManager.createTable(statsTableName, statsColumns);
//chunk visited hash
HashMap<Integer, XObject> visitedChunks = new HashMap<>();
HashMap<String, XObject> visitedChromosomes = new HashMap<>();
//Read file
BufferedReader br;
Boolean gzip = false;
if (gzip) {
br = new BufferedReader(new InputStreamReader(new GZIPInputStream(Files.newInputStream(filePath))));
} else {
br = Files.newBufferedReader(filePath, Charset.defaultCharset());
}
String line = null;
long offsetPos = 0;
int numberLines = 0;
while ((line = br.readLine()) != null) {
numberLines++;
XObject offsetXO = recordDefaultParser.parse(line);
offsetXO.put("offset", offsetPos);
//offset table
sqliteManager.insert(offsetXO, recordTableName);
offsetPos += line.length() + 1;
//calculate chromosome stats
XObject chrXo = visitedChromosomes.get(offsetXO.get("chromosome"));
if (chrXo == null) {
chrXo = new XObject();
chrXo.put("start", offsetXO.getInt("start"));
chrXo.put("end", offsetXO.getInt("end"));
visitedChromosomes.put(offsetXO.getString("chromosome"), chrXo);
}
chrXo.put("start", Math.min(chrXo.getInt("start"), offsetXO.getInt("start")));
chrXo.put("end", Math.max(chrXo.getInt("end"), offsetXO.getInt("end")));
//chunk table
int firstChunkId = getChunkId(offsetXO.getInt("start"));
int lastChunkId = getChunkId(offsetXO.getInt("end"));
for (int i = firstChunkId; i <= lastChunkId; i++) {
if (visitedChunks.get(i) == null) {
XObject xoChunk = new XObject();
int chunkStart = getChunkStart(i);
int chunkEnd = getChunkEnd(i);
xoChunk.put("chunk_id", i);
xoChunk.put("chromosome", offsetXO.getString("chr"));
xoChunk.put("start", chunkStart);
xoChunk.put("end", chunkEnd);
xoChunk.put("features_count", 0);
visitedChunks.put(i, xoChunk);
}
XObject xoUpdate = visitedChunks.get(i);
xoUpdate.put("features_count", xoUpdate.getInt("features_count") + 1);
}
}
br.close();
//table record_query_fields
sqliteManager.commit(recordTableName);
sqliteManager.createIndex(recordTableName, recordIndexName, recordIndices);
//table chunk
for (Integer key : visitedChunks.keySet()) {
sqliteManager.insert(visitedChunks.get(key), chunkTableName);
}
sqliteManager.commit(chunkTableName);
sqliteManager.createIndex(chunkTableName, chunkIndexName, chunkIndices);
//table stats
XObject values = new XObject();
values.put("title", "File number lines");
values.put("name", "NUM_LINES");
values.put("value", numberLines);
sqliteManager.insert(values, statsTableName);
values = new XObject();
values.put("title", "Number of chromosomes (or sequences)");
values.put("name", "NUM_CHR");
values.put("value", visitedChromosomes.keySet().size());
sqliteManager.insert(values, statsTableName);
values = new XObject();
String chrStr = visitedChromosomes.keySet().toString();
values.put("title", "Chromosomes (or sequences)");
values.put("name", "CHR_LIST");
values.put("value", chrStr.substring(1, chrStr.length() - 1));
sqliteManager.insert(values, statsTableName);
for (String key : visitedChromosomes.keySet()) {
XObject chrXo = visitedChromosomes.get(key);
String chromosomePrefix = "";
String chrkey = key;
if (key.contains("chr")) {
chromosomePrefix = "chr";
chrkey = key.replace("chr", "");
}
//check chromosome prefix
values = new XObject();
values.put("title", "Chromsome " + chrkey + " prefix");
values.put("name", "CHR_" + chrkey + "_PREFIX");
values.put("value", chromosomePrefix);
sqliteManager.insert(values, statsTableName);
values = new XObject();
values.put("title", "Chromosome");
values.put("name", "CHR_" + chrkey + "_NAME");
values.put("value", key);
sqliteManager.insert(values, statsTableName);
values = new XObject();
values.put("title", "Length");
values.put("name", "CHR_" + chrkey + "_LENGTH");
values.put("value", chrXo.getInt("end") - chrXo.getInt("start") + 1);
sqliteManager.insert(values, statsTableName);
}
sqliteManager.commit(statsTableName);
//disconnect
sqliteManager.disconnect(true);
}
public List<XObject> queryRegion(Path filePath, String chromosome, int start, int end) throws SQLException, IOException, ClassNotFoundException {
SqliteManager sqliteManager = new SqliteManager();
sqliteManager.connect(filePath, true);
String tableName = "global_stats";
String queryString = "SELECT value FROM " + tableName + " WHERE name='CHR_" + chromosome + "_PREFIX'";
String chrPrefix = sqliteManager.query(queryString).get(0).getString("value");
tableName = "record_query_fields";
queryString = "SELECT offset FROM " + tableName + " WHERE chromosome='" + chrPrefix + chromosome + "' AND start<=" + end + " AND end>=" + start;
List<XObject> queryResults = sqliteManager.query(queryString);
//disconnect
sqliteManager.disconnect(true);
//access file
List<XObject> results = new ArrayList<>();
DefaultParser GFFParser = new DefaultParser(gffColumns);
RandomAccessFile raf = new RandomAccessFile(filePath.toString(), "r");
for (XObject queryResult : queryResults) {
raf.seek(queryResult.getInt("offset"));
results.add(GFFParser.parse(raf.readLine()));
}
return results;
}
private int getChunkId(int position) {
return position / CHUNKSIZE;
}
private int getChunkStart(int id) {
return (id == 0) ? 1 : id * CHUNKSIZE;
}
private int getChunkEnd(int id) {
return (id * CHUNKSIZE) + CHUNKSIZE - 1;
}
}