/* * Copyright 2015-2016 OpenCB * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.opencb.opencga.storage.datamanagers; import com.fasterxml.jackson.databind.DeserializationFeature; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectWriter; import org.opencb.cellbase.core.common.Region; import org.opencb.commons.bioformats.variant.vcf4.VcfRecord; import org.opencb.opencga.core.SgeManager; import org.opencb.opencga.core.common.Config; import org.opencb.opencga.core.common.IOUtils; import org.opencb.opencga.core.common.StringUtils; import org.opencb.opencga.core.common.XObject; import org.opencb.opencga.storage.TabixReader; import org.opencb.opencga.storage.indices.SqliteManager; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.File; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.sql.SQLException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; public class VcfManager { // private Gson gson; protected static ObjectMapper jsonObjectMapper; protected static ObjectWriter jsonObjectWriter; protected static Logger logger = LoggerFactory.getLogger(VcfManager.class); private static Path indexerManagerScript = Paths.get(Config.getGcsaHome(), Config.getAnalysisProperties().getProperty("OPENCGA.ANALYSIS.BINARIES.PATH"), "indexer", "indexerManager.py"); XObject vcfColumns; public VcfManager() throws IOException { // gson = new Gson(); jsonObjectMapper = new ObjectMapper(); jsonObjectMapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); jsonObjectWriter = jsonObjectMapper.writer(); vcfColumns = new XObject(); vcfColumns.put("chromosome", 0); vcfColumns.put("position", 1); vcfColumns.put("id", 2); vcfColumns.put("ref", 3); vcfColumns.put("alt", 4); vcfColumns.put("qual", 5); vcfColumns.put("filter", 6); vcfColumns.put("info", 7); } private static Path getMetaDir(Path file) { String inputName = file.getFileName().toString(); return file.getParent().resolve(".meta_" + inputName); } public static String createIndex(Path inputPath) throws IOException, InterruptedException { Path metaDir = getMetaDir(inputPath); if (Files.exists(metaDir)) { IOUtils.deleteDirectory(metaDir); } String jobId = StringUtils.randomString(8); String commandLine = indexerManagerScript + " -t vcf -i " + inputPath + " --outdir " + metaDir; try { SgeManager.queueJob("indexer", jobId, 0, inputPath.getParent().toString(), commandLine); } catch (Exception e) { logger.error(e.toString()); // throw new AnalysisExecutionException("ERROR: sge execution failed."); } return "indexer_" + jobId; } private static File checkVcfIndex(Path inputPath) { Path metaDir = getMetaDir(inputPath); String fileName = inputPath.getFileName().toString(); //name.vcf.gz //name.vcf.tbi Path inputCompressedFile = metaDir.resolve(Paths.get(fileName + ".gz")); Path inputIndexFile = metaDir.resolve(Paths.get(fileName + ".gz.tbi")); if (Files.exists(inputIndexFile) && Files.exists(inputCompressedFile)) { return inputIndexFile.toFile(); } return null; } public static boolean checkIndex(Path filePath) { Path metaDir = getMetaDir(filePath); String fileName = filePath.getFileName().toString(); return Files.exists(metaDir.resolve(Paths.get(fileName + ".db"))); } public String queryRegion(Path filePath, String regionStr, Map<String, List<String>> params) throws SQLException, IOException, ClassNotFoundException { Path metaDir = getMetaDir(filePath); String fileName = filePath.getFileName().toString(); Path gzFilePath = metaDir.resolve(Paths.get(fileName + ".gz")); Region region = Region.parseRegion(regionStr); String chromosome = region.getChromosome(); int start = region.getStart(); int end = region.getEnd(); SqliteManager sqliteManager = new SqliteManager(); sqliteManager.connect(metaDir.resolve(Paths.get(fileName)), true); Boolean histogram = false; if (params.get("histogram") != null) { histogram = Boolean.parseBoolean(params.get("histogram").get(0)); } Boolean histogramLogarithm = false; if (params.get("histogramLogarithm") != null) { histogramLogarithm = Boolean.parseBoolean(params.get("histogramLogarithm").get(0)); } int histogramMax = 500; if (params.get("histogramMax") != null) { histogramMax = Integer.getInteger(params.get("histogramMax").get(0), 500); } if (histogram) { long tq = System.currentTimeMillis(); String tableName = "chunk"; String chrPrefix = ""; String queryString = "SELECT * FROM " + tableName + " WHERE chromosome='" + chrPrefix + chromosome + "' AND start<=" + end + " AND end>=" + start; List<XObject> queryResults = sqliteManager.query(queryString); sqliteManager.disconnect(true); int queryResultSize = queryResults.size(); if (queryResultSize > histogramMax) { List<XObject> sumList = new ArrayList<>(); int sumChunkSize = queryResultSize / histogramMax; int i = 0, j = 0; XObject item = null; int features_count = 0; for (XObject result : queryResults) { features_count += result.getInt("features_count"); if (i == 0) { item = new XObject("chromosome", result.getString("chromosome")); item.put("start", result.getString("start")); } else if (i == sumChunkSize - 1 || j == queryResultSize - 1) { if (histogramLogarithm) { item.put("features_count", (features_count > 0) ? Math.log(features_count) : 0); } else { item.put("features_count", features_count); } item.put("end", result.getString("end")); sumList.add(item); i = -1; features_count = 0; } j++; i++; } return jsonObjectWriter.writeValueAsString(sumList); // return gson.toJson(sumList); } if (histogramLogarithm) { for (XObject result : queryResults) { int features_count = result.getInt("features_count"); result.put("features_count", (features_count > 0) ? Math.log(features_count) : 0); } } System.out.println("Query time " + (System.currentTimeMillis() - tq) + "ms"); return jsonObjectWriter.writeValueAsString(queryResults); // return gson.toJson(queryResults); } // String tableName = "global_stats"; // String queryString = "SELECT value FROM " + tableName + " WHERE name='CHR_"+chromosome+"_PREFIX'"; // String chrPrefix = sqliteManager.query(queryString).get(0).getString("value"); String chrPrefix = ""; String tableName = "record_query_fields"; String queryString = "SELECT position FROM " + tableName + " WHERE chromosome='" + chrPrefix + chromosome + "' AND position<=" + end + " AND position>=" + start; List<XObject> queryResults = sqliteManager.query(queryString); int queryResultsLength = queryResults.size(); //disconnect sqliteManager.disconnect(true); HashMap<String, XObject> queryResultsMap = new HashMap<>(); for (XObject r : queryResults) { queryResultsMap.put(r.getString("position"), r); } System.out.println("queryResultsLength " + queryResultsLength); //Query Tabbix File inputVcfIndexFile = checkVcfIndex(filePath); TabixReader tabixReader = new TabixReader(gzFilePath.toString()); if (inputVcfIndexFile == null) { logger.info("VcfManager: " + "creating vcf index for: " + filePath); return null; } String line; logger.info("regionStr: " + regionStr); TabixReader.Iterator lines = null; try { lines = tabixReader.query(regionStr); } catch (Exception e) { e.printStackTrace(); } logger.info("lines != null: " + (lines == null)); logger.info("lines: " + lines); List<XObject> records = new ArrayList<>(); while (lines != null && ((line = lines.next()) != null)) { String[] fields = line.split("\t",10); XObject record = new XObject(); record.put("chromosome", fields[0]); record.put("start", Integer.valueOf(fields[1])); record.put("end", Integer.valueOf(fields[1])); record.put("id", fields[2]); record.put("reference", fields[3]); record.put("alternate", fields[4]); record.put("quality", fields[5]); record.put("filter", fields[6]); record.put("info", fields[7]); record.put("format", fields[8]); record.put("samples", fields[9].split("\\s")); if (queryResultsMap.get(String.valueOf(record.get("start"))) != null) { records.add(record); queryResultsLength--; } if (queryResultsLength < 0) { break; } } return jsonObjectWriter.writeValueAsString(records); // return gson.toJson(records); } @Deprecated public String getByRegion(Path fullFilePath, String regionStr, Map<String, List<String>> params) throws IOException { TabixReader tabixReader = new TabixReader(fullFilePath.toString()); StringBuilder sb = new StringBuilder(); try { TabixReader.Iterator lines = tabixReader.query(regionStr); String line; sb.append("["); while ((line = lines.next()) != null) { String[] fields = line.split("\t"); VcfRecord vcfRecord = new VcfRecord(fields[0],Integer.parseInt(fields[1]),fields[2],fields[3],fields[4],fields[5],fields[6],fields[7]); sb.append(jsonObjectWriter.writeValueAsString(vcfRecord) + ","); } // Remove last comma int sbLength = sb.length(); int sbLastPos = sbLength - 1; if (sbLength > 1 && sb.charAt(sbLastPos) == ',') { sb.replace(sbLastPos, sbLength, ""); } sb.append("]"); } catch (Exception e) { logger.info(e.toString()); sb.append("[]"); } return sb.toString(); } }