/* * Copyright (C) 2010-2013 "Bio4j" * * This file is part of Bio4j * * Bio4j is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/> */ package com.bio4j.neo4jdb.programs; import com.bio4j.neo4jdb.model.nodes.refseq.CDSNode; import com.bio4j.neo4jdb.model.nodes.refseq.GeneNode; import com.bio4j.neo4jdb.model.nodes.refseq.GenomeElementNode; import com.bio4j.neo4jdb.model.nodes.refseq.rna.*; import com.bio4j.neo4jdb.model.relationships.refseq.*; import com.bio4j.neo4jdb.model.util.Bio4jManager; import com.ohnosequences.util.Executable; import com.ohnosequences.util.genbank.GBCommon; import java.io.*; import java.util.ArrayList; import java.util.HashMap; import java.util.Map; import java.util.logging.FileHandler; import java.util.logging.Level; import java.util.logging.Logger; import java.util.logging.SimpleFormatter; import org.neo4j.helpers.collection.MapUtil; import org.neo4j.index.lucene.unsafe.batchinsert.LuceneBatchInserterIndexProvider; import org.neo4j.unsafe.batchinsert.*; /** * Imports RefSeq complete release into Bio4j * * @author Pablo Pareja Tobes <ppareja@era7.com> */ public class ImportRefSeq implements Executable { //--------indexing API constans----- private static String PROVIDER_ST = "provider"; private static String EXACT_ST = "exact"; private static String LUCENE_ST = "lucene"; private static String TYPE_ST = "type"; //----------------------------------- public static final String BASE_FOLDER = "refseq/release/complete/"; private static final Logger logger = Logger.getLogger("ImportRefSeq"); private static FileHandler fh; //------------------nodes properties maps----------------------------------- public static Map<String, Object> genomeElementProperties = new HashMap<String, Object>(); public static Map<String, Object> geneProperties = new HashMap<String, Object>(); public static Map<String, Object> cdsProperties = new HashMap<String, Object>(); public static Map<String, Object> miscRnaProperties = new HashMap<String, Object>(); public static Map<String, Object> mRnaProperties = new HashMap<String, Object>(); public static Map<String, Object> ncRnaProperties = new HashMap<String, Object>(); public static Map<String, Object> rRnaProperties = new HashMap<String, Object>(); public static Map<String, Object> tmRnaProperties = new HashMap<String, Object>(); public static Map<String, Object> tRnaProperties = new HashMap<String, Object>(); //---------------------------------------------------------------------------------- //--------------------------------relationships------------------------------------------ public static GenomeElementGeneRel genomeElementGeneRel = new GenomeElementGeneRel(null); public static GenomeElementCDSRel genomeElementCDSRel = new GenomeElementCDSRel(null); public static GenomeElementMiscRnaRel genomeElementMiscRnaRel = new GenomeElementMiscRnaRel(null); public static GenomeElementMRnaRel genomeElementMRnaRel = new GenomeElementMRnaRel(null); public static GenomeElementNcRnaRel genomeElementNcRnaRel = new GenomeElementNcRnaRel(null); public static GenomeElementRRnaRel genomeElementRRnaRel = new GenomeElementRRnaRel(null); public static GenomeElementTmRnaRel genomeElementTmRnaRel = new GenomeElementTmRnaRel(null); public static GenomeElementTRnaRel genomeElementTRnaRel = new GenomeElementTRnaRel(null); //---------------------------------------------------------------------------------- @Override public void execute(ArrayList<String> array) { String[] args = new String[array.size()]; for (int i = 0; i < array.size(); i++) { args[i] = array.get(i); } main(args); } public static void main(String[] args) { if (args.length != 3) { System.out.println("This program expects the following parameters: \n" + "1. Folder name with all the .gbk files \n" + "2. Bio4j DB folder \n" + "3. batch inserter .properties file"); } else { long initTime = System.nanoTime(); File inFolder = new File(args[0]); File[] files = inFolder.listFiles(); BatchInserter inserter = null; BatchInserterIndexProvider indexProvider = null; //---------------------------------------------------------------------------------- //---------------------initializing node type properties---------------------------- genomeElementProperties.put(GenomeElementNode.NODE_TYPE_PROPERTY, GenomeElementNode.NODE_TYPE); geneProperties.put(GeneNode.NODE_TYPE_PROPERTY, GeneNode.NODE_TYPE); cdsProperties.put(CDSNode.NODE_TYPE_PROPERTY, CDSNode.NODE_TYPE); miscRnaProperties.put(MiscRNANode.NODE_TYPE_PROPERTY, MiscRNANode.NODE_TYPE); mRnaProperties.put(MRNANode.NODE_TYPE_PROPERTY, MRNANode.NODE_TYPE); ncRnaProperties.put(NcRNANode.NODE_TYPE_PROPERTY, NcRNANode.NODE_TYPE); rRnaProperties.put(RRNANode.NODE_TYPE_PROPERTY, RRNANode.NODE_TYPE); tmRnaProperties.put(TmRNANode.NODE_TYPE_PROPERTY, TmRNANode.NODE_TYPE); tRnaProperties.put(TRNANode.NODE_TYPE_PROPERTY, TRNANode.NODE_TYPE); //---------------------------------------------------------------------------------- //---------------------------------------------------------------------------------- BufferedWriter statsBuff = null; int genomeElementCounter = 0; try { // This block configures the logger with handler and formatter fh = new FileHandler("ImportRefSeq.log", false); SimpleFormatter formatter = new SimpleFormatter(); fh.setFormatter(formatter); logger.addHandler(fh); logger.setLevel(Level.ALL); //---creating writer for stats file----- statsBuff = new BufferedWriter(new FileWriter(new File("ImportRefSeqStats.txt"))); // create the batch inserter inserter = BatchInserters.inserter(args[1], MapUtil.load(new File(args[2]))); // create the batch index service indexProvider = new LuceneBatchInserterIndexProvider(inserter); //-----------------create batch indexes---------------------------------- //---------------------------------------------------------------------- BatchInserterIndex genomeElementVersionIndex = indexProvider.nodeIndex(GenomeElementNode.GENOME_ELEMENT_VERSION_INDEX, MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, EXACT_ST)); BatchInserterIndex nodeTypeIndex = indexProvider.nodeIndex(Bio4jManager.NODE_TYPE_INDEX_NAME, MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, EXACT_ST)); for (File file : files) { if (file.getName().endsWith(".gbff")) { logger.log(Level.INFO, ("file: " + file.getName())); BufferedReader reader = new BufferedReader(new FileReader(file)); String line; while ((line = reader.readLine()) != null) { //this is the first line where the locus is String accessionSt = ""; String definitionSt = ""; String versionSt = ""; String commentSt = ""; StringBuilder seqStBuilder = new StringBuilder(); ArrayList<String> cdsList = new ArrayList<String>(); ArrayList<String> geneList = new ArrayList<String>(); ArrayList<String> miscRnaList = new ArrayList<String>(); ArrayList<String> mRnaList = new ArrayList<String>(); ArrayList<String> ncRnaList = new ArrayList<String>(); ArrayList<String> rRnaList = new ArrayList<String>(); ArrayList<String> tmRnaList = new ArrayList<String>(); ArrayList<String> tRnaList = new ArrayList<String>(); boolean originFound = false; //Now I get all the lines till I reach the string '//' do { boolean readLineFlag = true; if (line.startsWith(GBCommon.LOCUS_STR)) { // do nothing right now } else if (line.startsWith(GBCommon.ACCESSION_STR)) { accessionSt = line.split(GBCommon.ACCESSION_STR)[1].trim(); } else if (line.startsWith(GBCommon.VERSION_STR)) { versionSt = line.split(GBCommon.VERSION_STR)[1].trim().split(" ")[0]; } else if (line.startsWith(GBCommon.DEFINITION_STR)) { definitionSt += line.split(GBCommon.DEFINITION_STR)[1].trim(); do { line = reader.readLine(); if (line.startsWith(" ")) { definitionSt += line.trim(); } } while (line.startsWith(" ")); readLineFlag = false; } else if (line.startsWith(GBCommon.COMMENT_STR)) { commentSt += line.split(GBCommon.COMMENT_STR)[1].trim(); do { line = reader.readLine(); if (line.startsWith(" ")) { commentSt += "\n" + line.trim(); } } while (line.startsWith(" ")); readLineFlag = false; } else if (line.startsWith(GBCommon.FEATURES_STR)) { do { line = reader.readLine(); String lineSubstr5 = line.substring(5); if (lineSubstr5.startsWith(GBCommon.CDS_STR)) { String positionsSt = ""; positionsSt += line.trim().split(GBCommon.CDS_STR)[1].trim(); line = reader.readLine(); while (!line.trim().startsWith("/")) { positionsSt += line.trim(); line = reader.readLine(); } cdsList.add(positionsSt); } else if (lineSubstr5.startsWith(GBCommon.GENE_STR)) { String positionsSt = ""; positionsSt += line.trim().split(GBCommon.GENE_STR)[1].trim(); line = reader.readLine(); while (!line.trim().startsWith("/")) { positionsSt += line.trim(); line = reader.readLine(); } geneList.add(positionsSt); } else if (lineSubstr5.startsWith(GBCommon.MISC_RNA_STR)) { String positionsSt = ""; positionsSt += line.trim().split(GBCommon.MISC_RNA_STR)[1].trim(); line = reader.readLine(); while (!line.trim().startsWith("/")) { positionsSt += line.trim(); line = reader.readLine(); } miscRnaList.add(positionsSt); } else if (lineSubstr5.startsWith(GBCommon.TM_RNA_STR)) { String positionsSt = ""; positionsSt += line.trim().split(GBCommon.TM_RNA_STR)[1].trim(); line = reader.readLine(); while (!line.trim().startsWith("/")) { positionsSt += line.trim(); line = reader.readLine(); } tmRnaList.add(positionsSt); } else if (lineSubstr5.startsWith(GBCommon.R_RNA_STR)) { String positionsSt = ""; positionsSt += line.trim().split(GBCommon.R_RNA_STR)[1].trim(); line = reader.readLine(); while (!line.trim().startsWith("/")) { positionsSt += line.trim(); line = reader.readLine(); } rRnaList.add(positionsSt); } else if (lineSubstr5.startsWith(GBCommon.M_RNA_STR)) { String positionsSt = ""; positionsSt += line.trim().split(GBCommon.M_RNA_STR)[1].trim(); line = reader.readLine(); while (!line.trim().startsWith("/")) { positionsSt += line.trim(); line = reader.readLine(); } mRnaList.add(positionsSt); } else if (lineSubstr5.startsWith(GBCommon.NC_RNA_STR)) { String positionsSt = ""; positionsSt += line.trim().split(GBCommon.NC_RNA_STR)[1].trim(); line = reader.readLine(); while (!line.trim().startsWith("/")) { positionsSt += line.trim(); line = reader.readLine(); } ncRnaList.add(positionsSt); } else if (lineSubstr5.startsWith(GBCommon.T_RNA_STR)) { String positionsSt = ""; positionsSt += line.trim().split(GBCommon.T_RNA_STR)[1].trim(); line = reader.readLine(); while (!line.trim().startsWith("/")) { positionsSt += line.trim(); line = reader.readLine(); } tRnaList.add(positionsSt); } } while (line.startsWith(" ")); readLineFlag = false; } else if (line.startsWith(GBCommon.ORIGIN_STR)) { originFound = true; do { line = reader.readLine(); String[] tempArray = line.trim().split(" "); for (int i = 1; i < tempArray.length; i++) { seqStBuilder.append(tempArray[i]); } } while (line.startsWith(" ")); readLineFlag = false; } if (readLineFlag) { line = reader.readLine(); } } while (line != null && !line.startsWith(GBCommon.LAST_LINE_STR)); //--------create genome element node-------------- long genomeElementId = createGenomeElementNode(versionSt, commentSt, definitionSt, inserter, genomeElementVersionIndex, nodeTypeIndex); //-----------genes----------------- for (String genePositionsSt : geneList) { geneProperties.put(GeneNode.POSITIONS_PROPERTY, genePositionsSt); long geneId = inserter.createNode(geneProperties); inserter.createRelationship(genomeElementId, geneId, genomeElementGeneRel, null); //indexing gene node by its node_type nodeTypeIndex.add(geneId, MapUtil.map(Bio4jManager.NODE_TYPE_INDEX_NAME, GeneNode.NODE_TYPE)); } //-----------CDS----------------- for (String cdsPositionsSt : cdsList) { cdsProperties.put(CDSNode.POSITIONS_PROPERTY, cdsPositionsSt); long cdsID = inserter.createNode(cdsProperties); inserter.createRelationship(genomeElementId, cdsID, genomeElementCDSRel, null); //indexing CDS node by its node_type nodeTypeIndex.add(cdsID, MapUtil.map(Bio4jManager.NODE_TYPE_INDEX_NAME, CDSNode.NODE_TYPE)); } //-----------misc rna----------------- for (String miscRnaPositionsSt : miscRnaList) { miscRnaProperties.put(MiscRNANode.POSITIONS_PROPERTY, miscRnaPositionsSt); long miscRnaID = inserter.createNode(miscRnaProperties); inserter.createRelationship(genomeElementId, miscRnaID, genomeElementMiscRnaRel, null); //indexing MiscRNA node by its node_type nodeTypeIndex.add(miscRnaID, MapUtil.map(Bio4jManager.NODE_TYPE_INDEX_NAME, MiscRNANode.NODE_TYPE)); } //-----------m rna----------------- for (String mRnaPositionsSt : mRnaList) { mRnaProperties.put(MRNANode.POSITIONS_PROPERTY, mRnaPositionsSt); long mRnaID = inserter.createNode(mRnaProperties); inserter.createRelationship(genomeElementId, mRnaID, genomeElementMRnaRel, null); //indexing MRNA node by its node_type nodeTypeIndex.add(mRnaID, MapUtil.map(Bio4jManager.NODE_TYPE_INDEX_NAME, MRNANode.NODE_TYPE)); } //-----------nc rna----------------- for (String ncRnaPositionsSt : ncRnaList) { ncRnaProperties.put(NcRNANode.POSITIONS_PROPERTY, ncRnaPositionsSt); long ncRnaID = inserter.createNode(ncRnaProperties); inserter.createRelationship(genomeElementId, ncRnaID, genomeElementNcRnaRel, null); //indexing NCRNA node by its node_type nodeTypeIndex.add(ncRnaID, MapUtil.map(Bio4jManager.NODE_TYPE_INDEX_NAME, NcRNANode.NODE_TYPE)); } //-----------r rna----------------- for (String rRnaPositionsSt : rRnaList) { rRnaProperties.put(RRNANode.POSITIONS_PROPERTY, rRnaPositionsSt); long rRnaID = inserter.createNode(rRnaProperties); inserter.createRelationship(genomeElementId, rRnaID, genomeElementRRnaRel, null); //indexing RRNA node by its node_type nodeTypeIndex.add(rRnaID, MapUtil.map(Bio4jManager.NODE_TYPE_INDEX_NAME, RRNANode.NODE_TYPE)); } //-----------tm rna----------------- for (String tmRnaPositionsSt : tmRnaList) { tmRnaProperties.put(TmRNANode.POSITIONS_PROPERTY, tmRnaPositionsSt); long tmRnaID = inserter.createNode(tmRnaProperties); inserter.createRelationship(genomeElementId, tmRnaID, genomeElementTmRnaRel, null); //indexing TmRNA node by its node_type nodeTypeIndex.add(tmRnaID, MapUtil.map(Bio4jManager.NODE_TYPE_INDEX_NAME, TmRNANode.NODE_TYPE)); } //-----------t rna----------------- for (String tRnaPositionsSt : tRnaList) { tRnaProperties.put(TRNANode.POSITIONS_PROPERTY, tRnaPositionsSt); long tRnaID = inserter.createNode(tRnaProperties); inserter.createRelationship(genomeElementId, tRnaID, genomeElementTRnaRel, null); //indexing TRNA node by its node_type nodeTypeIndex.add(tRnaID, MapUtil.map(Bio4jManager.NODE_TYPE_INDEX_NAME, TRNANode.NODE_TYPE)); } logger.log(Level.INFO, (versionSt + " saved!")); genomeElementCounter++; } reader.close(); } } } catch (Exception e) { logger.log(Level.SEVERE, e.getMessage()); StackTraceElement[] trace = e.getStackTrace(); for (StackTraceElement stackTraceElement : trace) { logger.log(Level.SEVERE, stackTraceElement.toString()); } } finally { // shutdown, makes sure all changes are written to disk indexProvider.shutdown(); inserter.shutdown(); try { //-----------------writing stats file--------------------- long elapsedTime = System.nanoTime() - initTime; long elapsedSeconds = Math.round((elapsedTime / 1000000000.0)); long hours = elapsedSeconds / 3600; long minutes = (elapsedSeconds % 3600) / 60; long seconds = (elapsedSeconds % 3600) % 60; statsBuff.write("Statistics for program ImportRefSeq:\nInput folder: " + inFolder.getName() + "\nThere were " + genomeElementCounter + " genome elements stored.\n" + "The elapsed time was: " + hours + "h " + minutes + "m " + seconds + "s\n"); //---closing stats writer--- statsBuff.close(); } catch (Exception e) { e.printStackTrace(); } // closing logger file handler fh.close(); } } } private static long createGenomeElementNode(String version, String comment, String definition, BatchInserter inserter, BatchInserterIndex genomeElementVersionIndex, BatchInserterIndex nodeTypeIndex) { genomeElementProperties.put(GenomeElementNode.VERSION_PROPERTY, version); genomeElementProperties.put(GenomeElementNode.COMMENT_PROPERTY, comment); genomeElementProperties.put(GenomeElementNode.DEFINITION_PROPERTY, definition); long genomeElementId = inserter.createNode(genomeElementProperties); genomeElementVersionIndex.add(genomeElementId, MapUtil.map(GenomeElementNode.GENOME_ELEMENT_VERSION_INDEX, version)); nodeTypeIndex.add(genomeElementId, MapUtil.map(Bio4jManager.NODE_TYPE_INDEX_NAME, GenomeElementNode.NODE_TYPE)); return genomeElementId; } }