/* * Copyright (C) 2010-2013 "Bio4j" * * This file is part of Bio4j * * Bio4j is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/> */ package com.bio4j.neo4jdb.programs; import com.bio4j.neo4jdb.model.nodes.OrganismNode; import com.bio4j.neo4jdb.model.nodes.ncbi.NCBITaxonNode; import com.bio4j.neo4jdb.model.relationships.ncbi.NCBITaxonParentRel; import com.bio4j.neo4jdb.model.relationships.ncbi.NCBITaxonRel; import com.bio4j.neo4jdb.model.util.Bio4jManager; import com.bio4j.neo4jdb.model.util.NodeRetriever; import com.ohnosequences.util.Executable; import java.io.*; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.Set; import java.util.logging.FileHandler; import java.util.logging.Level; import java.util.logging.Logger; import java.util.logging.SimpleFormatter; import org.neo4j.graphdb.Node; import org.neo4j.graphdb.Transaction; /** * Imports NCBI taxonomy into Bio4j * * @author Pablo Pareja Tobes <ppareja@era7.com> */ public class ImportNCBITaxonomy implements Executable { private static final Logger logger = Logger.getLogger("ImportNCBITaxonomy"); private static FileHandler fh; @Override public void execute(ArrayList<String> array) { String[] args = new String[array.size()]; for (int i = 0; i < array.size(); i++) { args[i] = array.get(i); } main(args); } public static void main(String[] args) { if (args.length != 5) { System.out.println("This program expects the following parameters: \n" + "1. Nodes DMP filename \n" + "2. Names DMP filename \n" + "3. Merged DMP filename \n" + "4. Bio4j DB folder \n" + "5. Associate Uniprot taxonomy (true/false)"); } else { long initTime = System.nanoTime(); Bio4jManager manager = null; Transaction txn = null; int taxonCounter = 0; int txnCounter = 0; int txnLimitForCommit = 10000; boolean associateUniprotTaxonomy = Boolean.parseBoolean(args[4]); BufferedWriter statsBuff = null; File nodesDumpFile = new File(args[0]); File namesDumpFile = new File(args[1]); File mergedDumpFile = new File(args[2]); try { // This block configure the logger with handler and formatter fh = new FileHandler("ImportNCBITaxonomy.log", true); SimpleFormatter formatter = new SimpleFormatter(); fh.setFormatter(formatter); logger.addHandler(fh); logger.setLevel(Level.ALL); //---creating writer for stats file----- statsBuff = new BufferedWriter(new FileWriter(new File("ImportNCBITaxonomyStats.txt"))); BufferedReader reader = new BufferedReader(new FileReader(nodesDumpFile)); String line; logger.log(Level.INFO, "creating manager..."); manager = new Bio4jManager(args[3], true, false); NodeRetriever nodeRetriever = new NodeRetriever(manager); HashMap<String, String> nodeParentMap = new HashMap<String, String>(); txn = manager.beginTransaction(); logger.log(Level.INFO, "reading nodes file..."); while ((line = reader.readLine()) != null) { if (line.trim().length() > 0) { String[] columns = line.split("\\|"); NCBITaxonNode node = new NCBITaxonNode(manager.createNode()); //setting node_type property node.setNodeType(NCBITaxonNode.NODE_TYPE); node.setTaxId(columns[0].trim()); node.setRank(columns[2].trim()); node.setEmblCode(columns[3].trim()); //indexing the node.. manager.getNCBITaxonIdIndex().add(node.getNode(), NCBITaxonNode.NCBI_TAXON_ID_INDEX, node.getTaxId()); //indexing the node by its node_type manager.getNodeTypeIndex().add(node.getNode(), Bio4jManager.NODE_TYPE_INDEX_NAME, NCBITaxonNode.NODE_TYPE); //saving the parent of the node for later nodeParentMap.put(node.getTaxId(), columns[1].trim()); taxonCounter++; txnCounter++; if (txnCounter % txnLimitForCommit == 0) { txn.success(); txn.finish(); txn = manager.beginTransaction(); } } } //commiting and 'restarting' transaction txn.success(); txn.finish(); txn = manager.beginTransaction(); txnCounter = 0; reader.close(); logger.log(Level.INFO, "done!"); logger.log(Level.INFO, "reading names file..."); //------------reading names file----------------- reader = new BufferedReader(new FileReader(namesDumpFile)); while ((line = reader.readLine()) != null) { String[] columns = line.split("\\|"); if (columns[columns.length - 1].trim().equals("scientific name")) { String taxId = columns[0].trim(); String nameSt = columns[1].trim(); NCBITaxonNode node = nodeRetriever.getNCBITaxonByTaxId(taxId); node.setScientificName(nameSt); txnCounter++; if (txnCounter % txnLimitForCommit == 0) { //commiting and 'restarting' transaction txn.success(); txn.finish(); txn = manager.beginTransaction(); } } } reader.close(); logger.log(Level.INFO, "done!"); logger.log(Level.INFO, "storing relationships..."); //commiting and 'restarting' transaction txn.success(); txn.finish(); txn = manager.beginTransaction(); txnCounter = 0; Set<String> nodesSet = nodeParentMap.keySet(); for (String nodeTaxId : nodesSet) { String parentTaxId = nodeParentMap.get(nodeTaxId); NCBITaxonNode currentNode = nodeRetriever.getNCBITaxonByTaxId(nodeTaxId); if (!nodeTaxId.equals(parentTaxId)) { NCBITaxonNode parentNode = nodeRetriever.getNCBITaxonByTaxId(parentTaxId); parentNode.getNode().createRelationshipTo(currentNode.getNode(), new NCBITaxonParentRel(null)); } txnCounter++; if (txnCounter % txnLimitForCommit == 0) { //commiting and 'restarting' transaction txn.success(); txn.finish(); txn = manager.beginTransaction(); } } txn.success(); txn.finish(); txn = manager.beginTransaction(); logger.log(Level.INFO, "Done!"); if (associateUniprotTaxonomy) { logger.log(Level.INFO, "Associating uniprot taxonomy..."); associateTaxonomy(manager, nodeRetriever, new NCBITaxonRel(null)); logger.log(Level.INFO, "Done!"); } logger.log(Level.INFO, "reading merged file..."); //------------reading merged file----------------- reader = new BufferedReader(new FileReader(mergedDumpFile)); while ((line = reader.readLine()) != null) { String[] columns = line.split("\\|"); String oldId = columns[0].trim(); String goodId = columns[1].trim(); NCBITaxonNode goodNode = nodeRetriever.getNCBITaxonByTaxId(goodId); //indexing the node.. manager.getNCBITaxonIdIndex().add(goodNode.getNode(), NCBITaxonNode.NCBI_TAXON_ID_INDEX, oldId); txnCounter++; if (txnCounter % txnLimitForCommit == 0) { //commiting and 'restarting' transaction txn.success(); txn.finish(); txn = manager.beginTransaction(); } } reader.close(); logger.log(Level.INFO, "done!"); txn.success(); } catch (Exception ex) { Logger.getLogger(ImportNCBITaxonomy.class.getName()).log(Level.SEVERE, null, ex); txn.failure(); } finally { //commiting transaction txn.finish(); //closing logger file handler fh.close(); logger.log(Level.INFO, "Closing up inserter and index service...."); // shutdown, makes sure all changes are written to disk manager.shutDown(); try { //-----------------writing stats file--------------------- long elapsedTime = System.nanoTime() - initTime; long elapsedSeconds = Math.round((elapsedTime / 1000000000.0)); long hours = elapsedSeconds / 3600; long minutes = (elapsedSeconds % 3600) / 60; long seconds = (elapsedSeconds % 3600) % 60; statsBuff.write("Statistics for program ImportNCBITaxonomy:\nInput file: " + nodesDumpFile.getName() + "\nThere were " + taxonCounter + " taxonomic units inserted.\n" + "The elapsed time was: " + hours + "h " + minutes + "m " + seconds + "s\n"); //---closing stats writer--- statsBuff.close(); } catch (Exception e) { e.printStackTrace(); } } } } private static void associateTaxonomy(Bio4jManager manager, NodeRetriever nodeRetriever, NCBITaxonRel nCBITaxonRel) { Iterator<Node> organismIterator = manager.getNodeTypeIndex().get(Bio4jManager.NODE_TYPE_INDEX_NAME, OrganismNode.NODE_TYPE).iterator(); while (organismIterator.hasNext()) { OrganismNode organismNode = new OrganismNode(organismIterator.next()); Node ncbiNode = nodeRetriever.getNCBITaxonByTaxId(organismNode.getNcbiTaxonomyId()).getNode(); organismNode.getNode().createRelationshipTo(ncbiNode, nCBITaxonRel); } } }