/* * Copyright (C) 2010-2013 "Bio4j * * This file is part of Bio4j * * Bio4j is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/> */ package com.bio4j.neo4jdb.programs; import com.bio4j.neo4jdb.model.nodes.ncbi.NCBITaxonNode; import com.ohnosequences.util.Executable; import java.io.*; import java.util.ArrayList; import java.util.Map; import java.util.logging.FileHandler; import java.util.logging.Level; import java.util.logging.Logger; import java.util.logging.SimpleFormatter; import org.neo4j.helpers.collection.MapUtil; import org.neo4j.index.lucene.unsafe.batchinsert.LuceneBatchInserterIndexProvider; import org.neo4j.unsafe.batchinsert.*; /** * Indexes NCBI taxonomy elements by GI (gene identifiers) as specified in the * official mapping file * * @author Pablo Pareja Tobes <ppareja@era7.com> */ public class IndexNCBITaxonomyByGiId implements Executable { private static final Logger logger = Logger.getLogger("IndexNCBITaxonomyByGiId"); private static FileHandler fh; @Override public void execute(ArrayList<String> array) { String[] args = new String[array.size()]; for (int i = 0; i < array.size(); i++) { args[i] = array.get(i); } main(args); } public static void main(String[] args) { if (args.length != 3) { System.out.println("This program expects the following parameters: \n" + "1. Tax-id <--> Gi-id table file \n" + "2. Bio4j DB folder \n" + "3. Batch inserter .properties file name"); } else { long initTime = System.nanoTime(); BatchInserter inserter = null; BatchInserterIndexProvider indexProvider = null; BatchInserterIndex giIndex; BatchInserterIndex taxonIndex; //-------writer for storing incorrect gene identifiers-taxon id pairs---- BufferedWriter outBufferedWriter; BufferedWriter statsBuff = null; int lineCounter = 0; File inFile = new File(args[0]); try { // This block configure the logger with handler and formatter fh = new FileHandler("IndexNCBITaxonomyByGiId.log", true); SimpleFormatter formatter = new SimpleFormatter(); fh.setFormatter(formatter); logger.addHandler(fh); logger.setLevel(Level.ALL); outBufferedWriter = new BufferedWriter(new FileWriter(new File("incorrectGiTaxIdPairs.txt"))); //---creating writer for stats file----- statsBuff = new BufferedWriter(new FileWriter(new File("IndexNCBITaxonomyByGIIdStats.txt"))); // create the batch inserter inserter = BatchInserters.inserter(args[1], MapUtil.load(new File(args[2]))); // create the batch index service indexProvider = new LuceneBatchInserterIndexProvider(inserter); Map<String, String> indexProps = MapUtil.stringMap("provider", "lucene", "type", "exact"); giIndex = indexProvider.nodeIndex(NCBITaxonNode.NCBI_TAXON_GI_ID_INDEX, indexProps); taxonIndex = indexProvider.nodeIndex(NCBITaxonNode.NCBI_TAXON_ID_INDEX, indexProps); BufferedReader reader = new BufferedReader(new FileReader(inFile)); String line; while ((line = reader.readLine()) != null) { String[] columns = line.split("\t"); int giId = Integer.parseInt(columns[0]); int taxId = Integer.parseInt(columns[1]); Long nCBITaxonNodeId = taxonIndex.get(NCBITaxonNode.NCBI_TAXON_ID_INDEX, String.valueOf(taxId)).getSingle(); if (nCBITaxonNodeId != null) { giIndex.add(nCBITaxonNodeId, MapUtil.map(NCBITaxonNode.NCBI_TAXON_GI_ID_INDEX, giId)); } else { outBufferedWriter.write(giId + "\t" + taxId + "\n"); } lineCounter++; if (lineCounter % 100000 == 0) { logger.log(Level.INFO, (lineCounter + " lines parsed...")); outBufferedWriter.flush(); } } reader.close(); outBufferedWriter.close(); } catch (Exception e) { logger.log(Level.SEVERE, e.getMessage()); } finally { //closing logger file handler fh.close(); logger.log(Level.INFO, "Closing up inserter and index service...."); // shutdown, makes sure all changes are written to disk indexProvider.shutdown(); inserter.shutdown(); try { //-----------------writing stats file--------------------- long elapsedTime = System.nanoTime() - initTime; long elapsedSeconds = Math.round((elapsedTime / 1000000000.0)); long hours = elapsedSeconds / 3600; long minutes = (elapsedSeconds % 3600) / 60; long seconds = (elapsedSeconds % 3600) % 60; statsBuff.write("Statistics for program IndexNCBITaxonomyByGiId:\nInput file: " + inFile.getName() + "\nThere were " + lineCounter + " association pairs processed.\n" + "The elapsed time was: " + hours + "h " + minutes + "m " + seconds + "s\n"); //---closing stats writer--- statsBuff.close(); } catch (Exception e) { e.printStackTrace(); } } } } }