/*
* Copyright (C) 2010-2013 "Bio4j"
*
* This file is part of Bio4j
*
* Bio4j is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>
*/
package com.bio4j.neo4jdb.programs;
import com.bio4j.neo4jdb.model.nodes.EnzymeNode;
import com.bio4j.neo4jdb.model.util.Bio4jManager;
import com.ohnosequences.util.Executable;
import java.io.*;
import java.util.*;
import java.util.logging.FileHandler;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.logging.SimpleFormatter;
import org.neo4j.helpers.collection.MapUtil;
import org.neo4j.index.lucene.unsafe.batchinsert.LuceneBatchInserterIndexProvider;
import org.neo4j.unsafe.batchinsert.*;
/**
* Imports Expasy Enzyme DB into Bio4j (everything but Uniprot associations which are imported
* from Uniprot xml files.
* @author Pablo Pareja Tobes <ppareja@era7.com>
*/
public class ImportEnzymeDB implements Executable {
public static final String IDENTIFICATION_LINE_CODE = "ID";
public static final String OFFICIAL_NAME_LINE_CODE = "DE";
public static final String ALTERNATE_NAME_LINE_CODE = "AN";
public static final String CATALYTIC_ACTIVITY_LINE_CODE = "CA";
public static final String COMMENTS_LINE_CODE = "CC";
public static final String COFACTORS_LINE_CODE = "CF";
public static final String PROSITE_CROSS_REFERENCES_LINE_CODE = "PR";
public static final String SWISS_PROT_CROSS_REFERENCES_LINE_CODE = "DR";
public static final String TERMINATION_LINE_CODE = "//";
private static final Logger logger = Logger.getLogger("ImportEnzymeDB");
private static FileHandler fh;
@Override
public void execute(ArrayList<String> array) {
String[] args = new String[array.size()];
for (int i = 0; i < array.size(); i++) {
args[i] = array.get(i);
}
main(args);
}
public static void main(String[] args) {
if (args.length != 3) {
System.out.println("This program expects the following parameters: \n"
+ "1. Enzyme DB data file (.dat) \n"
+ "2. Bio4j DB folder \n"
+ "3. Batch inserter .properties file");
} else {
long initTime = System.nanoTime();
BatchInserter inserter = null;
BatchInserterIndexProvider indexProvider = null;
BatchInserterIndex enzymeIdIndex,nodeTypeIndex;
BufferedWriter statsBuff = null;
File inFile = new File(args[0]);
int enzymeCounter = 0;
try {
// This block configures the logger with handler and formatter
fh = new FileHandler("ImportEnzymeDB.log", true);
SimpleFormatter formatter = new SimpleFormatter();
fh.setFormatter(formatter);
logger.addHandler(fh);
logger.setLevel(Level.ALL);
//---creating writer for stats file-----
statsBuff = new BufferedWriter(new FileWriter(new File("ImportEnzymeDBStats.txt")));
// create the batch inserter
inserter = BatchInserters.inserter(args[1], MapUtil.load(new File(args[2])));
// create the batch index service
indexProvider = new LuceneBatchInserterIndexProvider(inserter);
Map<String, String> indexProps = MapUtil.stringMap("provider", "lucene", "type", "exact");
enzymeIdIndex = indexProvider.nodeIndex(EnzymeNode.ENZYME_ID_INDEX, indexProps);
nodeTypeIndex = indexProvider.nodeIndex( Bio4jManager.NODE_TYPE_INDEX_NAME, indexProps);
//------------------node properties maps-----------------------------------
Map<String, Object> enzymeProperties = new HashMap<>();
enzymeProperties.put(EnzymeNode.NODE_TYPE_PROPERTY, EnzymeNode.NODE_TYPE);
//--------------------------------------------------------------------------
BufferedReader reader = new BufferedReader(new FileReader(inFile));
String line;
boolean enzymeFound = false;
String officialName = "";
String enzymeId = "";
String commentsSt = "";
String catalyticActivity = "";
List<String> alternateNames = new LinkedList<>();
List<String> cofactors = new LinkedList<>();
List<String> prositeCrossRefs = new LinkedList<>();
boolean deletedEntry = false;
boolean transferredEntry = false;
System.out.println("Reading file....");
while ((line = reader.readLine()) != null) {
if (line.startsWith(IDENTIFICATION_LINE_CODE)) {
enzymeFound = true;
enzymeId = line.substring(5).trim();
} else if (enzymeFound) {
if (line.startsWith(OFFICIAL_NAME_LINE_CODE)) {
officialName += line.substring(5).trim();
if(officialName.contains("Deleted entry.")){
deletedEntry = true;
}else if(officialName.contains("Transferred entry:")){
transferredEntry = true;
}
} else if (line.startsWith(ALTERNATE_NAME_LINE_CODE)) {
alternateNames.add(line.substring(5).trim());
} else if (line.startsWith(COFACTORS_LINE_CODE)) {
String[] cofs = line.substring(5).trim().split(";");
for (String cofactorSt : cofs) {
cofactors.add(cofactorSt.trim());
}
} else if (line.startsWith(PROSITE_CROSS_REFERENCES_LINE_CODE)) {
String[] proRefs = line.substring(5).trim().split(";");
for (String prositeSt : proRefs) {
if(!prositeSt.equals("PROSITE")){
prositeCrossRefs.add(prositeSt.trim());
}
}
} else if (line.startsWith(COMMENTS_LINE_CODE)) {
commentsSt += line.substring(5).trim() + " ";
}else if (line.startsWith(CATALYTIC_ACTIVITY_LINE_CODE)) {
catalyticActivity += line.substring(5).trim() + " ";
} else if (line.startsWith(TERMINATION_LINE_CODE)) {
if (enzymeFound) {
if(deletedEntry){
logger.log(Level.INFO, ("Entry with id " + enzymeId + " was deleted. It won't be stored..."));
deletedEntry = false;
}else if(transferredEntry){
logger.log(Level.INFO, ("Entry with id " + enzymeId + " was transferred. It won't be stored..."));
transferredEntry = false;
}
enzymeProperties.put(EnzymeNode.ID_PROPERTY, enzymeId);
enzymeProperties.put(EnzymeNode.OFFICIAL_NAME_PROPERTY, officialName);
enzymeProperties.put(EnzymeNode.ALTERNATE_NAMES_PROPERTY, alternateNames.toArray(new String[0]));
enzymeProperties.put(EnzymeNode.COFACTORS_PROPERTY, cofactors.toArray(new String[0]));
enzymeProperties.put(EnzymeNode.PROSITE_CROSS_REFERENCES_PROPERTY, prositeCrossRefs.toArray(new String[0]));
enzymeProperties.put(EnzymeNode.CATALYTIC_ACTIVITY_PROPERTY, catalyticActivity);
enzymeProperties.put(EnzymeNode.COMMENTS_PROPERTY, commentsSt);
enzymeProperties.put(EnzymeNode.NODE_TYPE_PROPERTY, EnzymeNode.NODE_TYPE);
//creating node
long enzymeNodeId = inserter.createNode(enzymeProperties);
//indexing node
enzymeIdIndex.add(enzymeNodeId, MapUtil.map(EnzymeNode.ENZYME_ID_INDEX,enzymeId));
//--------indexing node by node_type index----------
nodeTypeIndex.add(enzymeNodeId, MapUtil.map(Bio4jManager.NODE_TYPE_INDEX_NAME, EnzymeNode.NODE_TYPE));
enzymeCounter++;
if (enzymeCounter % 100 == 0) {
System.out.println(enzymeCounter + " enzymes inserted...");
}
}
enzymeFound = false;
officialName = "";
enzymeId = "";
commentsSt = "";
catalyticActivity = "";
alternateNames.clear();
cofactors.clear();
prositeCrossRefs.clear();
}
}
}
reader.close();
} catch (Exception e) {
logger.log(Level.SEVERE, e.getMessage());
StackTraceElement[] trace = e.getStackTrace();
for (StackTraceElement stackTraceElement : trace) {
logger.log(Level.SEVERE, stackTraceElement.toString());
}
} finally {
try {
//closing logger file handler
fh.close();
logger.log(Level.INFO, "Closing up inserter and index service....");
// shutdown, makes sure all changes are written to disk
indexProvider.shutdown();
inserter.shutdown();
//-----------------writing stats file---------------------
long elapsedTime = System.nanoTime() - initTime;
long elapsedSeconds = Math.round((elapsedTime / 1000000000.0));
long hours = elapsedSeconds/3600;
long minutes = (elapsedSeconds % 3600)/60;
long seconds = (elapsedSeconds % 3600)%60;
statsBuff.write("Statistics for program ImportEnzymeDB:\nInput file: " + inFile.getName()
+ "\nThere were " + enzymeCounter + " enzymes inserted.\n" +
"The elapsed time was: " + hours + "h " + minutes + "m " + seconds + "s\n");
//---closing stats writer---
statsBuff.close();
} catch (Exception e) {
logger.log(Level.SEVERE, e.getMessage());
StackTraceElement[] trace = e.getStackTrace();
for (StackTraceElement stackTraceElement : trace) {
logger.log(Level.SEVERE, stackTraceElement.toString());
}
}
}
}
}
}