/*
* Copyright (C) 2010-2013 "Bio4j"
*
* This file is part of Bio4j
*
* Bio4j is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>
*/
package com.bio4j.neo4jdb.programs;
import com.bio4j.neo4jdb.model.relationships.go.PartOfGoRel;
import com.bio4j.neo4jdb.model.relationships.go.RegulatesGoRel;
import com.bio4j.neo4jdb.model.relationships.go.HasPartOfGoRel;
import com.bio4j.neo4jdb.model.relationships.go.NegativelyRegulatesGoRel;
import com.bio4j.neo4jdb.model.relationships.go.IsAGoRel;
import com.bio4j.neo4jdb.model.relationships.go.PositivelyRegulatesGoRel;
import com.bio4j.neo4jdb.model.nodes.GoTermNode;
import com.bio4j.neo4jdb.model.util.Bio4jManager;
import com.ohnosequences.util.Executable;
import com.ohnosequences.xml.api.model.XMLElement;
import java.io.*;
import java.util.*;
import java.util.logging.FileHandler;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.logging.SimpleFormatter;
import org.jdom2.Element;
import org.neo4j.helpers.collection.MapUtil;
import org.neo4j.index.lucene.unsafe.batchinsert.LuceneBatchInserterIndexProvider;
import org.neo4j.unsafe.batchinsert.*;
/**
* Imports the Gene Ontology into Bio4j
* @author Pablo Pareja Tobes <ppareja@era7.com>
*/
public class ImportGeneOntology implements Executable {
public static final String TERM_TAG_NAME = "term";
public static final String ID_TAG_NAME = "id";
public static final String NAME_TAG_NAME = "name";
public static final String DEF_TAG_NAME = "def";
public static final String DEFSTR_TAG_NAME = "defstr";
public static final String IS_ROOT_TAG_NAME = "is_root";
public static final String IS_OBSOLETE_TAG_NAME = "is_obsolete";
public static final String COMMENT_TAG_NAME = "comment";
public static final String NAMESPACE_TAG_NAME = "namespace";
public static final String RELATIONSHIP_TAG_NAME = "relationship";
public static final String MOLECULAR_FUNCTION_GO_ID = "GO:0003674";
public static final String BIOLOGICAL_PROCESS_GO_ID = "GO:0008150";
public static final String CELLULAR_COMPONENT_GO_ID = "GO:0005575";
private static final Logger logger = Logger.getLogger("ImportGeneOntology");
private static FileHandler fh;
@Override
public void execute(ArrayList<String> array) {
String[] args = new String[array.size()];
for (int i = 0; i < array.size(); i++) {
args[i] = array.get(i);
}
main(args);
}
public static void main(String[] args) {
if (args.length != 3) {
System.out.println("This program expects the following parameters: \n"
+ "1. Gene ontology xml filename \n"
+ "2. Bio4j DB folder \n"
+ "3. Batch inserter .properties file");
} else {
long initTime = System.nanoTime();
File inFile = new File(args[0]);
BatchInserter inserter = null;
BatchInserterIndexProvider indexProvider = null;
BatchInserterIndex goTermIdIndex;
BatchInserterIndex isAGoRelIndex;
BatchInserterIndex nodeTypeIndex;
BufferedWriter statsBuff = null;
int termCounter = 0;
int limitForPrintingOut = 10000;
try {
// This block configures the logger with handler and formatter
fh = new FileHandler("ImportGeneOntology.log", true);
SimpleFormatter formatter = new SimpleFormatter();
fh.setFormatter(formatter);
logger.addHandler(fh);
logger.setLevel(Level.ALL);
//---creating writer for stats file-----
statsBuff = new BufferedWriter(new FileWriter(new File("ImportGeneOntologyStats.txt")));
// create the batch inserter
inserter = BatchInserters.inserter(args[1], MapUtil.load(new File(args[2])));
// create the batch index service
indexProvider = new LuceneBatchInserterIndexProvider( inserter );
Map<String,String> indexProps = MapUtil.stringMap( "provider", "lucene", "type", "exact" );
goTermIdIndex = indexProvider.nodeIndex( GoTermNode.GO_TERM_ID_INDEX, indexProps);
isAGoRelIndex = indexProvider.relationshipIndex(IsAGoRel.IS_A_REL_INDEX, indexProps);
nodeTypeIndex = indexProvider.nodeIndex( Bio4jManager.NODE_TYPE_INDEX_NAME, indexProps);
//------------------nodes properties maps-----------------------------------
Map<String, Object> goProperties = new HashMap<String, Object>();
goProperties.put(GoTermNode.NODE_TYPE_PROPERTY, GoTermNode.NODE_TYPE);
//--------------------------------------------------------------------------
//--------------------------------relationships------------------------------------------
IsAGoRel isAGoRel = new IsAGoRel(null);
RegulatesGoRel regulatesGoRel = new RegulatesGoRel(null);
NegativelyRegulatesGoRel negativelyRegulatesGoRel = new NegativelyRegulatesGoRel(null);
PositivelyRegulatesGoRel positivelyRegulatesGoRel = new PositivelyRegulatesGoRel(null);
PartOfGoRel partOfGoRel = new PartOfGoRel(null);
HasPartOfGoRel hasPartGoRel = new HasPartOfGoRel(null);
//--------------------------------------------------------------------------
Map<String, ArrayList<String>> termParentsMap = new HashMap<String, ArrayList<String>>();
Map<String, ArrayList<String>> regulatesMap = new HashMap<String, ArrayList<String>>();
Map<String, ArrayList<String>> negativelyRegulatesMap = new HashMap<String, ArrayList<String>>();
Map<String, ArrayList<String>> positivelyRegulatesMap = new HashMap<String, ArrayList<String>>();
Map<String, ArrayList<String>> partOfMap = new HashMap<String, ArrayList<String>>();
Map<String, ArrayList<String>> hasPartMap = new HashMap<String, ArrayList<String>>();
BufferedReader reader = new BufferedReader(new FileReader(inFile));
String line;
StringBuilder termStBuilder = new StringBuilder();
logger.log(Level.INFO, "inserting nodes....");
//-----first I create all the elements whitout their relationships-------------
while ((line = reader.readLine()) != null) {
if (line.trim().startsWith("<" + TERM_TAG_NAME)) {
while (!line.trim().startsWith("</" + TERM_TAG_NAME + ">")) {
termStBuilder.append(line);
line = reader.readLine();
}
//linea final del organism
termStBuilder.append(line);
//System.out.println("organismStBuilder.toString() = " + organismStBuilder.toString());
XMLElement termXMLElement = new XMLElement(termStBuilder.toString());
termStBuilder.delete(0, termStBuilder.length());
String goId = termXMLElement.asJDomElement().getChildText(ID_TAG_NAME);
String goName = termXMLElement.asJDomElement().getChildText(NAME_TAG_NAME);
if (goName == null) {
goName = "";
}
String goNamespace = termXMLElement.asJDomElement().getChildText(NAMESPACE_TAG_NAME);
if (goNamespace == null) {
goNamespace = "";
}
String goDefinition = "";
Element defElem = termXMLElement.asJDomElement().getChild(DEF_TAG_NAME);
if (defElem != null) {
Element defstrElem = defElem.getChild(DEFSTR_TAG_NAME);
if (defstrElem != null) {
goDefinition = defstrElem.getText();
}
}
String goComment = termXMLElement.asJDomElement().getChildText(COMMENT_TAG_NAME);
if(goComment == null){
goComment = "";
}
String goIsObsolete = termXMLElement.asJDomElement().getChildText(IS_OBSOLETE_TAG_NAME);
if(goIsObsolete == null){
goIsObsolete = "";
}else{
if(goIsObsolete.equals("1")){
goIsObsolete = "true";
}else{
goIsObsolete = "false";
}
}
List<Element> altIdElems = termXMLElement.asJDomElement().getChildren("alt_id");
String[] alternativeIds = new String[altIdElems.size()];
for (int i = 0; i < altIdElems.size(); i++) {
alternativeIds[i] = altIdElems.get(i).getText();
}
//----term parents----
List<Element> termParentTerms = termXMLElement.asJDomElement().getChildren(IsAGoRel.OBOXML_RELATIONSHIP_NAME);
ArrayList<String> array = new ArrayList<String>();
for (Element elem : termParentTerms) {
array.add(elem.getText().trim());
}
termParentsMap.put(goId, array);
//---------------------
//-------relationship tags-----------
List<Element> relationshipTags = termXMLElement.asJDomElement().getChildren(RELATIONSHIP_TAG_NAME);
for (Element relationshipTag : relationshipTags) {
String relType = relationshipTag.getChildText("type");
String toSt = relationshipTag.getChildText("to");
if(relType.equals(RegulatesGoRel.OBOXML_RELATIONSHIP_NAME)){
ArrayList<String> tempArray = regulatesMap.get(goId);
if(tempArray == null){
tempArray = new ArrayList<String>();
regulatesMap.put(goId, tempArray);
}
tempArray.add(toSt);
}else if(relType.equals(PositivelyRegulatesGoRel.OBOXML_RELATIONSHIP_NAME)){
ArrayList<String> tempArray = positivelyRegulatesMap.get(goId);
if(tempArray == null){
tempArray = new ArrayList<String>();
positivelyRegulatesMap.put(goId, tempArray);
}
tempArray.add(toSt);
}else if(relType.equals(NegativelyRegulatesGoRel.OBOXML_RELATIONSHIP_NAME)){
ArrayList<String> tempArray = negativelyRegulatesMap.get(goId);
if(tempArray == null){
tempArray = new ArrayList<String>();
negativelyRegulatesMap.put(goId, tempArray);
}
tempArray.add(toSt);
}else if(relType.equals(PartOfGoRel.OBOXML_RELATIONSHIP_NAME)){
ArrayList<String> tempArray = partOfMap.get(goId);
if(tempArray == null){
tempArray = new ArrayList<String>();
partOfMap.put(goId, tempArray);
}
tempArray.add(toSt);
}else if(relType.equals(HasPartOfGoRel.OBOXML_RELATIONSHIP_NAME)){
ArrayList<String> tempArray = hasPartMap.get(goId);
if(tempArray == null){
tempArray = new ArrayList<String>();
hasPartMap.put(goId, tempArray);
}
tempArray.add(toSt);
}
}
//-------------------------------------
goProperties.put(GoTermNode.ID_PROPERTY, goId);
goProperties.put(GoTermNode.NAME_PROPERTY, goName);
goProperties.put(GoTermNode.DEFINITION_PROPERTY, goDefinition);
goProperties.put(GoTermNode.NAMESPACE_PROPERTY, goNamespace);
goProperties.put(GoTermNode.ALTERNATIVE_IDS_PROPERTY, alternativeIds);
goProperties.put(GoTermNode.OBSOLETE_PROPERTY, goIsObsolete);
goProperties.put(GoTermNode.COMMENT_PROPERTY, goComment);
long currentGoTermId = inserter.createNode(goProperties);
//--------indexing term by id (and alternative ids)----------
goTermIdIndex.add(currentGoTermId, MapUtil.map(GoTermNode.GO_TERM_ID_INDEX,goId));
for (int i = 0; i < alternativeIds.length; i++) {
goTermIdIndex.add(currentGoTermId, MapUtil.map(GoTermNode.GO_TERM_ID_INDEX,alternativeIds[i]));
}
//--------indexing node by node_type index----------
nodeTypeIndex.add(currentGoTermId, MapUtil.map(Bio4jManager.NODE_TYPE_INDEX_NAME,GoTermNode.NODE_TYPE));
}
termCounter++;
if ((termCounter % limitForPrintingOut) == 0) {
logger.log(Level.INFO, (termCounter + " terms inserted!!"));
}
}
reader.close();
//flushing index
goTermIdIndex.flush();
//-----------------------------------------------------------------------
logger.log(Level.INFO, "Inserting relationships....");
logger.log(Level.INFO, "'is_a' relationships....");
//-------------------'is_a' relationships-----------------
Set<String> keys = termParentsMap.keySet();
for (String key : keys) {
long currentNodeId = goTermIdIndex.get(GoTermNode.GO_TERM_ID_INDEX, key).getSingle();
ArrayList<String> tempArray = termParentsMap.get(key);
for (String string : tempArray) {
long tempNodeId = goTermIdIndex.get(GoTermNode.GO_TERM_ID_INDEX, string).getSingle();
long isAGorelId = inserter.createRelationship(currentNodeId, tempNodeId, isAGoRel, null);
//System.out.println("key = " + key);
isAGoRelIndex.add(isAGorelId, MapUtil.map(IsAGoRel.IS_A_REL_INDEX,String.valueOf(currentNodeId)));
//System.out.println("indexing key = " + key);
}
}
logger.log(Level.INFO, "'regulates' relationships....");
//-------------------'regulates' relationships----------------------
keys = regulatesMap.keySet();
for (String key : keys) {
long currentNodeId = goTermIdIndex.get(GoTermNode.GO_TERM_ID_INDEX, key).getSingle();
ArrayList<String> tempArray = regulatesMap.get(key);
for (String string : tempArray) {
long tempNodeId = goTermIdIndex.get(GoTermNode.GO_TERM_ID_INDEX, string).getSingle();
inserter.createRelationship(currentNodeId, tempNodeId, regulatesGoRel, null);
}
}
logger.log(Level.INFO, "'negatively_regulates' relationships....");
//-------------------'regulates' relationships----------------------
keys = negativelyRegulatesMap.keySet();
for (String key : keys) {
long currentNodeId = goTermIdIndex.get(GoTermNode.GO_TERM_ID_INDEX, key).getSingle();
ArrayList<String> tempArray = negativelyRegulatesMap.get(key);
for (String string : tempArray) {
long tempNodeId = goTermIdIndex.get(GoTermNode.GO_TERM_ID_INDEX, string).getSingle();
inserter.createRelationship(currentNodeId, tempNodeId, negativelyRegulatesGoRel, null);
}
}
logger.log(Level.INFO, "'positively_regulates' relationships....");
//-------------------'regulates' relationships----------------------
keys = positivelyRegulatesMap.keySet();
for (String key : keys) {
long currentNodeId = goTermIdIndex.get(GoTermNode.GO_TERM_ID_INDEX, key).getSingle();
ArrayList<String> tempArray = positivelyRegulatesMap.get(key);
for (String string : tempArray) {
long tempNodeId = goTermIdIndex.get(GoTermNode.GO_TERM_ID_INDEX, string).getSingle();
inserter.createRelationship(currentNodeId, tempNodeId, positivelyRegulatesGoRel, null);
}
}
logger.log(Level.INFO, "'part_of' relationships....");
//-------------------'regulates' relationships----------------------
keys = partOfMap.keySet();
for (String key : keys) {
long currentNodeId = goTermIdIndex.get(GoTermNode.GO_TERM_ID_INDEX, key).getSingle();
ArrayList<String> tempArray = partOfMap.get(key);
for (String string : tempArray) {
long tempNodeId = goTermIdIndex.get(GoTermNode.GO_TERM_ID_INDEX, string).getSingle();
inserter.createRelationship(currentNodeId, tempNodeId, partOfGoRel, null);
}
}
logger.log(Level.INFO, "'has_part' relationships....");
//-------------------'regulates' relationships----------------------
keys = hasPartMap.keySet();
for (String key : keys) {
long currentNodeId = goTermIdIndex.get(GoTermNode.GO_TERM_ID_INDEX, key).getSingle();
ArrayList<String> tempArray = hasPartMap.get(key);
for (String string : tempArray) {
long tempNodeId = goTermIdIndex.get(GoTermNode.GO_TERM_ID_INDEX, string).getSingle();
inserter.createRelationship(currentNodeId, tempNodeId, hasPartGoRel, null);
}
}
logger.log(Level.INFO, "Done! :)");
} catch (Exception e) {
logger.log(Level.SEVERE, e.getMessage());
StackTraceElement[] trace = e.getStackTrace();
for (StackTraceElement stackTraceElement : trace) {
logger.log(Level.SEVERE, stackTraceElement.toString());
}
} finally {
try {
//closing logger file handler
fh.close();
logger.log(Level.INFO, "Closing up inserter and index service....");
// shutdown, makes sure all changes are written to disk
indexProvider.shutdown();
inserter.shutdown();
//-----------------writing stats file---------------------
long elapsedTime = System.nanoTime() - initTime;
long elapsedSeconds = Math.round((elapsedTime / 1000000000.0));
long hours = elapsedSeconds/3600;
long minutes = (elapsedSeconds % 3600)/60;
long seconds = (elapsedSeconds % 3600)%60;
statsBuff.write("Statistics for program ImportGeneOntology:\nInput file: " + inFile.getName()
+ "\nThere were " + termCounter + " terms inserted.\n" +
"The elapsed time was: " + hours + "h " + minutes + "m " + seconds + "s\n");
//---closing stats writer---
statsBuff.close();
} catch (Exception e) {
logger.log(Level.SEVERE, e.getMessage());
StackTraceElement[] trace = e.getStackTrace();
for (StackTraceElement stackTraceElement : trace) {
logger.log(Level.SEVERE, stackTraceElement.toString());
}
}
}
}
}
}