/*
* Copyright (C) 2010-2013 "Bio4j"
*
* This file is part of Bio4j
*
* Bio4j is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>
*/
package com.bio4j.neo4jdb.programs;
import com.bio4j.neo4jdb.model.nodes.IsoformNode;
import com.bio4j.neo4jdb.model.nodes.ProteinNode;
import com.bio4j.neo4jdb.model.relationships.protein.ProteinIsoformInteractionRel;
import com.bio4j.neo4jdb.model.relationships.protein.ProteinProteinInteractionRel;
import com.bio4j.neo4jdb.model.util.UniprotStuff;
import com.ohnosequences.util.Executable;
import com.ohnosequences.xml.api.model.XMLElement;
import java.io.*;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.logging.FileHandler;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.logging.SimpleFormatter;
import org.jdom2.Element;
import org.neo4j.graphdb.index.IndexHits;
import org.neo4j.helpers.collection.MapUtil;
import org.neo4j.index.lucene.unsafe.batchinsert.LuceneBatchInserterIndexProvider;
import org.neo4j.unsafe.batchinsert.*;
/**
* Imports protein interactions: - protein <--> protein - protein <--> isoform -
* isoform <--> isoform
*
* @author Pablo Pareja Tobes <ppareja@era7.com>
*/
public class ImportProteinInteractions implements Executable {
private static final Logger logger = Logger.getLogger("ImportProteinInteractions");
private static FileHandler fh;
//--------indexing API constans-----
private static String PROVIDER_ST = "provider";
private static String EXACT_ST = "exact";
//private static String FULL_TEXT_ST = "fulltext";
private static String LUCENE_ST = "lucene";
private static String TYPE_ST = "type";
//-----------------------------------
@Override
public void execute(ArrayList<String> array) {
String[] args = new String[array.size()];
for (int i = 0; i < array.size(); i++) {
args[i] = array.get(i);
}
try {
main(args);
} catch (IOException e) {
e.printStackTrace();
}
}
public static void main(String[] args) throws IOException {
if (args.length != 3) {
System.out.println("This program expects the following parameters: \n"
+ "1. Uniprot xml filename \n"
+ "2. Bio4j DB folder\n"
+ "3. Batch inserter .properties file");
} else {
long initTime = System.nanoTime();
File inFile = new File(args[0]);
BatchInserter inserter = null;
BatchInserterIndexProvider indexProvider = null;
String accessionSt = "";
BufferedWriter statsBuff = null;
int proteinCounter = 0;
int limitForPrintingOut = 10000;
try {
// This block configure the logger with handler and formatter
fh = new FileHandler("ImportProteinInteractions" + args[0].split("\\.")[0] + ".log", false);
SimpleFormatter formatter = new SimpleFormatter();
fh.setFormatter(formatter);
logger.addHandler(fh);
logger.setLevel(Level.ALL);
//---------------------------------
//---creating writer for stats file-----
statsBuff = new BufferedWriter(new FileWriter(new File("ImportProteinInteractionsStats_" + inFile.getName().split("\\.")[0] + ".txt")));
// create the batch inserter
inserter = BatchInserters.inserter(args[1], MapUtil.load(new File(args[2])));
// create the batch index service
indexProvider = new LuceneBatchInserterIndexProvider(inserter);
//------------------nodes properties maps-----------------------------------
//---------------------------------------------------------------------
//-------------------relationships properties maps--------------------------
Map<String, Object> proteinProteinInteractionProperties = new HashMap<String, Object>();
Map<String, Object> proteinIsoformInteractionProperties = new HashMap<String, Object>();
//----------------------------------------------------------------------------
//--------------------------------relationships------------------------------------------
ProteinProteinInteractionRel proteinProteinInteractionRel = new ProteinProteinInteractionRel(null);
ProteinIsoformInteractionRel proteinIsoformInteractionRel = new ProteinIsoformInteractionRel(null);
//------------------------------------------------------------------------------------------------
//------------------indexes creation----------------------------------
BatchInserterIndex proteinAccessionIndex = indexProvider.nodeIndex(ProteinNode.PROTEIN_ACCESSION_INDEX,
MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, EXACT_ST));
BatchInserterIndex isoformIdIndex = indexProvider.nodeIndex(IsoformNode.ISOFORM_ID_INDEX,
MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, EXACT_ST));
//--------------------------------------------------------------------
BufferedReader reader = new BufferedReader(new FileReader(inFile));
String line;
StringBuilder entryStBuilder = new StringBuilder();
while ((line = reader.readLine()) != null) {
if (line.trim().startsWith("<" + UniprotStuff.ENTRY_TAG_NAME)) {
while (!line.trim().startsWith("</" + UniprotStuff.ENTRY_TAG_NAME + ">")) {
entryStBuilder.append(line);
line = reader.readLine();
}
//linea final del organism
entryStBuilder.append(line);
//System.out.println("organismStBuilder.toString() = " + organismStBuilder.toString());
XMLElement entryXMLElem = new XMLElement(entryStBuilder.toString());
entryStBuilder.delete(0, entryStBuilder.length());
accessionSt = entryXMLElem.asJDomElement().getChildText(UniprotStuff.ENTRY_ACCESSION_TAG_NAME);
long currentProteinId = proteinAccessionIndex.get(ProteinNode.PROTEIN_ACCESSION_INDEX, accessionSt).getSingle();
List<Element> comments = entryXMLElem.asJDomElement().getChildren(UniprotStuff.COMMENT_TAG_NAME);
for (Element commentElem : comments) {
String commentTypeSt = commentElem.getAttributeValue(UniprotStuff.COMMENT_TYPE_ATTRIBUTE);
//----------interaction----------------
if (commentTypeSt.equals(ProteinProteinInteractionRel.UNIPROT_ATTRIBUTE_TYPE_VALUE)) {
List<Element> interactants = commentElem.getChildren("interactant");
Element interactant1 = interactants.get(0);
Element interactant2 = interactants.get(1);
Element organismsDiffer = commentElem.getChild("organismsDiffer");
Element experiments = commentElem.getChild("experiments");
String intactId1St = interactant1.getAttributeValue("intactId");
String intactId2St = interactant2.getAttributeValue("intactId");
String organismsDifferSt = "";
String experimentsSt = "";
if (intactId1St == null) {
intactId1St = "";
}
if (intactId2St == null) {
intactId2St = "";
}
if (organismsDiffer != null) {
organismsDifferSt = organismsDiffer.getText();
}
if (experiments != null) {
experimentsSt = experiments.getText();
}
//----now we try to retrieve the interactant 2 accession--
String interactant2AccessionSt = interactant2.getChildText("id");
long protein2Id = -1;
if (interactant2AccessionSt != null) {
IndexHits<Long> protein2IdIndexHits = proteinAccessionIndex.get(ProteinNode.PROTEIN_ACCESSION_INDEX, interactant2AccessionSt);
if (protein2IdIndexHits.hasNext()) {
if (protein2IdIndexHits.size() == 1) {
protein2Id = protein2IdIndexHits.getSingle();
}
}
if (protein2Id < 0) {
//Since we did not find the protein we try to find a isoform instead
long isoformId = -1;
IndexHits<Long> isoformIdIndexHits = isoformIdIndex.get(IsoformNode.ISOFORM_ID_INDEX, interactant2AccessionSt);
if (isoformIdIndexHits.hasNext()) {
if (isoformIdIndexHits.size() == 1) {
isoformId = isoformIdIndexHits.getSingle();
}
}
if (isoformId >= 0) {
proteinIsoformInteractionProperties.put(ProteinIsoformInteractionRel.EXPERIMENTS_PROPERTY, experimentsSt);
proteinIsoformInteractionProperties.put(ProteinIsoformInteractionRel.ORGANISMS_DIFFER_PROPERTY, organismsDifferSt);
proteinIsoformInteractionProperties.put(ProteinIsoformInteractionRel.INTACT_ID_1_PROPERTY, intactId1St);
proteinIsoformInteractionProperties.put(ProteinIsoformInteractionRel.INTACT_ID_2_PROPERTY, intactId2St);
inserter.createRelationship(currentProteinId, isoformId, proteinIsoformInteractionRel, proteinIsoformInteractionProperties);
}
} else {
proteinProteinInteractionProperties.put(ProteinProteinInteractionRel.EXPERIMENTS_PROPERTY, experimentsSt);
proteinProteinInteractionProperties.put(ProteinProteinInteractionRel.ORGANISMS_DIFFER_PROPERTY, organismsDifferSt);
proteinProteinInteractionProperties.put(ProteinProteinInteractionRel.INTACT_ID_1_PROPERTY, intactId1St);
proteinProteinInteractionProperties.put(ProteinProteinInteractionRel.INTACT_ID_2_PROPERTY, intactId2St);
inserter.createRelationship(currentProteinId, protein2Id, proteinProteinInteractionRel, proteinProteinInteractionProperties);
}
}
}
}
proteinCounter++;
if ((proteinCounter % limitForPrintingOut) == 0) {
logger.log(Level.INFO, (proteinCounter + " proteins updated with interactions!!"));
}
}
}
reader.close();
} catch (Exception e) {
logger.log(Level.SEVERE, ("Exception retrieving protein " + accessionSt));
logger.log(Level.SEVERE, e.getMessage());
StackTraceElement[] trace = e.getStackTrace();
for (StackTraceElement stackTraceElement : trace) {
logger.log(Level.SEVERE, stackTraceElement.toString());
}
} finally {
//outbBuff.close();
try {
// shutdown, makes sure all changes are written to disk
indexProvider.shutdown();
inserter.shutdown();
//closing logger file handler
fh.close();
//-----------------writing stats file---------------------
long elapsedTime = System.nanoTime() - initTime;
long elapsedSeconds = Math.round((elapsedTime / 1000000000.0));
long hours = elapsedSeconds / 3600;
long minutes = (elapsedSeconds % 3600) / 60;
long seconds = (elapsedSeconds % 3600) % 60;
statsBuff.write("Statistics for program ImportProteinInteractions:\nInput file: " + inFile.getName()
+ "\nThere were " + proteinCounter + " proteins analyzed.\n"
+ "The elapsed time was: " + hours + "h " + minutes + "m " + seconds + "s\n");
//---closing stats writer---
statsBuff.close();
} catch (Exception e) {
logger.log(Level.SEVERE, ("Exception retrieving protein " + accessionSt));
logger.log(Level.SEVERE, e.getMessage());
StackTraceElement[] trace = e.getStackTrace();
for (StackTraceElement stackTraceElement : trace) {
logger.log(Level.SEVERE, stackTraceElement.toString());
}
//closing logger file handler
fh.close();
}
}
}
}
}