/*
* Copyright (C) 2010-2013 "Bio4j
*
* This file is part of Bio4j
*
* Bio4j is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>
*/
package com.bio4j.neo4jdb.programs;
import com.bio4j.neo4jdb.model.nodes.IsoformNode;
import com.bio4j.neo4jdb.model.nodes.ProteinNode;
import com.bio4j.neo4jdb.model.relationships.uniref.UniRef100MemberRel;
import com.bio4j.neo4jdb.model.relationships.uniref.UniRef50MemberRel;
import com.bio4j.neo4jdb.model.relationships.uniref.UniRef90MemberRel;
import com.bio4j.neo4jdb.model.util.UniprotStuff;
import com.bio4j.neo4jdb.BasicRelationship;
import com.ohnosequences.util.Executable;
import com.ohnosequences.xml.api.model.XMLElement;
import java.io.*;
import java.util.ArrayList;
import java.util.List;
import java.util.logging.FileHandler;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.logging.SimpleFormatter;
import org.jdom2.Element;
import org.neo4j.graphdb.index.IndexHits;
import org.neo4j.helpers.collection.MapUtil;
import org.neo4j.index.lucene.unsafe.batchinsert.LuceneBatchInserterIndexProvider;
import org.neo4j.unsafe.batchinsert.*;
/**
* Imports uniref(100,90,50) clusters info into Bio4j
*
* @author Pablo Pareja Tobes <ppareja@era7.com>
*/
public class ImportUniref implements Executable {
private static final Logger logger = Logger.getLogger("ImportUniref");
private static FileHandler fh;
//--------indexing API constans-----
private static String PROVIDER_ST = "provider";
private static String EXACT_ST = "exact";
//private static String FULL_TEXT_ST = "fulltext";
private static String LUCENE_ST = "lucene";
private static String TYPE_ST = "type";
//-----------------------------------
@Override
public void execute(ArrayList<String> array) {
String[] args = new String[array.size()];
for (int i = 0; i < array.size(); i++) {
args[i] = array.get(i);
}
main(args);
}
public static void main(String[] args) {
if (args.length != 5) {
System.out.println("This program expects the following parameters: \n"
+ "1. Uniref 100 xml filename \n"
+ "2. Uniref 90 xml filename \n"
+ "3. Uniref 50 xml filename \n"
+ "4. Bio4j DB folder \n"
+ "5. batch inserter .properties file");
} else {
long initTime = System.nanoTime();
File uniref100File = new File(args[0]);
File uniref90File = new File(args[1]);
File uniref50File = new File(args[2]);
UniRef100MemberRel uniRef100MemberRel = new UniRef100MemberRel(null);
UniRef50MemberRel uniRef50MemberRel = new UniRef50MemberRel(null);
UniRef90MemberRel uniRef90MemberRel = new UniRef90MemberRel(null);
BatchInserter inserter = null;
BatchInserterIndexProvider indexProvider = null;
BufferedWriter statsBuff = null;
int uniref100EntryCounter =0, uniref90EntryCounter = 0, uniref50EntryCounter = 0;
try {
// This block configure the logger with handler and formatter
fh = new FileHandler("ImportUniref.log", true);
SimpleFormatter formatter = new SimpleFormatter();
fh.setFormatter(formatter);
logger.addHandler(fh);
logger.setLevel(Level.ALL);
//---creating writer for stats file-----
statsBuff = new BufferedWriter(new FileWriter(new File("ImportUnirefStats.txt")));
// create the batch inserter
inserter = BatchInserters.inserter(args[3], MapUtil.load(new File(args[4])));
// create the batch index service
indexProvider = new LuceneBatchInserterIndexProvider(inserter);
//------------------indexes creation----------------------------------
BatchInserterIndex proteinAccessionIndex = indexProvider.nodeIndex(ProteinNode.PROTEIN_ACCESSION_INDEX,
MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, EXACT_ST));
BatchInserterIndex isoformIdIndex = indexProvider.nodeIndex(IsoformNode.ISOFORM_ID_INDEX,
MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, EXACT_ST));
//--------------------------------------------------------------------
//------------------- UNIREF 100----------------------------
System.out.println("Reading Uniref 100 file...");
uniref100EntryCounter = importUnirefFile(inserter, proteinAccessionIndex, isoformIdIndex, uniref100File, uniRef100MemberRel);
System.out.println("Done! :)");
System.out.println("Reading Uniref 90 file...");
uniref90EntryCounter = importUnirefFile(inserter, proteinAccessionIndex, isoformIdIndex, uniref90File, uniRef90MemberRel);
System.out.println("Done! :)");
System.out.println("Reading Uniref 50 file...");
uniref50EntryCounter = importUnirefFile(inserter, proteinAccessionIndex, isoformIdIndex, uniref50File, uniRef50MemberRel);
System.out.println("Done! :)");
} catch (Exception ex) {
logger.log(Level.SEVERE, ex.getMessage());
StackTraceElement[] trace = ex.getStackTrace();
for (StackTraceElement stackTraceElement : trace) {
logger.log(Level.SEVERE, stackTraceElement.toString());
}
} finally {
try {
//closing logger file handler
fh.close();
//closing neo4j managers
indexProvider.shutdown();
inserter.shutdown();
//-----------------writing stats file---------------------
long elapsedTime = System.nanoTime() - initTime;
long elapsedSeconds = Math.round((elapsedTime / 1000000000.0));
long hours = elapsedSeconds / 3600;
long minutes = (elapsedSeconds % 3600) / 60;
long seconds = (elapsedSeconds % 3600) % 60;
statsBuff.write("Statistics for program ImportUniref:\nInput files: " +
"\nUniref 100 file: " + uniref100File.getName() +
"\nUniref 90 file: " + uniref90File.getName() +
"\nUniref 50 file: " + uniref50File.getName()
+ "\nThe following number of entries was parsed:\n"
+ "Uniref 100 --> " + uniref100EntryCounter + " entries\n"
+ "Uniref 90 --> " + uniref90EntryCounter + " entries\n"
+ "Uniref 50 --> " + uniref50EntryCounter + " entries\n"
+ "The elapsed time was: " + hours + "h " + minutes + "m " + seconds + "s\n");
//---closing stats writer---
statsBuff.close();
} catch (Exception e) {
logger.log(Level.SEVERE, e.getMessage());
StackTraceElement[] trace = e.getStackTrace();
for (StackTraceElement stackTraceElement : trace) {
logger.log(Level.SEVERE, stackTraceElement.toString());
}
}
}
System.out.println("Program finished!! :D");
}
}
private static String getRepresentantAccession(Element elem) {
String result = null;
Element dbReference = elem.getChild("dbReference");
List<Element> properties = dbReference.getChildren("property");
for (Element prop : properties) {
if (prop.getAttributeValue("type").equals("UniProtKB accession")) {
result = prop.getAttributeValue("value");
}
}
return result;
}
private static int importUnirefFile(BatchInserter inserter,
BatchInserterIndex proteinAccessionIndex,
BatchInserterIndex isoformIdIndex,
File unirefFile,
BasicRelationship relationship) throws Exception {
StringBuilder entryStBuilder = new StringBuilder();
BufferedReader reader = new BufferedReader(new FileReader(unirefFile));
String line;
int entryCounter = 0;
int limitForPrintingOut = 10000;
while ((line = reader.readLine()) != null) {
//----we reached a entry line-----
if (line.trim().startsWith("<" + UniprotStuff.ENTRY_TAG_NAME)) {
while (!line.trim().startsWith("</" + UniprotStuff.ENTRY_TAG_NAME + ">")) {
entryStBuilder.append(line);
line = reader.readLine();
}
//organism last line
entryStBuilder.append(line);
XMLElement entryXMLElem = new XMLElement(entryStBuilder.toString());
entryStBuilder.delete(0, entryStBuilder.length());
ArrayList<String> membersAccessionList = new ArrayList<String>();
Element representativeMember = entryXMLElem.asJDomElement().getChild("representativeMember");
String representantAccession = getRepresentantAccession(representativeMember);
List<Element> members = entryXMLElem.asJDomElement().getChildren("member");
for (Element member : members) {
Element memberDbReference = member.getChild("dbReference");
List<Element> memberProperties = memberDbReference.getChildren("property");
for (Element prop : memberProperties) {
if (prop.getAttributeValue("type").equals("UniProtKB accession")) {
String memberAccession = prop.getAttributeValue("value");
membersAccessionList.add(memberAccession);
}
}
}
if (representantAccession != null) {
long representantId = -1;
//---The representant is an isoform----
if (representantAccession.contains("-")) {
IndexHits<Long> repIndexHits = isoformIdIndex.get(IsoformNode.ISOFORM_ID_INDEX, representantAccession);
if (repIndexHits.size() == 1) {
representantId = repIndexHits.getSingle();
}
repIndexHits.close();
} //---The representant is a protein
else {
IndexHits<Long> hits = proteinAccessionIndex.get(ProteinNode.PROTEIN_ACCESSION_INDEX, representantAccession);
if (hits.size() == 1) {
//System.out.println("representantAccession = " + representantAccession);
representantId = hits.getSingle();
}
hits.close();
}
//----we only create the relationships in the case where we found
// a valid representant id-----
if (representantId >= 0) {
for (String memberAccession : membersAccessionList) {
long memberId = -1;
if (memberAccession.contains("-")) {
IndexHits<Long> isoHits = isoformIdIndex.get(IsoformNode.ISOFORM_ID_INDEX, memberAccession);
if (isoHits.size() == 1) {
memberId = isoHits.getSingle();
}
isoHits.close();
} else {
IndexHits<Long> protHits = proteinAccessionIndex.get(ProteinNode.PROTEIN_ACCESSION_INDEX, memberAccession);
if (protHits.size() == 1) {
memberId = protHits.getSingle();
}
protHits.close();
}
if (memberId >= 0) {
inserter.createRelationship(representantId, memberId, relationship, null);
}
}
}
} else {
logger.log(Level.SEVERE, ("null representant accession for entry: " + entryXMLElem.asJDomElement().getAttributeValue("id")));
}
}
entryCounter++;
if ((entryCounter % limitForPrintingOut) == 0) {
logger.log(Level.INFO, (entryCounter + " entries parsed!!"));
}
}
reader.close();
return entryCounter;
}
}