package org.genedb.crawl.elasticsearch.index.cv;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.log4j.Logger;
import org.genedb.crawl.elasticsearch.index.NonDatabaseDataSourceIndexBuilder;
import org.genedb.crawl.model.Cv;
import org.genedb.crawl.model.Cvterm;
import org.genedb.crawl.model.CvtermRelationship;
import org.jgrapht.DirectedGraph;
import org.jgrapht.graph.DefaultDirectedGraph;
import org.jgrapht.graph.DefaultEdge;
import org.kohsuke.args4j.Option;
import org.obo.dataadapter.DefaultOBOParser;
import org.obo.dataadapter.OBOParseEngine;
import org.obo.datamodel.Link;
import org.obo.datamodel.OBOClass;
import org.obo.datamodel.OBOSession;
import org.obo.util.TermUtil;
public class CvIndexBuilder extends NonDatabaseDataSourceIndexBuilder {
private static Logger logger = Logger.getLogger(CvIndexBuilder.class);
@Option(name = "-cv", aliases = { "--controlled_vocabularies" }, usage = "The path(s) to the CV file(s)", required = true)
public List<String> cvFiles;
@Option(name = "-ns", aliases = { "--namespaces" }, usage = "The namespaces to be loaded", required = true)
public List<String> namespaces;
@Option(name = "-vn", aliases = { "--vocabulary_name" }, usage = "The name of the controlled vocabulary to load these files as", required = true)
public String vocabularyName;
@Option(name = "-re", aliases = { "--relationships" }, usage = "The relationship types to be loaded (by default, all are loaded)", required = false)
public List<String> relationships = Arrays.asList(new String[] { "*" });
/**
* We're going to use this to build the graph.
*/
private Map<String, Cvterm> allterms = new HashMap<String, Cvterm>();
@Override
public void run() throws Exception {
init();
Set<String> namespacesSet = new HashSet<String>(namespaces);
Set<String> relationshipsSet = new HashSet<String>(relationships);
/*
* Setup the OBO parser.
*/
DefaultOBOParser parser = new DefaultOBOParser();
OBOParseEngine engine = new OBOParseEngine(parser);
engine.setPaths(cvFiles);
engine.parse();
OBOSession session = parser.getSession();
/*
* Index and build graph.
*/
indexOntology(namespacesSet, session);
buildGraph(relationshipsSet, allterms);
}
/**
* Goes through the ontologies and drops them into elasticsearch.
*
* @param namespacesSet
* @param session
*/
private void indexOntology(Set<String> namespacesSet, OBOSession session) {
for (OBOClass term : TermUtil.getTerms(session)) {
if (term.getNamespace() != null) {
String nameSpace = term.getNamespace().getID();
if (namespacesSet.contains(nameSpace)) {
Cvterm cvterm = new Cvterm();
cvterm.name = term.getName();
cvterm.accession = term.getID();
cvterm.parents = parseLinks(term.getParents(), true);
cvterm.children = parseLinks(term.getChildren(), false);
cvterm.definition = term.getDefinition();
cvterm.cv = new Cv();
cvterm.cv.name = vocabularyName;
termsMapper.createOrUpdate(cvterm);
allterms.put(cvterm.accession, cvterm);
} else {
logger.debug(String.format("%s namespace not matching, skipping...", term.getName()));
}
} else {
logger.debug(String.format("%s has no namespace, skipping...", term.getName()));
}
}
}
/**
*
* Generates a list of children or parent links.
*
* @param links
* @param parent
* @return
*/
private List<CvtermRelationship> parseLinks(Collection<Link> links, boolean parent) {
List<CvtermRelationship> rels = new ArrayList<CvtermRelationship>();
for (Link link : links) {
CvtermRelationship cvr = new CvtermRelationship();
cvr.relationship = link.getType().getName();
if (parent)
cvr.link = link.getParent().getID();
else
cvr.link = link.getChild().getID();
rels.add(cvr);
}
return rels;
}
/**
*
* This is a quick test to generate the graph. In practice this would
* actually need to be run at load time, and rather than building off
* the allterms hash, it would be built from the elastic search indices
* generated above, and/or the controlled vocabularies stored in Chado.
*
* @param relationshipsSet
* @param allterms
*/
private void buildGraph(Set<String> relationshipsSet, Map<String, Cvterm> allterms) {
DirectedGraph<Cvterm, DefaultEdge> graph = new DefaultDirectedGraph<Cvterm, DefaultEdge>(DefaultEdge.class);
for (Map.Entry<String, Cvterm> cvTermEntry : allterms.entrySet()) {
Cvterm cvterm = cvTermEntry.getValue();
logger.info("Vertex " + cvterm.accession);
graph.addVertex(cvterm);
}
for (Map.Entry<String, Cvterm> cvTermEntry : allterms.entrySet()) {
Cvterm cvterm = cvTermEntry.getValue();
for (CvtermRelationship cvr : cvterm.children) {
if ((!relationshipsSet.contains("*")) && (!relationshipsSet.contains(cvr.relationship)))
continue;
Cvterm child = allterms.get(cvr.link);
assert (child != null);
logger.info("Edge " + cvterm.accession + "---(" + cvr.relationship + ")--->" + child.accession);
graph.addEdge(cvterm, child);
}
}
logger.info("Graph complete");
}
public static void main(String[] args) throws Exception {
new CvIndexBuilder().prerun(args).closeIndex();
}
}