package org.nextprot.api.core.utils.graph; import com.google.common.base.Preconditions; import grph.Grph; import grph.in_memory.InMemoryGrph; import grph.path.Path; import it.unimi.dsi.fastutil.longs.LongArrayList; import org.nextprot.api.commons.constants.TerminologyCv; import org.nextprot.api.core.domain.CvTerm; import org.nextprot.api.core.service.TerminologyService; import toools.collection.bigstuff.longset.LongCursor; import toools.collection.bigstuff.longset.LongHashSet; import toools.collection.bigstuff.longset.LongSet; import toools.math.MathsUtilities; import java.io.Serializable; import java.text.DecimalFormat; import java.util.*; import java.util.logging.Logger; import java.util.stream.Stream; /** * A hierarchy of {@code CvTerm} ids organised in a Directed Acyclic Graph. * * <h4>Warning</h4> * The graph data structure is backed by an instance of Grph that is a transient field. * * A call of a method depending on Grph field on a deserialized OntologyDAG instance (graph not accessible anymore) * will throw a NotFoundInternalGrphException. * * To help using this object, methods depending on this field were named with suffix "..FromTransientGraph()" and a * dedicated method {@code isTransientGraphAvailable()} was added to test for {@code transientGraph} eligibility. * * Created by fnikitin on 08.03.17. */ public class OntologyDAG implements Serializable { private static final long serialVersionUID = 1L; private final static Logger LOGGER = Logger.getLogger(OntologyDAG.class.getSimpleName()); private final static DecimalFormat DECIMAL_FORMAT = new DecimalFormat("######.##"); private transient final Grph transientGraph; private final TerminologyCv terminologyCv; private final Map<String, Long> cvTermIdByAccession; private final Map<Long, String> cvTermAccessionById; private final Map<Long, LongSet> cvTermIdAncestors; private final int allPathsSize; public OntologyDAG(TerminologyCv terminologyCv, TerminologyService service) { Preconditions.checkNotNull(terminologyCv); Preconditions.checkNotNull(service); List<CvTerm> cvTerms = service.findCvTermsByOntology(terminologyCv.name()); this.terminologyCv = terminologyCv; cvTermIdByAccession = new HashMap<>(cvTerms.size()); cvTermAccessionById = new HashMap<>(cvTerms.size()); cvTermIdAncestors = new HashMap<>(cvTerms.size()); transientGraph = new InMemoryGrph(); cvTerms.forEach(this::addCvTermNode); cvTerms.forEach(this::addCvTermEdges); allPathsSize = precomputeAllAncestors(); } private void addCvTermNode(CvTerm cvTerm) { cvTermIdByAccession.put(cvTerm.getAccession(), cvTerm.getId()); cvTermAccessionById.put(cvTerm.getId(), cvTerm.getAccession()); transientGraph.addVertex(cvTerm.getId()); cvTermIdAncestors.put(cvTerm.getId(), new LongHashSet()); } private void addCvTermEdges(CvTerm cvTerm) { List<String> parentAccessions = cvTerm.getAncestorAccession(); if (parentAccessions != null) { parentAccessions.forEach(parent -> { try { transientGraph.addDirectedSimpleEdge(getCvTermIdByAccession(parent), cvTerm.getId()); } catch (NotFoundNodeException e) { LOGGER.warning(cvTerm.getAccession()+" cannot connect to unknown node parent: "+e.getMessage()); } }); } } private int precomputeAllAncestors() { Collection<Path> paths = transientGraph.getAllPaths(); for (Path path : paths) { long dest = path.getDestination(); if (path.getNumberOfVertices() > 1) { for (long i = 0; i < path.getNumberOfVertices() - 1; i++) { cvTermIdAncestors.get(dest).add(path.getVertexAt(i)); } } } return paths.size(); } /** * @return the TerminologyCv of this graph of CvTerms */ public TerminologyCv getTerminologyCv() { return terminologyCv; } /** * @return a Stream of root(s) ids of the graph */ Stream<Long> getRoots() { return getAllNodes().filter(id -> transientGraph.getInEdges(id).isEmpty()); } /** * @return a Stream of all node ids */ public Stream<Long> getAllNodes() { return cvTermIdAncestors.keySet().stream(); } /** * @return the total number of graph nodes */ public long countNodes() { return cvTermIdAncestors.size(); } /** * @return the CvTerm with given id */ public String getCvTermAccessionById(long id) { if (!cvTermAccessionById.containsKey(id)) throw new IllegalStateException("cvterm id "+id+" was not found"); return cvTermAccessionById.get(id); } /** * @return the id of cvterm with given accession */ public long getCvTermIdByAccession(String accession) throws NotFoundNodeException { if (!cvTermIdByAccession.containsKey(accession)) throw new NotFoundNodeException(accession); return cvTermIdByAccession.get(accession); } /** * @return true if cvTermAccession was found */ public boolean hasCvTermAccession(String cvTermAccession) { return cvTermIdByAccession.containsKey(cvTermAccession); } /** * @return true if queryDescendant is a descendant of queryAncestor */ public boolean isAncestorOf(long queryAncestor, long queryDescendant) { return cvTermIdAncestors.get(queryDescendant).contains(queryAncestor); } /** * @return true if queryDescendant is a descendant of queryAncestor */ public boolean isChildOf(long queryDescendant, long queryAncestor) { return cvTermIdAncestors.get(queryDescendant).contains(queryAncestor); } // used for benchmarking only boolean isAncestorOfSlow(long queryAncestor, long queryDescendant) { return transientGraph.getShortestPath(queryAncestor, queryDescendant) != null; } /** * @return descendants of the given cvterm id */ public long[] getAncestors(long cvTermId) { return cvTermIdAncestors.get(cvTermId).toLongArray(); } /** * @return ancestors map of all cvterm nodes */ public Map<Long, LongSet> getCvTermIdAncestors() { return cvTermIdAncestors; } /** * @return the mappings of cv term accession to id */ public Map<String, Long> getCvTermIdByAccession() { return cvTermIdByAccession; } /** * @return true if grph instance exist (used to check if specific method name with suffix "FromGrph" are callable) */ public boolean isTransientGraphAvailable() { return transientGraph != null; } /** * @return the set of all path containing a cycle * @throws NotFoundInternalGraphException if internal graph is missing */ public Set<Path> getAllCyclesFromTransientGraph() throws NotFoundInternalGraphException { checkTransientGraphAvailability(); return transientGraph.getAllCycles(); } /** * @return a Stream of cvterm ids that are parents of cvTermId * @throws NotFoundInternalGraphException if internal graph is missing */ public long[] getParentsFromTransientGraph(long cvTermId) throws NotFoundInternalGraphException { checkTransientGraphAvailability(); return transientGraph.getInNeighbors(cvTermId).toLongArray(); } /** * @return a Stream of cvterm ids that are children of cvTermId * @throws NotFoundInternalGraphException if internal graph is missing */ public long[] getChildrenFromTransientGraph(long cvTermId) throws NotFoundInternalGraphException { checkTransientGraphAvailability(); return transientGraph.getOutNeighbors(cvTermId).toLongArray(); } /** * @return the total number of graph edges * @throws NotFoundInternalGraphException if internal graph is missing */ public long countEdgesFromTransientGraph() throws NotFoundInternalGraphException { checkTransientGraphAvailability(); return transientGraph.getSize(); } /** * @return all the paths of this graph * @throws NotFoundInternalGraphException if internal graph is missing */ public Collection<Path> getAllPathsFromTransientGraph() throws NotFoundInternalGraphException { checkTransientGraphAvailability(); return transientGraph.getAllPaths(); } /** * @return the connected components of the graph * @throws NotFoundInternalGraphException if internal graph is missing */ public Stream<LongSet> getConnectedComponentsFromTransientGraph() throws NotFoundInternalGraphException { checkTransientGraphAvailability(); return transientGraph.getConnectedComponents().stream(); } /** * @return the average degree of the graph * @throws NotFoundInternalGraphException if internal graph is missing */ // There is a deprecation in toools.math.MathsUtilities; grph.getAverageDegree() should call another method like below public double getAverageDegreeFromTransientGraph(Grph.TYPE type, Grph.DIRECTION direction) throws NotFoundInternalGraphException { LongArrayList l = new LongArrayList(); for (LongCursor c : transientGraph.getVertices()) { l.add(transientGraph.getVertexDegree(c.value, type, direction)); } return MathsUtilities.computeAverage(l.toLongArray()); } @Override public String toString() { StringBuilder sb = new StringBuilder("graph of "+terminologyCv); sb.append(": {nodes=").append(countNodes()); sb.append(", edges=").append(transientGraph.getSize()); Collection<LongSet> ccs = transientGraph.getConnectedComponents(); sb.append(", connected components=").append(ccs.size()); sb.append(", avg degree=").append(DECIMAL_FORMAT.format(transientGraph.getAverageDegree())); sb.append(", paths=").append(allPathsSize); sb.append("}"); return sb.toString(); } private void checkTransientGraphAvailability() throws NotFoundInternalGraphException { if (transientGraph == null) throw new NotFoundInternalGraphException(); } /** * Thrown if no nodes map the given cvterm accession */ public class NotFoundNodeException extends Exception { public NotFoundNodeException(String accession) { super("CvTerm node with accession "+accession+" was not found in "+terminologyCv + " graph"); } } /** * Thrown when transient graph is not available anymore from methods supposed to need it */ public class NotFoundInternalGraphException extends Exception { public NotFoundInternalGraphException() { super("This instance has been deserialized: the graph (and all associated methods) is not longer accessible for ontology "+terminologyCv); } } }