/* * Copyright (c) 2015 Data Harmonisation Panel * * All rights reserved. This program and the accompanying materials are made * available under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation, either version 3 of the License, * or (at your option) any later version. * * You should have received a copy of the GNU Lesser General Public License * along with this distribution. If not, see <http://www.gnu.org/licenses/>. * * Contributors: * Data Harmonisation Panel <http://www.dhpanel.eu> */ package eu.esdihumboldt.hale.common.instance.graph.reference; import java.text.MessageFormat; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedHashSet; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.NoSuchElementException; import java.util.Queue; import java.util.Set; import java.util.stream.Collectors; import javax.annotation.Nullable; import com.tinkerpop.blueprints.Direction; import com.tinkerpop.blueprints.Vertex; import com.tinkerpop.blueprints.impls.tg.TinkerGraph; import com.tinkerpop.gremlin.java.GremlinPipeline; import com.tinkerpop.pipes.PipeFunction; import com.tinkerpop.pipes.branch.LoopPipe.LoopBundle; import de.fhg.igd.slf4jplus.ALogger; import de.fhg.igd.slf4jplus.ALoggerFactory; import eu.esdihumboldt.hale.common.instance.graph.reference.internal.ReferencesInstanceCollection; import eu.esdihumboldt.hale.common.instance.model.Instance; import eu.esdihumboldt.hale.common.instance.model.InstanceCollection; import eu.esdihumboldt.hale.common.instance.model.InstanceReference; import eu.esdihumboldt.hale.common.instance.model.ResourceIterator; /** * Graph with associations between instances. * * @author Simon Templer * @param <T> the identifier type, must have a sensible equals implementation */ public class ReferenceGraph<T> { /** * Custom tinker graph that allows fast access to a random contained vertex * and to the information if the graph has any vertices. * * This class was created because the call to * {@link TinkerGraph#getVertices()} is very expensive for large graphs. */ public class CustomTinkerGraph extends TinkerGraph { private static final long serialVersionUID = -6470218605426839887L; /** * @return a vertex in the graph */ @Nullable public Vertex someVertex() { if (isEmpty()) return null; return vertices.values().iterator().next(); } /** * @return if the graph is empty */ public boolean isEmpty() { return vertices.isEmpty(); } } private static final ALogger log = ALoggerFactory.getLogger(ReferenceGraph.class); /** * Iterator for instance partitions. */ public class PartitionIterator implements Iterator<InstanceCollection> { private final Queue<List<InstanceReference>> candidates = new LinkedList<>(); private final int maxObjects; private int partCount = 0; private int partSum = 0; private int biggestAtom = 0; /** * @param maxObjects the guiding value for the maximum number of objects * in a part */ public PartitionIterator(int maxObjects) { this.maxObjects = maxObjects; } @Override public boolean hasNext() { /* * There are additional parts if there are candidates in the queue * and/or vertices left in the graph. */ return !candidates.isEmpty() || verticesLeft(); } private boolean verticesLeft() { return !graph.isEmpty(); } @Override public InstanceCollection next() { List<InstanceReference> part = candidates.poll(); if (part == null) { part = new ArrayList<>(maxObjects); } Queue<List<InstanceReference>> nextCandidates = new LinkedList<>(); while (verticesLeft() && part.size() < maxObjects) { // add to part List<InstanceReference> instances = getNextAtomicPart(); biggestAtom = Math.max(biggestAtom, instances.size()); if (part.size() + instances.size() > maxObjects) { // add to part candidates for later use nextCandidates.add(instances); if (!verticesLeft()) { // we added everything to candidates and need to // terminate the loop break; } } else { // add to current part part.addAll(instances); } } // try to add parts from previous candidates while (!candidates.isEmpty() && part.size() < maxObjects) { List<InstanceReference> instances = candidates.poll(); if (part.size() + instances.size() > maxObjects) { // add to part candidates for later use nextCandidates.add(instances); } else { // add to current part part.addAll(instances); } } // collected candidates for next attempt candidates.addAll(nextCandidates); if (part.isEmpty()) { // no vertices left if (!candidates.isEmpty()) { // yield a previously stored candidate that was to big // to fit into a request part = candidates.poll(); } else { throw new NoSuchElementException("All parts were retrieved"); } } partCount++; partSum += part.size(); log.debug("Reference based partitioning - Part {} - {} instances", partCount, part.size()); if (!hasNext()) { log.info(MessageFormat.format( "Completed partitioning of {1} instances in {0} parts, biggest inseparable set of instances was of size {2}.", partCount, partSum, biggestAtom)); } return new ReferencesInstanceCollection(part, originalCollection); } /** * @return the next atomic part from the graph as instance references */ private List<InstanceReference> getNextAtomicPart() { // select an arbitrary vertex Vertex vtx = graph.someVertex(); if (vtx != null) { // get all vertices associated with that vertex final Set<Vertex> visited = new LinkedHashSet<>(); if (!vtx.getEdges(Direction.BOTH).iterator().hasNext()) { // no edges associated - no need to use gremlin // does not speed up the process by much though visited.add(vtx); } else { /** * Example for the Groovy console - getting all associated * vertices. <code> * g = TinkerGraphFactory.createTinkerGraph() * x = new LinkedHashSet() * g.v(1).as('ref').aggregate(x).both.loop('ref', { !x.contains(it.object) }) * x * </code> */ GremlinPipeline<Vertex, Vertex> pipe = new GremlinPipeline<>(); pipe.start(vtx).as("ref").aggregate(visited).both() .loop("ref", new PipeFunction<LoopBundle<Vertex>, Boolean>() { @Override public Boolean compute(LoopBundle<Vertex> loop) { return !visited.contains(loop.getObject()); } }).iterate(); } List<InstanceReference> result = new ArrayList<>(); for (Vertex associated : visited) { InstanceReference ref = associated.getProperty(P_INSTANCE_REFERENCE); if (ref != null) { result.add(ref); } else { Iterable<Vertex> referers = associated.getVertices(Direction.IN); Set<String> ids = new HashSet<>(); if (referers != null) { for (Vertex referer : referers) { Object ident = referer.getId(); if (ident != null) { ids.add(ident.toString()); } } } if (ids.isEmpty()) { log.warn("Encountered referenced object w/o associated instance: " + associated.getId()); } else { String enumIds = ids.stream().collect(Collectors.joining(", ")); log.warn("Encountered referenced object w/o associated instance: " + associated.getId() + " - referenced from " + enumIds); } } } // remove vertices from graph for (Vertex v : visited) { graph.removeVertex(v); } return result; } else { return Collections.emptyList(); } } @Override public void remove() { throw new UnsupportedOperationException(); } } private static final String P_INSTANCE_REFERENCE = "instanceRef"; private static final String E_REFERENCE = "refs"; private final CustomTinkerGraph graph; private final IdentityReferenceInspector<T> inspector; /** * Index for fast retrieval of vertices by identifier. */ private final Map<T, Vertex> identifiedVertices = new HashMap<>(); private final InstanceCollection originalCollection; private boolean partitioned = false; /** * Create a new reference graph from the given instance collection. * * @param inspector the instance inspector to use * @param instances the */ public ReferenceGraph(IdentityReferenceInspector<T> inspector, InstanceCollection instances) { this.graph = new CustomTinkerGraph(); this.inspector = inspector; this.originalCollection = instances; populate(instances); // identified vertices no longer needed after populate identifiedVertices.clear(); } /** * Populate the graph with the instances from the given collection. * * @param instances an instance collection */ protected void populate(InstanceCollection instances) { try (ResourceIterator<Instance> it = instances.iterator()) { while (it.hasNext()) { Instance instance = it.next(); addInstance(instance, instances.getReference(instance)); } } } /** * Add an instance to the reference graph. * * @param instance the instance to add * @param ref the reference that can be used to retrieve the instance */ protected void addInstance(Instance instance, InstanceReference ref) { // retrieve / create vertex T id = inspector.getIdentity(instance); Vertex vertex = getVertex(id); // store instance reference vertex.setProperty(P_INSTANCE_REFERENCE, ref); // create references Set<T> associations = inspector.getReferencedIdentities(instance); for (T idRef : associations) { Vertex assoc = getVertex(idRef); // add edge between vertices graph.addEdge(null, vertex, assoc, E_REFERENCE); } } /** * Partition the collected instances in parts that respectively contain all * referenced instances. * * @param maxObjects the guiding value for the maximum number of objects in * a part * @return an iterator of instance collections, each instance collection * represents a part */ public Iterator<InstanceCollection> partition(int maxObjects) { if (!partitioned) { partitioned = true; return new PartitionIterator(maxObjects); } throw new IllegalStateException( "Partitioning the instance collection can only be done once"); } private Vertex getVertex(T id) { Vertex vertex = null; if (id != null) { vertex = identifiedVertices.get(id); } if (vertex == null) { vertex = graph.addVertex(id); identifiedVertices.put(id, vertex); } return vertex; } }