/* XXL: The eXtensible and fleXible Library for data processing Copyright (C) 2000-2011 Prof. Dr. Bernhard Seeger Head of the Database Research Group Department of Mathematics and Computer Science University of Marburg Germany This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; If not, see <http://www.gnu.org/licenses/>. http://code.google.com/p/xxl/ */ package xxl.core.spatial.cursors; import java.util.Iterator; import xxl.core.collections.queues.ListQueue; import xxl.core.collections.queues.Queue; import xxl.core.cursors.AbstractCursor; import xxl.core.cursors.mappers.Mapper; import xxl.core.cursors.unions.Sequentializer; import xxl.core.cursors.wrappers.IteratorCursor; import xxl.core.functions.AbstractFunction; import xxl.core.functions.Function; import xxl.core.predicates.AbstractPredicate; import xxl.core.predicates.Predicate; import xxl.core.spatial.KPEzCode; import xxl.core.spatial.SpaceFillingCurves; import xxl.core.spatial.rectangles.FixedPointRectangle; /** * The Replicator provides the replication engine of GESS. * The Replicator maps each input-Object (of arbitrary type) to an Iterator of KPEzCode-Objects. * For a detailed description see * "GESS: a Scalable Similarity-Join Algorithm for Mining Large Data Sets in High Dimensional Spaces * by Jens-Peter Dittrich and Bernhard Seeger, ACM SIGKDD 2001. pages: 47-56." * * @see xxl.core.spatial.cursors.GESS * @see xxl.core.spatial.cursors.GESS.ReferencePointMethod * @see xxl.core.spatial.points.FixedPoint * @see xxl.core.spatial.rectangles.FixedPointRectangle * @see xxl.core.spatial.KPEzCode * */ public class Replicator extends IteratorCursor { //Split-Predicate class///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /** Abstract Predicate used to decide whether further splits are allowed. * * @see xxl.core.predicates.Predicate */ public static abstract class Split extends AbstractPredicate { /** The bound that is checked by the predicate */ protected int bound; /** Creates a new Split-Predicate. * @param bound is the boundary used for splitting */ public Split(int bound){ this.bound = bound; } } /** Allows at most <code>bound</code> generations (2^(generation) replicates are created). */ public static class MaxGeneration extends Split{ /** Creates a new MaxGeneration-Predicate. * @param bound is the boundary of the predicate */ public MaxGeneration(int bound){ super(bound); } /** * @param object the object (of class QueueEl) where a split might be performed * @return true if a split is preformed */ public boolean invoke(Object object){ return ((QueueEl)object).splitGeneration < bound; //split at most <bound> times } } /** Allows at most <code>bound</code> splits per level. */ public static class MaxSplitsPerLevel extends Split{ /** Creates a new MaxSplitsPerLevel-Predicate. * @param bound limits the number of splits per level */ public MaxSplitsPerLevel(int bound){ super(bound); } /** * @param object The object for which a split might be performed * @return true if the object is split */ public boolean invoke(Object object){ QueueEl q = (QueueEl)object; if ( (q.flag == -1) || (q.dim == 0)) q.flag = q.replicate.numberOfSplitsPerLevel(q.bitIndex); //split at most <bound> times per level/**/ //use flag to store whether splits are allowed for this level //(the flag is like a cookie: we have to switch from local to global split decision here) return q.flag <= bound; } } /** Allows splits if the actual bitIndex is greater or equal than a given bound. */ public static class MaxSplitBit extends Split{ /** Creates a new MaxSplitBit-Predicate. @param bound max-split-bit for which a split is allowed, bound in [62,...,1] */ public MaxSplitBit(int bound){ super(bound); } /** * @param object The object for which a split might be performed * @return true if a split is performed */ public boolean invoke(Object object){ return ((QueueEl)object).bitIndex >= bound; //split if bitIndex >= bound } } /** Allows splits if the level is smaller or equal than a given bitIndex. The bound is reversed, i.e. this class passes <code>62-level</code> to its super-constructor. */ public static class MaxSplitLevel extends MaxSplitBit{ /** * @param level denotes the maximum level where splits are still allowed */ public MaxSplitLevel(int level){ super(62-level); } } /** Constrains replication to 'bound' replicates per input-element. */ public static class MaxReplicates extends Split{ /** Creates a new MaxReplicates-Predicate. * @param bound the maximum number of replicas per input element */ public MaxReplicates(int bound){ super(bound); } /** This method throws an UnsupportedOperationException * * @param object * @return true if a split is preformed */ public boolean invoke(Object object) { throw new UnsupportedOperationException(); } } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //static-fields///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /** table used to set all bits > bitIndex = split-bit to true */ public static final long[] m1 = new long[64]; /** table used to select the bit-prefix, i.e. all bits <= bitIndex = split-bit */ public static final long[] prefix = new long[64]; /** table used to select the bit at position 'index' */ public static final long[] bit = new long[64]; /** initializer for static protected fields */ static{ long bitMask = 0x1L << 63; //first bit true long prefixMask = bitMask; //first bit true for(int i=63; i>=0; i--){ prefix[i] = prefixMask; m1[i] = ~prefixMask; bit[i] = bitMask; bitMask >>>= 1; prefixMask += bitMask; } } /** split until sons */ public static final Split SPLIT_ONCE = new MaxGeneration(1); /** split until grandsons */ public static final Split SPLIT_TWICE = new MaxGeneration(2); /** split until grandgrandsons */ public static final Split SPLIT_THRICE = new MaxGeneration(3); /** split once per level */ public static final Split SPLIT_ONCE_PER_LEVEL = new MaxSplitsPerLevel(1); /** split twice per level */ public static final Split SPLIT_TWICE_PER_LEVEL = new MaxSplitsPerLevel(2); /** split thrice per level */ public static final Split SPLIT_THRICE_PER_LEVEL = new MaxSplitsPerLevel(3); /** Counter used to count elements that are created by the replicator (that are passed "out" to the next operator) */ public static long EL_OUT = 0; /** Counter used to count elements that are passed to the replicator (that are passed "in" to this operator) */ public static long EL_IN = 0; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //QueueEl class://////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /** A Wrapper-class needed to unwind the recursive split. We do not use "real" recursion here for reasons of flexibility: the type of Queue used determines the traversing- strategy of recursive splits. */ protected static class QueueEl{ /** The source-object, needed for result-tuples */ protected Object source; /** split (replicate) rectangle */ protected FixedPointRectangle replicate; /** the bit actually treated (start=62) */ protected int bitIndex; /** the dimension actually treated */ protected int dim; /** internal flag needed to precompute the number of possible splits for a sequence of levels default-value: -1 */ protected int flag; /** The generation of the split: 0: no split occured for this El, 1: one split, etc. */ protected short splitGeneration; /** Top-level constructor, creates a new QueueEl. @param source the source data @param replicate the replicate-rectangle used internally to compute the replicate-set @param bitIndex the bit-position actually treated @param dim the dimension of the objects @param splitGeneration the generation of the split @param flag internal flag (should be set to -1, used by MaxSplitsPerLevel-predicate) */ public QueueEl(Object source, FixedPointRectangle replicate, int bitIndex, int dim, short splitGeneration, int flag){ this.source = source; this.replicate = replicate; this.bitIndex = bitIndex; this.dim = dim; this.splitGeneration = splitGeneration; this.flag = flag; } /** Creates a new QueueEl. @param source the source data @param replicate the replicate-rectangle used internally to compute the replicate-set */ public QueueEl(Object source, FixedPointRectangle replicate){ this(source, replicate, 62, 0, (short)0, -1); } /** Clones a new QueueEl from an existing queueEl. @param q the QueueEl to be cloned */ public QueueEl(QueueEl q){ this(q.source, (FixedPointRectangle)q.replicate.clone(), q.bitIndex, q.dim, q.splitGeneration, q.flag); } /** Splits this element. * @return the right replicate. 'this' QueueEl is the left replicate. */ public QueueEl split(){ QueueEl qRight = new QueueEl(this); //clone this QueueEl ((long[])qRight.replicate.getCorner(false).getPoint())[dim] = ((long[])replicate.getCorner(true).getPoint())[dim] & prefix[bitIndex] ; //keep prefix of the bits seen so far and set remaining bits to "false" ((long[])this.replicate.getCorner(true).getPoint())[dim] = (((long[])this.replicate.getCorner(false).getPoint())[dim] & prefix[bitIndex]) | m1[bitIndex]; //keep prefix of the bits seen so far and set remaining bits to "true" qRight.dim = ++dim; //increment dim-counter for next assessment qRight.splitGeneration = ++splitGeneration; //increment split-generation return qRight; //return right replicate } /** Checks whether we got a split here. @return true if the bits of component <dim> at bit-index <bitIndex> are different (i.e. we are facing a split at the current posiiton) */ public boolean checkForSplit(){ return replicate.bitsDiffer(dim,bitIndex); } } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //Replicator: constructor/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /** The top-level constructor for this class. * * @param inputMapping Mapper for the input to the Replicator (maps incoming object of arbitrary type to FixedPointRectangle) * @param input the input to be processed by this class * @param newResult a Function that maps the internally used QueueEl to the Object to be returned by the replication process * @param dimensions the dimensionality of the data * @param queue the queue used to process replicates * @param splitAllowed the Predicate used to determine whether a split is allowed * @param minBitIndex the minimal bit-index to be considered for the replication-process ( 0 <= bitIndex <= 62 ) */ public Replicator(final Function inputMapping, final Iterator input, final Function newResult, final int dimensions, final Queue queue, final Predicate splitAllowed, final int minBitIndex){ super( new Mapper( //maps QueueEl-Objects to newObject (e.g. KPEzCode) newResult, //map internally used QueueEl-Objects to user-determined Object new Sequentializer( new Mapper( new AbstractFunction() { //maps input-Points to Iterator of QueueEl-Objects (i.e. computes replicates using assessment) /** Assesses a given QueueEl. @return resulting queue **/ protected QueueEl assess(Object queueEl){ QueueEl h = (QueueEl)queueEl; for(; h.bitIndex > minBitIndex; h.bitIndex--){ //repeat outer loop if the actual bit-index is greater minimal bit-index for(; h.dim < dimensions; h.dim++){ //inner loop, considers each dimension of the input vectors if( h.checkForSplit() ){ //check whether h hits a split line for the actual bitIndex and dimension if( splitAllowed.invoke(h) ){ //check whether it is allowed to perform a split for element h queue.enqueue(h.split()); //split h and insert right replicate into queue queue.enqueue(h); //insert left replicate into queue return assess(queue.dequeue()); //recurse: access next-element of the queue } else //i.e. split is not allowed return h; //h is a result of the replication-process and can be returned by the Replicator, //i.e. <bitIndex> and <dim> point to a bit that hit a split-line } } h.dim = 0; //reset h.dim to 0 for next iteration of outer loop } h.bitIndex = minBitIndex; //minBitIndex was reached, i.e. h passes all split-lines until minBitIndex return h; //return h; } /** Precondition: the argument object is of type xxl.core.spatial.points.Point, the point must be inside [0;1)^dim (i.e. NOT 1) */ public Object invoke(final Object object){ //Function used to map incoming objects to (Iterator of QueueEl) return new AbstractCursor(){ { EL_IN++; //increment counter used for counting size of the input queue.enqueue(new QueueEl( object, (FixedPointRectangle) inputMapping.invoke(object)) ); //unwind recursion } public boolean hasNextObject(){ return !queue.isEmpty(); //Precondition: queue not empty } public Object nextObject() { return queue.dequeue(); //call assessment which computes next replicate } }; } } , input) ) ) ); } /** Creates a new Replicator. This constructors passes a ListQueue to the parameter 'queue' of the top-level constructor. * * @param inputMapping Mapper for the input to the Replicator (maps incoming object of arbitrary type to FixedPointRectangle) * @param input the input to be processed by this class * @param newResult a Function that maps the internally used QueueEl to the Object to be returned by the replication process * @param dimensions the dimensionality of the data * @param splitAllowed the Predicate used to determine whether a split is allowed * @param minBitIndex the minimal bit-index to be considered for the replication-process ( 0 <= bitIndex <= 62 ) * * @see xxl.core.collections.queues.ListQueue */ public Replicator(final Function inputMapping, final Iterator input, final Function newResult, final int dimensions, final Predicate splitAllowed, final int minBitIndex){ this(inputMapping, input, newResult, dimensions, new ListQueue(), splitAllowed, minBitIndex); } /** Creates a new Replicator. This constructors provides an implementation of the parameter-Function 'newResult'. * * @param inputMapping Mapper for the input to the Replicator (maps incoming object of arbitrary type to FixedPointRectangle) * @param input the input to be processed by this class * @param dimensions the dimensionality of the data * @param splitAllowed the Predicate used to determine whether a split is allowed * @param minBitIndex the minimal bit-index to be considered for the replication-process ( 0 <= bitIndex <= 62 ) * @param considerAdditionalBits determines whether kd-splitting should be applied (otherwise quadtree-splitting is performed) */ public Replicator(final Function inputMapping, final Iterator input, final int dimensions, final Predicate splitAllowed, final int minBitIndex, final boolean considerAdditionalBits){ this(inputMapping, input, new AbstractFunction(){ //= newResult-Function public Object invoke(Object queueEl){ QueueEl q = (QueueEl) queueEl; final int additionalBits = considerAdditionalBits ? q.dim : 0; //determines the additional bits to be considered for computing the z-code final int componentPrecision = Math.min(62-q.bitIndex, 62-minBitIndex); //computes the number of bits to be considered for computing the z-code EL_OUT++; //increment counter used for counting size of the output return new KPEzCode( q.source, //insert source point into KPEzCode SpaceFillingCurves.zCode( ((long[])q.replicate.getCorner(false).getPoint()), componentPrecision, additionalBits), q.splitGeneration > 0 //if > 0 holds, this element is a replicate ); //computes zCode and returns KPEzCode containg original input-data and z-code } }, dimensions, splitAllowed, minBitIndex ); } /** Creates a new Replicator. * * @param inputMapping Mapper for the input to the Replicator (maps incoming object of arbitrary type to FixedPointRectangle) * @param input the input to be processed by this class * @param dimensions the dimensionality of the data * @param splitAllowed the Predicate used to determine whether a split is allowed * @param minBitIndex the minimal bit-index to be considered for the replication-process ( 0 <= bitIndex <= 62 ) */ public Replicator(final Function inputMapping, final Iterator input, final int dimensions, final Predicate splitAllowed, final int minBitIndex){ this(inputMapping, input, dimensions, splitAllowed, minBitIndex, true); } }