/* XXL: The eXtensible and fleXible Library for data processing Copyright (C) 2000-2011 Prof. Dr. Bernhard Seeger Head of the Database Research Group Department of Mathematics and Computer Science University of Marburg Germany This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; If not, see <http://www.gnu.org/licenses/>. http://code.google.com/p/xxl/ */ package xxl.core.spatial.cursors; import java.util.Iterator; import java.util.List; import xxl.core.functions.Function; import xxl.core.predicates.AbstractPredicate; import xxl.core.predicates.Predicate; import xxl.core.spatial.KPEzCode; import xxl.core.spatial.SpaceFillingCurves; import xxl.core.spatial.points.Point; import xxl.core.util.BitSet; /** * This class provides the similarity-join algorithm "GESS: Generic External Space Sweep" * (see "[DS 01] GESS: a Scalable Algorithm for Mining Large Datasets in High Dimensional Spaces * by Jens-Peter Dittrich and Bernhard Seeger, ACM SIGKDD 2001." for a * detailed description of this method). * <br><br> * The most important component of this algorithm is the Replicator-Engine * ({@link xxl.core.spatial.cursors.Replicator Replicator}) * which determines the partition(s) for incoming points and maps * Points to KPEzCodes. * <br><br> * The use-case implemented in the main-method of this class reads 1 or * 2 inputs and computes the similarity-join using GESS. The use-case * provided with this class reads files containing FloatPoints. * <br><br> * Note that GESS works on arbitrary data as long as the user provides * a mapping to the internally used FixedPointRectangle-type (see * parameter "inputMapping"). * * @see xxl.core.spatial.cursors.Replicator * @see xxl.core.spatial.cursors.Orenstein * @see xxl.core.spatial.cursors.MSJ * @see xxl.core.spatial.points.FixedPoint * @see xxl.core.spatial.rectangles.FixedPointRectangle * @see xxl.core.spatial.KPEzCode * @see xxl.core.cursors.joins.SortMergeJoin * */ public class GESS extends Orenstein{ /** * This class provides the Reference Point Method of GESS. * Since GESS allows hypercubes to get replicated, we have to provide * a method to eliminate possible duplicates from the result set. * <br><br> * There are two principal approaches for eliminating duplicate * results. The first is to use a hash-table that stores the entire * set of result tuples. The memory requirements of this approach * however are O(n). A second approach is to apply external sorting * to the result set. This causes additional I/O cost. In * addition, the sorting operation could not report any result until * all results had been reported by the merging algorithm. * <br><br> * Instead of using these standard techniques * we propose an inexpensive on-line method termed * Reference Point Method (RPM). This method neither allocates * additional memory nor does it cause any additional I/O operations. * <br><br> * The basic idea of RPM is to define a reference point * which is contained in the section of two hypercubes. * <br><br> * See [DS 01] GESS: a Scalable Algorithm for Mining Large Datasets in High Dimensional Spaces * by Jens-Peter Dittrich and Bernhard Seeger, ACM SIGKDD 2001. San Francisco. pages: 47-56" for a * detailed description of this algorithm. * <br><br> * Usage: * The main-method of this class contains an elaborate similarity-join use-case. * <br><br> * To make GESS work using RPM simply modify the joinPredicate using the And-predicate: * <code><pre> * Predicate joinPredicate = * new And( * new FeaturePredicate( * new DistanceWithinMaximum(epsilon), * new AbstractFunction(){ * public Object invoke(Object object){ * return ((KPEzCode)object).getData(); * } * } * ), * new GESS.ReferencePointMethod(epsilonDiv2) //duplicate removal (modify GESS to work with reference point method) * ); * * * @see xxl.core.spatial.cursors.Replicator * @see xxl.core.predicates.Predicate * @see xxl.core.predicates.And * @see xxl.core.spatial.predicates.DistanceWithinMaximum * @see xxl.core.cursors.joins.SortMergeJoin */ public static class ReferencePointMethod extends AbstractPredicate<Object> { /** The epsilon (query) distance of the Similarity Join divided by 2. */ protected double epsilonDiv2; /** Constructs a new ReferencePointMethod instance. * * @param epsilonDiv2 the epsilon distance of the Similarity Join divided by 2 */ public ReferencePointMethod(double epsilonDiv2){ this.epsilonDiv2 = epsilonDiv2; } /** Takes a tuple containing two KPEzCodes as its input and * checks whether a certain reference point is contained * in the partition currently processed. * * Contains an optimization that applies RPM only in those cases * when at least one of the inputs is a replicate. * @param object is a two-dimensional array whith the two KPEzCodes * @return return true if a certain reference point is contained in the * cell of the two points given by the KPEzCodes (see Paper) */ public boolean invoke(List<? extends Object> object){ KPEzCode k0 = (KPEzCode)object.get(0); KPEzCode k1 = (KPEzCode)object.get(1); if( k0.getIsReplicate() || k1.getIsReplicate()){ //optimization: apply RPM only in case one of the inputs is a replicate final BitSet currentZCode = k0.getzCode(); float[] p1 = (float[]) ((Point)k0.getData()).getPoint(); float[] p2 = (float[]) ((Point)k1.getData()).getPoint(); long[] rp = new long[p1.length]; for(int i=0; i<p1.length; i++){ rp[i] = xxl.core.math.Maths.doubleToNormalizedLongBits( Math.max( p1[i], p2[i] ) - epsilonDiv2 ); } return SpaceFillingCurves.zCode2( rp, currentZCode.precision() ).compare(currentZCode) == 0; } else return true; } } /** Creates a new GESS-operator (Constructor for two inputs). * * @param input0 first (unsorted) input * @param input1 second (unsorted) input * @param inputMapping a Function used to map incoming objects of arbitrary type to a FixedPointRectangle (internally used by the replication engine) * @param joinPredicate the join predicate to be used by this join (e.g. DistanceWithin-predicate) * @param splitAllowed the replication strategy to be applied by GESS (see inner class Replicator.Split) * @param minBitIndex the minimal bit-index to be considered for the replication-process ( 0 <= bitIndex <= 62 ) * @param newSorter a factory-Function that returns a sorting-operator (e.g. {@link xxl.core.cursors.sorters.MergeSorter}) * @param newResult a factory-Function that is used to create the result-tuples that are returned by this operator (e.g. {@link xxl.core.functions.Tuplify}) * @param dimensions the dimensionality of the data * @param initialCapacity the maximum number of elements that can be stored inside main-memory (i.e. by the SweepArea) */ public GESS(Iterator input0, Iterator input1, Function inputMapping, Predicate joinPredicate, Predicate splitAllowed, int minBitIndex, Function newSorter, Function newResult, int dimensions, int initialCapacity){ super( new Replicator(inputMapping, input0, dimensions, splitAllowed, minBitIndex), new Replicator(inputMapping, input1, dimensions, splitAllowed, minBitIndex), joinPredicate, newSorter, newResult, initialCapacity ); } /** Creates a new GESS-operator (Constructor for a self-join). * * @param input (unsorted) input * @param inputMapping a Function used to map incoming objects of arbitrary type to a FixedPointRectangle (internally used by the replication engine) * @param joinPredicate the join predicate to be used by this join (e.g. DistanceWithin-predicate) * @param splitAllowed the replication strategy to be applied by GESS (see inner class Replicator.Split) * @param minBitIndex the minimal bit-index to be considered for the replication-process ( 0 <= bitIndex <= 62 ) * @param newSorter a factory-Function that returns a sorting-operator (e.g. {@link xxl.core.cursors.sorters.MergeSorter}) * @param newResult a factory-Function that is used to create the result-tuples that are returned by this operator (e.g. {@link xxl.core.functions.Tuplify}) * @param dimensions the dimensionality of the data * @param initialCapacity the maximum number of elements that can be stored inside main-memory (i.e. by the SweepArea) */ // public GESS(Iterator input, Function inputMapping, Predicate joinPredicate, Predicate splitAllowed, int minBitIndex, Function newSorter, Function newResult, int dimensions, int initialCapacity){ // super( // new Replicator(inputMapping, input, dimensions, splitAllowed, minBitIndex), // joinPredicate, // newSorter, // newResult, // initialCapacity // ); // } }