/* XXL: The eXtensible and fleXible Library for data processing Copyright (C) 2000-2011 Prof. Dr. Bernhard Seeger Head of the Database Research Group Department of Mathematics and Computer Science University of Marburg Germany This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; If not, see <http://www.gnu.org/licenses/>. http://code.google.com/p/xxl/ */ package xxl.core.cursors.joins; import java.util.Comparator; import java.util.Iterator; import xxl.core.collections.sweepAreas.SweepArea; import xxl.core.cursors.AbstractCursor; import xxl.core.cursors.Cursor; import xxl.core.cursors.Cursors; import xxl.core.cursors.sources.EmptyCursor; import xxl.core.functions.Function; /** * A sort-merge implementation of the join operator. This class provides a * generic, untyped sort-merge join algorithm. The resulting tuples of any join * operation are generated by a user defined function realizing a kind of * factory method. The binary function <code>newResult</code> can be used to * map the result tuples to an arbitrary user defined type. The sweep-line * status structure, here called sweep-area, consists of a bag with an * additional method for reorganisation. The way the elements of the input * iterations are inserted into the according sweep-area is determined by a * given comparator. Depending on the result of the comparison of the two next * input elements of the input iterations, the left (<code>sortedInput0</code>) * or right (<code>sortedInput1</code>) input is processed. If the left input * is processed, the sweep-area's of <code>sortedInput0</code> and * <code>sortedInput1</code> are reorganized and the next element of * <code>sortedInput0</code> is inserted in <code>sweepArea0</code>. After that * <code>sweepArea1</code> is queried with this element, i.e., the specified * predicate is applied on the elements contained in <code>sweepArea1</code> * and the last inserted element of <code>sweepArea0</code>. In order to * perform an effective search in the sweep-area, the query method of the bags * should be overriden. If the binary predicate returns <code>true</code> the * evaluated tuple gets an result of the join operation. After that query the * mapping function to create the result-tuples is applied on the results * detected by the predicate and after that they are returned to the user. The * right input is processed analogous. The implementation is a bit more complex * due to addional checks of join-types and the generation of result-tuples * where the evaluated join predicate returned <code>false</code>. * * <p><b>Note:</b> When the given input iteration only implements the interface * {@link Iterator} it is wrapped to a cursor by a call to the static * method {@link Cursors#wrap(Iterator) wrap}.</p> * * <p><b>Example usage (1):</b> * <code><pre> * LinkedList l1 = new LinkedList(); * final LinkedList l2 = new LinkedList(); * for (int i = 0; i ≤ 10; i++) { * // left: odd numbers or can be divided by 4 * if (i%2 != 0 || i%4 == 0) * l1.add(new Integer(i)); * //right: even numbers * if (i%2 == 0) * l2.add(new Integer(i)); * } * * SortMergeJoin join = new SortMergeJoin( * l1.listIterator(), * l2.listIterator(), * new SortMergeEquiJoinSA( * new ListSAImplementor(), * 0, * 2 * ), * new SortMergeEquiJoinSA( * new ListSAImplementor(), * 1, * 2 * ), * ComparableComparator.DEFAULT_INSTANCE, * AbstractFunction.IDENTITY * ); * * join.open(); * * while (join.hasNext()) { * Object[] result = (Object[])join.next(); * System.out.println("Tuple: (" + result[0] + ", " + result[1] + ")"); * } * * join.close(); * </pre></code> * The input iterations of this simple example are two list iterators. The * first one is based on all odd numbers and numbers that can be divided by 4 * of the interval [0, 10]. The second input iterator contains all even numbers * of the same interval. The comparator, a default * {@link xxl.core.comparators.ComparableComparator#INTEGER_COMPARATOR comparator} * for integers, is used for comparing the elements stored in the sweep-areas * based on a {@link java.util.List list}. In this example the specified * function to create user defined result-tuples is a rather simple one. The * {@link xxl.core.functions.Tuplify tuplify} function delivers the * result-tuples in their original representation, namely as an array * containing the matching elements of the input iterations. This can simply be * seen when printing the result-tuples to the output stream. But in the * package {@link xxl.core.relational} there are different factory methods * creating a particular kind of tuple, e.g., * {@link xxl.core.relational.tuples.ArrayTuple array-tuples}, * {@link xxl.core.relational.tuples.ListTuple list-tuples}, that substitute the * tuplify function. So, let us consider the output of this join operation, * which looks as follows: * <pre> * [0, 0] * [4, 4] * [8, 8] * </pre></p> * * @param <I> the type of the elements consumed by this iteration. * @param <E> the type of the elements returned by this join operation. * @see java.util.Iterator * @see xxl.core.cursors.Cursor * @see xxl.core.cursors.AbstractCursor * @see xxl.core.cursors.joins.NestedLoopsJoin * @see xxl.core.cursors.joins.SortMergeEquivalenceJoin * @see xxl.core.relational.cursors.SortMergeJoin * @see xxl.core.spatial.cursors.Orenstein * @see xxl.core.spatial.cursors.PlaneSweep */ //use switchable bags in those cases where the sweep area does not fit in main memory (external bags) public class SortMergeJoin<I, E> extends AbstractCursor<E> { /** * An enumeration of constants specifing the join types supported by this * class. */ public static enum Type { /** * A constant specifying a theta-join. Only the tuples for which the * specified predicate is <code>true</code> will be returned. */ THETA_JOIN, /** * A constant specifying a left outer-join. The tuples for which the * specified predicate is <code>true</code> as well as all elements of * <code>input0</code> not qualifying concerning the predicate will be * returned. The function <code>newResult</code> is called with an * element of <code>input0</code> and the <code>null</code> value. */ LEFT_OUTER_JOIN, /** * A constant specifying a right outer-join. The tuples for which the * specified predicate is <code>true</code> as well as all elements of * <code>input1</code> not qualifying concerning the predicate will be * returned. The function <code>newResult</code> is called with an * element of <code>input1</code> and the <code>null</code> value. */ RIGHT_OUTER_JOIN, /** * A constant specifying a full outer-join. The tuples for which the * specified predicate is <code>true</code> as well as all tuples * additionally returned by the left and right outer-join will be * returned. */ OUTER_JOIN }; /** * The first (or left) sorted input iteration of the join operator. */ protected Cursor<? extends I> sortedInput0; /** * The second (or right) sorted input iteration of the join operator. */ protected Cursor<? extends I> sortedInput1; /** * The sweep-area that is used for storing the elements of the first input * iteration (<code>sortedInput0</code>) and that is probed with elements * of the second input iteration (<code>sortedInput1</code>). */ protected SweepArea<I,I> sweepArea0; /** * The sweep-area that is used for storing the elements of the second input * iteration (<code>sortedInput1</code>) and that is probed with elements * of the first input iteration (<code>sortedInput0</code>). */ protected SweepArea<I,I> sweepArea1; /** * An element of one of the input iteration that is actually used for * querying the sweep-area of the other input iteration. */ protected I queryObject; //the next Object to be processed by this operator /** * A boolean flag that determines whether the elements stored in * <code>queryObject</code> belongs to the first input iteration or not. */ protected boolean first = false; /** * The comparator used to compare the elements of the two input iterations. */ protected Comparator<? super I> comparator; /** * A function that is invoked on each qualifying tuple before it is * returned to the caller concerning a call to the <code>next</code> * method. This binary function works like a kind of factory method * modelling the resulting object (tuple). Be aware that this function * possibly has to handle <code>null</code> values in cases of outer joins. */ protected Function<? super I, ? extends E> newResult; /** * The type of this sort-merge join operator. Determines whether it * calculates a theta- or an outer-join. */ protected Type type = Type.THETA_JOIN; /** * An iterator holding the precomputed results of the join operator. * Because the <code>query</code> method of sweep-area returns an iterator * of elements that match the given query, this iterator must be stored to * return the remaining elements later. */ protected Iterator<? extends I> results = new EmptyCursor<I>(); /** * Creates a new sort-merge join operator backed on two sorted input * iterations using the given sweep-areas to store the input iterations' * elements and probe for join results. Furthermore a function named * <code>newResult</code> can be specified that is invoked on each * qualifying tuple before it is returned to the caller concerning a call * to the <code>next</code> method. This function is a kind of factory * method to model the resulting object. * * <p><b>Precondition:</b> The input iterations have to be sorted!</p> * * @param sortedInput0 the first sorted input iteration to be joined. * @param sortedInput1 the second sorted input iteration to be joined. * @param sweepArea0 the sweep-area used for storing elements of the first * sorted input iteration (<code>sortedInput0</code>). * @param sweepArea1 the sweep-area used for storing elements of the second * sorted input iteration (<code>sortedInput1</code>). * @param comparator the comparator that is used for comparing elements of * the two sorted input iterations. * @param newResult a factory method (function) that takes two parameters * as argument and is invoked on each tuple where the predicate's * evaluation result is <code>true</code>, i.e., on each qualifying * tuple before it is returned to the caller concerning a call to * the <code>next</code> method. */ public SortMergeJoin(Iterator<? extends I> sortedInput0, Iterator<? extends I> sortedInput1, SweepArea<I,I> sweepArea0, SweepArea<I,I> sweepArea1, Comparator<? super I> comparator, Function<? super I, ? extends E> newResult) { this.sortedInput0 = Cursors.wrap(sortedInput0); this.sortedInput1 = Cursors.wrap(sortedInput1); this.comparator = comparator; this.newResult = newResult; this.sweepArea0 = sweepArea0; this.sweepArea1 = sweepArea1; //check input arguments (assert): if (comparator == null || newResult == null) throw new IllegalArgumentException("one of the input arguments was null!"); } /** * Creates a new sort-merge join operator backed on two input iterations * using the given sweep-areas to store the input iterations' elements and * probe for join results. The constructor does not require the two input * iterations to be sorted. The two specified, unary functions * <code>newSorter0</code> and <code>newSorter1</code> will be invoked on * the corresponding input iteration in order to get a sorted input. * Furthermore a function named <code>newResult</code> can be specified * that is invoked on each qualifying tuple before it is returned to the * caller concerning a call to the <code>next</code> method. This function * is a kind of factory method to model the resulting object. * * @param input0 the first input iteration to be joined. * @param input1 the second input iteration to be joined. * @param newSorter0 an unary function that sorts the first input iteration * <code>input0</code>. * @param newSorter1 an unary function that sorts the second input * iteration <code>input1</code>. * @param sweepArea0 the sweep-area used for storing elements of the first * input iteration (<code>input0</code>). * @param sweepArea1 the sweep-area used for storing elements of the second * input iteration (<code>input1</code>). * @param comparator the comparator that is used for comparing elements of * the two input iterations. * @param newResult a factory method (function) that takes two parameters * as argument and is invoked on each tuple where the predicate's * evaluation result is <code>true</code>, i.e., on each qualifying * tuple before it is returned to the caller concerning a call to * the <code>next</code> method. */ public SortMergeJoin(Iterator<? extends I> input0, Iterator<? extends I> input1, Function<? super Iterator<? extends I>, ? extends Iterator<? extends I>> newSorter0, Function<? super Iterator<? extends I>, ? extends Iterator<? extends I>> newSorter1, SweepArea<I,I> sweepArea0, SweepArea<I,I> sweepArea1, Comparator<? super I> comparator, Function<? super I, ? extends E> newResult) { this(newSorter0.invoke(input0), newSorter1.invoke(input1), sweepArea0, sweepArea1, comparator, newResult); } /** * Creates a new sort-merge join operator backed on two sorted input * iterations using a parameterless function to create the required * sweep-areas that are used to store the input iterations' elements and * probe for join results. Furthermore a function named * <code>newResult</code> can be specified that is invoked on each * qualifying tuple before it is returned to the caller concerning a call * to the <code>next</code> method. This function is a kind of factory * method to model the resulting object. * * <p><b>Precondition:</b> The input iterations have to be sorted!</p> * * @param sortedInput0 the first sorted input iteration to be joined. * @param sortedInput1 the second sorted input iteration to be joined. * @param newSweepArea a parameterless function creating a new sweep-area * that is used for storing elements of the sorted input iterations. * @param comparator the comparator that is used for comparing elements of * the two sorted input iterations. * @param newResult a factory method (function) that takes two parameters * as argument and is invoked on each tuple where the predicate's * evaluation result is <code>true</code>, i.e., on each qualifying * tuple before it is returned to the caller concerning a call to * the <code>next</code> method. */ public SortMergeJoin(Iterator<? extends I> sortedInput0, Iterator<? extends I> sortedInput1, Function<?, ? extends SweepArea<I,I>> newSweepArea, Comparator<? super I> comparator, Function<? super I, ? extends E> newResult) { this(sortedInput0, sortedInput1, newSweepArea.invoke(), newSweepArea.invoke(), comparator, newResult); } /** * Returns whether an element of the first input iteration is responsible * for the actual element of the sort-based join operator, i.e., if an * element of the first input iteration has been probed against the second * sweep-area to produce actual element. The implementation simply returns * the field <code>first</code>. * * @return the value of the filed <code>first</code>. */ protected boolean getFirst() { return first; } /** * Opens the join operator, i.e., signals the cursor to reserve resources, * open the input iteration, etc. Before a cursor has been opened calls to * methods like <code>next</code> or <code>peek</code> are not guaranteed * to yield proper results. Therefore <code>open</code> must be called * before a cursor's data can be processed. Multiple calls to * <code>open</code> do not have any effect, i.e., if <code>open</code> was * called the cursor remains in the state <i>opened</i> until its * <code>close</code> method is called. * * <p>Note, that a call to the <code>open</code> method of a closed cursor * usually does not open it again because of the fact that its state * generally cannot be restored when resources are released respectively * files are closed.</p> */ public void open() { if (isOpened) return; super.open(); sortedInput0.open(); sortedInput1.open(); } /** * Closes the cursor, i.e., signals the cursor to clean up resources, close * input iterations and sweep-areas, etc. When a cursor has been closed * calls to methods like <code>next</code> or <code>peek</code> are not * guaranteed to yield proper results. Multiple calls to <code>close</code> * do not have any effect, i.e., if <code>close</code> was called the * cursor remains in the state <i>closed</i>. * * <p>Note, that a closed cursor usually cannot be opened again because of * the fact that its state generally cannot be restored when resources are * released respectively files are closed.</p> */ public void close () { if (isClosed) return; super.close(); sortedInput0.close(); sweepArea0.close(); sortedInput1.close(); sweepArea1.close(); } /** * Returns <code>true</code> if the iteration has more elements. (In other * words, returns <code>true</code> if <code>next</code> or * <code>peek</code> would return an element rather than throwing an * exception.) * * @return <code>true</code> if the cursor has more elements. */ protected boolean hasNextObject() { if (!results.hasNext()) { queryObject = null; //while one of the inputs is not empty while (sortedInput0.hasNext() || sortedInput1.hasNext()) { // if one of the inputs is empty: process non-empty input // else compare boolean inputEmpty = !(sortedInput0.hasNext() && sortedInput1.hasNext()); if (first = (inputEmpty ? sortedInput0.hasNext() : comparator.compare(sortedInput0.peek(), sortedInput1.peek()) <= 0)) { //process LEFT input: //get the next object to be considered form the corresponding input queryObject = sortedInput0.next(); //pass queryObject to SweepArea to be queried (the SweepArea can then eliminate elements that are "out-of-date") sweepArea1.reorganize(queryObject, 0); //pass queryObject to SweepArea where nextObject will be inserted ( " , this operation is of HIGH IMPORTANCE for non-equi joins!) sweepArea0.reorganize(queryObject, 0); sweepArea0.insert(queryObject); //if iterator of results is empty: if (!(results = sweepArea1.query(queryObject, 0)).hasNext()) { //if element of the right input contains an element that equals queryObject if ((type == Type.LEFT_OUTER_JOIN || type == Type.OUTER_JOIN) && !(sortedInput1.hasNext() && comparator.compare(queryObject, sortedInput1.peek()) == 0)) { next = newResult.invoke(queryObject, null); return true; } } else return true; } else { //process RIGHT input: //get the next Object to be considered from the corresponding input queryObject = sortedInput1.next(); //pass queryObject to SweepArea to be queried (the SweepArea can then eliminate elements that are "out-of-date") sweepArea0.reorganize(queryObject, 1); //pass queryObject to SweepArea where nextObject will be inserted ( " , this operation is of HIGH IMPORTANCE for non-equi joins!) sweepArea1.reorganize(queryObject, 1); sweepArea1.insert(queryObject); //if iterator of results is empty if (!(results = sweepArea0.query(queryObject, 1)).hasNext()) { //if element of the left input contains an element that equals queryObject if ((type == Type.RIGHT_OUTER_JOIN || type == Type.OUTER_JOIN) && !(sortedInput0.hasNext() && comparator.compare(queryObject, sortedInput0.peek()) == 0)) { next = newResult.invoke(null, queryObject); return true; } } else return true; } } return false; } return true; } /** * Returns the next element in the iteration. This element will be * accessible by some of the cursor's methods, e.g., <code>update</code> or * <code>remove</code>, until a call to <code>next</code> or * <code>peek</code> occurs. This is calling <code>next</code> or * <code>peek</code> proceeds the iteration and therefore its previous * element will not be accessible any more. * * @return the next element in the iteration. */ protected E nextObject() { if (results.hasNext()) return first ? (E)newResult.invoke(queryObject, results.next()) : newResult.invoke(results.next(), queryObject); return next; } /** * Resets the sort-merge join operator to its initial state such that the * caller is able to traverse the join result again without constructing a * new join operator (optional operation). * * <p>Note, that this operation is optional and might not work for all * cursors.</p> * * @throws UnsupportedOperationException if the <code>reset</code> * operation is not supported by the cursor. */ public void reset () throws UnsupportedOperationException { super.reset(); sortedInput0.reset(); sweepArea0.clear(); sortedInput1.reset(); sweepArea1.clear(); results = new EmptyCursor<I>(); } /** * Returns <code>true</code> if the <code>reset</code> operation is * supported by the sort-merge join operator. Otherwise it returns * <code>false</code>. * * @return <code>true</code> if the <code>reset</code> operation is * supported by the sort-merge join operator, otherwise * <code>false</code>. */ public boolean supportsReset() { return true; } /** * Returns a string representation of the object. In general, the * <code>toString</code> method returns a string that * "textually represents" this object. The result should be a concise but * informative representation that is easy for a person to read. It is * recommended that all subclasses override this method. * * @return string representation of the object. */ public String toString() { StringBuffer sb = new StringBuffer(); Iterator sweepAreaContent = sweepArea0.iterator(); sb.append("sweep area 0:\n"); while (sweepAreaContent.hasNext()) sb.append(sweepAreaContent.next() + "\n"); sweepAreaContent = sweepArea1.iterator(); sb.append("sweep area 1:\n"); while (sweepAreaContent.hasNext()) sb.append(sweepAreaContent.next() + "\n"); return sb.toString(); } }