/* XXL: The eXtensible and fleXible Library for data processing Copyright (C) 2000-2011 Prof. Dr. Bernhard Seeger Head of the Database Research Group Department of Mathematics and Computer Science University of Marburg Germany This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; If not, see <http://www.gnu.org/licenses/>. http://code.google.com/p/xxl/ */ package xxl.core.cursors.groupers; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import xxl.core.cursors.AbstractCursor; import xxl.core.cursors.Cursor; import xxl.core.cursors.Cursors; import xxl.core.cursors.wrappers.IteratorCursor; import xxl.core.functions.Function; import xxl.core.functions.Identity; /** * A hash grouper partitions input data into groups (strict evaluation). When * the hash grouper is initialized a new {@link java.util.HashMap hash-map} is * created. The input data is partitioned concerning this hash-map by invoking * the user defined, unary function on the given iteration's elements. The hash * grouper calls the <code>next</code> method on the input iteration until the * <code>hasNext</code> method returns <code>false</code>, i.e., the iteration * is completely consumed, so a strict evaluation is performed. Every entry in * the hash-map consists of a value and a linked list where the iteration's * elements are added to. This means that the hash grouper partitions the data * physically. A function is used to compute the hash value of an object. With * regard to this value a bucket in the hash-map is assigned, i.e., a new group * starts. * * <p>A call to the <code>next</code> method returns a cursor pointing to the * next group (the next bucket in the hash-map).</p> * * <p><b>Note:</b> If the input iteration is given by an object of the class * {@link java.util.Iterator}, i.e., it does not support the <code>peek</code> * operation, it is internally wrapped to a cursor.</p> * * <p><b>Example usage:</b> * <code><pre> * HashGrouper<Integer> hashGrouper = new HashGrouper<Integer>( * new xxl.core.cursors.sources.Enumerator(21), * new Function<Integer, Integer>() { * public Integer invoke(Integer next) { * return next % 5; * } * } * ); * * hashGrouper.open(); * * while (hashGrouper.hasNext()) { * Cursor<Integer> bucket = hashGrouper.next(); * // a cursor pointing to next group * while (bucket.hasNext()) * System.out.print(bucket.next() + "; "); * System.out.flush(); * System.out.println(); * } * * hashGrouper.close(); * </pre></code> * This example creates a new hash grouper by applying the function to the * objects of an enumerator. So a hash-map is created with the values 0,...,4 * and their corresponding buckets. Every entry in this hash-map consists of a * value and a linked list (bucket) where the iteration's elements are added * to. The implementation of the <code>init</code> method is as follows: * <code><pre> * HashMap<Object, List<E>> hashMap = new HashMap<Object, List<E>>(); * input.open(); * while (input.hasNext()) { * E object = input.next(); * Object value = function.invoke(object); * List<E> list = hashMap.get(value); * if (list == null) * hashMap.put(value, list = new LinkedList<E>()); * list.add(object); * } * groups = hashMap.values().iterator(); * </pre></code> * A collection is returned by applying the method <code>values</code> on the * hash map. This collection contains references to all linked lists (buckets) * used in the hash-map. The method <code>iterator</code> invoked on this * collection returns the elements as an iterator, namely the result-iteration * <code>groups</code>.<br /> * In the use case shown above, this iteration is printed to the output stream, * i.e., the elements contained in the buckets were printed out. The generated * output is as follows: * <pre> * 4; 9; 14; 19; * 3; 8; 13; 18; * 2; 7; 12; 17; * 1; 6; 11; 16; * 0; 5; 10; 15; 20; * </pre> * The partitions concerning the function (object modulo 5) are represented * correctly. * * @param <E> the type of the elements returned by the input iteration. * @see java.util.Iterator * @see xxl.core.cursors.Cursor * @see xxl.core.functions.Function * @see java.util.HashMap * @see xxl.core.cursors.groupers.NestedLoopsGrouper * @see xxl.core.cursors.groupers.SortBasedGrouper */ public class HashGrouper<E> extends AbstractCursor<Cursor<E>> { /** * The given input cursor providing the data to be partitioned. */ protected Cursor<? extends E> input; /** * The function used to partition the elements (hash function). */ protected Function<? super E, ? extends Object> function; /** * An iterator pointing to the resulting groups. */ protected Iterator<List<E>> groups; /** * Creates a new hash grouper backed on an iteration. If an iterator is * given to this constructor it is wrapped to a cursor. * * @param iterator the input iteration delivering the elements to be * partitioned. * @param function the unary function returning a (hash-)value for each * element. */ public HashGrouper(Iterator<? extends E> iterator, Function<? super E, ? extends Object> function) { this.input = Cursors.wrap(iterator); this.function = function; } /** * Creates a new hash grouper using the identity function to partition the * elements. If an iterator is given to this constructor it is wrapped to a * cursor. * * @param iterator the input iteration delivering the elements to be * partitioned. * @see xxl.core.functions.Identity */ public HashGrouper(Iterator<? extends E> iterator) { this(iterator, new Identity<E>()); } /** * Initializes the hash grouper by creating a new * {@link java.util.HashMap hash-map}. Therefore this method partitions the * input data concerning this hash-map by invoking the user defined, unary * function on the iteration's elements. Every entry in this hash-map * consists of a key (hash-value) and a linked list where the iteration's * elements are added to. * * <p>The implementation of this method is as follows: * <code><pre> * HashMap<Object, List<E>> hashMap = new HashMap<Object, List<E>>(); * input.open(); * while (input.hasNext()) { * E object = input.next(); * Object value = function.invoke(object); * List<E> list = hashMap.get(value); * if (list == null) * hashMap.put(value, list = new LinkedList<E>()); * list.add(object); * } * groups = hashMap.values().iterator(); * </pre></code> * A collection is returned by applying the method <tt>values</tt> on the * hash-map. This collection contains references to all linked lists * (buckets) used in the hash-map. The method <tt>iterator</tt> invoked on * this collection returns the elements as an iterator, namely the * result-iteration <tt>groups</tt>. */ protected void init() { HashMap<Object, List<E>> hashMap = new HashMap<Object, List<E>>(); input.open(); while (input.hasNext()) { E object = input.next(); Object value = function.invoke(object); List<E> list = hashMap.get(value); if (list == null) hashMap.put(value, list = new LinkedList<E>()); list.add(object); } groups = hashMap.values().iterator(); } /** * Opens the hash grouper, i.e., signals the cursor to reserve resources * and consume the input iteration to compute its partitions. Before a * cursor has been opened calls to methods like <code>next</code> or * <code>peek</code> are not guaranteed to yield proper results. Therefore * <code>open</code> must be called before a cursor's data can be * processed. Multiple calls to <code>open</code> do not have any effect, * i.e., if <code>open</code> was called the cursor remains in the state * <i>opened</i> until its <code>close</code> method is called. * * <p>Note, that a call to the <code>open</code> method of a closed cursor * usually does not open it again because of the fact that its state * generally cannot be restored when resources are released respectively * files are closed.</p> */ public void open() { if (!isOpened) init(); super.open(); } /** * Closes the hash grouper, i.e., signals the cursor to clean up resources, * close the input iteration, etc. When a cursor has been closed calls to * methods like <code>next</code> or <code>peek</code> are not guaranteed * to yield proper results. Multiple calls to <code>close</code> do not * have any effect, i.e., if <code>close</code> was called the cursor * remains in the state <i>closed</i>. * * <p>Note, that a closed cursor usually cannot be opened again because of * the fact that its state generally cannot be restored when resources are * released respectively files are closed.</p> */ public void close() { if (isClosed) return; super.close(); input.close(); } /** * Returns <code>true</code> if the iteration has more elements. (In other * words, returns <code>true</code> if <code>next</code> or * <code>peek</code> would return an element rather than throwing an * exception.) This happens when the result-iteration <code>groups</code> * has more elements. * * @return <code>true</code> if the hash grouper has more elements, * otherwise <code>false</code>. */ protected boolean hasNextObject() { return groups.hasNext(); } /** * Returns the next element in the iteration, in this case a cursor. A new * {@link xxl.core.cursors.wrappers.IteratorCursor iterator} cursor is * returned by calling the <code>next</code> method on the result-iteration * <code>groups</code>. * * <p>More detailed: The <code>next</code> method invoked on the * result-iteration <code>groups</code> returns a reference to a list, * because the result-iteration consists of references pointing to lists. * The method <code>iterator</code> is applied on this list and the * returned iterator instance is wrapped to a cursor. * * @return the next element in the iteration. */ protected Cursor<E> nextObject() { return new IteratorCursor<E>(groups.next().iterator()); } /** * Resets the hash grouper to its initial state (optional operation). That * means the <code>reset</code> method of the input iteration and the * <code>init</code> method of the hash grouper are called. So the caller * is able to traverse the partitions again. * * @throws UnsupportedOperationException if the <code>reset</code> method * is not supported by the input iteration. */ public void reset() throws UnsupportedOperationException { super.reset(); input.reset(); init(); } /** * Returns <code>true</code> if the <code>reset</code> operation is * supported by the hash grouper. Otherwise it returns <code>false</code>. * * @return <code>true</code> if the <code>reset</code> operation is * supported by the hash grouper, otherwise <code>false</code>. */ public boolean supportsReset() { return input.supportsReset(); } }