/* XXL: The eXtensible and fleXible Library for data processing Copyright (C) 2000-2011 Prof. Dr. Bernhard Seeger Head of the Database Research Group Department of Mathematics and Computer Science University of Marburg Germany This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; If not, see <http://www.gnu.org/licenses/>. http://code.google.com/p/xxl/ */ package xxl.core.cursors.groupers; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import xxl.core.cursors.AbstractCursor; import xxl.core.functions.AbstractFunction; import xxl.core.functions.Function; import xxl.core.functions.Identity; import xxl.core.math.functions.AggregationFunction; import xxl.core.math.statistics.parametric.aggregates.Count; import xxl.core.predicates.Equal; import xxl.core.predicates.Predicate; /** * An aggregate grouper provides a grouping and an aggregation in only one step. * This could be useful in the case of a lack of memory if all data objects must * have been seen to provide a grouping meaning the grouping is data-driven. * * <p>An instance of this class could easily be used to construct an equi-width * histogram of numerical data. To do so one must provide a function to identify * the corresponding histogram bucket similar to a hashing function and a * function for aggregation normally an average function. The corresponding code * could be like this: * <pre> * Cursor histogram = new AggregateGrouper( * new DiscreteRandomNumber(new JavaDiscreteRandomWrapper(10), 500), * new AbstractFunction() { * int start = 0; * int binWidth = 3; * public Object invoke(Object integer) { * int v = ((Integer)integer).intValue(); * int bin = v / binWidth; * return "[ " + Integer.toString(bin*binWidth) + ", " + Integer.toString((bin+1)*binWidth) + ")"; * } * }, * new AbstractFunction() { * public Object invoke() { * return new StatefulAverage(); * } * } * ); * </pre></p> * * <p><b>Further examples:</b><br /> * cumulative frequency distribution: * <pre> * Cursor cfd = new CFDCursor( * new DiscreteRandomNumber(new JavaDiscreteRandomWrapper(10), 100) * ); * * cfd.open(); * * System.out.println ("cumulative frequency distribution (unsorted):"); * while (cfd.hasNext()) { * Object[] next = (Object[])cfd.next(); * System.out.println(next[0] + " has occured " + next[1] + " times"); * } * * cfd.close(); * </pre> * remove duplicates: * <pre> * Cursor noDubs = new Mapper( * new DuplicatesRemover( * new DiscreteRandomNumber(new JavaDiscreteRandomWrapper(10), 100) * ), * new AbstractFunction() { * public Object invoke(Object o) { * return ((Object[])o)[0]; * } * } * ); * * noDubs.open(); * * System.out.println("remove any duplicates"); * while (noDubs.hasNext()) * System.out.println(noDubs.next()); * * noDubs.close(); * </pre> * cumulative frequency distribution (sorted by value): * <pre> * Cursor cfdSort = new ReplacementSelection( * new CFDCursor( * new DiscreteRandomNumber(new JavaDiscreteRandomWrapper(10), 100) * ), * 100, * new Comparator() { * public int compare(Object o1, Object o2) { * return ((Integer)((Object[])o1)[0]).intValue() - ((Integer)((Object[])o2)[0]).intValue(); * } * } * ); * * cfdSort.open(); * * System.out.println("cumulative frequency distribution (sorted by value):"); * while (cfdSort.hasNext()) { * Object[] next = (Object[])cfdSort.next(); * System.out.println(next[0] + " has occured " + next[1] + " times"); * } * * cfdSort.close(); * </pre> * cumulative frequency distribution (sorted by frequency) * <pre> * Cursor cfdSort2 = new ReplacementSelection( * new CFDCursor( * new DiscreteRandomNumber(new JavaDiscreteRandomWrapper(10), 100) * ), * 100, * new Comparator() { * public int compare(Object o1, Object o2) { * return ((Long)((Object[])o1)[1]).intValue() - ((Long)((Object[])o2)[1]).intValue(); * } * } * ); * * cfdSort2.open(); * * System.out.println("cumulative frequency distribution (sorted by frequency):"); * while (cfdSort2.hasNext()) { * Object[] next = (Object[])cfdSort2.next(); * System.out.println(next[0] + " has occured " + next[1] + " times"); * } * * cfdSort2.close(); * </pre></p> * * @see java.util.Iterator * @see xxl.core.cursors.Cursor * @see xxl.core.functions.Function */ public class AggregateGrouper extends AbstractCursor { /** * The input iteration providing the data to be grouped and aggregated. */ protected Iterator iterator; /** * A function providing representatives for objects (the identity as a rule). */ protected Function representatives; /** * A factory for the aggregation functions used for aggregating the data * (e.g., count). */ protected Function aggregationFunctionFactory; /** * The number of already delivered data. */ protected int pos; /** * A list storing the aggregated tuples internally. */ protected List cumulative; /** * A predicate providing the equality of the representatives of the objects. */ protected Predicate equals; /* * Cumulative Frequency Distribution (CFD) */ /** * This class provides a cumulative frequency distribution (CFD) of given * data. To do this all equal data will be stored and counted. Calling the * <tt>next</tt> method delivers an <tt>Object[]</tt> containing an object * and the number of occurences of this object or equal objects. */ public static class CFDCursor extends AggregateGrouper { /** * Creates a new CFD cursor. * * @param iterator the input iteration prodiving the data. */ public CFDCursor(Iterator iterator) { super( iterator, new AbstractFunction() { public Object invoke() { return new Count(); } } ); } } /** * This class provides a duplicate remover. */ public static class DuplicatesRemover extends AggregateGrouper { /** * Creates a new duplicate remover. * * @param iterator the input iteration providing the data. */ public DuplicatesRemover(Iterator iterator) { super( iterator, new AbstractFunction() { public Object invoke() { return new AggregationFunction() { public Object invoke(Object old, Object next) { return null; } }; } } ); } } /** * Constructs a new aggregate grouper. * * @param iterator the input iteration delivering the data to aggregate. * @param representatives this function provides to any given object a * representative to identify the corresponding objects. Complies to a * hashing function but unlike determining the hash bucket this * function returns a substitute for each object. This substitute will * be returned combined with the computed aggregation value by calling * the <tt>next</tt> method. * @param aggregationFunctionFactory a factory method providing new * aggregation functions. * @param equals equality predicate to determine the equality (resp. * inequaltity) of two given representatives. Using this predicate one * is able to use different representatives for the same group. */ public AggregateGrouper(Iterator iterator, Function representatives, Function aggregationFunctionFactory, Predicate equals) { this.iterator = iterator; this.representatives = representatives; this.aggregationFunctionFactory = aggregationFunctionFactory; this.equals = equals; cumulative = new LinkedList(); } /** * Constructs a new aggregate grouper using an * {@link xxl.core.predicates.Equal equality} predicate as default. * * @param iterator the input iteration delivering the data to aggregate. * @param representatives this function provides to any given object a * representative to identify the corresponding objects. Complies to a * hashing function but unlike determining the hash bucket this * function returns a substitute for each object. This substitute will * be returned combined with the computed aggregation value by calling * the <tt>next</tt> method. * @param aggregationFunctionFactory a factory method providing new * aggregation functions. */ public AggregateGrouper(Iterator iterator, Function representatives, Function aggregationFunctionFactory) { this( iterator, representatives, aggregationFunctionFactory, Equal.DEFAULT_INSTANCE ); } /** * Constructs a new aggregate grouper using the default an * {@link xxl.core.predicates.Equal equality} predicate and an * {@link xxl.core.functions.Identity#DEFAULT_INSTANCE identity} function for the * representative mapping as default. * * @param iterator the input iteration delivering the data to aggregate. * @param aggregationFunctionFactory a factory method providing new * aggregation functions. */ public AggregateGrouper(Iterator iterator, Function aggregationFunctionFactory) { this( iterator, Identity.DEFAULT_INSTANCE, aggregationFunctionFactory ); } /** * Consumes every given object. I.e., determines the representive of the * given object and computes the aggregation. * * @param next the next object to consume. */ private void consume(Object next) { Object value = representatives.invoke(next); Iterator it = cumulative.iterator(); boolean notFound = true; while (it.hasNext() && notFound) { Object[] rep = (Object[])it.next(); if (equals.invoke(value, rep[0])) { notFound = false; rep[1] = ((AggregationFunction)rep[2]).invoke(rep[1], next); } } if (notFound) { Object[] c = new Object[3]; c[2] = aggregationFunctionFactory.invoke(); c[0] = value; c[1] = ((AggregationFunction)c[2]).invoke(null, next); cumulative.add(c); } } /** * Opens the aggregate grouper, i.e., signals the cursor to reserve * resources and consume the input iteration to compute its groups. Before a * cursor has been opened calls to methods like <tt>next</tt> or * <tt>peek</tt> are not guaranteed to yield proper results. Therefore * <tt>open</tt> must be called before a cursor's data can be processed. * Multiple calls to <tt>open</tt> do not have any effect, i.e., if * <tt>open</tt> was called the cursor remains in the state <i>opened</i> * until its <tt>close</tt> method is called. * * <p>Note, that a call to the <tt>open</tt> method of a closed cursor * usually does not open it again because of the fact that its state * generally cannot be restored when resources are released respectively * files are closed.</p> */ public void open() { if (!isOpened) { pos = 0; while (iterator.hasNext()) consume(iterator.next()); } super.open(); } /** * Returns <tt>true</tt> if the iteration has more elements. (In other * words, returns <tt>true</tt> if <tt>next</tt> or <tt>peek</tt> would * return an element rather than throwing an exception.) * * @return <tt>true</tt> if the cursor has more elements. */ protected boolean hasNextObject() { return pos < cumulative.size(); } /** * Returns the next element in the iteration. This element will be * accessible by some of the cursor's methods, e.g., <tt>update</tt> or * <tt>remove</tt>, until a call to <tt>next</tt> or <tt>peek</tt> occurs. * This is calling <tt>next</tt> or <tt>peek</tt> proceeds the iteration and * therefore its previous element will not be accessible any more. * * @return the next element in the iteration. */ protected Object nextObject() { Object[] n = new Object[2]; n[0] = ((Object[])cumulative.get(pos))[0]; n[1] = ((Object[])cumulative.get(pos))[1]; pos++; return n; } }