package com.scaleunlimited.cascading; import java.beans.ConstructorProperties; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.Map; import cascading.flow.FlowProcess; import cascading.operation.BaseOperation; import cascading.operation.Buffer; import cascading.operation.BufferCall; import cascading.operation.Filter; import cascading.operation.FilterCall; import cascading.operation.OperationCall; import cascading.pipe.Each; import cascading.pipe.Every; import cascading.pipe.GroupBy; import cascading.pipe.Pipe; import cascading.pipe.SubAssembly; import cascading.pipe.assembly.Unique; import cascading.tuple.Fields; import cascading.tuple.Tuple; import cascading.tuple.TupleEntry; @SuppressWarnings({"serial", "rawtypes"}) public class UniqueCount extends SubAssembly { /** * Class FilterPartialDuplicates is a {@link cascading.operation.Filter} * that is used to remove observed duplicates from the tuple stream. * <p/> * Use this class typically in tandem with a * {@link cascading.operation.aggregator.First} * {@link cascading.operation.Aggregator} in order to improve de-duping * performance by removing as many values as possible before the * intermediate {@link cascading.pipe.GroupBy} operator. * <p/> * The {@code threshold} value is used to maintain a LRU of a constant size. * If more than threshold unique values are seen, the oldest cached values * will be removed from the cache. * * @see Unique */ public static class FilterPartialDuplicates extends BaseOperation<LinkedHashMap<Tuple, Object>> implements Filter<LinkedHashMap<Tuple, Object>> { private int threshold = 10000; /** * Constructor FilterPartialDuplicates creates a new * FilterPartialDuplicates instance. */ public FilterPartialDuplicates() { } /** * Constructor FilterPartialDuplicates creates a new * FilterPartialDuplicates instance. * * @param threshold * of type int */ @ConstructorProperties({ "threshold" }) public FilterPartialDuplicates(int threshold) { this.threshold = threshold; } @Override public void prepare(FlowProcess flowProcess, OperationCall<LinkedHashMap<Tuple, Object>> operationCall) { operationCall.setContext(new LinkedHashMap<Tuple, Object>(threshold, 0.75f, true) { @Override protected boolean removeEldestEntry(Map.Entry eldest) { return size() > threshold; } }); } @Override public boolean isRemove(FlowProcess flowProcess, FilterCall<LinkedHashMap<Tuple, Object>> filterCall) { // we assume its more painful to create lots of tuple copies vs // comparisons Tuple args = filterCall.getArguments().getTuple(); if (filterCall.getContext().containsKey(args)) return true; filterCall.getContext().put(filterCall.getArguments().getTupleCopy(), null); return false; } @Override public void cleanup(FlowProcess flowProcess, OperationCall<LinkedHashMap<Tuple, Object>> operationCall) { operationCall.setContext(null); } @Override public boolean equals(Object object) { if (this == object) return true; if (!(object instanceof FilterPartialDuplicates)) return false; if (!super.equals(object)) return false; FilterPartialDuplicates that = (FilterPartialDuplicates) object; if (threshold != that.threshold) return false; return true; } @Override public int hashCode() { int result = super.hashCode(); result = 31 * result + threshold; return result; } } private static class CountUniques extends BaseOperation<NullContext> implements Buffer<NullContext> { private Fields _uniqueFields; private transient Tuple _result;; public CountUniques(Fields uniqueFields, Fields countField) { super(countField); _uniqueFields = uniqueFields; } @Override public void prepare(FlowProcess flowProcess, OperationCall<NullContext> operationCall) { super.prepare(flowProcess, operationCall); _result = new Tuple(0); } @Override public void operate(FlowProcess flowProcess, BufferCall<NullContext> bufferCall) { // We are being called with all entries for the target group, sorted // by _uniqueFields // Keep track of the current value(s) for the unique fields, and // when it changes increment the count. At the end we can emit the result. Iterator<TupleEntry> iter = bufferCall.getArgumentsIterator(); int count = 1; Tuple oldGroupValue = null; while (iter.hasNext()) { Tuple curGroupValue = iter.next().selectTuple(_uniqueFields); if (oldGroupValue == null) { oldGroupValue = curGroupValue; } if (!oldGroupValue.equals(curGroupValue)) { count += 1; oldGroupValue = curGroupValue; } } _result.set(0, count); bufferCall.getOutputCollector().add(_result); } } /** * Constructor Unique creates a new Unique instance. * * @param pipe * of type Pipe * @param uniqueFields * of type Fields */ @ConstructorProperties({ "pipe", "uniqueFields" }) public UniqueCount(Pipe pipe, Fields groupFields, Fields uniqueFields, Fields countField) { this(null, pipe, groupFields, uniqueFields, countField); } /** * Constructor Unique creates a new Unique instance. * * @param pipe * of type Pipe * @param uniqueFields * of type Fields * @param threshold * of type int */ @ConstructorProperties({ "pipe", "uniqueFields", "threshold" }) public UniqueCount(Pipe pipe, Fields groupFields, Fields uniqueFields, Fields countField, int threshold) { this(null, pipe, groupFields, uniqueFields, countField, threshold); } /** * Constructor UniqueCount creates a new UniqueCount instance. * * @param name * of type String * @param pipe * of type Pipe * @param uniqueFields * of type Fields */ @ConstructorProperties({ "name", "pipe", "uniqueFields" }) public UniqueCount(String name, Pipe pipe, Fields groupFields, Fields uniqueFields, Fields countField) { this(name, pipe, groupFields, uniqueFields, countField, 10000); } /** * Constructor UniqueCount creates a new UniqueCount instance. * * @param name * of type String * @param pipe * of type Pipe * @param uniqueFields * of type Fields * @param threshold * of type int */ @ConstructorProperties({ "name", "pipe", "uniqueFields", "threshold" }) public UniqueCount(String name, Pipe pipe, Fields groupFields, Fields uniqueFields, Fields countField, int threshold) { this(name, Pipe.pipes(pipe), groupFields, uniqueFields, countField, threshold); } /** * Constructor UniqueCount creates a new UniqueCount instance. * * @param pipes * of type Pipe[] * @param uniqueFields * of type Fields */ @ConstructorProperties({ "pipes", "uniqueFields" }) public UniqueCount(Pipe[] pipes, Fields groupFields, Fields uniqueFields, Fields countField) { this(null, pipes, groupFields, uniqueFields, countField, 10000); } /** * Constructor UniqueCount creates a new UniqueCount instance. * * @param pipes * of type Pipe[] * @param uniqueFields * of type Fields * @param threshold * of type int */ @ConstructorProperties({ "pipes", "uniqueFields", "threshold" }) public UniqueCount(Pipe[] pipes, Fields groupFields, Fields uniqueFields, Fields countField, int threshold) { this(null, pipes, groupFields, uniqueFields, countField, threshold); } /** * Constructor UniqueCount creates a new UniqueCount instance. * * @param name * of type String * @param pipes * of type Pipe[] * @param uniqueFields * of type Fields */ @ConstructorProperties({ "name", "pipes", "uniqueFields" }) public UniqueCount(String name, Pipe[] pipes, Fields groupFields, Fields uniqueFields, Fields countField) { this(name, pipes, groupFields, uniqueFields, countField, 10000); } /** * Constructor UniqueCount creates a new UniqueCount instance. This will * count the number of unique values found in uniqueFields, for each group * defined by groupFields, and put the resulting count into countField. * * @param name * of type String * @param pipes * of type Pipe[] * @param uniqueFields * of type Fields * @param threshold * of type int */ @ConstructorProperties({ "name", "pipes", "uniqueFields", "threshold" }) public UniqueCount(String name, Pipe[] pipes, Fields groupFields, Fields uniqueFields, Fields countField, int threshold) { super(pipes); Fields joinedFields = Fields.join(groupFields, uniqueFields); Pipe[] filters = new Pipe[pipes.length]; FilterPartialDuplicates partialDuplicates = new FilterPartialDuplicates(threshold); for (int i = 0; i < filters.length; i++) { filters[i] = new Each(pipes[i], joinedFields, partialDuplicates); } // At this point we need to group by the groupFields, sort by // uniqueFields, and then use a special CountUnique // buffer that uses the sort order to generate unique counts. // The output should be a tuple with the groupFields and the countField Pipe pipe = new GroupBy(name, filters, groupFields, uniqueFields); pipe = new Every(pipe, uniqueFields, new CountUniques(uniqueFields, countField), Fields.SWAP); setTails(pipe); } public Pipe getTailPipe() { Pipe[] tails = getTails(); return tails[0]; } }