/** * diqube: Distributed Query Base. * * Copyright (C) 2015 Bastian Gloeckle * * This file is part of diqube. * * diqube is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package org.diqube.util; import java.util.Arrays; import java.util.Collections; import java.util.HashSet; import java.util.Iterator; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import java.util.function.BiConsumer; import java.util.function.BinaryOperator; import java.util.function.Consumer; import java.util.function.Function; import java.util.function.Supplier; import java.util.stream.Collector; /** * A HashingBatchCollector is a {@link Collector} that tries to collect batches of specific size before sending them to * a consumer. * * <p> * Please be aware that input objects (of type T) will be <b>hashed</b> and must therefore be non-colliding. If they are * colliding, <b>values may be lost</b>. It is fine to use this collector with classes that do not have equals() and * hashCode() methods overridden from {@link Object}, but only have standard identity implementations provided by * {@link Object}. * * <p> * This method can be used in a multi-threaded way, but when calling the provided consumer, it will be synchronized. * * <p> * It is not guaranteed that batches of exact batchSize are provided to the Consumer. * * @param <T> * The type of input objects. * * @author Bastian Gloeckle */ public final class HashingBatchCollector<T> implements Collector<T, ConcurrentHashMap<T, Object>, Void> { private final Object EMPTY = new Object(); private final Object SYNC = new Object(); private int batchSize; private Consumer<T[]> consumer; private Factory<T> factory; /** * Create a new {@link HashingBatchCollector}. * * @param batchSize * The size of batches that should be tried to be used when calling the consumer. * @param factory * Factory that creates an array of the correct result type with the given length. The arrays created by this * factory will be provided to the consumer. * @param consumer * The consumer that will be called as soon as a batch is approximately full (or the finisher identifies some * objects are left). */ public HashingBatchCollector(int batchSize, Factory<T> factory, Consumer<T[]> consumer) { this.batchSize = batchSize; this.consumer = consumer; this.factory = factory; } @Override public Supplier<ConcurrentHashMap<T, Object>> supplier() { return () -> new ConcurrentHashMap<T, Object>(); } @Override public BiConsumer<ConcurrentHashMap<T, Object>, T> accumulator() { return (map, a) -> { if (map.size() >= batchSize) { synchronized (SYNC) { int size = map.size(); if (size > 0) { Set<T> keysToWorkOn = new HashSet<T>(Collections.list(map.keys())); consumer.accept(createDataArray(keysToWorkOn)); for (T k : keysToWorkOn) map.remove(k); } } } map.put(a, EMPTY); }; } @Override public BinaryOperator<ConcurrentHashMap<T, Object>> combiner() { return (x, y) -> { x.putAll(y); if (x.size() >= batchSize) { synchronized (SYNC) { int size = x.size(); if (size > 0) { Set<T> keysToWorkOn = new HashSet<T>(Collections.list(x.keys())); consumer.accept(createDataArray(keysToWorkOn)); } } return supplier().get(); } return x; }; } @Override public Function<ConcurrentHashMap<T, Object>, Void> finisher() { return (map) -> { synchronized (SYNC) { int size = map.size(); if (size > 0) { Set<T> keysToWorkOn = new HashSet<T>(Collections.list(map.keys())); consumer.accept(createDataArray(keysToWorkOn)); } } return null; }; } @Override public Set<java.util.stream.Collector.Characteristics> characteristics() { return new HashSet<Collector.Characteristics>( Arrays.asList(new Characteristics[] { Characteristics.CONCURRENT, Characteristics.UNORDERED })); } /** * Transform the values of the enumeration into a result array. */ private T[] createDataArray(Set<T> values) { T[] res = factory.create(values.size()); Iterator<T> setIt = values.iterator(); for (int i = 0; i < res.length; i++) { res[i] = setIt.next(); } return res; } public static interface Factory<T> { /** * Creates and returns an array of the result type with the given length. */ public T[] create(int len); } }