/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.jena.atlas.data; import java.util.Comparator ; import java.util.HashSet ; import java.util.Iterator ; import org.apache.jena.atlas.iterator.Iter ; import org.apache.jena.atlas.iterator.PeekIterator ; import org.apache.jena.atlas.lib.Closeable ; /** * <p> * This data bag will gather distinct items in memory until a size threshold is passed, at which point it will write * out all of the items to disk using the supplied serializer. * </p> * <p> * After adding is finished, call {@link #iterator()} to set up the data bag for reading back items and iterating over them. * The iterator will retrieve only distinct items. * </p> * <p> * IMPORTANT: You may not add any more items after this call. You may subsequently call {@link #iterator()} multiple * times which will give you a new iterator for each invocation. If you do not consume the entire iterator, you should * call {@link Iter#close(Iterator)} to close any FileInputStreams associated with the iterator. * </p> * <p> * Additionally, make sure to call {@link #close()} when you are finished to free any system resources (preferably in a finally block). * </p> * <p> * Implementation Notes: Data is stored without duplicates as it comes in in a HashSet. When it is time to spill, * that data is sorted and written to disk. An iterator that eliminates adjacent duplicates is used in conjunction * with the SortedDataBag's iterator. * </p> */ public class DistinctDataBag<E> extends SortedDataBag<E> { public DistinctDataBag(ThresholdPolicy<E> policy, SerializationFactory<E> serializerFactory, Comparator<E> comparator) { super(policy, serializerFactory, comparator); this.memory = new HashSet<>(); } @Override public boolean isSorted() { // The bag may not be sorted if we havn't spilled return false; } @Override public boolean isDistinct() { return true; } @Override public Iterator<E> iterator() { // We could just return super.iterator() in all cases, // but no need to waste time sorting if we havn't spilled if (!spilled) { checkClosed(); finishedAdding = true; if (memory.size() > 0) { return memory.iterator(); } else { return Iter.nullIterator(); } } else { return new DistinctReducedIterator<>(super.iterator()); } } protected static class DistinctReducedIterator<T> extends PeekIterator<T> implements Closeable { private Iterator<T> iter; public DistinctReducedIterator(Iterator<T> iter) { super(iter); this.iter = iter; } @Override public T next() { T item = super.next(); // Keep going until as long as the next item is the same as the current one while (hasNext() && ((null == item && null == peek()) || (null != item && item.equals(peek())))) { item = super.next(); } return item; } @Override public void close() { Iter.close(iter); } } }