/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.jena.atlas.data; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.util.ArrayList; import java.util.Arrays; import java.util.Comparator; import java.util.Iterator; import java.util.List; import java.util.NoSuchElementException; import java.util.PriorityQueue; import org.apache.jena.atlas.AtlasException; import org.apache.jena.atlas.data.AbortableComparator.Finish; import org.apache.jena.atlas.iterator.Iter; import org.apache.jena.atlas.iterator.IteratorResourceClosing; import org.apache.jena.atlas.lib.Closeable; import org.apache.jena.atlas.lib.Sink; /** * <p> * This data bag will gather items in memory until a size threshold is passed, * at which point it will write out all of the items to disk using the supplied * serializer. * </p> * <p> * After adding is finished, call {@link #iterator()} to set up the data bag for * reading back items and iterating over them. The iterator will retrieve the * items in sorted order using the supplied comparator. * </p> * <p> * IMPORTANT: You may not add any more items after this call. You may * subsequently call {@link #iterator()} multiple times which will give you a * new iterator for each invocation. If you do not consume the entire iterator, * you should call {@link Iter#close(Iterator)} to close any FileInputStreams * associated with the iterator. * </p> * <p> * Additionally, make sure to call {@link #close()} when you are finished to * free any system resources (preferably in a finally block). * </p> * <p> * Implementation Notes: Data is stored in an ArrayList as it comes in. When it * is time to spill, that data is sorted and written to disk. An iterator will * read in each file and perform a merge-sort as the results are returned. * </p> */ public class SortedDataBag<E> extends AbstractDataBag<E> { /** * The the maximum number of files to merge at the same time. Without this, * you can run out of file handles and other bad things. */ protected static int MAX_SPILL_FILES = 100; protected final ThresholdPolicy<E> policy; protected final SerializationFactory<E> serializationFactory; protected final AbortableComparator<E> comparator; protected boolean finishedAdding = false; protected boolean spilled = false; protected boolean closed = false; public SortedDataBag(ThresholdPolicy<E> policy, SerializationFactory<E> serializerFactory, Comparator<? super E> comparator) { this.policy = policy; this.serializationFactory = serializerFactory; this.comparator = new AbortableComparator<E>(comparator); } /** * cancel arranges that further comparisons using the supplied comparator * will abandon the sort in progress. */ public void cancel() { comparator.cancel(); } /** * isCancelled is true iff cancel has been called on this bags comparator. * (Used in testing.) */ public boolean isCancelled() { return comparator.cancelled; } /** * isClosed returns true iff this bag has been closed. * (Used in testing.) */ public boolean isClosed() { return closed; } protected void checkClosed() { if (closed) throw new AtlasException("SortedDataBag is closed, no operations can be performed on it."); } @Override public boolean isSorted() { return true; } @Override public boolean isDistinct() { return false; } @Override public void add(E item) { checkClosed(); if (finishedAdding) throw new AtlasException("SortedDataBag: Cannot add any more items after the writing phase is complete."); if (policy.isThresholdExceeded()) { spill(); } if (memory.add(item)) { policy.increment(item); size++; } } @SuppressWarnings({ "unchecked" }) protected void spill() { // Make sure we have something to spill. if (memory.size() > 0) { OutputStream out; try { out = getSpillStream(); } catch (IOException e) { throw new AtlasException(e); } // Sort the tuples as an array. The CanAbortComparator will sort // that // array using Arrays.sort. The cast to E[] is safe. If the sort is // aborted, don't bother messing around with the serialisation. // We'll // never get around to using it anyway. E[] array = (E[]) memory.toArray(); if (comparator.abortableSort(array) == Finish.COMPLETED) { Sink<E> serializer = serializationFactory.createSerializer(out); try { for (Object tuple : array) { serializer.send((E) tuple); } } finally { serializer.close(); } } spilled = true; policy.reset(); memory.clear(); } } @Override public void flush() { spill(); } protected Iterator<E> getInputIterator(File spillFile) throws FileNotFoundException { InputStream in = getInputStream(spillFile); Iterator<E> deserializer = serializationFactory.createDeserializer(in); return new IteratorResourceClosing<>(deserializer, in); } /** * Returns an iterator over a set of elements of type E. If you do not * exhaust the iterator, you should call * {@link org.apache.jena.atlas.iterator.Iter#close(Iterator)} to be sure * any open file handles are closed. * * @return an Iterator */ @Override public Iterator<E> iterator() { preMerge(); return iterator(getSpillFiles().size()); } @SuppressWarnings({ "unchecked" }) private Iterator<E> iterator(int size) { checkClosed(); int memSize = memory.size(); // Constructing an iterator from this class is not thread-safe (just // like all the the other methods) if (!finishedAdding && memSize > 1) { E[] array = (E[]) memory.toArray(); comparator.abortableSort(array); // don't care if we aborted or not memory = Arrays.asList(array); } finishedAdding = true; if (spilled) { List<Iterator<E>> inputs = new ArrayList<>(size + (memSize > 0 ? 1 : 0)); if (memSize > 0) { inputs.add(memory.iterator()); } for (int i = 0; i < size; i++) { File spillFile = getSpillFiles().get(i); try { Iterator<E> irc = getInputIterator(spillFile); inputs.add(irc); } catch (FileNotFoundException e) { // Close any open streams before we throw an exception for (Iterator<E> it : inputs) { Iter.close(it); } throw new AtlasException("Cannot find one of the spill files", e); } } SpillSortIterator<E> ssi = new SpillSortIterator<>(inputs, comparator); registerCloseableIterator(ssi); return ssi; } else { if (memSize > 0) { return memory.iterator(); } else { return Iter.nullIterator(); } } } private void preMerge() { if (getSpillFiles() == null || getSpillFiles().size() <= MAX_SPILL_FILES) { return; } try { while (getSpillFiles().size() > MAX_SPILL_FILES) { Sink<E> sink = serializationFactory.createSerializer(getSpillStream()); Iterator<E> ssi = iterator(MAX_SPILL_FILES); try { while (ssi.hasNext()) { sink.send(ssi.next()); } } finally { Iter.close(ssi); sink.close(); } List<File> toRemove = new ArrayList<>(MAX_SPILL_FILES); for (int i = 0; i < MAX_SPILL_FILES; i++) { File file = getSpillFiles().get(i); file.delete(); toRemove.add(file); } getSpillFiles().removeAll(toRemove); memory = new ArrayList<>(); } } catch (IOException e) { throw new AtlasException(e); } } @Override public void close() { if (!closed) { closeIterators(); deleteSpillFiles(); memory = null; closed = true; } } /** * An iterator that handles getting the next tuple from the bag. */ protected static class SpillSortIterator<T> implements Iterator<T>, Closeable { private final List<Iterator<T>> inputs; private final Comparator<? super T> comp; private final PriorityQueue<Item<T>> minHeap; public SpillSortIterator(List<Iterator<T>> inputs, Comparator<? super T> comp) { this.inputs = inputs; this.comp = comp; this.minHeap = new PriorityQueue<>(inputs.size()); // Prime the heap for (int i = 0; i < inputs.size(); i++) { replaceItem(i); } } private void replaceItem(int index) { Iterator<T> it = inputs.get(index); if (it.hasNext()) { T tuple = it.next(); minHeap.add(new Item<>(index, tuple, comp)); } } @Override public boolean hasNext() { return (minHeap.peek() != null); } @Override public T next() { if (!hasNext()) { throw new NoSuchElementException(); } Item<T> curr = minHeap.poll(); // Read replacement item replaceItem(curr.getIndex()); return curr.getTuple(); } @Override public void remove() { throw new UnsupportedOperationException("SpillSortIterator.remove"); } @Override public void close() { for (Iterator<T> it : inputs) { Iter.close(it); } } private final class Item<U> implements Comparable<Item<U>> { private final int index; private final U tuple; private final Comparator<? super U> c; public Item(int index, U tuple, Comparator<? super U> c) { this.index = index; this.tuple = tuple; this.c = c; } public int getIndex() { return index; } public U getTuple() { return tuple; } @Override @SuppressWarnings("unchecked") public int compareTo(Item<U> o) { return (null != c) ? c.compare(tuple, o.getTuple()) : ((Comparable<U>) tuple).compareTo(o.getTuple()); } @SuppressWarnings("unchecked") @Override public boolean equals(Object obj) { if (obj instanceof Item) { return compareTo((Item<U>) obj) == 0; } return false; } @Override public int hashCode() { return tuple.hashCode(); } } } }