// This software is released into the Public Domain. See copying.txt for details.
package org.openstreetmap.osmosis.core.sort.common;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import org.openstreetmap.osmosis.core.lifecycle.Closeable;
import org.openstreetmap.osmosis.core.lifecycle.ReleasableIterator;
import org.openstreetmap.osmosis.core.store.ChunkedObjectStore;
import org.openstreetmap.osmosis.core.store.ObjectSerializationFactory;
import org.openstreetmap.osmosis.core.store.PersistentIterator;
import org.openstreetmap.osmosis.core.store.Storeable;
/**
* Allows a large number of objects to be sorted by writing them all to disk
* then sorting using a merge sort algorithm.
*
* @param <T>
* The object type to be sorted.
* @author Brett Henderson
*/
public class FileBasedSort<T extends Storeable> implements Closeable {
/**
* The maximum number of entities to perform memory-based sorting on,
* amounts larger than this will be split into chunks of this size, the
* chunks sorted in memory before writing to file, and all the results
* merged using the merge sort algorithm.
*/
private static final int MAX_MEMORY_SORT_COUNT = 16384;
/**
* The maximum number of sources to merge together at a single level of the
* merge sort hierarchy. Must be 2 or higher. A standard merge sort is 2.
*/
private static final int MAX_MERGE_SOURCE_COUNT = 2;
/**
* The number of levels in the merge sort hierarchy to perform in memory
* before persisting to a file. By persisting to file at regular hierarchy
* levels, the number of file handles is minimised. File handle count is
* likely to be an issue before memory usage due to the small number of
* in-flight objects at any point in time.
* <p>
* The number of file handles will be MAX_MERGE_SOURCE_COUNT raised to the
* power of MAX_MEMORY_SORT_DEPTH.
*/
private static final int MAX_MEMORY_SORT_DEPTH = 8;
private ObjectSerializationFactory serializationFactory;
private Comparator<T> comparator;
private ChunkedObjectStore<T> chunkedEntityStore;
private List<T> addBuffer;
private boolean useCompression;
/**
* Creates a new instance.
*
* @param serializationFactory
* The factory defining the object serialisation implementation.
* @param comparator
* The comparator to be used for sorting the results.
* @param useCompression
* If true, the storage files will be compressed.
*/
public FileBasedSort(
ObjectSerializationFactory serializationFactory, Comparator<T> comparator, boolean useCompression) {
this.serializationFactory = serializationFactory;
this.comparator = comparator;
this.useCompression = useCompression;
chunkedEntityStore = new ChunkedObjectStore<T>(serializationFactory, "emta", "idx", useCompression);
addBuffer = new ArrayList<T>(MAX_MEMORY_SORT_COUNT);
}
/**
* Sorts the data currently in the add buffer, writes it to the object
* store, and clears the buffer.
*/
private void flushAddBuffer() {
if (addBuffer.size() >= 0) {
// Sort the chunk prior to writing.
Collections.sort(addBuffer, comparator);
// Write all entities in the buffer to entity storage.
for (T entity : addBuffer) {
chunkedEntityStore.add(entity);
}
addBuffer.clear();
// Close the chunk in the underlying data store so that it can be
// read separately.
chunkedEntityStore.closeChunk();
}
}
/**
* Adds a new object to be sorted.
*
* @param value
* The data object.
*/
public void add(T value) {
// Add the new data entity to the add buffer.
addBuffer.add(value);
// If the add buffer is full, it must be sorted and written to entity
// storage.
if (addBuffer.size() >= MAX_MEMORY_SORT_COUNT) {
flushAddBuffer();
}
}
/**
* This is a wrapper method around the iterate method with the same argument
* list that persists the sort results prior to returning. This forces all
* sorting by nested recursive method calls to be performed allowing all
* associated memory can be freed.
*
* @param nestLevel
* The current recursive nesting level of the merge sort
* operation.
* @param beginChunkIndex
* The initial chunk to begin sorting from.
* @param chunkCount
* The number of chunks to sort.
* @return An iterator providing access to the sort result.
*/
private ReleasableIterator<T> iteratePersisted(int nestLevel, long beginChunkIndex, long chunkCount) {
ReleasableIterator<T> persistentIterator;
// Create a persistent iterator based on the requested underlying chunk
// iterator.
persistentIterator = new PersistentIterator<T>(
serializationFactory,
iterate(nestLevel, beginChunkIndex, chunkCount),
"emtb",
useCompression
);
// Prime the persistent iterator so that all underlying iterator data is
// written to file.
try {
ReleasableIterator<T> result;
result = persistentIterator;
// This will cause all data to be read from the underlying iterator
// into the persistent store.
persistentIterator.hasNext();
persistentIterator = null;
return result;
} finally {
// This will release the persistent iterator and its underlying
// source iterator if the persistence operations failed.
if (persistentIterator != null) {
persistentIterator.close();
}
}
}
/**
* Sorts the specified sub-section of the overall storage contents. This
* result list is not backed by a file and should be persisted prior to
* being incorporated into a higher level merge operation.
*
* @param nestLevel
* The current recursive nesting level of the merge sort
* operation.
* @param beginChunkIndex
* The initial chunk to begin sorting from.
* @param chunkCount
* The number of chunks to sort.
* @return An iterator providing access to the sort result.
*/
private ReleasableIterator<T> iterate(int nestLevel, long beginChunkIndex, long chunkCount) {
List<ReleasableIterator<T>> sources;
sources = new ArrayList<ReleasableIterator<T>>();
try {
MergingIterator<T> mergingIterator;
// If we are down to a small number of entities, we retrieve each source from file.
// Otherwise we recurse and split the number of entities down into smaller chunks.
if (chunkCount <= MAX_MERGE_SOURCE_COUNT) {
for (int i = 0; i < chunkCount; i++) {
sources.add(
chunkedEntityStore.iterate(beginChunkIndex + i)
);
}
} else {
long maxChunkIndex;
long subChunkCount;
/*
* The current chunk count must be divided by
* MAX_MERGE_SOURCE_COUNT and we must recurse for each of those
* sub chunk counts. Where the result isn't exact, we round up
* to the nearest multiple of MAX_MERGE_SOURCE_COUNT to ensure
* we don't end up with more than MAX_MERGE_SOURCE_COUNT
* sources.
*/
subChunkCount = chunkCount / MAX_MERGE_SOURCE_COUNT;
subChunkCount += chunkCount % MAX_MERGE_SOURCE_COUNT;
// We can never pass beyond the chunk boundaries specified for
// this function.
maxChunkIndex = beginChunkIndex + chunkCount;
for (
long subFirstChunk = beginChunkIndex;
subFirstChunk < maxChunkIndex;
subFirstChunk += subChunkCount) {
// The chunk count passed to the nested function should not
// make the nested function exceed this function's boundaries.
if (subFirstChunk + subChunkCount > maxChunkIndex) {
subChunkCount = maxChunkIndex - subFirstChunk;
}
/*
* Either call the persistent or standard version of the
* recursive iterate based on whether this nesting level
* requires persistence. If we only have one chunk left at
* this point we make an exception and skip persistence
* because it will only result in a single file being opened
* anyway.
*/
if (((nestLevel + 1) % MAX_MEMORY_SORT_DEPTH) == 0 && subChunkCount > 1) {
sources.add(
iteratePersisted(nestLevel + 1, subFirstChunk, subChunkCount)
);
} else {
sources.add(
iterate(nestLevel + 1, subFirstChunk, subChunkCount)
);
}
}
}
// Create a merging iterator to merge all of the sources.
mergingIterator = new MergingIterator<T>(sources, comparator);
// The merging iterator owns the sources now, so we clear our copy
// of them to prevent them being released on method exit.
sources.clear();
return mergingIterator;
} finally {
for (ReleasableIterator<T> source : sources) {
source.close();
}
}
}
/**
* Sorts and returns the contents of the sorter.
*
* @return An iterator providing access to the sorted entities.
*/
public ReleasableIterator<T> iterate() {
flushAddBuffer();
return iterate(0, 0, chunkedEntityStore.getChunkCount());
}
/**
* {@inheritDoc}
*/
public void close() {
chunkedEntityStore.close();
}
}