/**
* AnalyzerBeans
* Copyright (C) 2014 Neopost - Customer Information Management
*
* This copyrighted material is made available to anyone wishing to use, modify,
* copy, or redistribute it subject to the terms and conditions of the GNU
* Lesser General Public License, as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this distribution; if not, write to:
* Free Software Foundation, Inc.
* 51 Franklin Street, Fifth Floor
* Boston, MA 02110-1301 USA
*/
package org.eobjects.analyzer.util.sort;
import java.io.Closeable;
import java.io.EOFException;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.TreeMap;
import org.eobjects.analyzer.util.ImmutableEntry;
import org.apache.metamodel.util.FileHelper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Sorter, deduplicator and writer that uses temporary files as storage to
* support high volume sorted data.
*
* Note: This class is NOT thread-safe.
*
* @param <R>
* the row type, HAS to be serializable
* @param <W>
* the writer type used when writing a row to the final destination
* file.
*/
public abstract class SortMergeWriter<R extends Serializable, W extends Closeable> {
private static final Logger logger = LoggerFactory.getLogger(SortMergeWriter.class);
/**
* Size of the "records in memory" buffer
*/
private final int _bufferSize;
/**
* Comparator for row sorting
*/
private final Comparator<? super R> _comparator;
/**
* List of temporary files containing values
*/
private final List<File> _tempFiles;
/**
* Buffer containing sorted rows in memory
*/
private final Map<R, Integer> _buffer;
private int _nullCount;
public SortMergeWriter(Comparator<? super R> comparator) {
this(50000, comparator);
}
public SortMergeWriter(int bufferSize, Comparator<? super R> comparator) {
_bufferSize = bufferSize;
_tempFiles = new ArrayList<File>();
_buffer = new TreeMap<R, Integer>(comparator);
_comparator = comparator;
}
public void append(R line) {
append(line, 1);
}
public void append(R line, int frequency) {
if (line == null) {
// special handling of null
_nullCount += frequency;
} else {
Integer count = _buffer.get(line);
if (count == null) {
if (_buffer.size() == _bufferSize) {
flushBuffer();
}
count = 0;
}
count += frequency;
_buffer.put(line, count);
}
}
private void flushBuffer() {
logger.debug("flushBuffer()");
ObjectOutputStream oos = null;
try {
File file = createTempFile();
logger.info("Writing {} rows to temporary file: {}", _bufferSize, file);
oos = new ObjectOutputStream(new FileOutputStream(file));
Set<Entry<R, Integer>> entries = _buffer.entrySet();
for (Entry<R, Integer> entry : entries) {
oos.writeObject(entry.getKey());
oos.writeInt(entry.getValue());
}
_buffer.clear();
_tempFiles.add(file);
} catch (IOException e) {
throw new IllegalStateException(e);
} finally {
FileHelper.safeClose(oos);
}
}
protected File createTempFile() throws IOException {
File file = File.createTempFile("sort_merge", ".dat");
file.deleteOnExit();
return file;
}
/**
* Should null rows (if any) be written in the beginning or in the end of
* the written file? Subclasses can overwrite this method to define that
* behaviour.
*
* @return
*/
protected boolean writeNullsFirst() {
return true;
}
protected abstract void writeHeader(W writer) throws IOException;
protected abstract void writeRow(W writer, R row, int count) throws IOException;
protected abstract W createWriter(File file);
protected void writeNull(W writer, int nullCount) throws IOException {
writeRow(writer, null, nullCount);
}
public File write(String filename) {
File file = new File(filename);
write(file);
return file;
}
/**
* @param file
* @return the written count of rows
*/
public int write(final File file) {
W writer = null;
ObjectInputStream[] tempFileObjectInputStreams = null;
try {
writer = createWriter(file);
writeHeader(writer);
int rowCount = 0;
final boolean writeNullsFirst = writeNullsFirst();
if (_nullCount > 0 && writeNullsFirst) {
writeNull(writer, _nullCount);
rowCount++;
}
if (_tempFiles.isEmpty()) {
logger.info("No temp files created yet, flushing buffer directly to target file: {}", file);
Set<Entry<R, Integer>> entries = _buffer.entrySet();
for (Entry<R, Integer> entry : entries) {
writeRow(writer, entry.getKey(), entry.getValue());
rowCount++;
}
_buffer.clear();
if (_nullCount > 0 && !writeNullsFirst) {
writeNull(writer, _nullCount);
rowCount++;
}
return rowCount;
}
if (!_buffer.isEmpty()) {
flushBuffer();
}
tempFileObjectInputStreams = createTempFileObjectInputStreams();
final List<Entry<R, Integer>> rowCandidates = new ArrayList<Entry<R, Integer>>(_tempFiles.size());
for (int i = 0; i < _tempFiles.size(); i++) {
rowCandidates.add(null);
}
while (true) {
readNextRows(rowCandidates, tempFileObjectInputStreams);
Entry<R, Integer> currentRow = null;
// find the next row to write
for (Entry<R, Integer> rowCandidate : rowCandidates) {
if (rowCandidate != null) {
if (currentRow == null) {
currentRow = rowCandidate;
} else {
if (_comparator.compare(rowCandidate.getKey(), currentRow.getKey()) < 0) {
currentRow = rowCandidate;
}
}
}
}
if (currentRow == null) {
// the writing is done!
break;
}
// set count to 0 (the next loop will increment it)
currentRow = new ImmutableEntry<R, Integer>(currentRow.getKey(), 0);
for (int i = 0; i < rowCandidates.size(); i++) {
Entry<R, Integer> rowCandidate = rowCandidates.get(i);
if (rowCandidate != null) {
if (_comparator.compare(rowCandidate.getKey(), currentRow.getKey()) == 0) {
// sum up a new count
final int newCount = currentRow.getValue().intValue() + rowCandidate.getValue().intValue();
currentRow = new ImmutableEntry<R, Integer>(currentRow.getKey(), newCount);
rowCandidates.set(i, null);
}
}
}
writeRow(writer, currentRow.getKey(), currentRow.getValue());
rowCount++;
}
if (_nullCount > 0 && !writeNullsFirst) {
writeNull(writer, _nullCount);
rowCount++;
}
return rowCount;
} catch (Exception e) {
throw new IllegalStateException(e);
} finally {
FileHelper.safeClose(writer);
if (tempFileObjectInputStreams != null) {
for (int i = 0; i < tempFileObjectInputStreams.length; i++) {
FileHelper.safeClose(tempFileObjectInputStreams[i]);
}
}
}
}
private void readNextRows(List<Entry<R, Integer>> nextRows, ObjectInputStream[] tempFileObjectInputStreams)
throws Exception {
for (int i = 0; i < tempFileObjectInputStreams.length; i++) {
if (tempFileObjectInputStreams[i] != null) {
if (nextRows.get(i) == null) {
try {
@SuppressWarnings("unchecked")
final R row = (R) tempFileObjectInputStreams[i].readObject();
final int count = tempFileObjectInputStreams[i].readInt();
final Entry<R, Integer> entry = new ImmutableEntry<R, Integer>(row, count);
nextRows.set(i, entry);
} catch (EOFException e) {
FileHelper.safeClose(tempFileObjectInputStreams[i]);
tempFileObjectInputStreams[i] = null;
}
}
}
}
}
@SuppressWarnings("resource")
private ObjectInputStream[] createTempFileObjectInputStreams() throws IOException {
final ObjectInputStream[] tempFileObjectInputStreams = new ObjectInputStream[_tempFiles.size()];
for (int i = 0; i < tempFileObjectInputStreams.length; i++) {
final File tempFile = _tempFiles.get(i);
final ObjectInputStream ois = new ObjectInputStream(new FileInputStream(tempFile));
tempFileObjectInputStreams[i] = ois;
}
return tempFileObjectInputStreams;
}
}