/*******************************************************************************
* GenPlay, Einstein Genome Analyzer
* Copyright (C) 2009, 2014 Albert Einstein College of Medicine
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
* Authors: Julien Lajugie <julien.lajugie@einstein.yu.edu>
* Nicolas Fourel <nicolas.fourel@einstein.yu.edu>
* Eric Bouhassira <eric.bouhassira@einstein.yu.edu>
*
* Website: <http://genplay.einstein.yu.edu>
******************************************************************************/
package edu.yu.einstein.genplay.core.IO.fileSorter;
// filename: ExternalSort.java
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.EOFException;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.PriorityQueue;
import java.util.zip.Deflater;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;
class BinaryFileBuffer {
public BufferedReader fbr;
private String cache;
private boolean empty;
public BinaryFileBuffer(BufferedReader r)
throws IOException {
fbr = r;
reload();
}
public void close() throws IOException {
fbr.close();
}
public boolean empty() {
return empty;
}
public String peek() {
if (empty()) {
return null;
}
return cache.toString();
}
public String pop() throws IOException {
String answer = peek();
reload();
return answer;
}
private void reload() throws IOException {
try {
if ((cache = fbr.readLine()) == null) {
empty = true;
cache = null;
} else {
empty = false;
}
} catch (EOFException oef) {
empty = true;
cache = null;
}
}
}
/**
* Goal: offer a generic external-memory sorting program in Java.
*
* It must be : - hackable (easy to adapt) - scalable to large files - sensibly
* efficient.
*
* This software is in the public domain.
*
* Usage: java com/google/code/externalsorting/ExternalSort somefile.txt out.txt
*
* You can change the default maximal number of temporary files with the -t
* flag: java com/google/code/externalsorting/ExternalSort somefile.txt out.txt
* -t 3
*
* For very large files, you might want to use an appropriate flag to allocate
* more memory to the Java VM: java -Xms2G
* com/google/code/externalsorting/ExternalSort somefile.txt out.txt
*
* By (in alphabetical order) Philippe Beaudoin, Eleftherios Chetzakis, Jon
* Elsas, Christan Grant, Daniel Haran, Daniel Lemire, Sugumaran Harikrishnan,
* Jerry Yang, First published: April 2010 originally posted at
* http://lemire.me/blog/archives/2010/04/01/external-memory-sorting-in-java/
*/
public class ExternalSort {
static int DEFAULTMAXTEMPFILES = 1024;
public static Comparator<String> defaultcomparator = new Comparator<String>() {
@Override
public int compare(String r1, String r2) {
return r1.compareTo(r2);
}
};
public static void displayUsage() {
System.out
.println("java com.google.externalsorting.ExternalSort inputfile outputfile");
System.out.println("Flags are:");
System.out.println("-v or --verbose: verbose output");
System.out.println("-d or --distinct: prune duplicate lines");
System.out
.println("-t or --maxtmpfiles (followed by an integer): specify an upper bound on the number of temporary files");
System.out
.println("-c or --charset (followed by a charset code): specify the character set to use (for sorting)");
System.out
.println("-z or --gzip: use compression for the temporary files");
System.out
.println("-H or --header (followed by an integer): ignore the first few lines");
System.out
.println("-s or --store (following by a path): where to store the temporary files");
System.out.println("-h or --help: display this message");
}
// we divide the file into small blocks. If the blocks
// are too small, we shall create too many temporary files.
// If they are too big, we shall be using too much memory.
public static long estimateBestSizeOfBlocks(File filetobesorted,
int maxtmpfiles) {
long sizeoffile = filetobesorted.length() * 2;
/**
* We multiply by two because later on someone insisted on
* counting the memory usage as 2 bytes per character. By this
* model, loading a file with 1 character will use 2 bytes.
*/
// we don't want to open up much more than maxtmpfiles temporary
// files, better run
// out of memory first.
long blocksize = (sizeoffile / maxtmpfiles)
+ ((sizeoffile % maxtmpfiles) == 0 ? 0 : 1);
// on the other hand, we don't want to create many temporary
// files
// for naught. If blocksize is smaller than half the free
// memory, grow it.
long freemem = Runtime.getRuntime().freeMemory();
if (blocksize < (freemem / 2)) {
blocksize = freemem / 2;
}
return blocksize;
}
public static void main(String[] args) throws IOException {
boolean verbose = false;
boolean distinct = false;
int maxtmpfiles = DEFAULTMAXTEMPFILES;
Charset cs = Charset.defaultCharset();
String inputfile = null, outputfile = null;
File tempFileStore = null;
boolean usegzip = false;
int headersize = 0;
for (int param = 0; param < args.length; ++param) {
if (args[param].equals("-v")
|| args[param].equals("--verbose")) {
verbose = true;
} else if ((args[param].equals("-h") || args[param]
.equals("--help"))) {
displayUsage();
return;
} else if ((args[param].equals("-d") || args[param]
.equals("--distinct"))) {
distinct = true;
} else if ((args[param].equals("-t") || args[param]
.equals("--maxtmpfiles"))
&& (args.length > (param + 1))) {
param++;
maxtmpfiles = Integer.parseInt(args[param]);
if (headersize < 0) {
System.err
.println("maxtmpfiles should be positive");
}
} else if ((args[param].equals("-c") || args[param]
.equals("--charset"))
&& (args.length > (param + 1))) {
param++;
cs = Charset.forName(args[param]);
} else if ((args[param].equals("-z") || args[param]
.equals("--gzip"))) {
usegzip = true;
} else if ((args[param].equals("-H") || args[param]
.equals("--header")) && (args.length > (param + 1))) {
param++;
headersize = Integer.parseInt(args[param]);
if (headersize < 0) {
System.err
.println("headersize should be positive");
}
} else if ((args[param].equals("-s") || args[param]
.equals("--store")) && (args.length > (param + 1))) {
param++;
tempFileStore = new File(args[param]);
} else {
if (inputfile == null) {
inputfile = args[param];
} else if (outputfile == null) {
outputfile = args[param];
} else {
System.out.println("Unparsed: "
+ args[param]);
}
}
}
if (outputfile == null) {
System.out
.println("please provide input and output file names");
displayUsage();
return;
}
Comparator<String> comparator = defaultcomparator;
List<File> l = sortInBatch(new File(inputfile), comparator,
maxtmpfiles, cs, tempFileStore, distinct, headersize,
usegzip);
if (verbose) {
System.out
.println("created " + l.size() + " tmp files");
}
mergeSortedFiles(l, new File(outputfile), comparator, cs,
distinct, false, usegzip);
}
/**
* This merges several BinaryFileBuffer to an output writer.
*
* @param BufferedWriter
* A buffer where we write the data.
* @param cmp
* A comparator object that tells us how to sort the lines.
* @param distinct
* Pass <code>true</code> if duplicate lines should be
* discarded. (elchetz@gmail.com)
* @param buffers
* Where the data should be read.
* @return The number of lines sorted. (P. Beaudoin)
*
*/
public static int merge(BufferedWriter fbw, final Comparator<String> cmp, boolean distinct, List<BinaryFileBuffer> buffers) throws IOException {
PriorityQueue<BinaryFileBuffer> pq = new PriorityQueue<BinaryFileBuffer>(
11, new Comparator<BinaryFileBuffer>() {
@Override
public int compare(BinaryFileBuffer i,
BinaryFileBuffer j) {
return cmp.compare(i.peek(), j.peek());
}
});
for (BinaryFileBuffer bfb: buffers) {
if(!bfb.empty()) {
pq.add(bfb);
}
}
int rowcounter = 0;
String lastLine = null;
try {
while (pq.size() > 0) {
BinaryFileBuffer bfb = pq.poll();
String r = bfb.pop();
// Skip duplicate lines
if (!distinct || !r.equals(lastLine)) {
fbw.write(r);
fbw.newLine();
lastLine = r;
}
++rowcounter;
if (bfb.empty()) {
bfb.fbr.close();
} else {
pq.add(bfb); // add it back
}
}
} finally {
fbw.close();
for (BinaryFileBuffer bfb : pq) {
bfb.close();
}
}
return rowcounter;
}
/**
* This merges a bunch of temporary flat files
*
* @param files
* @param output
* file
* @return The number of lines sorted. (P. Beaudoin)
*/
public static int mergeSortedFiles(List<File> files, File outputfile) throws IOException {
return mergeSortedFiles(files, outputfile, defaultcomparator,
Charset.defaultCharset());
}
/**
* This merges a bunch of temporary flat files
*
* @param files
* @param output
* file
* @return The number of lines sorted. (P. Beaudoin)
*/
public static int mergeSortedFiles(List<File> files, File outputfile,
final Comparator<String> cmp) throws IOException {
return mergeSortedFiles(files, outputfile, cmp,
Charset.defaultCharset());
}
/**
* This merges a bunch of temporary flat files
*
* @param files
* @param output
* file
* @return The number of lines sorted. (P. Beaudoin)
*/
public static int mergeSortedFiles(List<File> files, File outputfile,
final Comparator<String> cmp, boolean distinct)
throws IOException {
return mergeSortedFiles(files, outputfile, cmp,
Charset.defaultCharset(), distinct);
}
/**
* This merges a bunch of temporary flat files
*
* @param files
* @param output
* file
* @param Charset
* character set to use to load the strings
* @return The number of lines sorted. (P. Beaudoin)
*/
public static int mergeSortedFiles(List<File> files, File outputfile,
final Comparator<String> cmp, Charset cs) throws IOException {
return mergeSortedFiles(files, outputfile, cmp, cs, false);
}
/**
* This merges a bunch of temporary flat files
*
* @param files
* The {@link List} of sorted {@link File}s to be merged.
* @param Charset
* character set to use to load the strings
* @param distinct
* Pass <code>true</code> if duplicate lines should be
* discarded. (elchetz@gmail.com)
* @param outputfile
* The output {@link File} to merge the results to.
* @param cmp
* The {@link Comparator} to use to compare
* {@link String}s.
* @param cs
* The {@link Charset} to be used for the byte to
* character conversion.
* @return The number of lines sorted. (P. Beaudoin)
* @since v0.1.2
*/
public static int mergeSortedFiles(List<File> files, File outputfile,
final Comparator<String> cmp, Charset cs, boolean distinct)
throws IOException {
return mergeSortedFiles(files, outputfile, cmp, cs, distinct,
false, false);
}
/**
* This merges a bunch of temporary flat files
*
* @param files
* The {@link List} of sorted {@link File}s to be merged.
* @param Charset
* character set to use to load the strings
* @param distinct
* Pass <code>true</code> if duplicate lines should be
* discarded. (elchetz@gmail.com)
* @param outputfile
* The output {@link File} to merge the results to.
* @param cmp
* The {@link Comparator} to use to compare
* {@link String}s.
* @param cs
* The {@link Charset} to be used for the byte to
* character conversion.
* @param append
* Pass <code>true</code> if result should append to
* {@link File} instead of overwrite. Default to be false
* for overloading methods.
* @param usegzip
* assumes we used gzip compression for temporary files
* @return The number of lines sorted. (P. Beaudoin)
* @since v0.1.4
*/
public static int mergeSortedFiles(List<File> files, File outputfile,
final Comparator<String> cmp, Charset cs, boolean distinct,
boolean append, boolean usegzip) throws IOException {
/*PriorityQueue<BinaryFileBuffer> pq = new PriorityQueue<BinaryFileBuffer>(
11, new Comparator<BinaryFileBuffer>() {
@Override
public int compare(BinaryFileBuffer i,
BinaryFileBuffer j) {
return cmp.compare(i.peek(), j.peek());
}
});*/
ArrayList<BinaryFileBuffer> bfbs = new ArrayList<BinaryFileBuffer>();
for (File f : files) {
final int BUFFERSIZE = 2048;
InputStream in = new FileInputStream(f);
BufferedReader br;
if (usegzip) {
br = new BufferedReader(new InputStreamReader(
new GZIPInputStream(in, BUFFERSIZE), cs));
} else {
br = new BufferedReader(new InputStreamReader(in,
cs));
}
BinaryFileBuffer bfb = new BinaryFileBuffer(br);
bfbs.add(bfb);
}
BufferedWriter fbw = new BufferedWriter(new OutputStreamWriter(
new FileOutputStream(outputfile, append), cs));
int rowcounter = merge(fbw,cmp,distinct, bfbs);
for (File f : files) {
f.delete();
}
return rowcounter;
}
/*
* This sorts a file (input) to an output file (output) using
* default parameters
*
* @param file
* source file
*
* @param file
* output file
*
*/
public static void sort(File input, File output) throws IOException {
ExternalSort.mergeSortedFiles(ExternalSort.sortInBatch(input),output);
}
/**
* Sort a list and save it to a temporary file
*
* @return the file containing the sorted data
* @param tmplist
* data to be sorted
* @param cmp
* string comparator
* @param cs
* charset to use for output (can use
* Charset.defaultCharset())
* @param tmpdirectory
* location of the temporary files (set to null for
* default location)
*/
public static File sortAndSave(List<String> tmplist,
Comparator<String> cmp, Charset cs, File tmpdirectory)
throws IOException {
return sortAndSave(tmplist, cmp, cs, tmpdirectory, false, false);
}
/**
* Sort a list and save it to a temporary file
*
* @return the file containing the sorted data
* @param tmplist
* data to be sorted
* @param cmp
* string comparator
* @param cs
* charset to use for output (can use
* Charset.defaultCharset())
* @param tmpdirectory
* location of the temporary files (set to null for
* default location)
* @param distinct
* Pass <code>true</code> if duplicate lines should be
* discarded.
*/
public static File sortAndSave(List<String> tmplist,
Comparator<String> cmp, Charset cs, File tmpdirectory,
boolean distinct, boolean usegzip) throws IOException {
Collections.sort(tmplist, cmp);
File newtmpfile = File.createTempFile("sortInBatch",
"flatfile", tmpdirectory);
newtmpfile.deleteOnExit();
OutputStream out = new FileOutputStream(newtmpfile);
int ZIPBUFFERSIZE = 2048;
if (usegzip) {
out = new GZIPOutputStream(out, ZIPBUFFERSIZE) {
{
def.setLevel(Deflater.BEST_SPEED);
}
};
}
BufferedWriter fbw = new BufferedWriter(new OutputStreamWriter(
out, cs));
String lastLine = null;
try {
for (String r : tmplist) {
// Skip duplicate lines
if (!distinct || !r.equals(lastLine)) {
fbw.write(r);
fbw.newLine();
lastLine = r;
}
}
} finally {
fbw.close();
}
return newtmpfile;
}
/**
* This will simply load the file by blocks of lines, then sort them
* in-memory, and write the result to temporary files that have to be
* merged later.
*
* @param file
* some flat file
* @param cmp
* string comparator
* @return a list of temporary flat files
*/
public static List<File> sortInBatch(File file)
throws IOException {
return sortInBatch(file, defaultcomparator, DEFAULTMAXTEMPFILES,
Charset.defaultCharset(), null, false);
}
/**
* This will simply load the file by blocks of lines, then sort them
* in-memory, and write the result to temporary files that have to be
* merged later.
*
* @param file
* some flat file
* @param cmp
* string comparator
* @return a list of temporary flat files
*/
public static List<File> sortInBatch(File file, Comparator<String> cmp)
throws IOException {
return sortInBatch(file, cmp, DEFAULTMAXTEMPFILES,
Charset.defaultCharset(), null, false);
}
/**
* This will simply load the file by blocks of lines, then sort them
* in-memory, and write the result to temporary files that have to be
* merged later.
*
* @param file
* some flat file
* @param cmp
* string comparator
* @param distinct
* Pass <code>true</code> if duplicate lines should be
* discarded.
* @return a list of temporary flat files
*/
public static List<File> sortInBatch(File file, Comparator<String> cmp,
boolean distinct) throws IOException {
return sortInBatch(file, cmp, DEFAULTMAXTEMPFILES,
Charset.defaultCharset(), null, distinct);
}
/**
* This will simply load the file by blocks of lines, then sort them
* in-memory, and write the result to temporary files that have to be
* merged later. You can specify a bound on the number of temporary
* files that will be created.
*
* @param file
* some flat file
* @param cmp
* string comparator
* @param maxtmpfiles
* maximal number of temporary files
* @param Charset
* character set to use (can use
* Charset.defaultCharset())
* @param tmpdirectory
* location of the temporary files (set to null for
* default location)
* @param distinct
* Pass <code>true</code> if duplicate lines should be
* discarded.
* @return a list of temporary flat files
*/
public static List<File> sortInBatch(File file, Comparator<String> cmp,
int maxtmpfiles, Charset cs, File tmpdirectory, boolean distinct)
throws IOException {
return sortInBatch(file, cmp, maxtmpfiles, cs, tmpdirectory,
distinct, 0, false);
}
/**
* This will simply load the file by blocks of lines, then sort them
* in-memory, and write the result to temporary files that have to be
* merged later. You can specify a bound on the number of temporary
* files that will be created.
*
* @param file
* some flat file
* @param cmp
* string comparator
* @param maxtmpfiles
* maximal number of temporary files
* @param Charset
* character set to use (can use
* Charset.defaultCharset())
* @param tmpdirectory
* location of the temporary files (set to null for
* default location)
* @param distinct
* Pass <code>true</code> if duplicate lines should be
* discarded.
* @param numHeader
* number of lines to preclude before sorting starts
* @parame usegzip use gzip compression for the temporary files
* @return a list of temporary flat files
*/
public static List<File> sortInBatch(File file, Comparator<String> cmp,
int maxtmpfiles, Charset cs, File tmpdirectory,
boolean distinct, int numHeader, boolean usegzip)
throws IOException {
List<File> files = new ArrayList<File>();
BufferedReader fbr = new BufferedReader(new InputStreamReader(
new FileInputStream(file), cs));
long blocksize = estimateBestSizeOfBlocks(file, maxtmpfiles);// in
// bytes
try {
List<String> tmplist = new ArrayList<String>();
String line = "";
try {
int counter = 0;
while (line != null) {
long currentblocksize = 0;// in bytes
while ((currentblocksize < blocksize)
&& ((line = fbr.readLine()) != null)) {
// as long as you have enough memory
if (counter < numHeader) {
counter++;
continue;
}
tmplist.add(line);
// ram usage estimation, not
// very accurate, still more
// realistic that the simple 2 *
// String.length
currentblocksize += StringSizeEstimator
.estimatedSizeOf(line);
}
files.add(sortAndSave(tmplist, cmp, cs,
tmpdirectory, distinct, usegzip));
tmplist.clear();
}
} catch (EOFException oef) {
if (tmplist.size() > 0) {
files.add(sortAndSave(tmplist, cmp, cs,
tmpdirectory, distinct, usegzip));
tmplist.clear();
}
}
} finally {
fbr.close();
}
return files;
}
}