package ch.rgw.io; // filename: ExternalSort.java import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.EOFException; import java.io.File; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.util.Collections; import java.util.Comparator; import java.util.List; import java.util.PriorityQueue; import java.util.Vector; /** * Goal: offer a generic external-memory sorting program in Java. * * It must be : - hackable (easy to adapt) - scalable to large files - sensibly efficient. * * This software is in the public domain. * * By Daniel Lemire, April 2010 http://www.daniel-lemire.com/ */ public class ExternalSort { /** * This will simply load the file by blocks of x rows, then sort them in-memory, and write the * result to a bunch of temporary files that have to be merged later. * * @param file * some flat file * @return a list of temporary flat files */ public static List<File> sortInBatch(File file, Comparator<String> cmp) throws IOException{ List<File> files = new Vector<File>(); BufferedReader fbr = new BufferedReader(new FileReader(file)); long totalrowread = 0; try { List<String> tmplist = new Vector<String>(); String line = ""; try { while (line != null) { tmplist = new Vector<String>(); while ((Runtime.getRuntime().freeMemory() > 2097152) && ((line = fbr.readLine()) != null)) { // as long // as you // have 2MB tmplist.add(line); } files.add(sortAndSave(tmplist, cmp)); tmplist.clear(); } } catch (EOFException oef) { if (tmplist.size() > 0) { files.add(sortAndSave(tmplist, cmp)); tmplist.clear(); } } } finally { fbr.close(); } return files; } public static File sortAndSave(List<String> tmplist, Comparator<String> cmp) throws IOException{ Collections.sort(tmplist, cmp); File newtmpfile = File.createTempFile("sortInBatch", "flatfile"); newtmpfile.deleteOnExit(); BufferedWriter fbw = new BufferedWriter(new FileWriter(newtmpfile)); try { for (String r : tmplist) { fbw.write(r); fbw.newLine(); } } finally { fbw.close(); } return newtmpfile; } /** * This merges a bunch of temporary flat files * * @param files * @param output * file */ public static int mergeSortedFiles(List<File> files, File outputfile, Comparator<String> cmp) throws IOException{ PriorityQueue<BinaryFileBuffer> pq = new PriorityQueue<BinaryFileBuffer>(); for (File f : files) { BinaryFileBuffer bfb = new BinaryFileBuffer(f, cmp); pq.add(bfb); } BufferedWriter fbw = new BufferedWriter(new FileWriter(outputfile)); int rowcounter = 0; try { while (pq.size() > 0) { BinaryFileBuffer bfb = pq.poll(); String r = bfb.pop(); fbw.write(r); fbw.newLine(); ++rowcounter; if (bfb.empty()) { bfb.fbr.close(); bfb.originalfile.delete();// we don't need you anymore } else { pq.add(bfb); // add it back } } } finally { fbw.close(); } return rowcounter; } public static void main(String[] args) throws IOException{ if (args.length < 2) { System.out.println("please provide input and output file names"); return; } String inputfile = args[0]; String outputfile = args[1]; Comparator<String> comparator = new Comparator<String>() { public int compare(String r1, String r2){ return r1.compareTo(r2); } }; List<File> l = sortInBatch(new File(inputfile), comparator); mergeSortedFiles(l, new File(outputfile), comparator); } public void sortFile(File in, File out, Comparator<String> cmp) throws IOException{ List<File> l = sortInBatch(in, cmp); mergeSortedFiles(l, out, cmp); } static class BinaryFileBuffer implements Comparable<BinaryFileBuffer> { public static int BUFFERSIZE = 512; public BufferedReader fbr; private List<String> buf = new Vector<String>(); int currentpointer = 0; Comparator<String> mCMP; public File originalfile; public BinaryFileBuffer(File f, Comparator<String> cmp) throws IOException{ originalfile = f; mCMP = cmp; fbr = new BufferedReader(new FileReader(f)); reload(); } public boolean empty(){ return buf.size() == 0; } private void reload() throws IOException{ buf.clear(); try { String line; while ((buf.size() < BUFFERSIZE) && ((line = fbr.readLine()) != null)) buf.add(line); } catch (EOFException oef) {} } public String peek(){ if (empty()) return null; return buf.get(currentpointer); } public String pop() throws IOException{ String answer = peek(); ++currentpointer; if (currentpointer == buf.size()) { reload(); currentpointer = 0; } return answer; } public int compareTo(BinaryFileBuffer b){ return mCMP.compare(peek(), b.peek()); } } }