package gr.ntua.ivml.mint.util; // filename: ExternalSort.java import java.util.*; import java.io.*; /** * Goal: offer a generic external-memory sorting program in Java. * * It must be : * - hackable (easy to adapt) * - scalable to large files * - sensibly efficient. * * This software is in the public domain. * * By (in alphabetical order) * Philippe Beaudoin, Jon Elsas, Christan Grant, Daniel Haran, Daniel Lemire, * April 2010 * originally posted at * http://www.daniel-lemire.com/blog/archives/2010/04/01/external-memory-sorting-in-java/ */ public class ExternalSort { // how many files to merge maximally public static int NWAYMERGE = 16; // buffer to use in the n-way-merge public static int NWAYBUFFER = (1<<20); // how much memory we want to use public static long estimateBestSizeOfBlocks() { Runtime.getRuntime().gc(); long freemem = Runtime.getRuntime().freeMemory(); return (long) (freemem*0.7f); } /** * This will simply load the file by blocks of x rows, then * sort them in-memory, and write the result to a bunch of * temporary files that have to be merged later. * * If the sort can fit into memory it just writes the output straight away * @param file some flat file * @return a list of temporary flat files */ public static List<File> sortInBatch(Reader inFile, Writer outFile, Comparator<String> cmp) throws IOException { List<File> files = new ArrayList<File>(); BufferedReader fbr = new BufferedReader( inFile ); long blocksize = estimateBestSizeOfBlocks();// in bytes System.out.println( "Using " + blocksize + " bytes of memory"); try{ List<String> tmplist = new ArrayList<String>(); List<String> pass = new ArrayList<String>(); String line = ""; try { while(line != null) { long currentblocksize = 0;// in bytes while((currentblocksize < blocksize) &&( (line = fbr.readLine()) != null) ){ // as long as you have 2MB if( line.startsWith("#")) // pass through to outfile pass.add( line ); else { tmplist.add(line); currentblocksize += line.length() * 2; // java uses 16 bits per character? currentblocksize += 16; // plus some object overhead in the list } } BufferedWriter bw = new BufferedWriter( outFile ); for( String comment: pass ) { bw.write( comment ); bw.newLine(); } pass.clear(); bw.close(); if(( line == null ) && files.isEmpty()) { sortAndSave( tmplist, cmp, outFile ); } else { File tmpFile = File.createTempFile("sortInBatch", "flatfile"); tmpFile.deleteOnExit(); Writer tmpWriter = new FileWriter( tmpFile ); sortAndSave(tmplist,cmp, tmpWriter); files.add( tmpFile ); tmplist.clear(); } } } catch(EOFException oef) { if(tmplist.size()>0) { File tmpFile = File.createTempFile("sortInBatch", "flatfile"); tmpFile.deleteOnExit(); Writer tmpWriter = new FileWriter( tmpFile ); sortAndSave(tmplist,cmp, tmpWriter); files.add( tmpFile ); tmplist.clear(); } } } finally { fbr.close(); } return files; } public static void sortAndSave(List<String> tmplist, Comparator<String> cmp, Writer outFile ) throws IOException { Collections.sort(tmplist,cmp); // BufferedWriter fbw = new BufferedWriter(outFile ); try { for(String r : tmplist) { fbw.write(r); fbw.newLine(); } } finally { fbw.close(); } } /** * This merges a bunch of temporary flat files. This might be a bit chaotic if there are a thousand files. * Sending the head around a lot. Better do n way merges with big buffers. * @param files * @param output file * @return The number of lines sorted. (P. Beaudoin) */ public static int mergeSortedFiles(List<File> files, Writer outputWriter, final Comparator<String> cmp) throws IOException { PriorityQueue<BinaryFileBuffer> pq = new PriorityQueue<BinaryFileBuffer>(11, new Comparator<BinaryFileBuffer>() { public int compare(BinaryFileBuffer i, BinaryFileBuffer j) { return cmp.compare(i.peek(), j.peek()); } } ); ArrayDeque<File> toDoFiles = new ArrayDeque<File>(); toDoFiles.addAll( files ); int rowCounter = 0; while( ! toDoFiles.isEmpty() ) { while( !toDoFiles.isEmpty() && pq.size()<NWAYMERGE) { File f = toDoFiles.pollFirst(); BinaryFileBuffer bfb = new BinaryFileBuffer(f, NWAYBUFFER); pq.add(bfb); } BufferedWriter fbw; if( toDoFiles.isEmpty()) { fbw = new BufferedWriter( outputWriter ); rowCounter=0; } else { File tmpFile = File.createTempFile("sortInBatch", "flatfile"); tmpFile.deleteOnExit(); fbw = new BufferedWriter(new FileWriter(tmpFile)); toDoFiles.addLast( tmpFile ); } try { while(pq.size()>0) { BinaryFileBuffer bfb = pq.poll(); String r = bfb.pop(); fbw.write(r); fbw.newLine(); ++rowCounter; if(bfb.empty()) { bfb.fbr.close(); bfb.originalfile.delete();// we don't need you anymore } else { pq.add(bfb); // add it back } } } finally { fbw.close(); for(BinaryFileBuffer bfb : pq ) bfb.close(); // make sure everything is deleted in case of exception for( File f: files ) { try { if(( f != null) && ( f.exists())) f.delete(); } catch( Exception e ) { // ignore them, its just the garbage } } } } return rowCounter; } /** * A File version of the Reader Writer sort. * @param in * @param out * @param c * @throws IOException */ public static void sort( File in, File out, Comparator<String> c ) throws IOException { Reader r = new FileReader( in ); Writer w = new FileWriter( out ); sort( r,w,c); } /** * Uses external memory (tmp files) to sort the reader into the writer. * If everything fits in the memory, doesnt use tmp-files (obviously). * Your line Comparator needs to be efficient (most critical part). * @param in * @param out * @param c - Compares lines, as elaborate or as simple as you like. * @throws IOException */ public static void sort( Reader in, Writer out, Comparator<String> c ) throws IOException { List<File> l = sortInBatch(in, out, c) ; if( !l.isEmpty()) mergeSortedFiles(l, out, c); } public static void main( String[] args ) { long start = System.currentTimeMillis(); File in = new File( "testfile" ); File out = new File( "testfile.sort" ); TabbedStringComparator c = new TabbedStringComparator(); c.addKey( 2, true, false ); try { sort( in, out, c ); } catch( Exception e ) { e.printStackTrace(); } System.out.printf("%5.3f secs", (System.currentTimeMillis()-start)/1000d ); } } class BinaryFileBuffer { public BufferedReader fbr; public File originalfile; private String cache; private boolean empty; public BinaryFileBuffer(File f, int bufferSize ) throws IOException { originalfile = f; fbr = new BufferedReader(new FileReader(f), bufferSize ); reload(); } public boolean empty() { return empty; } private void reload() throws IOException { try { if((this.cache = fbr.readLine()) == null){ empty = true; cache = null; } else{ empty = false; } } catch(EOFException oef) { empty = true; cache = null; } } public void close() throws IOException { fbr.close(); } public String peek() { if(empty()) return null; return cache.toString(); } public String pop() throws IOException { String answer = peek(); reload(); return answer; } }