package de.tudarmstadt.ukp.dkpro.core.io.web1t.util; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.EOFException; import java.io.File; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.List; import java.util.PriorityQueue; /** * Goal: offer a generic external-memory sorting program in Java. * * It must be : * - hackable (easy to adapt) * - scalable to large files * - sensibly efficient. * * This software is in the public domain. * * Usage: * java com/google/code/externalsorting/ExternalSort somefile.txt out.txt * * You can change the default maximal number of temporary files with the -t flag: * java com/google/code/externalsorting/ExternalSort somefile.txt out.txt -t 3 * * For very large files, you might want to use an appropriate flag to allocate * more memory to the Java VM: * java -Xms2G com/google/code/externalsorting/ExternalSort somefile.txt out.txt * * By (in alphabetical order) * Philippe Beaudoin, Jon Elsas, Christan Grant, Daniel Haran, Daniel Lemire, * April 2010 * originally posted at * http://www.daniel-lemire.com/blog/archives/2010/04/01/external-memory-sorting-in-java/ */ public class ExternalSort { static int DEFAULTMAXTEMPFILES = 1024; // we divide the file into small blocks. If the blocks // are too small, we shall create too many temporary files. // If they are too big, we shall be using too much memory. public static long estimateBestSizeOfBlocks(File filetobesorted, int maxtmpfiles) { long sizeoffile = filetobesorted.length() * 2; /** * We multiply by two because later on someone insisted on counting the memory * usage as 2 bytes per character. By this model, loading a file with 1 character * will use 2 bytes. */ // we don't want to open up much more than maxtmpfiles temporary files, better run // out of memory first. long blocksize = sizeoffile / maxtmpfiles + (sizeoffile % maxtmpfiles == 0 ? 0 : 1) ; // on the other hand, we don't want to create many temporary files // for naught. If blocksize is smaller than half the free memory, grow it. long freemem = Runtime.getRuntime().freeMemory(); if( blocksize < freemem/2) { blocksize = freemem/2; } return blocksize; } /** * This will simply load the file by blocks of x rows, then sort them in-memory, and write the * result to temporary files that have to be merged later. * * @param file * some flat file * @param cmp * string comparator * @return a list of temporary flat files * @throws IOException * if an I/O problem occurs. */ public static List<File> sortInBatch(File file, Comparator<String> cmp) throws IOException { return sortInBatch(file, cmp,DEFAULTMAXTEMPFILES); } /** * This will simply load the file by blocks of x rows, then sort them in-memory, and write the * result to temporary files that have to be merged later. You can specify a bound on the number * of temporary files that will be created. * * @param file * some flat file * @param cmp * string comparator * @param maxtmpfiles * maximum number of temporary files * @return a list of temporary flat files * @throws IOException * if an I/O problem occurs. */ public static List<File> sortInBatch(File file, Comparator<String> cmp, int maxtmpfiles) throws IOException { List<File> files = new ArrayList<File>(); BufferedReader fbr = new BufferedReader(new FileReader(file)); long blocksize = estimateBestSizeOfBlocks(file,maxtmpfiles);// in bytes try{ List<String> tmplist = new ArrayList<String>(); String line = ""; try { while(line != null) { long currentblocksize = 0;// in bytes while((currentblocksize < blocksize) &&( (line = fbr.readLine()) != null) ){ // as long as you have enough memory tmplist.add(line); currentblocksize += line.length() * 2; // java uses 16 bits per character? } files.add(sortAndSave(tmplist,cmp)); tmplist.clear(); } } catch(EOFException oef) { if(tmplist.size()>0) { files.add(sortAndSave(tmplist,cmp)); tmplist.clear(); } } } finally { fbr.close(); } return files; } public static File sortAndSave(List<String> tmplist, Comparator<String> cmp) throws IOException { Collections.sort(tmplist,cmp); File newtmpfile = File.createTempFile("sortInBatch", "flatfile"); newtmpfile.deleteOnExit(); BufferedWriter fbw = new BufferedWriter(new FileWriter(newtmpfile)); try { for(String r : tmplist) { fbw.write(r); fbw.newLine(); } } finally { fbw.close(); } return newtmpfile; } /** * This merges a bunch of temporary flat files * * @param files * the files to merge. * @param outputfile * the target file. * @param cmp * the comprarator. * @return The number of lines sorted. (P. Beaudoin) * @throws IOException * if an I/O problem occurs. */ public static int mergeSortedFiles(List<File> files, File outputfile, final Comparator<String> cmp) throws IOException { PriorityQueue<BinaryFileBuffer> pq = new PriorityQueue<BinaryFileBuffer>(11, new Comparator<BinaryFileBuffer>() { @Override public int compare(BinaryFileBuffer i, BinaryFileBuffer j) { return cmp.compare(i.peek(), j.peek()); } } ); for (File f : files) { BinaryFileBuffer bfb = new BinaryFileBuffer(f); pq.add(bfb); } BufferedWriter fbw = new BufferedWriter(new FileWriter(outputfile)); int rowcounter = 0; try { while(pq.size()>0) { BinaryFileBuffer bfb = pq.poll(); String r = bfb.pop(); fbw.write(r); fbw.newLine(); ++rowcounter; if(bfb.empty()) { bfb.fbr.close(); bfb.originalfile.delete();// we don't need you anymore } else { pq.add(bfb); // add it back } } } finally { fbw.close(); for(BinaryFileBuffer bfb : pq ) bfb.close(); } return rowcounter; } public static void main(String[] args) throws IOException { boolean verbose = false; int maxtmpfiles = DEFAULTMAXTEMPFILES; String inputfile=null, outputfile=null; for(int param = 0; param<args.length; ++param) { if(args[param].equals("-v") || args[param].equals("--verbose")) verbose = true; else if ((args[param].equals("-t") || args[param].equals("--maxtmpfiles")) && args.length>param+1) { param++; maxtmpfiles = Integer.parseInt(args[param]); } else { if(inputfile == null) inputfile = args[param]; else if (outputfile == null) outputfile = args[param]; else System.out.println("Unparsed: "+args[param]); } } if(outputfile == null) { System.out.println("please provide input and output file names"); return; } Comparator<String> comparator = new Comparator<String>() { @Override public int compare(String r1, String r2){ return r1.compareTo(r2);}}; List<File> l = sortInBatch(new File(inputfile), comparator, maxtmpfiles) ; if(verbose) System.out.println("created "+l.size()+" tmp files"); mergeSortedFiles(l, new File(outputfile), comparator); } } class BinaryFileBuffer { public static int BUFFERSIZE = 2048; public BufferedReader fbr; public File originalfile; private String cache; private boolean empty; public BinaryFileBuffer(File f) throws IOException { originalfile = f; fbr = new BufferedReader(new FileReader(f), BUFFERSIZE); reload(); } public boolean empty() { return empty; } private void reload() throws IOException { try { if((this.cache = fbr.readLine()) == null){ empty = true; cache = null; } else{ empty = false; } } catch(EOFException oef) { empty = true; cache = null; } } public void close() throws IOException { fbr.close(); } public String peek() { if(empty()) return null; return cache.toString(); } public String pop() throws IOException { String answer = peek(); reload(); return answer; } }