package weka.classifiers.rules.sortinghandler; /** * This code is from the book: * * Winder, R and Roberts, G (1998) <em>Developing Java * Software</em>, John Wiley & Sons. * * It is copyright (c) 1997 Russel Winder and Graham Roberts. */ import java.io.File ; import java.io.FileNotFoundException ; import java.io.FileWriter ; import java.io.IOException ; import java.util.Vector ; /** * A function object delivering a balanced merge sort of a file on * the filestore. * * @see Record * @see RecordReader * @see RecordWriter * @see RecordInformation * @see RecordCopyFile * @version 1.0 19.5.97 * @author Russel Winder */ public final class BalancedMergeSort implements FileSort { /** * A per object sort operation. * * @param fileName the <code>String</code> giving the name of the * file to be sorted. * * @param r the <code>RecordInformation</code> factory object for * creating <code>RecordReader</code>s and * <code>RecordWriters</code>s and able to deliver a * <code>Comparator</code>. */ public final void sort(final String fileName, final RecordInformation r) throws FileNotFoundException, IOException { execute(fileName, 20, 2, r) ; } /** * A per object sort operation. * * @param fileName the <code>String</code> giving the name of the * file to be sorted. * * @param blockSize the number of data items in the initial * sorted blocks. * * @param numberOfFiles the number of files to use for initial * dispersion. * * @param r the <code>RecordInformation</code> factory object for * creating <code>RecordReader</code>s and * <code>RecordWriters</code>s and able to deliver a * <code>Comparator</code>. */ public final void sort(final String fileName, final int blockSize, final int numberOfFiles, final RecordInformation r) throws FileNotFoundException, IOException { execute(fileName, blockSize, numberOfFiles, r) ; } /** * A statically accessible sort operation. * * @param fileName the <code>String</code> giving the name of the * file to be sorted. * * @param r the <code>RecordInformation</code> factory object for * creating <code>RecordReader</code>s and * <code>RecordWriters</code>s and able to deliver a * <code>Comparator</code>. */ public static void execute(final String fileName, final RecordInformation r) throws FileNotFoundException, IOException { execute(fileName, 20, 2, r) ; } /** * A statically accessible sort operation. * * @param fileName the <code>String</code> giving the name of the * file to be sorted. * * @param blockSize the number of data items in the initial * sorted blocks. * * @param numberOfFiles the number of files to use for initial * dispersion. * * @param r the <code>RecordInformation</code> factory object for * creating <code>RecordReader</code>s and * <code>RecordWriters</code>s and able to deliver a * <code>Comparator</code>. */ public static void execute(final String fileName, final int blockSize, final int numberOfFiles, final RecordInformation rInfo) throws FileNotFoundException, IOException { // // Create all the files needed for the sorting. // File file = new File (fileName) ; File[] f_A = new File [numberOfFiles] ; File[] f_B = new File [numberOfFiles] ; for (int i = 0 ; i < numberOfFiles ; i++) { f_A[i] = new File ("tmp_A_"+i) ; f_B[i] = new File ("tmp_B_"+i) ; } // // Perform the initial dispersion into the A files. // distributeSortedBlocks(file, f_A, blockSize, rInfo) ; // // Undertake the number of merge loops required to guarantee // that everything is sorted. Remember whether we ended up // with A_0 or B_0 containing the final sorted data. // File[] from = f_A ; File[] to = f_B ; boolean B_isFinal = true ; for (int i = 0 ; merge(from, to, (int)Math.pow(2, i)*blockSize, rInfo) ; i++) { File[] temp = from ; from = to ; to = temp ; B_isFinal = ! B_isFinal ; } // // Copy the data to the final destination. // File fileToCopy = B_isFinal ? f_B[0] : f_A[0] ; RecordCopyFile.execute(fileToCopy, file, rInfo) ; // // Delete all the files. // for (int i = 0 ; i < numberOfFiles ; i++) { f_A[i].delete() ; f_B[i].delete() ; } } /** * Perform the initial dispersion of the data. */ private static void distributeSortedBlocks(final File from, final File[] to, final int blockSize, final RecordInformation rInfo) throws FileNotFoundException, IOException { // // Create a Reader for the original data and a set of Writers // for the A files. // RecordReader reader = rInfo.newRecordReader(from) ; RecordWriter[] writers = new RecordWriter[to.length] ; for (int i = 0 ; i < to.length ; i++) { writers[i] = rInfo.newRecordWriter(to[i]) ; } boolean allDone = false ; while(! allDone) { for (int i = 0 ; ! allDone && (i < writers.length) ; i++) { // // Pull in a few records, put them into the Vector // that is where we are performing the internal sort // that creates the sorted blocks. // Vector v = new Vector () ; for (int j = 0 ; j < blockSize ; j++) { Record r = reader.readRecord() ; if (r == null) { //System.out.println("I read a null record\n"); // // If we cannot read a record then we have // reached the end of the file, so we must be // finished. Well except that we have to // sort and write out this incomplete block // first. // allDone = true ; break ; } /**** my updating ****/ // if (r != null) /**** end of my updating ****/ v.addElement(r) ; } // // Sort the Vector then write it out to the // appropriate A file. // QuicksortVector.execute(v, rInfo.getComparator()) ; for (int j = 0 ; j < v.size() ; j++) { writers[i].writeRecord((Record)v.elementAt(j)) ; } } } // // Be tidy and close all the files. Actually this is // essential to ensure we get a flush. // for (int i = 0 ; i < writers.length ; i++) { writers[i].close() ; } reader.close() ; } /** * Undertake a round of merging. */ private static boolean merge(final File[] from, final File[] to, final int currentBlockSize, final RecordInformation rInfo) throws FileNotFoundException, IOException { // // Open up the set of Readers and the set of Writers. // RecordReader[] readers = new RecordReader[from.length] ; for (int i = 0 ; i < readers.length ; i++) { readers[i] = rInfo.newRecordReader(from[i]) ; } RecordWriter[] writers = new RecordWriter [to.length] ; for (int i = 0 ; i < writers.length ; i++) { writers[i] = rInfo.newRecordWriter(to[i]) ; } // // We make us of an array which hold the next Record for each // of the files -- we need to have the record in memory in // order to compare the keys and so decide which Record to // write to the output file. // // Have another array which is keeping count of how many // Records we take from each of the files so that we can // cease drawing from a given file when we have taken the // appropriate number yet there are still records left. // boolean returnValue = false ; boolean allDone = false ; Record[] items = new Record[readers.length] ; int[] counts = new int[readers.length] ; while (! allDone) { for (int i = 0 ; i < writers.length ; i++) { // // Initialize the array holding the next record from // each of the files. Determine whether we are // finished or not by whether there are any records // left or not. // allDone = true ; for (int j = 0 ; j < readers.length ; j++) { counts[j] = 0 ; items[j] = readers[j].readRecord() ; if (items[j] != null) { counts[j] = 1 ; allDone = false ; } } if (allDone) break ; while (true) { // // Determine which is the next Record to add to // the output stream. If there isn't one then we // get a negative index and we can terminate the // loop. If we do not terminate then there was a // Record and we must write it out. // int index = findAppropriate(items, rInfo.getComparator()) ; if (index < 0) break ; writers[i].writeRecord(items[index]) ; if (i > 0) { // // We have not yet reduced the problem to // only a single file so there must be at // least one more iteration -- we know when // we are finished when everything goes into // a single file. // returnValue = true ; } // // Draw a new Record from the file whose Record // was chosen -- unless of course we have // finished our quota from that file. // if (counts[index] < currentBlockSize) { items[index] = readers[index].readRecord() ; if (items[index] != null) { counts[index]++ ; } } else { items[index] = null ; } } } } // // Be tidy, close all the files. Actually this is essentialy // to ensure there is a flush. // for (int i = 0 ; i < writers.length ; i++) { writers[i].close() ; } for (int i = 0 ; i < readers.length ; i++) { readers[i].close() ; } return returnValue ; } /** * Determine which Record is the one to be output next. * * @param items the array of <code>Records</code> from which to * select the next according to the order relation defined bu * <code>c</code>. * * @param c the <code>Comparator</code> defining the required * order relation on the <code>Record</code>s. * * @return the index in the array of the item that should be * chosen next. */ private static int findAppropriate(final Record[] items, final Comparator c) { // // Assume no output is to be done and then find the first // non-empty entry. // int index = -1 ; for (int i = 0 ; i < items.length ; i++) { if (items[i] != null) { index = i ; break ; } } // // If there were no non-empty entries then do nothing, we are // finshied. Otherwise... // if (index >= 0) { // // ...do a linear search through the items to see which // is the next one to select. // Record value = items[index] ; for (int i = index+1 ; i < items.length ; i++) { if (items[i] != null) { if (c.relation(items[i], value)) { index = i ; value = items[i] ; } } } } return index ; } }