package weka.classifiers.rules.sortinghandler;
/**
* This code is from the book:
*
* Winder, R and Roberts, G (1998) <em>Developing Java
* Software</em>, John Wiley & Sons.
*
* It is copyright (c) 1997 Russel Winder and Graham Roberts.
*/
import java.io.File ;
import java.io.FileNotFoundException ;
import java.io.FileWriter ;
import java.io.IOException ;
import java.util.Vector ;
/**
* A function object delivering a polyphase merge sort of a file on
* the filestore.
*
* @see Record
* @see RecordReader
* @see RecordWriter
* @see RecordInformation
* @see RecordCopyFile
* @version 1.0 19.5.97
* @author Russel Winder
*/
public final class PolyphaseMergeSort implements FileSort
{
/**
* A per object sort operation.
*
* @param fileName the <code>String</code> giving the name of the
* file to be sorted.
*
* @param r the <code>RecordInformation</code> factory object for
* creating <code>RecordReader</code>s and
* <code>RecordWriters</code>s and able to deliver a
* <code>Comparator</code>.
*/
public final void sort(final String fileName,
final RecordInformation r)
throws FileNotFoundException, IOException
{
execute(fileName, 20, 2, r) ;
}
/**
* A per object sort operation.
*
* @param fileName the <code>String</code> giving the name of the
* file to be sorted.
*
* @param blockSize the number of data items in the initial
* sorted blocks.
*
* @param numberOfFiles the number of files to use for initial
* dispersion.
*
* @param r the <code>RecordInformation</code> factory object for
* creating <code>RecordReader</code>s and
* <code>RecordWriters</code>s and able to deliver a
* <code>Comparator</code>.
*/
public final void sort(final String fileName,
final int blockSize,
final int numberOfFiles,
final RecordInformation r)
throws FileNotFoundException, IOException
{
execute(fileName, blockSize, numberOfFiles, r) ;
}
/**
* A statically accessible sort operation.
*
* @param fileName the <code>String</code> giving the name of the
* file to be sorted.
*
* @param r the <code>RecordInformation</code> factory object for
* creating <code>RecordReader</code>s and
* <code>RecordWriters</code>s and able to deliver a
* <code>Comparator</code>.
*/
public static void execute(final String fileName,
final RecordInformation r)
throws FileNotFoundException, IOException
{
execute(fileName, 20, 2, r) ;
}
/**
* A statically accessible sort operation.
*
* @param fileName the <code>String</code> giving the name of the
* file to be sorted.
*
* @param blockSize the number of data items in the initial
* sorted blocks.
*
* @param numberOfFiles the number of files to use for initial
* dispersion.
*
* @param r the <code>RecordInformation</code> factory object for
* creating <code>RecordReader</code>s and
* <code>RecordWriters</code>s and able to deliver a
* <code>Comparator</code>.
*/
public static void execute(final String fileName,
final int approximateBlockSize,
int numberOfFiles,
final RecordInformation rInfo)
throws FileNotFoundException, IOException
{
numberOfFiles++ ;
//
// Forceably stick to 3 files.
//
numberOfFiles = 3 ;
//
// Create all the files needed for the sorting.
//
File file = new File (fileName) ;
File[] temp = new File [numberOfFiles] ;
for (int i = 0 ; i < numberOfFiles ; i++)
{
temp[i] = new File ("tmp_"+i) ;
}
//
// Calculate the block size. Must get things into the
// Fibonacci series for it to work properly.
//
// Have to run through the file to find the number of
// records. We need to copy the file anyway so this is not a
// wasted activity.
//
// We use the array F to calculate the Fibonacci numbers as
// we go, calculating the initialBlockSize to best fit the
// nearest Fibonacci number.
//
int indexOfNumberOfBlocks = 1 ;
int initialBlockSize = 1 ;
int numberOfRecords =
RecordCopyFile.execute(file, temp[0], rInfo) ;
int F[] = new int[numberOfRecords] ;
for (int i = 0 ; i < numberOfFiles ; i++)
{
F[i] = 1 ;
}
for (int i = numberOfFiles ; i < numberOfRecords ; i++)
{
F[i] = 0 ;
for (int j = i-1 ; j > i-numberOfFiles ; j--)
{
F[i] += F[j] ;
}
initialBlockSize = numberOfRecords / F[i] ;
if (initialBlockSize < approximateBlockSize)
{
indexOfNumberOfBlocks = i-1 ;
break ;
}
}
while (true)
{
if (++initialBlockSize * F[indexOfNumberOfBlocks] >
numberOfRecords)
break ;
}
//
// Ceate the support arrays containing current block size
// and block count in the various files.
//
int[] blockSizes = new int[numberOfFiles] ;
int[] blockCounts = new int[numberOfFiles] ;
blockSizes[0] = 0 ;
blockCounts[0] = 0 ;
for (int i = 1, j = indexOfNumberOfBlocks-1 ;
i < numberOfFiles ;
i++, j--)
{
blockSizes[i] = initialBlockSize ;
blockCounts[i] = F[j] ;
}
//
// Create the files of blocks of sorted records.
//
distributeSortedBlocks(temp,
0,
initialBlockSize,
blockCounts,
rInfo) ;
//
// Set up the file readers for all the files.
//
RecordReader[] readers = new RecordReader[numberOfFiles] ;
for (int i = 0 ; i < numberOfFiles ; i++)
{
readers[i] = rInfo.newRecordReader(temp[i]) ;
}
while (true)
{
//
// Check what work there is to do. If there is,
// find out which is the empty file
//
int toIndex = -1 ;
int numberOfNonEmptyFiles = 0 ;
int indexOfNonEmptyFile = -1 ;
for (int i = 0 ; i < numberOfFiles ; i++)
{
if (blockCounts[i] == 0)
{
toIndex = i ;
}
else
{
indexOfNonEmptyFile = i ;
numberOfNonEmptyFiles++ ;
}
}
//
// Exit if everthing is done but close all the files
// and copy the result back before exiting.
//
if (numberOfNonEmptyFiles <= 1)
{
for (int i = 0 ; i < numberOfFiles ; i++)
{
readers[i].close() ;
}
RecordCopyFile.execute(temp[indexOfNonEmptyFile],
file,
rInfo) ;
for (int i = 0 ; i < numberOfFiles ; i++)
{
temp[i].delete() ;
}
break ;
}
//
// Perform the next round of merging.
//
readers[toIndex].close() ;
RecordWriter writer =
rInfo.newRecordWriter(temp[toIndex]);
merge(readers,
writer,
toIndex,
blockSizes,
blockCounts,
rInfo) ;
writer.close() ;
readers[toIndex] =
rInfo.newRecordReader(temp[toIndex]) ;
}
}
/**
* Perform the initial dispersion of the data.
*/
private static void distributeSortedBlocks(File[] files,
int fromIndex,
int blockSize,
int[] blockCounts,
RecordInformation rInfo)
throws FileNotFoundException, IOException
{
//
// Create a Reader for the original data and a set of Writers
// for the output files.
//
RecordReader reader = rInfo.newRecordReader(files[fromIndex]);
RecordWriter[] writers = new RecordWriter[files.length] ;
for (int i = 0 ; i < files.length ; i++)
{
writers[i] = i == fromIndex
? null
: rInfo.newRecordWriter(files[i]) ;
}
for (int i = 0 ; i < writers.length ; i++)
{
if (i != fromIndex)
{
for (int j = 0 ; j < blockCounts[i] ; j++)
{
//
// Pull in a few records, put them into the
// Vector that is where we are performing the
// internal sort that creates us the sorted
// block.
//
Vector v = new Vector () ;
for (int k = 0 ; k < blockSize ; k++)
{
Record r = reader.readRecord() ;
if (r == null)
break ;
v.addElement(r) ;
}
//
// Sort the Vector then write it out to the
// appropriate file.
//
QuicksortVector.execute(v, rInfo.getComparator());
for (int k = 0 ; k < v.size() ; k++)
{
writers[i].writeRecord((Record)v.elementAt(k));
}
}
}
}
//
// Be tidy and close all the files. Actually this is
// essential to ensure we get a flush.
//
for (int i = 0 ; i < writers.length ; i++)
{
if (i != fromIndex)
{
writers[i].close() ;
}
}
reader.close() ;
}
/**
* Undertake a round of merging.
*/
private static void merge(RecordReader[] readers,
RecordWriter writer,
int toIndex,
int[] blockSizes,
int[] blockCounts,
RecordInformation rInfo)
throws FileNotFoundException, IOException
{
Record[] items = new Record[readers.length] ;
int[] counts = new int[readers.length] ;
int numberOfBlocksMerged = 0 ;
while (true)
{
boolean allDone = false ;
for (int i = 0 ; i < readers.length ; i++)
{
counts[i] = 0 ;
if (i == toIndex)
{
items[i] = null ;
}
else
{
readers[i].mark(64) ;
items[i] = readers[i].readRecord() ;
if (items[i] == null)
{
for (int j = 0 ; j < i ; j++)
{
if (j != toIndex)
{
readers[j].reset() ;
}
}
allDone = true ;
break ;
}
else
{
counts[i] = 1 ;
}
}
}
if (allDone)
break ;
numberOfBlocksMerged++ ;
while (true)
{
int i = findAppropriate(items,
toIndex,
rInfo.getComparator()) ;
if (i < 0)
break ;
writer.writeRecord(items[i]) ;
if (counts[i] < blockSizes[i])
{
items[i] = readers[i].readRecord() ;
if (items[i] != null)
{
counts[i]++ ;
}
}
else
{
items[i] = null ;
}
}
}
blockSizes[toIndex] = 0 ;
for (int i = 0 ; i < readers.length ; i++)
{
if (i != toIndex)
{
blockSizes[toIndex] += blockSizes[i] ;
}
}
for (int i = 0 ; i < readers.length ; i++)
{
blockCounts[i] -= numberOfBlocksMerged ;
}
blockCounts[toIndex] = numberOfBlocksMerged ;
}
/**
* Determine which Record is the one to be output next.
*
* @param items the array of <code>Records</code> from which to
* select the next according to the order relation defined bu
* <code>c</code>.
*
* @param toIndex the index into the array of the target. The
* otheres are assumed to be sources.
*
* @param c the <code>Comparator</code> defining the required
* order relation on the <code>Record</code>s.
*
* @return the index in the array of the item that should be
* chosen next.
*/
private static int findAppropriate(Record[] items,
int toIndex,
Comparator c)
{
//
// Assume no output is to be done and then find the first
// non-empty entry.
//
int index = -1 ;
for (int i = 0 ; i < items.length ; i++)
{
if (i != toIndex)
{
if (items[i] != null)
{
index = i ;
break ;
}
}
}
//
// If there were no non-empty entries then do nothing, we are
// finshied. Otherwise...
//
if (index >= 0)
{
//
// ...do a linear search through the items to see which
// is the next one to select.
//
Record value = items[index] ;
for (int i = index+1 ; i < items.length ; i++)
{
if (i != toIndex)
{
if (items[i] != null)
{
if (c.relation(items[i], value))
{
index = i ;
value = items[i] ;
}
}
}
}
}
return index ;
}
}