package com.neocoretechs.bigsack.io.cluster; import java.io.IOException; import java.util.concurrent.CountDownLatch; import com.neocoretechs.bigsack.DBPhysicalConstants; import com.neocoretechs.bigsack.io.IOWorker; import com.neocoretechs.bigsack.io.IoInterface; import com.neocoretechs.bigsack.io.MultithreadedIOManager; import com.neocoretechs.bigsack.io.RecoveryLogManager; import com.neocoretechs.bigsack.io.ThreadPoolManager; import com.neocoretechs.bigsack.io.pooled.BlockAccessIndex; import com.neocoretechs.bigsack.io.pooled.BlockStream; import com.neocoretechs.bigsack.io.pooled.Datablock; import com.neocoretechs.bigsack.io.pooled.GlobalDBIO; import com.neocoretechs.bigsack.io.pooled.MappedBlockBuffer; import com.neocoretechs.bigsack.io.pooled.ObjectDBIO; import com.neocoretechs.bigsack.io.request.cluster.AbstractClusterWork; import com.neocoretechs.bigsack.io.request.cluster.CommitRequest; import com.neocoretechs.bigsack.io.request.cluster.CompletionLatchInterface; import com.neocoretechs.bigsack.io.request.cluster.FSeekAndReadFullyRequest; import com.neocoretechs.bigsack.io.request.cluster.FSeekAndReadRequest; import com.neocoretechs.bigsack.io.request.cluster.FSeekAndWriteFullyRequest; import com.neocoretechs.bigsack.io.request.cluster.FSeekAndWriteRequest; import com.neocoretechs.bigsack.io.request.cluster.FSizeRequest; import com.neocoretechs.bigsack.io.request.cluster.GetNextFreeBlockRequest; import com.neocoretechs.bigsack.io.request.cluster.GetNextFreeBlocksRequest; import com.neocoretechs.bigsack.io.request.cluster.FSyncRequest; import com.neocoretechs.bigsack.io.request.cluster.IsNewRequest; import com.neocoretechs.bigsack.io.request.cluster.RemoteCommitRequest; import com.neocoretechs.bigsack.io.request.iomanager.AddBlockAccessNoReadRequest; import com.neocoretechs.bigsack.io.request.iomanager.DirectBufferWriteRequest; import com.neocoretechs.bigsack.io.request.iomanager.FindOrAddBlockAccessRequest; import com.neocoretechs.bigsack.io.request.iomanager.ForceBufferClearRequest; import com.neocoretechs.bigsack.io.request.iomanager.GetUsedBlockRequest; import com.neocoretechs.bigsack.io.request.IoRequestInterface; /** * Handles the aggregation of the IO worker threads of which there is one for each tablespace. * In this incarnation the IOWorkers are subclassed to UDPMaster which handles traffic down to * UDPWorkers through the command to start via the WorkBoot node * When we need to cast a global operation which requires all tablespaces to coordinate a response we use * the CyclicBarrier class to set up the rendezvous with each IOworker and its particular request to the * set of all IO workers * Copyright (C) NeoCoreTechs 2014 * @author jg * */ public final class ClusterIOManager extends MultithreadedIOManager { private static final boolean DEBUG = false; protected int L3cache = 0; private static int currentPort = 10000; // starting UDP port, increments as assigned private static int messageSeq = 0; // monotonically increasing request id /** * Instantiate our master node array per database that communicate with our worker nodes * @throws IOException */ public ClusterIOManager(ObjectDBIO globalIO) throws IOException { super(globalIO); } protected void assignIoWorker() { ioWorker = new DistributedIOWorker[DBPhysicalConstants.DTABLESPACES]; } /** * Return the first available block that can be acquired for write * queue the request to the proper ioworker * @param tblsp The tablespace * @return The block available as a real, not virtual block * @exception IOException if IO problem */ public long getNextFreeBlock(int tblsp) throws IOException { if( DEBUG ) System.out.println("ClusterIOManager.getNextFreeBlock "+tblsp); CountDownLatch barrierCount = new CountDownLatch(1); IoRequestInterface iori = new GetNextFreeBlockRequest(barrierCount, nextFree[tblsp]); ioWorker[tblsp].queueRequest(iori); try { barrierCount.await(); } catch (InterruptedException e) {} // remove old request ((DistributedIOWorker)ioWorker[tblsp]).removeRequest((AbstractClusterWork) iori); nextFree[tblsp] = iori.getLongReturn(); return nextFree[tblsp]; } /** * Return the reverse scan of the first free block of each tablespace * queue the request to the proper ioworker, they wait at barrier synch, * then activate countdown latch to signal main. Result in placed in class level nextFree * @exception IOException if IO problem */ private void getNextFreeBlocks() throws IOException { if( DEBUG ) System.out.println("ClusterIOManager.getNextFreeBlocks "); CountDownLatch barrierCount = new CountDownLatch(DBPhysicalConstants.DTABLESPACES); IoRequestInterface[] iori = new IoRequestInterface[DBPhysicalConstants.DTABLESPACES]; // queue to each tablespace for (int i = 0; i < DBPhysicalConstants.DTABLESPACES; i++) { iori[i] = new GetNextFreeBlocksRequest(barrierCount); ioWorker[i].queueRequest(iori[i]); } // Wait for barrier synchronization from UDP master nodes if request demands it try { barrierCount.await(); } catch (InterruptedException e) {} for (int i = 0; i < DBPhysicalConstants.DTABLESPACES; i++) { nextFree[i] = iori[i].getLongReturn(); // remove old requests ((DistributedIOWorker)ioWorker[i]).removeRequest((AbstractClusterWork) iori[i]); } } /** * Send the request to write the given block at the given location, with * the number of bytes used written */ public void FseekAndWrite(long toffset, Datablock tblk) throws IOException { if( DEBUG ) System.out.println("ClusterIOManager.FseekAndWrite "+toffset); int tblsp = GlobalDBIO.getTablespace(toffset); long offset = GlobalDBIO.getBlock(toffset); CountDownLatch barrierCount = new CountDownLatch(1); IoRequestInterface iori = new FSeekAndWriteRequest(barrierCount, offset, tblk); ioWorker[tblsp].queueRequest(iori); try { barrierCount.await(); } catch (InterruptedException e) {} ((DistributedIOWorker)ioWorker[tblsp]).removeRequest((AbstractClusterWork) iori); } /** * Send the request to write the entire contents of the given block at the location specified * Presents a guaranteed write of full block for file extension or other spacing operations */ public void FseekAndWriteFully(long toffset, Datablock tblk) throws IOException { if( DEBUG ) System.out.println("ClusterIOManager.FseekAndWriteFully "+toffset); int tblsp = GlobalDBIO.getTablespace(toffset); long offset = GlobalDBIO.getBlock(toffset); CountDownLatch barrierCount = new CountDownLatch(1); IoRequestInterface iori = new FSeekAndWriteFullyRequest(barrierCount, offset, tblk); ioWorker[tblsp].queueRequest(iori); try { barrierCount.await(); } catch (InterruptedException e) {} ((DistributedIOWorker)ioWorker[tblsp]).removeRequest((AbstractClusterWork) iori); } /** * Queue a request to read int the passed block buffer * @param toffset The virtual block to read * @param tblk The Datablock buffer to read into * @throws IOException */ public void FseekAndRead(long toffset, Datablock tblk) throws IOException { if( DEBUG ) System.out.println("ClusterIOManager.FseekAndRead "+toffset); int tblsp = GlobalDBIO.getTablespace(toffset); long offset = GlobalDBIO.getBlock(toffset); CountDownLatch barrierCount = new CountDownLatch(1); IoRequestInterface iori = new FSeekAndReadRequest(barrierCount, offset, tblk); ioWorker[tblsp].queueRequest(iori); try { barrierCount.await(); } catch (InterruptedException e) {} // original request should contain object from response from remote worker Datablock rblock = (Datablock) iori.getObjectReturn(); rblock.doClone(tblk); // remove old requests ((DistributedIOWorker)ioWorker[tblsp]).removeRequest((AbstractClusterWork) iori); } /** * Queue a request to read int the passed block buffer * @param toffset The virtual block to read * @param tblk The Datablock buffer to read into * @throws IOException */ public void FseekAndReadFully(long toffset, Datablock tblk) throws IOException { if( DEBUG ) System.out.println("ClusterIOManager.FseekAndReadFully "+toffset); int tblsp = GlobalDBIO.getTablespace(toffset); long offset = GlobalDBIO.getBlock(toffset); CountDownLatch barrierCount = new CountDownLatch(1); CompletionLatchInterface iori = new FSeekAndReadFullyRequest(barrierCount, offset, tblk); ioWorker[tblsp].queueRequest(iori); try { barrierCount.await(); } catch (InterruptedException e) {} // original request should contain object from response from remote worker Datablock rblock = (Datablock) iori.getObjectReturn(); rblock.doClone(tblk); // remove old requests, this signals we are done ((DistributedIOWorker)ioWorker[tblsp]).removeRequest((AbstractClusterWork) iori); } /** * Find the smallest tablespace for storage balance, we will always favor creating one * over extending an old one * @return tablespace * @exception IOException if seeking new tablespace or creating fails */ public int findSmallestTablespace() throws IOException { if( DEBUG ) System.out.println("ClusterIOManager.findSmallestTablespace "); synchronized(nextFree) { // always make sure we have primary long primarySize = Fsize(0); int smallestTablespace = 0; // default main long smallestSize = primarySize; getNextFreeBlocks(); for (int i = 0; i < nextFree.length; i++) { if(nextFree[i] != -1 && nextFree[i] < smallestSize) { smallestSize = nextFree[i]; smallestTablespace = i; } } return smallestTablespace; } } public long Fsize(int tblsp) throws IOException { if( DEBUG ) System.out.println("ClusterIOManager.Fsize "); CountDownLatch barrierCount = new CountDownLatch(1); CompletionLatchInterface iori = new FSizeRequest(barrierCount); ioWorker[tblsp].queueRequest(iori); try { barrierCount.await(); } catch (InterruptedException e) {} long retVal = iori.getLongReturn(); // remove old requests ((DistributedIOWorker)ioWorker[tblsp]).removeRequest((AbstractClusterWork) iori); return retVal; } /** * Invoke each tablespace open request by creating buffers and spinning workers. * @see com.neocoretechs.bigsack.io.IoManagerInterface#Fopen(java.lang.String, int, boolean) */ @Override public synchronized boolean Fopen(String fname, int L3cache, boolean create) throws IOException { this.L3cache = L3cache; for (int i = 0; i < DBPhysicalConstants.DTABLESPACES; i++) { if( globalIO.getWorkerNodes() != null ) ioWorker[i] = new DistributedIOWorker(fname, i, ++currentPort, ++currentPort, globalIO.getWorkerNodes()[i][0], Integer.valueOf(globalIO.getWorkerNodes()[i][1]) ); else ioWorker[i] = new DistributedIOWorker(fname, i, ++currentPort, ++currentPort, null, 0); blockBuffer[i] = new MappedBlockBuffer(this, i); lbai[i] = new BlockStream(i, blockBuffer[i]); ulog[i] = new RecoveryLogManager(globalIO,i); ThreadPoolManager.getInstance().spin((Runnable)ioWorker[i],"IOWORKER"); ThreadPoolManager.getInstance().spin(blockBuffer[i], "BLOCKPOOL"); // allow the workers to come up try { Thread.sleep(500); } catch (InterruptedException e) {} // attempt recovery if needed ulog[i].getLogToFile().recover(); } // fill in the next free block indicators and set the smallest tablespace findSmallestTablespace(); return true; } /** (non-Javadoc) * @see com.neocoretechs.bigsack.io.IoManagerInterface#Fopen(java.lang.String, int, boolean) * * This is where the recovery logs are initialized because the logs operate at the block (database page) level. * When this module is instantiated the RecoveryLogManager is assigned to 'ulog' and a roll forward recovery * is started. If there are any records in the log file they will scanned for low water marks and * checkpoints etc and the determination is made based on the type of log record encountered. * Our log granularity is the page level. We store DB blocks and their original mirrors to use in * recovery. At the end of recovery we restore the logs to their initial state, as we do on a commit. * There is a simple paradigm at work here, we carry a single block access index in another class and use it * to cursor through the blocks as we access them. The BlockStream class has the BlockAccessIndex and DBStream * for each tablespace. The cursor window block and read and written from seep store and buffer pool. */ @Override public synchronized boolean Fopen(String fname, String remote, int L3cache, boolean create) throws IOException { this.L3cache = L3cache; String bootNode; int bootPort; for (int i = 0; i < DBPhysicalConstants.DTABLESPACES; i++) { if( globalIO.getWorkerNodes() != null ) { bootNode = globalIO.getWorkerNodes()[i][0]; bootPort = Integer.valueOf(globalIO.getWorkerNodes()[i][1]); } else { bootNode = null; bootPort = 0; } if( remote == null ) ioWorker[i] = new DistributedIOWorker(fname, i, ++currentPort, ++currentPort, bootNode, bootPort); else ioWorker[i] = new DistributedIOWorker(fname, remote, i, ++currentPort, ++currentPort, bootNode, bootPort); blockBuffer[i] = new MappedBlockBuffer(this, i); lbai[i] = new BlockStream(i, blockBuffer[i]); ulog[i] = new RecoveryLogManager(globalIO,i); ThreadPoolManager.getInstance().spin((Runnable)ioWorker[i], "IOWORKER"); ThreadPoolManager.getInstance().spin(blockBuffer[i], "BLOCKPOOL"); // allow the workers to come up try { Thread.sleep(500); } catch (InterruptedException e) {} // attempt recovery if needed ulog[i].getLogToFile().recover(); } // fill in the next free block indicators and set the smallest tablespace findSmallestTablespace(); return true; } public void Fopen() throws IOException { } public void Fclose() throws IOException { for (int i = 0; i < ioWorker.length; i++) if (ioWorker[i] != null ) { if( ioWorker[i].getRequestQueueLength() != 0 ) System.out.println("WARNING: closing tablespace "+i+" with "+ ioWorker[i].getRequestQueueLength()+" outstanding requests"); } // just sync in cluster mode Fforce(); } public void Fforce() throws IOException { if( DEBUG ) { System.out.println("ClusterIOManager.Fforce "); } CountDownLatch barrierCount = new CountDownLatch(DBPhysicalConstants.DTABLESPACES); IoRequestInterface[] iori = new IoRequestInterface[DBPhysicalConstants.DTABLESPACES]; // queue to each tablespace for (int i = 0; i < DBPhysicalConstants.DTABLESPACES; i++) { iori[i] = new FSyncRequest(barrierCount); ioWorker[i].queueRequest(iori[i]); } try { barrierCount.await(); } catch (InterruptedException e) {} for (int i = 0; i < DBPhysicalConstants.DTABLESPACES; i++) { // remove old requests ((DistributedIOWorker)ioWorker[i]).removeRequest((AbstractClusterWork) iori[i]); } } public boolean isNew() { try { return FisNew(0); } catch (IOException e) { } return false; } private boolean FisNew(int tblsp) throws IOException { if( DEBUG ) System.out.println("ClusterIOManager.FisNew for tablespace "+tblsp); CountDownLatch barrierCount = new CountDownLatch(1); CompletionLatchInterface iori = new IsNewRequest(barrierCount); ioWorker[tblsp].queueRequest(iori); try { barrierCount.await(); } catch (InterruptedException e) {} boolean retVal = (Boolean) iori.getObjectReturn(); // remove old requests ((DistributedIOWorker)ioWorker[tblsp]).removeRequest((AbstractClusterWork) iori); return retVal; } public static int getNextUUID() { return ++messageSeq; } @Override public void forceBufferClear() { CountDownLatch cdl = new CountDownLatch(DBPhysicalConstants.DTABLESPACES); synchronized(blockBuffer) { for(int i = 0; i < DBPhysicalConstants.DTABLESPACES; i++) { ForceBufferClearRequest fbcr = new ForceBufferClearRequest(blockBuffer[i], cdl, forceBarrierSynch); blockBuffer[i].queueRequest(fbcr); //blockBuffer[i].forceBufferClear(); } try { cdl.await();// wait for completion } catch (InterruptedException e) { // executor requested thread shutdown return; } } } /** * Load up a block from the freelist with the assumption that it will be filled in later. Do not * check for whether it should be logged,etc. As part of the 'acquireblock' process, this takes place. Latch it * as soon as possible though */ @Override public BlockAccessIndex addBlockAccessNoRead(Long Lbn) throws IOException { int tblsp = GlobalDBIO.getTablespace(Lbn); //return blockBuffer[tblsp].addBlockAccessNoRead(Lbn); CountDownLatch cdl = new CountDownLatch(1); AddBlockAccessNoReadRequest abanrr = new AddBlockAccessNoReadRequest(blockBuffer[tblsp], cdl, Lbn); blockBuffer[tblsp].queueRequest(abanrr); try { cdl.await(); return (BlockAccessIndex) abanrr.getObjectReturn(); } catch (InterruptedException e) { // shutdown waiting for return return null; } } @Override public BlockAccessIndex findOrAddBlockAccess(long bn) throws IOException { int tblsp = GlobalDBIO.getTablespace(bn); //return blockBuffer[tblsp].findOrAddBlockAccess(bn); CountDownLatch cdl = new CountDownLatch(1); FindOrAddBlockAccessRequest abanrr = new FindOrAddBlockAccessRequest(blockBuffer[tblsp], cdl, bn); blockBuffer[tblsp].queueRequest(abanrr); try { cdl.await(); lbai[tblsp].setLbai(((BlockAccessIndex) abanrr.getObjectReturn())); return (BlockAccessIndex) abanrr.getObjectReturn(); } catch (InterruptedException e) { // shutdown waiting for return return null; } } @Override public BlockAccessIndex getUsedBlock(long loc) { int tblsp = GlobalDBIO.getTablespace(loc); //return blockBuffer[tblsp].getUsedBlock(loc); CountDownLatch cdl = new CountDownLatch(1); GetUsedBlockRequest abanrr = new GetUsedBlockRequest(blockBuffer[tblsp], cdl, loc); blockBuffer[tblsp].queueRequest(abanrr); try { cdl.await(); return (BlockAccessIndex) abanrr.getObjectReturn(); } catch (InterruptedException e) { // shutdown waiting for return return null; } } /** * When something comes through the TCPWorker or UDPWorker the ioInterface is set to the TCPWorker * or UDPWorker, which also implement NodeBlockBufferInterface, so we have access to the node block buffer * through the ioInterface if the request traverses those classes. If we are in standalone the MultiThreadedIoManager * uses an alternate request method. * We queue a request to the local block buffer to commit after the preceding requests finish. * We queue a request to the master to forward to the workers to commit their blocks. */ @Override public void commitBufferFlush() throws IOException { if( DEBUG ) { System.out.println("ClusterIOManager.commitBufferFlush"); } CountDownLatch cdl = new CountDownLatch( DBPhysicalConstants.DTABLESPACES); for(int i = 0; i < DBPhysicalConstants.DTABLESPACES; i++) { //blockBuffer[i].commitBufferFlush(); CommitRequest cbfr = new CommitRequest(blockBuffer[i], globalIO.getIOManager().getUlog(i), commitBarrierSynch, cdl); ioWorker[i].queueRequest(cbfr); } try { cdl.await(); } catch (InterruptedException e) { return; // executor shutdown } if( DEBUG ) { System.out.println("ClusterIOManager.commitBufferFlush local buffers synched, messaging remote workers"); } // local buffers are flushed, queue request outbound to flush remote buffers, possibly updated by // our commit of local buffers pushing blocks out. cdl = new CountDownLatch( DBPhysicalConstants.DTABLESPACES); IoRequestInterface[] iori = new IoRequestInterface[DBPhysicalConstants.DTABLESPACES]; for (int i = 0; i < DBPhysicalConstants.DTABLESPACES; i++) { iori[i] = new RemoteCommitRequest(cdl); ioWorker[i].queueRequest(iori[i]); } try { cdl.await(); } catch (InterruptedException e) {} for (int i = 0; i < DBPhysicalConstants.DTABLESPACES; i++) { // remove old requests ((DistributedIOWorker)ioWorker[i]).removeRequest((AbstractClusterWork) iori[i]); } if( DEBUG ) { System.out.println("ClusterIOManager.commitBufferFlush exiting."); } } @Override public void directBufferWrite() throws IOException { CountDownLatch cdl = new CountDownLatch( DBPhysicalConstants.DTABLESPACES); synchronized(blockBuffer) { for(int i = 0; i < DBPhysicalConstants.DTABLESPACES; i++) { //blockBuffer[i].directBufferWrite(); DirectBufferWriteRequest dbwr = new DirectBufferWriteRequest(blockBuffer[i], cdl, directWriteBarrierSynch); blockBuffer[i].queueRequest(dbwr); } try { cdl.await(); } catch (InterruptedException e) { return; } } } @Override public void writeDirect(int tblsp, long blkn, Datablock blkV2) throws IOException { synchronized(ioWorker[tblsp]) { CountDownLatch barrierCount = new CountDownLatch(1); IoRequestInterface iori = new FSeekAndWriteRequest(barrierCount, blkn, blkV2); ioWorker[tblsp].queueRequest(iori); try { barrierCount.await(); } catch (InterruptedException e) {} ((DistributedIOWorker)ioWorker[tblsp]).removeRequest((AbstractClusterWork) iori); } } @Override public void readDirect(int tblsp, long blkn, Datablock blkV2) throws IOException { synchronized(ioWorker[tblsp]) { CountDownLatch barrierCount = new CountDownLatch(1); IoRequestInterface iori = new FSeekAndReadRequest(barrierCount, blkn, blkV2); ioWorker[tblsp].queueRequest(iori); try { barrierCount.await(); } catch (InterruptedException e) {} // original request should contain object from response from remote worker Datablock rblock = (Datablock) iori.getObjectReturn(); rblock.doClone(blkV2); // remove old requests ((DistributedIOWorker)ioWorker[tblsp]).removeRequest((AbstractClusterWork) iori); } } }