package com.neocoretechs.bigsack.io.cluster.mpi; import mpi.*; import java.io.IOException; import java.io.OutputStream; import java.net.InetAddress; import java.net.Socket; import java.nio.ByteBuffer; import java.util.Iterator; import java.util.Map.Entry; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.CountDownLatch; import com.neocoretechs.arieslogger.core.impl.LogToFile; import com.neocoretechs.bigsack.io.cluster.MasterInterface; import com.neocoretechs.bigsack.io.cluster.WorkBootCommand; import com.neocoretechs.bigsack.io.pooled.GlobalDBIO; import com.neocoretechs.bigsack.io.request.IoRequestInterface; import com.neocoretechs.bigsack.io.request.IoResponseInterface; import com.neocoretechs.bigsack.io.request.cluster.CompletionLatchInterface; /** * MPI implementation * This node functions as the master, in effect, a layer between MultiThreadedIOManager in its incarnation * as ClusterIOManager and each IOWorker thread located on a remote node. * There will be one of these for each tablespace of each database, so 8 per DB each with its own port * The naming convention for the remote nodes is the constant 'remoteWorker' with the tablespace number appended. * The 'WorkBoot' process on the remote node is responsible for spinning the workers that communicate with the master. * A back-channel MPI server to the workboot initiates the process. * To test in local cluster mode set the boolean 'TEST' value true. This replaces the references to remote workers * 'AMI' + tablespace Ip address with a localhost IP address. Also TCPworker uses 'AMIMASTER' as its remote master and * those references are likewise replaced with localhost. In general the remote directory is * 'Database path + 'tablespace'+ tablespace# + tablename' where tablename is 'DBname+class+'.'+tablespace#' * so if your remote db path is /home/relatrix/AMI as passed to workboot then its translation is: * /home/relatrix/tablespace0/AMIcom.yourpack.yourclass.0 * for the remote node 'AMI0', for others replace all '0' with '1' etc for other tablespaces. * So to test cluster locally use 1 workboot and different directories on localhost called tablespace0-7 under the same * directory as 'log', the recovery log location. this directory also needs the .properties file * On the true cluster a workboot would be running on each node and /home/relatrix/tablespace0,1,2 and properties * etc must be present on each node. The master contains the recovery logs and distributes IO requests to each worker node * based on tablespace. * @author jg * Copyright (C) NeoCoreTechs 2014,2015 */ public class MPIMaster implements Runnable, MasterInterface { private static final boolean DEBUG = true; public static final boolean TEST = false; // true to run in local cluster test mode private String masterPort; private int WORKBOOTPORT = 8000; private static String remoteWorker = "AMI"; private Intercomm intercomm; ByteBuffer bout = MPI.newByteBuffer(LogToFile.DEFAULT_LOG_BUFFER_SIZE); ByteBuffer bin = MPI.newByteBuffer(LogToFile.DEFAULT_LOG_BUFFER_SIZE); //ByteBuffer b = ByteBuffer.allocate(LogToFile.DEFAULT_LOG_BUFFER_SIZE); private String DBName; private int tablespace; private String remoteDBName = null; // if not null, alternate database name for remote worker nodes with specific directory private boolean shouldRun = true; private ConcurrentHashMap<Integer, IoRequestInterface> requestContext; private int MAXLEN = 10000; private InetAddress IPAddress; /** * Start a master cluster node. The database, tablespace, and listener port are assigned * by the respective IO manager. The request queue and mapping from request id to original request hashmap * are passed again by the respective IO manager. These masters are one-to-one tablespace and database and worker * on the remote node. The masters all run on the main cluster node. * @param dbName * @param tablespace * @param port * @param requestQueue * @param requestContext * @throws IOException */ public MPIMaster(String dbName, int tablespace, ConcurrentHashMap<Integer, IoRequestInterface> requestContext) throws IOException { this.DBName = dbName; this.tablespace = tablespace; this.requestContext = requestContext; try { masterPort = Intracomm.openPort(); if( DEBUG ) { System.out.println("MPIMaster constructed with "+DBName+" tablespace:"+tablespace+" MPI master port:"+masterPort); } // We have to contact the remote TCP backchannel associated with this WORKBOOT to spin remote threads // if TEST we are all local, this flag appears in all the network options if( TEST ) { IPAddress = InetAddress.getLocalHost(); } else { IPAddress = InetAddress.getByName(remoteWorker+String.valueOf(tablespace)); } } catch (MPIException e) { throw new IOException(e); } } /** * Specify an alternate remote DB name and directory for the current database. * Primary usage is for nodes with OSs different from the master * @param dbName * @param remoteDBName * @param tablespace * @param masterPort * @param slavePort * @param requestContext * @throws IOException */ public MPIMaster(String dbName, String remoteDBName, int tablespace, ConcurrentHashMap<Integer, IoRequestInterface> requestContext) throws IOException { this(dbName, tablespace, requestContext); this.remoteDBName = remoteDBName; if( DEBUG ) System.out.println("MPIMaster constructed with "+dbName+" using remote DB:"+remoteDBName+" tablespace:"+tablespace); } /** * Set the prefix name of the remote worker node that this master communicates with * This name plus the tablespace identifies each individual worker node * In test mode, the local host is used for workers and master * @param rname */ public void setRemoteWorkerName(String rname) { remoteWorker = rname; } /** * Look for messages coming back from the workers. Extract the UUID of the returned packet * and get the real request from the ConcurrentHashTable buffer */ @Override public void run() { if( DEBUG ) { System.out.println("MPIMaster connection "); try { intercomm = MPI.COMM_SELF.accept(masterPort, 0); } catch (MPIException e) { System.out.println("MPI Master connection fault "+e+", returning from MPIMaster thread"); return; } if( DEBUG ) { System.out.println("MPIMaster got connection:"+intercomm+" db:"+DBName+" tablespace:"+tablespace+" MPI master port:"+masterPort); } } try { while(shouldRun ) { bin.clear(); intercomm.recv(bin, bin.capacity(), MPI.BYTE, 0, 99); IoResponseInterface iori = (IoResponseInterface) GlobalDBIO.deserializeObject(bin); // get the original request from the stored table IoRequestInterface ior = requestContext.get(iori.getUUID()); if( DEBUG ) System.out.println("MPIMaster FROM Remote, response:"+iori); // // If we detect a request that has not correspondence in the table of requests issued // then the request is a duplicate of some sort of corruption has occurred. If in debug, log, dump // table of current requests, and ignore // if( DEBUG ) { System.out.println("MPIMaster Extracting latch from original request:"+ior); if( ior == null ) { Set<Entry<Integer, IoRequestInterface>> e = requestContext.entrySet(); System.out.println("MPIMaster ******* INBOUND REQUEST DOES NOT VERIFY *******\r\nDump context table, size:"+requestContext.size()); Iterator<Entry<Integer, IoRequestInterface>> ei = e.iterator(); while(ei.hasNext()) { Entry<Integer, IoRequestInterface> ein = ei.next(); System.out.println("Request #: "+ein.getKey()+" val:"+ein.getValue()); } break; } } // set the return values in the original request to our values from remote workers ((CompletionLatchInterface)ior).setLongReturn(iori.getLongReturn()); Object o = iori.getObjectReturn(); if( o instanceof Exception ) { System.out.println("MPIMaster: ******** REMOTE EXCEPTION ******** "+o); } ((CompletionLatchInterface)ior).setObjectReturn(o); if( DEBUG ) { System.out.println("MPIMaster ready to count down latch with "+ior); } // now add to any latches awaiting CountDownLatch cdl = ((CompletionLatchInterface)ior).getCountDownLatch(); cdl.countDown(); } } catch (IOException | MPIException e) { // we lost the remote, try to close worker and wait for reconnect System.out.println("MPIMaster receive IO error "+e); try { Intracomm.closePort(masterPort); MPI.Finalize(); MPI.Init(null); } catch (MPIException e3) {} } } /** * Send request to remote worker * @param iori */ public void send(IoRequestInterface iori) { byte[] sendData; int rank = -1; try { sendData = GlobalDBIO.getObjectAsBytes(iori); bout = MPI.newByteBuffer(sendData.length); bout.put(sendData); bout.flip(); intercomm.send(bout, sendData.length, MPI.BYTE, 1, 99); } catch (IOException | MPIException e) { System.out.println("MPI send error "+e+" rank:"+rank); } } /** * Open a socket to the remote worker located at 'remoteWorker' with the tablespace appended * so each node is named [remoteWorker]0 [remoteWorker]1 etc * @param fname * @param create * @return * @throws IOException */ public boolean Fopen(String fname, boolean create) throws IOException { // send a remote Fopen request to the node // this consists of sending the running WorkBoot a message to start the worker for a particular // database and tablespace and the node we hand down Socket s = new Socket(IPAddress, WORKBOOTPORT); OutputStream os = s.getOutputStream(); WorkBootCommand cpi = new WorkBootCommand(); if( remoteDBName != null ) cpi.setDatabase(remoteDBName); else cpi.setDatabase(DBName); cpi.setTablespace(tablespace); cpi.setTransport("MPI"); cpi.setMasterPort(masterPort); cpi.setSlavePort(masterPort); os.write(GlobalDBIO.getObjectAsBytes(cpi)); os.flush(); os.close(); s.close(); return true; } @Override public void setMasterPort(String port) { } @Override public void setSlavePort(String port) { } }