package com.neocoretechs.bigsack.io.cluster.mpi; import java.io.IOException; import java.io.InputStream; import java.io.ObjectInputStream; import java.io.ObjectOutputStream; import java.io.OutputStream; import java.net.InetAddress; import java.net.InetSocketAddress; import java.net.ServerSocket; import java.net.Socket; import java.net.SocketAddress; import java.net.SocketException; import java.net.StandardSocketOptions; import java.net.UnknownHostException; import java.nio.ByteBuffer; import java.nio.channels.ServerSocketChannel; import java.nio.channels.SocketChannel; import mpi.Intercomm; import mpi.Intracomm; import mpi.MPI; import mpi.MPIException; import com.neocoretechs.arieslogger.core.impl.LogToFile; import com.neocoretechs.bigsack.io.IOWorker; import com.neocoretechs.bigsack.io.ThreadPoolManager; import com.neocoretechs.bigsack.io.cluster.DistributedWorkerResponseInterface; import com.neocoretechs.bigsack.io.cluster.NodeBlockBuffer; import com.neocoretechs.bigsack.io.cluster.NodeBlockBufferInterface; import com.neocoretechs.bigsack.io.cluster.WorkerRequestProcessor; import com.neocoretechs.bigsack.io.pooled.GlobalDBIO; import com.neocoretechs.bigsack.io.request.IoRequestInterface; import com.neocoretechs.bigsack.io.request.IoResponseInterface; import com.neocoretechs.bigsack.io.request.cluster.CompletionLatchInterface; /** * This class functions as the remote IOWorker. Two unidirectional channels provide full duplex * communication. Each 'master' and 'worker' connect via MPI. * Multiple threads on each node, one set of master/worker/worker processor threads is spun * for each database. Each node maintains a specific tablespace for all databases. * An Fopen spawns additional instances of these threads. * Presumably, there is an instance of this present on each of the 8 tablespace worker nodes, but * any combination of nodes can be used as long as the target directory has the proper 'tablespace#' * subdirectories. The design has the target database path concatenated with the tablespace in cluster mode. * Actual operation is simple: When a block comes down it gets written, if a block comes up it gets read. * The request comes down as a serialized object similar to standalone requests, but with additional network garnish. * The network requests are interpreted as standard requests when they reach the IOWorker. * Instances of these TCPWorkers are started by the WorkBoot controller node in response to * the backchannel TCPServer requests. Existing threads are shut down and sockets closed, and a new batch of threads * are spun up if necessary. * @author jg * Copyright (C) NeoCoreTechs 2014,2015 * */ public class MPIWorker extends IOWorker implements DistributedWorkerResponseInterface, NodeBlockBufferInterface { private static final boolean DEBUG = true; boolean shouldRun = true; public String MASTERPORT = "tcp://amimaster"; public String SLAVEPORT = "tcp://ami0"; public static String remoteMaster = "AMIMASTER"; private String remoteDb; private byte[] sendData; private Intercomm master; private WorkerRequestProcessor workerRequestProcessor; // ByteBuffer for NIO socket read/write, currently broken under arm //private ByteBuffer b = ByteBuffer.allocate(LogToFile.DEFAULT_LOG_BUFFER_SIZE); private NodeBlockBuffer blockBuffer; private ByteBuffer bout; private ByteBuffer bin; /* * This routine establishes communication with a server specified by port_name. * It is collective over the calling communicator and returns an intercommunicator * in which the remote group participated in an MPI_COMM_ACCEPT. * If the named port does not exist (or has been closed), MPI_COMM_CONNECT raises an error of class MPI_ERR_PORT. * If the port exists, but does not have a pending MPI_COMM_ACCEPT, the connection attempt will eventually time * out after an implementation-defined time, or succeed when the server calls MPI_COMM_ACCEPT. In the case of a * time out, MPI_COMM_CONNECT raises an error of class MPI_ERR_PORT */ public MPIWorker(String dbname, int tablespace, String masterPort, String slavePort, int L3Cache) throws IOException { super(dbname, tablespace, L3Cache); MASTERPORT= masterPort; SLAVEPORT = slavePort; /** * Java binding of {@code MPI_COMM_CONNECT}. * @param port port name * @param info implementation-specific information * @param root rank in comm of root node * @return intercommunicator with server as remote group * @throws MPIException */ try { master = MPI.COMM_WORLD.connect(MASTERPORT, 1); } catch (MPIException e) { throw new IOException(e); } // spin the request processor thread for the worker workerRequestProcessor = new WorkerRequestProcessor(this); ThreadPoolManager.getInstance().spin(workerRequestProcessor); blockBuffer = new NodeBlockBuffer(this); if( DEBUG ) { System.out.println("Worker on port "+SLAVEPORT+" with master "+MASTERPORT+" database:"+dbname+ " tablespace "+tablespace); } } public MPIWorker(String dbname, String remotedb, int tablespace, String masterPort, String slavePort, int l3Cache) throws IOException { this(dbname, tablespace, masterPort, slavePort,l3Cache); this.remoteDb = remotedb; } public NodeBlockBuffer getBlockBuffer() { return blockBuffer; } /** * Queue a request on this worker, the request is assumed to be on this tablespace * Instead of queuing to a running thread request queue, queue this for outbound message * The type is IOResponseInterface and contains the Id and the payload * back to master * @param irf */ public void queueResponse(IoResponseInterface irf) { if( DEBUG ) { System.out.println("Adding response "+irf+" to outbound from worker to port:"+MASTERPORT); } try { // connect to the master and establish persistent connect sendData = GlobalDBIO.getObjectAsBytes(irf); //ByteBuffer srcs = ByteBuffer.wrap(sendData); //int rank = -1; bout = MPI.newByteBuffer(sendData.length); bout.put(sendData); bout.flip(); master.send(bout, sendData.length, MPI.BYTE, 1, 99); } catch (IOException | MPIException e) { System.out.println("Socket send error "+e+" on port "+MASTERPORT); throw new RuntimeException(e); } } /** * Spin the worker, get the tablespace from the cmdl param * @param args * @throws Exception */ public static void main(String args[]) throws Exception { if( args.length < 4 ) { System.out.println("Usage: java com.neocoretechs.bigsack.io.cluster.MPIWorker [database] [remotedb] [tablespace] [master port] [slave port]"); } // Use mmap mode 0 ThreadPoolManager.getInstance().spin(new MPIWorker(args[0], args[1], Integer.valueOf(args[2]), args[3], args[4], 0)); } @Override public void run() { while(shouldRun) { try { bin.clear(); master.recv(bin, bin.capacity(), MPI.BYTE, 1, 99); //IoResponseInterface iori = (IoResponseInterface) GlobalDBIO.deserializeObject(bin); // get the original request from the stored table //IoRequestInterface ior = requestContext.get(iori.getUUID()); CompletionLatchInterface iori = (CompletionLatchInterface) GlobalDBIO.deserializeObject(bin); if( DEBUG ) { System.out.println("MPIWorker FROM REMOTE on port:"+SLAVEPORT+" "+iori); } // Hook the request up to a real IoWorker iori.setIoInterface(this); // put the received request on the processing stack getRequestQueue().put(iori); } catch(IOException ioe) { System.out.println("MPIWorker receive exception "+ioe+" on port "+SLAVEPORT); break; } catch (InterruptedException e) { // the condition here is that the blocking request queue was waiting on a 'put' since the // queue was at maximum capacity, and a the ExecutorService requested a shutdown during that // time, we should bail form the thread and exit // quit the processing thread break; } catch (MPIException e) { System.out.println("MPIWorker class not found on deserialization"+e+" on port "+SLAVEPORT); break; } } // thread has been stopped by WorkBoot or by error try { Intracomm.closePort(MASTERPORT); MPI.Finalize(); MPI.Init(null); } catch (MPIException e3) {} } @Override public String getMasterPort() { return MASTERPORT; } @Override public String getSlavePort() { return SLAVEPORT; } public void stopWorker() { // thread has been stopped by WorkBoot shouldRun = false; } }