package io.nextop.client.node.nextop;
import com.google.common.io.ByteStreams;
import io.nextop.Id;
import io.nextop.Message;
import io.nextop.Wire;
import io.nextop.WireValue;
import io.nextop.client.MessageContext;
import io.nextop.client.MessageControl;
import io.nextop.client.MessageControlNode;
import io.nextop.client.MessageControlState;
import io.nextop.client.node.AbstractMessageControlNode;
import io.nextop.log.NL;
import io.nextop.util.NoCopyByteArrayOutputStream;
import javax.annotation.Nullable;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.*;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.TimeUnit;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;
/** Nextop is symmetric protocol, so the client and server both use an instance
* of this class to communicate. The difference between instances is the
* Wire.Factory, which is responsible for a secure connection.
* The nextop protocol is optimized to pipeline messages up/down. There is a tradeoff
* in ordering in the case the endpoint crashes. Assuming a reliable endpoint, order is maintained. */
public class NextopNode extends AbstractMessageControlNode {
public static final class Config {
public final int chunkBytes;
public Config(int chunkBytes) {
this.chunkBytes = chunkBytes;
}
}
public static final Config DEFAULT_CONFIG = new Config(/* aim for one packet per chunk */ 4 * 1024);
public static final CompressionStrategy COMPRESS_NON_BINARY = new CompressionStrategy() {
@Override
public boolean isCompress(Message message) {
@Nullable WireValue content = message.getContent();
if (null == content) {
return true;
}
switch (content.getType()) {
case IMAGE:
case BLOB:
return false;
default:
return true;
}
}
};
private static final int DEFAULT_T_STARTUP_MS = 3000;
private static final int DEFAULT_T_DROP_MS = 2 * DEFAULT_T_STARTUP_MS;
final Config config;
@Nullable
Wire.Factory wireFactory;
@Nullable
volatile Wire.Adapter wireAdapter = null;
boolean active = false;
@Nullable
ControlLooper controlLooper = null;
final SharedTransferState sts;
CompressionStrategy compressionStrategy = COMPRESS_NON_BINARY;
final UpstreamActive upstreamActive;
final int startupMs = DEFAULT_T_STARTUP_MS;
final int dropTimeoutMs = DEFAULT_T_DROP_MS;
public NextopNode() {
this(DEFAULT_CONFIG);
}
public NextopNode(Config config) {
this.config = config;
sts = new SharedTransferState(this);
upstreamActive = new UpstreamActive();
}
/** Call before #init.
* @param wireFactory can be an instance of MessageControlNode.
* in that case, it will be attached as a sub-node ({@link #initDownstream}).
* this is useful if the wire factory needs to maintain its own network stack. */
public void setWireFactory(Wire.Factory wireFactory) {
this.wireFactory = wireFactory;
}
public void setWireAdapter(Wire.Adapter wireAdapter) {
this.wireAdapter = wireAdapter;
}
/////// NODE ///////
@Override
protected void initDownstream(Bundle savedState) {
if (wireFactory instanceof MessageControlNode) {
((MessageControlNode) wireFactory).init(this, savedState);
}
}
@Override
protected void initSelf(@Nullable Bundle savedState) {
// the control looper sets upstream active when there is a successful connection to the peer
// the control looper drops upstream active after #dropTimeoutMs of no successful connection to the peer
// hold for the estimated #startupMs
// This prevents a request from hitting another route when it will be net faster/better
// to hit nextop if sent before this timeout
upstreamActive.up(startupMs);
}
@Override
public void onActive(boolean active) {
if (active && wireFactory instanceof MessageControlNode) {
((MessageControlNode) wireFactory).onActive(active);
}
if (this.active != active) {
this.active = active;
if (active) {
assert null == controlLooper;
controlLooper = new ControlLooper();
controlLooper.start();
} else {
assert null != controlLooper;
controlLooper.interrupt();
controlLooper = null;
}
}
if (!active && wireFactory instanceof MessageControlNode) {
((MessageControlNode) wireFactory).onActive(active);
}
}
@Override
public void onMessageControl(MessageControl mc) {
assert MessageControl.Direction.SEND.equals(mc.dir);
assert active;
if (active) {
MessageControlState mcs = getMessageControlState();
if (!mcs.onActiveMessageControl(mc, upstream)) {
mcs.add(mc);
}
}
// TODO else send back upstream?
}
/////// CONNECTIVITY ///////
private void onConnected() {
upstreamActive.up();
}
private void onDisconnected() {
upstreamActive.down(dropTimeoutMs);
}
private final Runnable ON_CONNECTED = new Runnable() {
@Override
public void run() {
onConnected();
}
};
private final Runnable ON_DISCONNECTED = new Runnable() {
@Override
public void run() {
onDisconnected();
}
};
final class UpstreamActive {
boolean active = false;
long pendingDownTime = 0L;
@Nullable
Runnable pendingDown = null;
private void clearPendingDown() {
pendingDownTime = 0L;
pendingDown = null;
}
void up() {
clearPendingDown();
if (!active) {
active = true;
upstream.onActive(true);
}
}
void down() {
clearPendingDown();
if (active) {
active = false;
upstream.onActive(false);
}
}
void up(int holdMs) {
up();
down(holdMs);
}
void down(int delayMs) {
long downTime = System.currentTimeMillis() + delayMs;
if (pendingDownTime < downTime) {
pendingDownTime = downTime;
pendingDown = new Runnable() {
@Override
public void run() {
if (this == pendingDown) {
down();
}
}
};
postDelayed(pendingDown, delayMs);
}
}
}
final class ControlLooper extends Thread {
final byte[] controlBuffer = new byte[4 * 1024];
@Override
public void run() {
@Nullable SerializationState ss = null;
@Nullable SharedWireState sws = null;
top:
while (active) {
try {
if (null == sws || !sws.active) {
Wire wire;
try {
wire = wireFactory.create(null != sws ? sws.wire : null);
} catch (NoSuchElementException e) {
sws = null;
continue top;
}
Wire.Adapter wireAdapter = NextopNode.this.wireAdapter;
if (null != wireAdapter) {
wire = wireAdapter.adapt(wire);
}
{
long startNanos = System.nanoTime();
try {
syncTransferState(wire);
} catch (IOException e) {
// FIXME log
e.printStackTrace();
continue top;
}
NL.nl.metric("node.nextop.control.sync", System.nanoTime() - startNanos, TimeUnit.NANOSECONDS);
}
post(ON_CONNECTED);
sws = new SharedWireState(wire);
if (null == ss) {
ss = new SerializationState();
}
WriteLooper writeLooper = new WriteLooper(sws, ss);
ReadLooper readLooper = new ReadLooper(sws);
sws.writeLooper = writeLooper;
sws.readLooper = readLooper;
writeLooper.start();
readLooper.start();
} // else it was just an interruption
try {
sws.awaitEnd();
} catch (InterruptedException e) {
continue top;
}
} catch (Exception e) {
// FIXME log
e.printStackTrace();
continue top;
}
finally {
post(ON_DISCONNECTED);
}
}
if (null != sws) {
sws.end();
while (true) {
try {
sws.awaitEnd();
break;
} catch (InterruptedException e) {
continue;
}
}
}
}
// FIXME see notes in SharedTransferState
void syncTransferState(final Wire wire) throws IOException {
sts.membar();
// each side sends SharedTransferState (id->transferred chunks)
// each side removes parts of the shared transfer state that the other side does not have
// FIXME
// send write state header
// receive other side write state header
// loop where write and read interleave
// make changes to read state using other side write state
// make changes to write state using other side read state
// TODO what are the most important changes to make?
// TODO this only matters when ACK is moved to complete not receive (there is a "hanging" message that both sides acknowledge)
// - if any in pendingWrite that are not in pendingRead, move from pendingWrite to write
// - if any are in pendingRead that are not in pendingWrite,
// TODO for now, just remove any readState that is not in the other writeState. this frees up memory
// F_SYNC_WRITE_STATE [frame count] [frame+]
// frame := [id]
final int n = sts.writeStates.size();
final int m;
{
int c = 0;
{
controlBuffer[c] = F_SYNC_WRITE_STATE;
c += 1;
WireValue.putint(controlBuffer, c, n);
c += 4;
wire.write(controlBuffer, 0, c, 0);
wire.flush();
}
wire.read(controlBuffer, 0, c, 0);
c = 0;
if (F_SYNC_WRITE_STATE != controlBuffer[c]) {
// FIXME log
throw new IOException("Bad sync header.");
}
c += 1;
m = WireValue.getint(controlBuffer, c);
}
final int bytesPerFrame = Id.LENGTH;
// write
class Writer extends Thread {
int i = 0;
Iterator<MessageWriteState> itr = sts.writeStates.values().iterator();
@Nullable IOException e = null;
@Override
public void run() {
try {
for (int writeCount; 0 < (writeCount = Math.min(n - i, controlBuffer.length / bytesPerFrame)); ) {
for (int k = 0; k < writeCount; ++k) {
MessageWriteState writeState = itr.next();
Id.toBytes(writeState.id, controlBuffer, k * bytesPerFrame);
}
wire.write(controlBuffer, 0, writeCount * bytesPerFrame, 0);
i += writeCount;
}
wire.flush();
} catch (IOException e) {
// FIXME log
e.printStackTrace();
this.e = e;
}
}
};
Writer writer = new Writer();
writer.start();
// read
int j = 0;
Id[] pairs = new Id[m];
for (int readCount; 0 < (readCount = Math.min(m - j, controlBuffer.length / bytesPerFrame)); ) {
wire.read(controlBuffer, 0, readCount * bytesPerFrame, 0);
for (int k = 0; k < readCount; ++k) {
Id id = Id.fromBytes(controlBuffer, k * bytesPerFrame);
pairs[j + k] = id;
}
j += readCount;
}
// process read pairs
// remove any read state that does not have a pair id
sts.readStates.keySet().retainAll(Arrays.asList(pairs));
while (true) {
try {
writer.join();
break;
} catch (InterruptedException e) {
// can't interrupt io
continue;
}
}
if (null != writer.e) {
throw writer.e;
}
// end
{
int c = 0;
{
controlBuffer[c] = F_SYNC_END;
c += 1;
controlBuffer[c] = SYNC_STATUS_OK;
c += 1;
wire.write(controlBuffer, 0, c, 0);
wire.flush();
}
wire.read(controlBuffer, 0, c, 0);
c = 0;
if (F_SYNC_END != controlBuffer[c]) {
// FIXME log
throw new IOException("Bad sync end.");
}
c += 1;
if (SYNC_STATUS_OK != controlBuffer[c]) {
// FIXME log
throw new IOException("Bad sync status.");
}
}
sts.membar();
}
}
/* nextop framed format:
* [byte type][next bytes depend on type] */
// FIXME finish
static final class SharedWireState {
final Wire wire;
volatile boolean active = true;
WriteLooper writeLooper;
ReadLooper readLooper;
SharedWireState(Wire wire) {
this.wire = wire;
}
void end() {
synchronized (this) {
active = false;
notifyAll();
}
writeLooper.interrupt();
readLooper.interrupt();
}
void awaitEnd() throws InterruptedException {
synchronized (this) {
while (active) {
wait();
}
}
writeLooper.join();
readLooper.join();
}
}
static final class SerializationState {
// FIXME need to work more on memory footprint.
final byte[] serBytes = new byte[8 * 1024 * 1024];
final ByteBuffer serBuffer = ByteBuffer.wrap(serBytes);
SerializationState() {
}
}
final class WriteLooper extends Thread {
final SharedWireState sws;
final SerializationState ss;
final MessageControlState mcs = getMessageControlState();
final byte[] controlBuffer = new byte[1024];
WriteLooper(SharedWireState sws, SerializationState ss) {
this.sws = sws;
this.ss = ss;
}
@Override
public void run() {
sts.membar();
// take top
// write
// every chunkQ, check if there if a more important, before writing the next chunk
// if so put back
@Nullable MessageControlState.Entry entry = null;
try {
NL.nl.message("node.nextop.write", "Start write loop");
top:
while (sws.active) {
pollUrgent();
if (null == entry) {
entry = mcs.takeFirstAvailable(NextopNode.this);
if (null == entry) {
{
sws.wire.flush();
}
try {
entry = mcs.takeFirstAvailable(NextopNode.this, Integer.MAX_VALUE, TimeUnit.MILLISECONDS);
assert null != entry;
if (null == entry) {
continue;
}
} catch (InterruptedException e) {
continue;
}
}
}
@Nullable MessageWriteState writeState = sts.writeStates.get(entry.id);
if (null == writeState) {
{
long startNanos = System.nanoTime();
final ByteBuffer serBuffer = ss.serBuffer;
final byte[] serBytes = ss.serBytes;
// create it
byte[] bytes;
boolean compressed;
try {
pkg(entry.mc).toBytes(serBuffer);
serBuffer.flip();
int n = serBuffer.remaining();
assert pkg(entry.mc).equals(WireValue.valueOf(serBytes));
if (compressionStrategy.isCompress(entry.message)) {
try {
NoCopyByteArrayOutputStream os = new NoCopyByteArrayOutputStream(serBytes, n);
GZIPOutputStream gzos = new GZIPOutputStream(os);
try {
gzos.write(serBytes, 0, n);
gzos.finish();
} finally {
gzos.close();
}
bytes = os.toByteArray();
compressed = true;
} catch (OutOfMemoryError e) {
// Don't compress - too much temp space needed
bytes = new byte[n];
System.arraycopy(serBytes, 0, bytes, 0, n);
compressed = false;
}
} else {
bytes = new byte[n];
System.arraycopy(serBytes, 0, bytes, 0, n);
compressed = false;
}
} finally {
serBuffer.clear();
}
assert 0 < bytes.length;
int chunkCount = (bytes.length + config.chunkBytes - 1) / config.chunkBytes;
int[] chunkOffsets = new int[chunkCount];
chunkOffsets[0] = 0;
for (int i = 1; i < chunkCount; ++i) {
chunkOffsets[i] = chunkOffsets[i - 1] + config.chunkBytes;
}
writeState = new MessageWriteState(entry.id, bytes, chunkOffsets, compressed);
NL.nl.metric("node.nextop.write.state", System.nanoTime() - startNanos, TimeUnit.NANOSECONDS);
NL.nl.count("node.nextop.write.%s", entry.mc.type);
}
}
final int n = writeState.chunkOffsets.length;
// F_MESSAGE_START [id][total length][total chunks]
{
long startNanos = System.nanoTime();
{
int c = 0;
controlBuffer[c] = F_MESSAGE_START;
c += 1;
Id.toBytes(entry.id, controlBuffer, c);
c += Id.LENGTH;
WireValue.putint(controlBuffer, c, writeState.bytes.length);
c += 4;
WireValue.putint(controlBuffer, c, n);
c += 4;
controlBuffer[c] = writeState.compressed ? (byte) 0x01 : (byte) 0x00;
c += 1;
sws.wire.write(controlBuffer, 0, c, 0);
}
NL.nl.metric("node.nextop.write.start", System.nanoTime() - startNanos, TimeUnit.NANOSECONDS);
}
for (int i = 0; i < n; ++i) {
pollUrgent();
if (!writeState.chunkWrites[i]) {
if (null != entry.end) {
// ended
entry = null;
continue top;
}
// write it
int start = writeState.chunkOffsets[i];
int end = i + 1 < n ? writeState.chunkOffsets[i + 1] : writeState.bytes.length;
assert start < end;
// F_MESSAGE_CHUNK [chunk index][chunk offset][chunk length][data]
{
long startNanos = System.nanoTime();
{
{
int c = 0;
controlBuffer[c] = F_MESSAGE_CHUNK;
c += 1;
WireValue.putint(controlBuffer, c, i);
c += 4;
WireValue.putint(controlBuffer, c, start);
c += 4;
WireValue.putint(controlBuffer, c, end - start);
c += 4;
sws.wire.write(controlBuffer, 0, c, 0);
}
sws.wire.write(writeState.bytes, start, end - start, 0);
}
NL.nl.metric("node.nextop.write.chunk", System.nanoTime() - startNanos, TimeUnit.NANOSECONDS);
}
writeState.chunkWrites[i] = true;
@Nullable MessageControlState.Entry preemptEntry = mcs.takeFirstAvailable(entry.id, NextopNode.this);
if (null != preemptEntry) {
mcs.release(entry.id, NextopNode.this);
entry = preemptEntry;
continue top;
}
}
}
// F_MESSAGE_END
{
long startNanos = System.nanoTime();
{
int c = 0;
controlBuffer[c] = F_MESSAGE_END;
c += 1;
sws.wire.write(controlBuffer, 0, c, 0);
}
NL.nl.metric("node.nextop.write.end", System.nanoTime() - startNanos, TimeUnit.NANOSECONDS);
}
// done with entry, transfer to pending ack
mcs.remove(entry.id, MessageControlState.End.COMPLETED);
sts.writePendingAck.add(entry.mc);
entry = null;
}
{
sws.wire.flush();
}
} catch (IOException e) {
// FIXME log
e.printStackTrace();
// fatal
sws.end();
}
if (null != entry) {
mcs.release(entry.id, NextopNode.this);
entry = null;
}
NL.nl.message("node.nextop.write", "End write loop");
sts.membar();
}
private void pollUrgent() throws IOException {
{
int u = 0;
long startNanos = System.nanoTime();
for (byte[] urgentMessage; null != (urgentMessage = sts.writeUrgentMessages.poll()); ) {
sws.wire.write(urgentMessage, 0, urgentMessage.length, 0);
u += 1;
}
if (0 < u) {
NL.nl.metric("node.nextop.write.urgent", System.nanoTime() - startNanos, TimeUnit.NANOSECONDS);
}
}
}
}
final class ReadLooper extends Thread {
final SharedWireState sws;
final MessageControlState mcs = getMessageControlState();
final byte[] controlBuffer = new byte[1024];
ReadLooper(SharedWireState sws) {
this.sws = sws;
}
@Override
public void run() {
// FIXME
// as soon as get a COMPLETE, send an ACK (this is not resilient to crash, but works for now to keep the client buffer limited)
// on F_MESSAGE_COMPLETE or F_MESSAGE_CHUNK, if there is a verification error, send back a NACK
// if read NACK, move message from pendingWrite back to mcs
sts.membar();
@Nullable Id id = null;
@Nullable MessageReadState readState = null;
try {
NL.nl.message("node.nextop.read", "Start read loop");
top:
while (sws.active) {
sws.wire.read(controlBuffer, 0, 1, 0);
{
long startNanos = System.nanoTime();
byte type = controlBuffer[0];
switch (type) {
case F_MESSAGE_START: {
// F_MESSAGE_START [id][total length][total chunks][compressed]
int c = Id.LENGTH + 4 + 4 + 1;
sws.wire.read(controlBuffer, 0, c, 0);
c = 0;
id = Id.fromBytes(controlBuffer, c);
c += Id.LENGTH;
int length = WireValue.getint(controlBuffer, c);
c += 4;
int chunkCount = WireValue.getint(controlBuffer, c);
c += 4;
boolean compressed = (0xFF & controlBuffer[c]) != 0;
readState = sts.readStates.get(id);
if (null == readState) {
// create it
readState = new MessageReadState(id, length, chunkCount, compressed);
sts.readStates.put(id, readState);
}
break;
}
case F_MESSAGE_CHUNK: {
if (null == readState) {
continue top;
}
// F_MESSAGE_CHUNK [chunk index][chunk offset][chunk length][data]
int c = 4 + 4 + 4;
sws.wire.read(controlBuffer, 0, c, 0);
c = 0;
int chunkIndex = WireValue.getint(controlBuffer, c);
c += 4;
int start = WireValue.getint(controlBuffer, c);
c += 4;
int chunkLength = WireValue.getint(controlBuffer, c);
int end = start + chunkLength;
// verify that the values do not conflict with existing values
// designed so that each index passing verification implies that the entire read state is valid
boolean conflict = false;
try {
if (readState.chunkReads[chunkIndex]) {
// already read
conflict = false;
} else {
if (0 <= chunkIndex - 1 && readState.chunkReads[chunkIndex - 1]) {
// the previous chunk was read and set the index of the current chunk
if (start != readState.chunkOffsets[chunkIndex]) {
// index does not match value set in previous chunk
conflict = true;
}
}
if (chunkIndex + 1 < readState.chunkOffsets.length) {
if (readState.chunkReads[chunkIndex] && end != readState.chunkOffsets[chunkIndex + 1]) {
// end does not match known
conflict = true;
}
} else {
if (end != readState.bytes.length) {
// end does not match known
conflict = true;
}
}
}
} catch (Exception e) {
// FIXME log
e.printStackTrace();
// index out of bounds, etc
conflict = true;
}
if (conflict) {
NL.nl.count("node.nextop.read.conflict");
// discard chunk content
sws.wire.skip(chunkLength, 0);
// FIXME log this
sts.writeUrgentMessages.add(nack(id));
sws.writeLooper.interrupt();
// discard the read state
sts.readStates.remove(id);
continue top;
}
// read chunk content
sws.wire.read(readState.bytes, start, chunkLength, 0);
readState.chunkReads[chunkIndex] = true;
readState.chunkOffsets[chunkIndex] = start;
if (chunkIndex + 1 < readState.chunkOffsets.length) {
// set the next start, used for conflict detection (see above)
readState.chunkOffsets[chunkIndex + 1] = end;
}
break;
}
case F_MESSAGE_END: {
if (null == readState) {
continue top;
}
// F_MESSAGE_END
// nothing to read
for (int i = 0, n = readState.chunkOffsets.length; i < n; ++i) {
if (!readState.chunkReads[i]) {
sts.writeUrgentMessages.add(nack(id));
sws.writeLooper.interrupt();
// discard the read state
sts.readStates.remove(id);
readState = null;
continue top;
}
}
// received
// TODO move this to where the message is actually completed (ack on complete not receive)
// TODO when ack changed, move message to readPending
sts.writeUrgentMessages.offer(ack(id));
sws.writeLooper.interrupt();
sts.readStates.remove(id);
// defer the parsing to the context thread
// TODO is this better thank inline? (get numbers)
post(new Dispatch(id, readState));
readState = null;
break;
}
case F_ACK: {
// F_ACK [id]
int c = Id.LENGTH;
sws.wire.read(controlBuffer, 0, c, 0);
c = 0;
Id uid = Id.fromBytes(controlBuffer, c);
// remove from pending
sts.writePendingAck.remove(uid, MessageControlState.End.COMPLETED);
break;
}
case F_NACK: {
NL.nl.count("node.nextop.read.nack");
// F_NACK [id]
int c = Id.LENGTH;
sws.wire.read(controlBuffer, 0, c, 0);
c = 0;
Id uid = Id.fromBytes(controlBuffer, c);
// move from pending to active
@Nullable MessageControl mc = sts.writePendingAck.remove(uid, MessageControlState.End.ERROR);
if (null != mc) {
mcs.add(mc);
} else {
// this would be a bug in sync state - one node thought the other had something it doesn't
assert false;
}
break;
}
default:
// protocol error
throw new IOException("Protocol error.");
}
NL.nl.metric("node.nextop.read.%s", System.nanoTime() - startNanos, TimeUnit.NANOSECONDS, type);
}
}
} catch (IOException e) {
// FIXME log
e.printStackTrace();
// fatal
sws.end();
}
NL.nl.message("node.nextop.read", "End read loop");
sts.membar();
}
final class Dispatch implements Runnable {
final Id id;
final MessageReadState readState;
Dispatch(Id id, MessageReadState readState) {
this.id = id;
this.readState = readState;
}
@Override
public void run() {
try {
WireValue pkg;
if (readState.compressed) {
NoCopyByteArrayOutputStream os = new NoCopyByteArrayOutputStream(1024);
ByteStreams.copy(new GZIPInputStream(new ByteArrayInputStream(readState.bytes)), os);
// TODO copy this if the overhead is too large (the byte buffer has padding at the end)
pkg = WireValue.valueOf(os.getBytes(), os.getOffset());
} else {
pkg = WireValue.valueOf(readState.bytes);
}
MessageControl mc = unpkg(pkg);
NL.nl.count("node.nextop.read.%s", mc.type);
upstream.onMessageControl(MessageControl.receive(mc.type, mc.message));
} catch (Exception e) {
// FIXME the nack might create an infinite retry here; think about something better
// FIXME possibly just ban the session since it's running an incompatible version
sts.writeUrgentMessages.add(nack(id));
sws.writeLooper.interrupt();
NL.nl.unhandled("node.nextop.read", e);
}
}
}
}
// urgent messages
static byte[] nack(Id id) {
// F_NACK [id]
byte[] nack = new byte[1 + Id.LENGTH];
int c = 0;
nack[c] = F_NACK;
c += 1;
Id.toBytes(id, nack, c);
return nack;
}
static byte[] ack(Id id) {
// F_NACK [id]
byte[] ack = new byte[1 + Id.LENGTH];
int c = 0;
ack[c] = F_ACK;
c += 1;
Id.toBytes(id, ack, c);
return ack;
}
// message packaging
static WireValue pkg(MessageControl mc) {
return MessageControl.toWireValue(mc);
}
static MessageControl unpkg(WireValue value) {
return MessageControl.fromWireValue(value);
}
// FIXME relied on new threads being a membar. all this state is shared across 1+1 (writer+reader) threads in sequence
static final class SharedTransferState {
// when a message is remove from the shared mcs on write, it goes here
// these message are pendinging ack
// sync state established which of these are still valid. if any not valid, the client immediately retransmits at the front of the line
// the nextop node holds these even if the node goes active->false. the protocol is set up that on reconnect they will get sent.
// even if a billing outage, getting these sent is an exception - they will always get sent even if the account is in bad standing etc.
MessageControlState writePendingAck;
// TODO this matters when the node reads and dispatches, waiting for a complete back
// TODO store here until the complete/ack (so the message isn't lost)
// MessageControlState readPendingAck;
/** single-thread */
Map<Id, MessageWriteState> writeStates;
/** single-thread */
Map<Id, MessageReadState> readStates;
/** thread-safe */
Queue<byte[]> writeUrgentMessages;
SharedTransferState(MessageContext context) {
writePendingAck = new MessageControlState(context);
// readPendingAck = new MessageControlState(context);
writeStates = new HashMap<Id, MessageWriteState>(32);
readStates = new HashMap<Id, MessageReadState>(32);
writeUrgentMessages = new ConcurrentLinkedQueue<byte[]>();
}
synchronized void membar() {
}
}
static final class MessageWriteState {
final Id id;
final byte[] bytes;
final boolean compressed;
// [0] is the start of the first chunk
final int[] chunkOffsets;
final boolean[] chunkWrites;
MessageWriteState(Id id, byte[] bytes, int[] chunkOffsets, boolean compressed) {
this.id = id;
this.bytes = bytes;
this.chunkOffsets = chunkOffsets;
this.compressed = compressed;
// init all false
chunkWrites = new boolean[chunkOffsets.length];
}
}
static final class MessageReadState {
final Id id;
final boolean compressed;
final byte[] bytes;
// [0] is the start of the first chunk
final int[] chunkOffsets;
final boolean[] chunkReads;
MessageReadState(Id id, int length, int chunkCount, boolean compressed) {
if (length < chunkCount) {
throw new IllegalArgumentException();
}
this.id = id;
this.compressed = compressed;
bytes = new byte[length];
chunkOffsets = new int[chunkCount];
chunkReads = new boolean[chunkCount];
}
}
/////// NEXTOP PROTOCOL ///////
// FIXME be able to transfer MessageControl not just message
/** [id][total length][total chunks][compressed] */
public static final byte F_MESSAGE_START = 0x01;
/** [chunk index][chunk offset][chunk length][data] */
public static final byte F_MESSAGE_CHUNK = 0x02;
/** TODO checksum */
public static final byte F_MESSAGE_END = 0x03;
/** [id]
* ack indicates the node can remove its copy of the message. */
static final byte F_ACK = 0x04;
/** [id]
* nack indicates the node should resend its copy of the message */
static final byte F_NACK = 0x05;
/** [frame count][frame+]
* frame := [id] */
static final byte F_SYNC_WRITE_STATE = 0x70;
/** [status]
* status is a single byte, SYNC_STATUS_OK, SYNC_STATUS_ERROR */
static final byte F_SYNC_END = 0x70;
static final byte SYNC_STATUS_OK = 0x00;
static final byte SYNC_STATUS_ERROR = 0x01;
public static interface CompressionStrategy {
boolean isCompress(Message message);
}
// TODO work out a more robust fallback
// big assumption for ordering: nextop endpoint will not crash
// compromise: maintain order and never lose a message if this is true
// if not true, at least never lose a message (but order will be lost)
// two phase dev:
// (current) phase 1: just get it working, buggy in some cases, no reordering, etc
// phase 2: correctness (never lose), reordering, etc, focus on perf
// shared transfer state:
// id -> bytes, sent index in bytes
// socket control flow:
// - retake timeout (use take state, time since last take, elapsed)
// - create wire (socket) (on timeout, go to [0])
// - initial handshakes
// - initial state sync (sync the shared transfer state)
// - start loopers
// - when any loopers fails, shut down all, go to [0]
//
// WriteLooper
// take off the top of mcs and write
// have a parallel thread that peeks at the next
// every yieldQ write, surface progress, check if there is a more urgent message
// if so, shelve the current and switch
// ReadLooper
// wire format:
// [type][length]
// types:
// - message start [ID]
// - message data [bytes]
// - message end [MD5]
// - (verify error) (ack) (on ack, delete from shared transfer state)
}