package the8472.bencode; import java.nio.ByteBuffer; import java.util.stream.IntStream; public class Tokenizer { public static class BDecodingException extends RuntimeException { public BDecodingException(String msg) { super(msg); } } /* handled edge-cases: * * - negative length strings * - deep dictionary/list nesting * - zero-prefixed numbers (illegal) * - dictionary-exit while expecting a value * - non-string while expecting dict key * - end of input reached * * TODO: * - verify correct sorting * - long overflow * * Handled downstream: duplicate dict keys * */ public static interface TokenConsumer { void push(Token st); void pop(Token st); } public enum TokenType { LIST, DICT, PREFIXED_STRING, STRING, LONG; } int stackIdx = 0; Token[] stack = new Token[256]; public Tokenizer() { IntStream.range(0, stack.length).forEach(i -> { stack[i] = new Token(); }); } void reset() { for(int i=0;i<stack.length;i++) stack[i].reset(); stackIdx = 0; lastString = null; buf = null; } ByteBuffer buf; TokenConsumer consumer; public static final class Token { int start; int end; byte state; byte dictExpect; private static final TokenType[] tokenEnums = TokenType.values(); private static final DictState[] stateEnums = DictState.values(); public Token() { reset(); } void reset() { start = -1; end = -1; state = -1; expect(DictState.NoExpectation); } public TokenType type() { return tokenEnums[state]; } void type(TokenType t) { state = (byte) t.ordinal(); } public DictState expect() { return stateEnums[dictExpect]; } void expect(DictState d) { dictExpect = (byte) d.ordinal(); } @Override public String toString() { String st = "Token: "+type(); if(type() == TokenType.DICT) st += " "+expect(); st += " [" + start + ","+ end + "]"; return st; } } enum DictState { NoExpectation, ExpectKeyOrEnd, ExpectValue } TokenType current() { return stack[stackIdx].type(); } Token currentToken() { return stack[stackIdx]; } Token atStackOffset(int offset) { int depth = stackIdx + offset; if(depth < 0) return null; return stack[depth]; } void push(TokenType t, int pos) { Token current = currentToken(); if(current.expect() == DictState.ExpectKeyOrEnd && t != TokenType.PREFIXED_STRING) throw new BDecodingException("encountered "+t.toString()+" at offset "+pos+" while expecting a dictionary key"); stackIdx++; if(stackIdx >= stack.length) throw new BDecodingException("nesting too deep"); Token newState = stack[stackIdx]; newState.start = pos; newState.type(t); if(t == TokenType.DICT) newState.expect(DictState.ExpectKeyOrEnd); consumer.push(newState); } void pop(int pos) { Token current = stack[stackIdx]; if(current.type() == TokenType.DICT && current.expect() == DictState.ExpectValue) throw new BDecodingException("encountered 'e' (offset: "+buf.position()+") after dictionary key, expected a value"); current.end = pos; consumer.pop(current); lastDecodedNum = -1; current.reset(); stackIdx--; current = currentToken(); switch(current.expect()) { case ExpectKeyOrEnd: current.expect(DictState.ExpectValue); break; case ExpectValue: current.expect(DictState.ExpectKeyOrEnd); break; default: break; } } void decodeString() { long length = this.parseNum(buf, (byte) ':'); if(length < 0) length = 0; push(TokenType.STRING, buf.position()); if(length > buf.remaining()) throw new BDecodingException("string (offset: "+buf.position()+" + length: "+length+") points beyond end of message (length: "+buf.limit()+")"); //ByteBuffer key = buf.slice(); //if(length > key.capacity()) //key.limit((int) length); buf.position((int) (buf.position() + length)); pop(buf.position()); } ByteBuffer getSlice(Token t) { int oldPos = buf.position(); buf.position(t.start); ByteBuffer slice = buf.slice(); slice.limit(t.end - t.start); buf.position(oldPos); return slice; } long lastDecodedNum; ByteBuffer lastString; public long lastDecodedNum() { return lastDecodedNum; } public int stackIdx() { return stackIdx; } public void inputBuffer(ByteBuffer buf) { this.buf = buf; } public void consumer(TokenConsumer c) { this.consumer = c; } public void tokenize() { while(buf.remaining() > 0) { int pos = buf.position(); byte current = buf.get(); switch(current) { case 'd': push(TokenType.DICT, pos); break; case 'i': push(TokenType.LONG, pos); lastDecodedNum = this.parseNum(buf, (byte) 'e'); pop(buf.position()); break; case 'l': push(TokenType.LIST, pos); break; case 'e': pop(buf.position()); break; case '-': case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': push(TokenType.PREFIXED_STRING, pos); buf.position(pos); decodeString(); pop(buf.position()); break; default: StringBuilder b = new StringBuilder(); Utils.toHex(new byte[]{current}, b , 1); throw new BDecodingException("unexpected character 0x" + b + " at offset "+(buf.position()-1)); } if(stackIdx <= 0) break; } if(stackIdx != 0) throw new BDecodingException("reached end of data with unterminated lists/dictionaries on the stack"); } public long parseNum(ByteBuffer buf, byte terminator) { long result = 0; boolean neg = false; if(buf.remaining() < 1) throw new BDecodingException("end of message reached while decoding a number/string length prefix. offset:"+buf.position()); byte current = buf.get(); if(current == '-') { neg = true; if(buf.remaining() < 1) throw new BDecodingException("end of message reached while decoding a number/string length prefix. offset:"+buf.position()); current = buf.get(); } int iter = 0; // do-while since we expect at least one digit do { // do zero-check on 2nd character, since 0 itself is a valid length if(iter > 0 && result == 0) throw new BDecodingException("encountered a leading zero at offset "+(buf.position()-1)+" while decoding a number/string length prefix"); if(current < '0' || current > '9') { StringBuilder b = new StringBuilder(); Utils.toHex(new byte[]{current}, b , 1); throw new BDecodingException("encountered invalid character 0x"+b+" (offset:"+ (buf.position()-1) +") while decoding a number/string length prefix, expected 0-9 or "+ (char)terminator); } int digit = current - '0'; result *= 10; result += digit; if(buf.remaining() < 1) throw new BDecodingException("end of message reached while decoding a number/string length prefix. offset:"+buf.position()); current = buf.get(); iter++; } while (current != terminator); if(neg) result *= -1; return result; } }