Token.java example

Explorer
basex-master
package org.basex.util;

import java.text.*;
import java.util.*;

/**
 * <p>This class provides convenience operations for handling 'Tokens'.
 * A token is a UTF-8 encoded string. It is represented as a byte array.</p>
 *
 * <p>In order to ensure a consistent representation of tokens in the project, all string
 * conversions should be done via the methods of this class.</p>
 *
 * @author BaseX Team 2005-17, BSD License
 * @author Christian Gruen
 */
public final class Token {
  /** Empty token. */
  public static final byte[] EMPTY = {};
  /** XML token. */
  public static final byte[] XML = token("xml");
  /** XML token with colon. */
  public static final byte[] XMLC = token("xml:");
  /** XMLNS token. */
  public static final byte[] XMLNS = token("xmlns");
  /** XMLNS token with colon. */
  public static final byte[] XMLNSC = token("xmlns:");
  /** ID token. */
  public static final byte[] ID = token("id");
  /** IDRef token. */
  public static final byte[] IDREF = token("ref");
  /** Token 'true'. */
  public static final byte[] TRUE = token("true");
  /** Token 'false'. */
  public static final byte[] FALSE = token("false");
  /** Token 'NaN'. */
  public static final byte[] NAN = token("NaN");
  /** Token 'INF'. */
  public static final byte[] INF = token("INF");
  /** Token '-INF'. */
  public static final byte[] NINF = token("-INF");
  /** Minimum long value. */
  public static final byte[] MINLONG = token("-9223372036854775808");
  /** Space. */
  public static final byte[] SPACE = { ' ' };
  /** Number '0'. */
  public static final byte[] ZERO = { '0' };
  /** Number '-0'. */
  private static final byte[] MZERO = { '-', '0' };
  /** Number '1'. */
  public static final byte[] ONE = { '1' };
  /** Slash. */
  public static final byte[] SLASH = { '/' };
  /** Colon. */
  public static final byte[] COLON = { ':' };

  /** Unicode replacement character. */
  public static final char REPLACEMENT = '\uFFFD';

  /** Maximum length for hash calculation. */
  private static final byte MAXLENGTH = 96;
  /** Maximum values for converting tokens to integer values. */
  private static final int MAXINT = Integer.MAX_VALUE / 10;
  /** Maximum values for converting tokens to long values. */
  private static final long MAXLONG = Long.MAX_VALUE / 10;

  /** Hex codes. */
  public static final byte[] HEX = token("0123456789ABCDEF");
  /** Reserved characters. */
  private static final byte[] IRIRES = token("!#$%&*'()+,-./:;=?@[]~_");
  /** Reserved characters. */
  private static final byte[] RES = token("-._~");

  /** Comparator for byte arrays. */
  public static final Comparator<byte[]> COMP = new Comparator<byte[]>() {
    @Override
    public int compare(final byte[] o1, final byte[] o2) {
      return diff(o1, o2);
    }
  };
  /** Case-insensitive comparator for byte arrays. */
  public static final Comparator<byte[]> LC_COMP = new Comparator<byte[]>() {
    @Override
    public int compare(final byte[] o1, final byte[] o2) {
      return diff(lc(o1), lc(o2));
    }
  };

  /** Hidden constructor. */
  private Token() { }

  /**
   * Returns the specified token as string.
   * @param token token
   * @return string
   */
  public static String string(final byte[] token) {
    return string(token, 0, token.length);
  }

  /**
   * Returns the specified token as string.
   * @param token token
   * @param start start position
   * @param length length
   * @return string
   */
  public static String string(final byte[] token, final int start, final int length) {
    if(length <= 0) return "";
    /// check if string contains non-ascii characters
    final int e = start + length;
    for(int p = start; p < e; ++p) if(token[p] < 0) return utf8(token, start, length);
    /// copy ascii characters to character array
    final char[] str = new char[length];
    for(int p = 0; p < length; ++p) str[p] = (char) token[start + p];
    return new String(str);
  }

  /**
   * Returns a string of the specified UTF8 token.
   * @param token token
   * @param start start position
   * @param length length
   * @return string
   */
  private static String utf8(final byte[] token, final int start, final int length) {
    // input is assumed to be correct UTF8. if input contains codepoints
    // larger than Character.MAX_CODE_POINT, results might be unexpected.

    final StringBuilder sb = new StringBuilder(length << 1);
    final int il = Math.min(start + length, token.length);
    for(int i = start; i < il; i += cl(token, i)) {
      final int cp = cp(token, i);
      if(cp < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
        sb.append((char) cp);
      } else {
        final int o = cp - Character.MIN_SUPPLEMENTARY_CODE_POINT;
        sb.append((char) ((o >>> 10) + Character.MIN_HIGH_SURROGATE));
        sb.append((char) ((o & 0x3ff) + Character.MIN_LOW_SURROGATE));
      }
    }
    return sb.toString();
  }

  /**
   * Checks if the specified token only consists of ASCII characters.
   * @param token token
   * @return result of check
   */
  public static boolean ascii(final byte[] token) {
    for(final byte t : token) if(t < 0) return false;
    return true;
  }

  /**
   * Converts a string to a byte array.
   * All strings should be converted by this function to guarantee
   * a consistent character conversion.
   * @param string string to be converted
   * @return byte array
   */
  public static byte[] token(final String string) {
    final int l = string.length();
    if(l == 0) return EMPTY;
    final byte[] b = new byte[l];
    for(int i = 0; i < l; ++i) {
      final char c = string.charAt(i);
      if(c > 0x7F) return utf8(string);
      b[i] = (byte) c;
    }
    return b;
  }

  /**
   * Converts the specified strings to tokens.
   * @param strings strings
   * @return tokens
   */
  public static byte[][] tokens(final String... strings) {
    final byte[][] tokens = new byte[strings.length][];
    final int tl = tokens.length;
    for(int t = 0; t < tl; ++t) tokens[t] = token(strings[t]);
    return tokens;
  }

  /**
   * Converts a string to a UTF8 byte array.
   * @param string string to be converted
   * @return byte array
   */
  private static byte[] utf8(final String string) {
    final char[] arr = string.toCharArray();
    final int al = arr.length;
    final TokenBuilder tb = new TokenBuilder(al << 1);
    for(int c = 0; c < al; ++c) {
      final char ch = arr[c];
      tb.add(Character.isHighSurrogate(ch) && c < al - 1
          && Character.isLowSurrogate(arr[c + 1])
           ? Character.toCodePoint(ch, arr[++c]) : ch);
    }
    return tb.finish();
  }

  /**
   * Converts a token from the input encoding to UTF8.
   * @param token token to be converted
   * @param encoding input encoding
   * @return byte array
   */
  public static byte[] utf8(final byte[] token, final String encoding) {
    // UTF8 (comparison by ref.) or no special characters: return input
    if(encoding == Strings.UTF8 || ascii(token)) return token;

    // convert to utf8. if errors occur while converting, an empty is returned.
    try {
      return token(new String(token, encoding));
    } catch(final Exception ex) {
      Util.debug(ex);
      return EMPTY;
    }
  }

  /**
   * Returns the codepoint (unicode value) of the specified token, starting at
   * the specified position. Returns a unicode replacement character for invalid values.
   * @param token token
   * @param pos character position
   * @return current character
   */
  public static int cp(final byte[] token, final int pos) {
    // 0xxxxxxx
    final byte v = token[pos];
    if((v & 0xFF) < 192) return v & 0xFF;
    // number of bytes to be read
    final int vl = cl(v);
    if(pos + vl > token.length) return REPLACEMENT;
    // 110xxxxx 10xxxxxx
    if(vl == 2) return (v & 0x1F) << 6 | token[pos + 1] & 0x3F;
    // 1110xxxx 10xxxxxx 10xxxxxx
    if(vl == 3) return (v & 0x0F) << 12 | (token[pos + 1] & 0x3F) << 6 |
      token[pos + 2] & 0x3F;
    // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
    return (v & 0x07) << 18 | (token[pos + 1] & 0x3F) << 12 |
      (token[pos + 2] & 0x3F) << 6 | token[pos + 3] & 0x3F;
  }

  /** Character lengths. */
  private static final int[] CHLEN = { 1, 1, 1, 1, 2, 2, 3, 4 };

  /**
   * Returns the length of the specified UTF8 byte.
   * @param cp codepoint
   * @return character length
   */
  public static int cl(final byte cp) {
    return cp >= 0 ? 1 : CHLEN[cp >> 4 & 0x7];
  }

  /**
   * Returns the length of a UTF8 character at the specified position.
   * @param token token
   * @param pos position
   * @return character length
   */
  public static int cl(final byte[] token, final int pos) {
    return cl(token[pos]);
  }

  /**
   * Converts a token to a sequence of codepoints.
   * @param token token
   * @return codepoints
   */
  public static int[] cps(final byte[] token) {
    int pos = 0;
    final int len = token.length;
    final int[] cp = new int[len];
    for(int i = 0; i < len; i += cl(token, i)) cp[pos++] = cp(token, i);
    return pos < len ? Arrays.copyOf(cp, pos) : cp;
  }

  /**
   * Returns the number of codepoints in the token.
   * @param token token
   * @return number of codepoints
   */
  public static int length(final byte[] token) {
    final int tl = token.length;
    if(ascii(token)) return tl;
    int l = 0;
    for(int t = 0; t < tl; t += cl(token, t)) ++l;
    return l;
  }

  /**
   * Creates a byte array representation of the specified boolean value.
   * @param bool boolean value to be converted
   * @return boolean value in byte array
   */
  public static byte[] token(final boolean bool) {
    return bool ? TRUE : FALSE;
  }

  /**
   * Creates a byte array representation of the specified integer value.
   * @param integer int value to be converted
   * @return integer value in byte array
   */
  public static byte[] token(final int integer) {
    if(integer == 0) return ZERO;
    if(integer == Integer.MIN_VALUE) return MININT;

    int n = integer;
    final boolean m = n < 0;
    if(m) n = -n;
    int j = numDigits(n);
    if(m) ++j;
    final byte[] num = new byte[j];

    // faster division by 10 for values < 81920 (see Integer.getChars)
    while(n > 81919) {
      final int q = n / 10;
      num[--j] = (byte) (n - (q << 3) - (q << 1) + '0');
      n = q;
    }
    while(n != 0) {
      final int q = n * 52429 >>> 19;
      num[--j] = (byte) (n - (q << 3) - (q << 1) + '0');
      n = q;
    }
    if(m) num[--j] = '-';
    return num;
  }

  /**
   * Checks number of digits of the specified integer.
   * @param integer number to be checked
   * @return number of digits
   */
  public static int numDigits(final int integer) {
    for(int i = 0;; ++i) if(integer <= INTSIZE[i]) return i + 1;
  }

  /** Minimum integer. */
  private static final byte[] MININT = token("-2147483648");
  /** Table with integer sizes. */
  private static final int[] INTSIZE = { 9, 99, 999, 9999, 99999, 999999,
      9999999, 99999999, 999999999, Integer.MAX_VALUE };

  /**
   * Creates a byte array representation from the specified long value,
   * using Java's standard method.
   * @param integer value to be converted
   * @return byte array
   */
  public static byte[] token(final long integer) {
    return integer >= Integer.MIN_VALUE && integer <= Integer.MAX_VALUE ?
        token((int) integer) : token(Long.toString(integer));
  }

  /** US charset. */
  private static final DecimalFormatSymbols LOC =
    new DecimalFormatSymbols(Locale.US);
  /** Scientific double output. */
  private static final DecimalFormat SD = new DecimalFormat("0.0##################E0", LOC);
  /** Decimal double output. */
  private static final DecimalFormat DD = new DecimalFormat("#####0.0################", LOC);
  /** Scientific float output. */
  private static final DecimalFormat SF = new DecimalFormat("0.0######E0", LOC);
  /** Decimal float output. */
  private static final DecimalFormat DF = new DecimalFormat("#####0.0######", LOC);

  /**
   * Creates a byte array representation from the specified double value.
   * @param dbl double value to be converted
   * @return byte array
   */
  public static byte[] token(final double dbl) {
    final byte[] b = tok(dbl);
    if(b != null) return b;

    final double a = Math.abs(dbl);
    final String s;
    if(a >= 1.0e-6 && a < 1.0e6) {
      synchronized(DD) { s = DD.format(dbl); }
    } else {
      synchronized(SD) { s = SD.format(dbl); }
    }
    return chopNumber(token(s));
  }

  /**
   * Creates a byte array representation from the specified float value.
   * @param flt float value to be converted
   * @return byte array
   */
  public static byte[] token(final float flt) {
    final byte[] b = tok(flt);
    if(b != null) return b;

    final int fl = FLT.length;
    for(int i = 0; i < fl; ++i) if(flt == FLT[i]) return FLTSTR[i];
    final float a = Math.abs(flt);
    final boolean small = a >= 1.0e-6f && a < 1.0e6f;
    String s1;
    if(small) {
      synchronized(DF) { s1 = DF.format(flt); }
    } else {
      synchronized(SF) { s1 = SF.format(flt); }
    }
    final String s2 = Float.toString(flt);
    if(s2.length() < s1.length() && (!s2.contains("E") || !small)) s1 = s2;
    return chopNumber(token(s1));
  }

  /**
   * Tries to create a byte array representation from a floating point.
   * @param value value to be converted
   * @return byte array, or {@code null}
   */
  private static byte[] tok(final double value) {
    if(value == Double.POSITIVE_INFINITY) return INF;
    if(value == Double.NEGATIVE_INFINITY) return NINF;
    if(value == 0) return 1 / value > 0 ? ZERO : MZERO;
    if(Double.isNaN(value)) return NAN;
    final double a = Math.abs(value);
    if(a < 1.0e6) {
      final int i = (int) value;
      if(i == value) return token(i);
    }
    return null;
  }

  /**
   * Finishes the numeric token, removing trailing zeroes.
   * @param token token to be modified
   * @return token
   */
  public static byte[] chopNumber(final byte[] token) {
    if(!contains(token, '.') || contains(token, 'e') || contains(token, 'E')) return token;
    // remove trailing zeroes
    int l = token.length;
    while(--l > 0 && token[l] == '0');
    return substring(token, 0, token[l] == '.' ? l : l + 1);
  }

  /** Constant float values. */
  private static final float[] FLT = { 1.0E17f, 1.0E15f, 1.0E13f, 1.0E11f,
    -1.0E17f, -1.0E15f, -1.0E13f, -1.0E11f };
  /** Token representations of float values. */
  private static final byte[][] FLTSTR = tokens("1.0E17", "1.0E15",
    "1.0E13", "1.0E11", "-1.0E17", "-1.0E15", "-1.0E13", "-1.0E11");

  /**
   * Converts the specified token into a double value.
   * @param token token to be converted
   * @return resulting double value, or {@link Double#NaN} is returned if the input is invalid
   */
  public static double toDouble(final byte[] token) {
    final int tl = token.length;
    int s = -1;
    while(++s < tl && ws(token[s]));
    if(s == tl) return Double.NaN;

    int e = s;
    boolean f = false;
    for(int p = s; p < tl; ++p) {
      final byte b = token[p];
      if(e == s) {
        if(digit(b) || b == '+') continue;
        if(ws(b)) {
          e = p + 1;
        } else {
          f = b == 'e' || b == 'E' || b == '.' || b == '-';
          if(!f) return Double.NaN;
        }
      } else if(!ws(b)) {
        return Double.NaN;
      }
    }
    if(e == s) e = tl;
    if(f || e - s > 9) return toDouble(token, s, e);

    final int d = toInt(token, s, e);
    return d == Integer.MIN_VALUE ? Double.NaN : d;
  }

  /**
   * Converts the specified token into a double value.
   * {@link Double#NaN} is returned if the input is invalid.
   * @param token token to be converted
   * @param start first byte to be parsed
   * @param end last byte to be parsed - exclusive
   * @return resulting double value
   */
  private static double toDouble(final byte[] token, final int start, final int end) {
    try {
      return Double.parseDouble(string(token, start, end - start));
    } catch(final NumberFormatException ex) {
      return Double.NaN;
    }
  }

  /**
   * Converts the specified token into an long value.
   * {@link Long#MIN_VALUE} is returned if the input is invalid.
   * Note that this may also be the actual value ({@link #MINLONG}).
   * @param token token to be converted
   * @return resulting long value
   */
  public static long toLong(final byte[] token) {
    return toLong(token, 0, token.length);
  }

  /**
   * Converts the specified token into an long value.
   * {@link Long#MIN_VALUE} is returned if the input is invalid.
   * Note that this may also be the actual value ({@link #MINLONG}).
   * @param token token to be converted
   * @param start first byte to be parsed
   * @param end last byte to be parsed - exclusive
   * @return resulting long value
   */
  public static long toLong(final byte[] token, final int start, final int end) {
    int p = start;
    while(p < end && ws(token[p])) ++p;
    if(p == end) return Long.MIN_VALUE;
    boolean m = false;
    if(token[p] == '-' || token[p] == '+') m = token[p++] == '-';
    if(p == end) return Long.MIN_VALUE;
    long v = 0;
    for(; p < end; ++p) {
      final byte b = token[p];
      if(b < '0' || b > '9') break;
      if(v >= MAXLONG && (b > '7' || v > MAXLONG)) return Long.MIN_VALUE;
      v = (v << 3) + (v << 1) + b - '0';
    }
    while(p < end && ws(token[p])) ++p;
    return p < end ? Long.MIN_VALUE : m ? -v : v;
  }

  /**
   * Converts the specified token into an integer value.
   * {@link Integer#MIN_VALUE} is returned if the input is invalid.
   * @param token token to be converted
   * @return resulting integer value
   */
  public static int toInt(final byte[] token) {
    return toInt(token, 0, token.length);
  }

  /**
   * Converts the specified token into an integer value.
   * {@link Integer#MIN_VALUE} is returned if the input is invalid.
   * @param token token to be converted
   * @param start first byte to be parsed
   * @param end last byte to be parsed (exclusive)
   * @return resulting integer value
   */
  private static int toInt(final byte[] token, final int start, final int end) {
    int p = start;
    while(p < end && ws(token[p])) ++p;
    if(p == end) return Integer.MIN_VALUE;
    boolean m = false;
    if(token[p] == '-' || token[p] == '+') m = token[p++] == '-';
    if(p == end) return Integer.MIN_VALUE;
    int v = 0;
    for(; p < end; ++p) {
      final byte b = token[p];
      if(b < '0' || b > '9') break;
      if(v >= MAXINT && (b > '7' || v > MAXINT)) return Integer.MIN_VALUE;
      v = (v << 3) + (v << 1) + b - '0';
    }
    while(p < end && ws(token[p])) ++p;
    return p < end || v < 0 ? Integer.MIN_VALUE : m ? -v : v;
  }

  /**
   * Converts the specified token into a positive integer value.
   * {@link Integer#MIN_VALUE} is returned if non-digits are found
   * or if the input is longer than nine characters.
   * @param token token to be converted
   * @return resulting integer value
   */
  public static int toSimpleInt(final byte[] token) {
    final int te = token.length;
    if(te >= 10 || te == 0) return Integer.MIN_VALUE;
    if(token[0] == '0') return te == 1 ? 0 : Integer.MIN_VALUE;

    int v = 0;
    for(final byte c : token) {
      if(c < '0' || c > '9') return Integer.MIN_VALUE;
      v = (v << 3) + (v << 1) + c - '0';
    }
    return v;
  }

  /**
   * Calculates a hash code for the specified token.
   * @param token specified token
   * @return hash code
   */
  public static int hash(final byte[] token) {
    int h = 0;
    final int l = Math.min(token.length, MAXLENGTH);
    for(int i = 0; i != l; ++i) h = (h << 5) - h + token[i];
    return h;
  }

  /**
   * Compares two tokens for equality.
   * @param token1 first token
   * @param token2 token to be compared
   * @return true if the arrays are equal
   */
  public static boolean eq(final byte[] token1, final byte[] token2) {
    final int tl = token2.length;
    if(tl != token1.length) return false;
    for(int t = 0; t != tl; ++t) if(token2[t] != token1[t]) return false;
    return true;
  }

  /**
   * Compares several tokens for equality.
   * @param token token
   * @param tokens tokens to be compared
   * @return true if one test is successful
   */
  public static boolean eq(final byte[] token, final byte[]... tokens) {
    for(final byte[] t : tokens) if(eq(token, t)) return true;
    return false;
  }

  /**
   * Compares two tokens lexicographically.
   * @param token first token
   * @param compare token to be compared
   * @return 0 if tokens are equal, negative if first token is smaller,
   *         positive if first token is bigger
   */
  public static int diff(final byte[] token, final byte[] compare) {
    final int tl = token.length;
    final int cl = compare.length;
    final int l = Math.min(tl, cl);
    for(int i = 0; i < l; ++i) {
      final int c = (token[i] & 0xFF) - (compare[i] & 0xFF);
      if(c != 0) return c;
    }
    return tl - cl;
  }

  /**
   * Returns the smaller token.
   * @param token first token
   * @param compare token to be compared
   * @return smaller token
   */
  public static byte[] min(final byte[] token, final byte[] compare) {
    return diff(token, compare) < 0 ? token : compare;
  }

  /**
   * Returns the bigger token.
   * @param token first token
   * @param compare token to be compared
   * @return bigger token
   */
  public static byte[] max(final byte[] token, final byte[] compare) {
    return diff(token, compare) > 0 ? token : compare;
  }

  /**
   * Checks if the first token contains the second token.
   * @param token token
   * @param sub token to be found
   * @return result of test
   */
  public static boolean contains(final byte[] token, final byte[] sub) {
    return contains(token, sub, 0);
  }

  /**
   * Checks if the first token contains the second token.
   * @param token token
   * @param sub token to be found
   * @param pos start position
   * @return result of test
   */
  public static boolean contains(final byte[] token, final byte[] sub, final int pos) {
    return indexOf(token, sub, pos) != -1;
  }

  /**
   * Checks if the first token contains the specified character.
   * @param token token
   * @param ch character to be found
   * @return result of test
   */
  public static boolean contains(final byte[] token, final int ch) {
    return indexOf(token, ch) != -1;
  }

  /**
   * Returns the position of the specified character or -1.
   * @param token token
   * @param ch character to be found
   * @return position or {@code -1}
   */
  public static int indexOf(final byte[] token, final int ch) {
    final int tl = token.length;
    if(ch < 0x80) {
      for(int t = 0; t < tl; t++) if(token[t] == ch) return t;
    } else {
      for(int t = 0; t < tl; t += cl(token, t)) if(cp(token, t) == ch) return t;
    }
    return -1;
  }

  /**
   * Returns the last position of the specified character or -1.
   * @param token token
   * @param ch character to be found
   * @return position or {@code -1}
   */
  public static int lastIndexOf(final byte[] token, final int ch) {
    final int tl = token.length;
    int p = -1;
    if(ch < 128) {
      for(int t = tl - 1; t >= 0; --t) if(token[t] == ch) return t;
    } else {
      for(int t = 0; t < tl; t += cl(token, t)) if(cp(token, t) == ch) p = t;
    }
    return p;
  }

  /**
   * Returns the position of the specified token or -1.
   * @param token token
   * @param sub token to be found
   * @return position or {@code -1}
   */
  public static int indexOf(final byte[] token, final byte[] sub) {
    return indexOf(token, sub, 0);
  }

  /**
   * Returns the position of the specified token or -1.
   * @param token token
   * @param sub token to be found
   * @param pos start position
   * @return result of test
   */
  public static int indexOf(final byte[] token, final byte[] sub, final int pos) {
    final int sl = sub.length;
    if(sl == 0) return pos;
    final int tl = token.length - sl;
    if(pos > tl) return -1;

    // compare tokens character wise
    for(int t = pos; t <= tl; ++t) {
      int s = 0;
      while(sub[s] == token[t + s]) if(++s == sl) return t;
    }
    return -1;
  }

  /**
   * Checks if the first token starts with the specified character.
   * @param token token
   * @param ch character to be found
   * @return result of test
   */
  public static boolean startsWith(final byte[] token, final int ch) {
    return startsWith(token, ch, 0);
  }

  /**
   * Checks if the first token starts with the specified character.
   * @param token token
   * @param ch character to be found
   * @param pos start position
   * @return result of test
   */
  private static boolean startsWith(final byte[] token, final int ch, final int pos) {
    return pos < token.length && token[pos] == ch;
  }

  /**
   * Checks if the first token starts with the second token.
   * @param token token
   * @param sub token to be found
   * @return result of test
   */
  public static boolean startsWith(final byte[] token, final byte[] sub) {
    return startsWith(token, sub, 0);
  }

  /**
   * Checks if the first token starts with the second token.
   * @param token token
   * @param sub token to be found
   * @param pos start position
   * @return result of test
   */
  public static boolean startsWith(final byte[] token, final byte[] sub, final int pos) {
    final int sl = sub.length;
    if(sl > token.length - pos) return false;
    for(int s = 0, p = pos; s < sl; ++s, ++p) {
      if(sub[s] != token[p]) return false;
    }
    return true;
  }

  /**
   * Checks if the first token starts with the specified character.
   * @param token token
   * @param ch character to be bound
   * @return result of test
   */
  public static boolean endsWith(final byte[] token, final int ch) {
    return token.length != 0 && token[token.length - 1] == ch;
  }

  /**
   * Checks if the first token ends with the second token.
   * @param token token
   * @param sub token to be found
   * @return result of test
   */
  public static boolean endsWith(final byte[] token, final byte[] sub) {
    final int sl = sub.length;
    final int tl = token.length;
    if(sl > tl) return false;
    for(int s = sl; s > 0; s--) if(sub[sl - s] != token[tl - s]) return false;
    return true;
  }

  /**
   * Returns a substring of the specified token.
   * Note that this method ignores Unicode codepoints; use {@link #subtoken} instead.
   * @param token input token
   * @param start start position
   * @return substring
   */
  public static byte[] substring(final byte[] token, final int start) {
    return substring(token, start, token.length);
  }

  /**
   * Returns a substring of the specified token.
   * Note that this method ignores Unicode codepoints; use {@link #subtoken} instead.
   * @param token input token
   * @param start start position
   * @param end end position
   * @return substring
   */
  public static byte[] substring(final byte[] token, final int start, final int end) {
    final int s = Math.max(0, start);
    final int e = Math.min(end, token.length);
    if(s == 0 && e == token.length) return token;
    return s >= e ? EMPTY : Arrays.copyOfRange(token, s, e);
  }

  /**
   * Returns a partial token.
   * @param token input token
   * @param start start position
   * @return resulting text
   */
  public static byte[] subtoken(final byte[] token, final int start) {
    return subtoken(token, start, token.length);
  }

  /**
   * Returns a partial token.
   * @param token input text
   * @param start start position
   * @param end end position
   * @return resulting text
   */
  public static byte[] subtoken(final byte[] token, final int start, final int end) {
    int s = Math.max(0, start);
    final int e = Math.min(end, token.length);
    if(s == 0 && e == token.length) return token;
    if(s >= e) return EMPTY;

    int t = Math.max(0, s - 4);
    for(; t != s && t < e; t += cl(token, t)) {
      if(t >= s) s = t;
    }
    for(; t < e; t += cl(token, t));
    return Arrays.copyOfRange(token, s, t);
  }

  /**
   * Splits a token around matches of the given separator.
   * @param token token to be split
   * @param sep separation character
   * @return array
   */
  public static byte[][] split(final byte[] token, final int sep) {
    final int tl = token.length;
    final byte[][] split = new byte[tl][];

    int sl = 0;
    final TokenBuilder tb = new TokenBuilder();
    for(int t = 0; t < tl; t += cl(token, t)) {
      final int c = cp(token, t);
      if(c == sep) {
        if(!tb.isEmpty()) split[sl++] = tb.next();
      } else {
        tb.add(c);
      }
    }
    if(!tb.isEmpty()) split[sl++] = tb.finish();
    return Array.copyOf(split, sl);
  }

  /**
   * Normalizes the specified input and returns its distinct tokens.
   * Optimized for small number of tokens.
   * @param token token
   * @return distinct tokens
   */
  public static byte[][] distinctTokens(final byte[] token) {
    final byte[][] tokens = split(normalize(token), ' ');
    int tl = tokens.length;
    for(int i = 0; i < tl - 1; i++) {
      for(int j = i + 1; j < tl; j++) {
        if(eq(tokens[i], tokens[j])) {
          Array.move(tokens, j + 1, -1, tl - j  - 1);
          j--; tl--;
        }
      }
    }
    return Array.copyOf(tokens, tl);
  }

  /**
   * Checks if the specified token has only whitespaces.
   * @param token token
   * @return true if all characters are whitespaces
   */
  public static boolean ws(final byte[] token) {
    for(final byte t : token) if(!ws(t)) return false;
    return true;
  }

  /**
   * Replaces the specified character and returns the result token.
   * @param token token to be checked
   * @param search the character to be replaced
   * @param replace the new character
   * @return resulting token
   */
  public static byte[] replace(final byte[] token, final int search, final int replace) {
    if(!contains(token, search)) return token;

    final TokenBuilder tb = new TokenBuilder(token.length);
    final int tl = token.length;
    for(int i = 0; i < tl; i += cl(token, i)) {
      final int c = cp(token, i);
      tb.add(c == search ? replace : c);
    }
    return tb.finish();
  }

  /**
   * Removes leading and trailing whitespaces from the specified token.
   * @param token token to be trimmed
   * @return trimmed token
   */
  public static byte[] trim(final byte[] token) {
    int s = -1, e = token.length;
    while(++s < e) if(!ws(token[s])) break;
    while(--e > s) if(!ws(token[e])) break;
    if(++e == token.length && s == 0) return token;
    return s == e ? EMPTY : Arrays.copyOfRange(token, s, e);
  }

  /**
   * Chops a token to the specified length and adds dots.
   * @param token token to be chopped
   * @param max maximum length
   * @return chopped token
   */
  public static byte[] chop(final byte[] token, final int max) {
    if(token.length <= max) return token;
    final byte[] tt = Arrays.copyOf(token, max);
    if(max > 2) tt[max - 3] = '.';
    if(max > 1) tt[max - 2] = '.';
    if(max > 0) tt[max - 1] = '.';
    return tt;
  }

  /**
   * Concatenates two tokens.
   * @param token1 first token
   * @param token2 second token
   * @return resulting array
   */
  public static byte[] concat(final byte[] token1, final byte[] token2) {
    final int t1 = token1.length;
    final int t2 = token2.length;
    final byte[] tmp = new byte[t1 + t2];
    System.arraycopy(token1, 0, tmp, 0, t1);
    System.arraycopy(token2, 0, tmp, t1, t2);
    return tmp;
  }

  /**
   * Concatenates three tokens. A {@link TokenBuilder} instance can be used to
   * concatenate more than three tokens.
   * @param token1 first token
   * @param token2 second token
   * @param token3 third token
   * @return resulting array
   */
  public static byte[] concat(final byte[] token1, final byte[] token2, final byte[] token3) {
    final int t1 = token1.length;
    final int t2 = token2.length;
    final int t3 = token3.length;
    final byte[] tmp = new byte[t1 + t2 + t3];
    System.arraycopy(token1, 0, tmp, 0, t1);
    System.arraycopy(token2, 0, tmp, t1, t2);
    System.arraycopy(token3, 0, tmp, t1 + t2, t3);
    return tmp;
  }

  /**
   * Deletes a character from the token.
   * @param token token
   * @param ch character to be removed
   * @return resulting token
   */
  public static byte[] delete(final byte[] token, final int ch) {
    // ascii character
    if(ch < 0x80) {
      // skip deletion if character is not found
      if(!contains(token, ch)) return token;

      final int tl = token.length;
      final TokenBuilder tb = new TokenBuilder(tl);
      for(final byte c : token) {
        if(c != ch) tb.add(c);
      }
      return tb.finish();
    }
    // remove character
    final int tl = token.length;
    final TokenBuilder tb = new TokenBuilder(tl);
    for(int i = 0; i < tl; i += cl(token, i)) {
      final int c = cp(token, i);
      if(c != ch) tb.add(c);
    }
    return tb.finish();
  }

  /**
   * Normalizes all whitespace occurrences from the specified token.
   * @param token token
   * @return normalized token
   */
  public static byte[] normalize(final byte[] token) {
    final int l = token.length;
    if(l == 0) return token;
    final byte[] tmp = new byte[l];
    int c = 0;
    boolean ws1 = true;
    for(final byte t : token) {
      final boolean ws2 = ws(t);
      if(ws2 && ws1) continue;
      tmp[c++] = ws2 ? (byte) ' ' : t;
      ws1 = ws2;
    }
    if(c > 0 && ws(tmp[c - 1])) --c;
    return c == l ? tmp : Arrays.copyOf(tmp, c);
  }

  /**
   * Checks if the specified character is a whitespace.
   * @param ch the letter to be checked
   * @return result of check
   */
  public static boolean ws(final int ch) {
    return ch == 0x09 || ch == 0x0A || ch == 0x0D || ch == 0x20;
  }

  /**
   * Checks if the specified character is a computer letter (A - Z, a - z, _).
   * @param ch the letter to be checked
   * @return result of check
   */
  public static boolean letter(final int ch) {
    return ch >= 'A' && ch <= 'Z' || ch >= 'a' && ch <= 'z' || ch == '_';
  }

  /**
   * Checks if the specified character is a digit (0 - 9).
   * @param ch the letter to be checked
   * @return result of check
   */
  public static boolean digit(final int ch) {
    return ch >= '0' && ch <= '9';
  }

  /**
   * Checks if the specified character is a computer letter or digit.
   * @param ch the letter to be checked
   * @return result of check
   */
  public static boolean letterOrDigit(final int ch) {
    return letter(ch) || digit(ch);
  }

  /**
   * Converts the specified token to upper case.
   * @param token token to be converted
   * @return resulting token
   */
  public static byte[] uc(final byte[] token) {
    if(ascii(token)) {
      final int tl = token.length;
      final byte[] tok = new byte[tl];
      for(int t = 0; t < tl; t++) tok[t] = (byte) uc(token[t]);
      return tok;
    }
    return token(string(token).toUpperCase(Locale.ENGLISH));
  }

  /**
   * Converts the specified token to title case.
   * @param token token to be converted
   * @return resulting token
   */
  public static byte[] tc(final byte[] token) {
    final int tl = token.length;
    final TokenBuilder tb = new TokenBuilder(tl);
    boolean del = false;
    for(int t = 0; t < tl; t += cl(token, t)) {
      final int cp = cp(token, t);
      tb.add(del ? lc(cp) : uc(cp));
      del = Character.isLetterOrDigit(cp);
    }
    return tb.finish();
  }

  /**
   * Converts a character to upper case.
   * @param ch character to be converted
   * @return resulting character
   */
  public static int uc(final int ch) {
    return ch >= 'a' && ch <= 'z' ? ch - 0x20 :
      ch > 0x7F ? Character.toUpperCase(ch) : ch;
  }

  /**
   * Converts the specified token to lower case.
   * @param token token to be converted
   * @return resulting token
   */
  public static byte[] lc(final byte[] token) {
    if(ascii(token)) {
      final int tl = token.length;
      final byte[] tok = new byte[tl];
      for(int t = 0; t < tl; t++) tok[t] = (byte) lc(token[t]);
      return tok;
    }
    return token(string(token).toLowerCase(Locale.ENGLISH));
  }

  /**
   * Converts a character to lower case.
   * @param ch character to be converted
   * @return resulting character
   */
  public static int lc(final int ch) {
    return ch >= 'A' && ch <= 'Z' ? ch | 0x20 :
      ch > 0x7F ? Character.toLowerCase(ch) : ch;
  }

  /**
   * Returns the prefix of the specified token.
   * @param name name
   * @return prefix or empty token if no prefix exists
   */
  public static byte[] prefix(final byte[] name) {
    final int i = indexOf(name, ':');
    return i == -1 ? EMPTY : substring(name, 0, i);
  }

  /**
   * Returns the local name of the specified name.
   * @param name name
   * @return local name
   */
  public static byte[] local(final byte[] name) {
    final int i = indexOf(name, ':');
    return i == -1 ? name : substring(name, i + 1);
  }

  /**
   * Returns a URI encoded token.
   * @param token token
   * @param iri input
   * @return encoded token
   */
  public static byte[] uri(final byte[] token, final boolean iri) {
    final TokenBuilder tb = new TokenBuilder();
    for(final byte t : token) {
      if(letterOrDigit(t) || contains(iri ? IRIRES : RES, t)) tb.addByte(t);
      else hex(tb, t);
    }
    return tb.finish();
  }

  /**
   * Escapes the specified token.
   * @param token token
   * @return escaped token
   */
  public static byte[] escape(final byte[] token) {
    final TokenBuilder tb = new TokenBuilder();
    for(final byte t : token) {
      if(t >= 0x20 && t <= 0x7e) tb.addByte(t);
      else hex(tb, t);
    }
    return tb.finish();
  }

  /**
   * Adds the specified byte in hex code.
   * @param tb token builder
   * @param value byte to be added
   */
  private static void hex(final TokenBuilder tb, final byte value) {
    tb.add('%');
    tb.addByte(HEX[(value & 0xFF) >> 4]);
    tb.addByte(HEX[value & 0xFF & 15]);
  }

  /**
   * Returns a hex representation of the specified byte array.
   * @param value values to be mapped
   * @param uc upper case
   * @return hex representation
   */
  public static byte[] hex(final byte[] value, final boolean uc) {
    final int vl = value.length, u = uc ? 0x37 : 0x57;
    final byte[] data = new byte[vl << 1];
    for(int v = 0, c = 0; v < vl; v++) {
      int b = value[v] >> 4 & 0x0F;
      data[c++] = (byte) (b + (b > 9 ? u : '0'));
      b = value[v] & 0x0F;
      data[c++] = (byte) (b + (b > 9 ? u : '0'));
    }
    return data;
  }

}