package org.basex.util; import java.text.*; import java.util.*; /** * <p>This class provides convenience operations for handling 'Tokens'. * A token is a UTF-8 encoded string. It is represented as a byte array.</p> * * <p>In order to ensure a consistent representation of tokens in the project, all string * conversions should be done via the methods of this class.</p> * * @author BaseX Team 2005-17, BSD License * @author Christian Gruen */ public final class Token { /** Empty token. */ public static final byte[] EMPTY = {}; /** XML token. */ public static final byte[] XML = token("xml"); /** XML token with colon. */ public static final byte[] XMLC = token("xml:"); /** XMLNS token. */ public static final byte[] XMLNS = token("xmlns"); /** XMLNS token with colon. */ public static final byte[] XMLNSC = token("xmlns:"); /** ID token. */ public static final byte[] ID = token("id"); /** IDRef token. */ public static final byte[] IDREF = token("ref"); /** Token 'true'. */ public static final byte[] TRUE = token("true"); /** Token 'false'. */ public static final byte[] FALSE = token("false"); /** Token 'NaN'. */ public static final byte[] NAN = token("NaN"); /** Token 'INF'. */ public static final byte[] INF = token("INF"); /** Token '-INF'. */ public static final byte[] NINF = token("-INF"); /** Minimum long value. */ public static final byte[] MINLONG = token("-9223372036854775808"); /** Space. */ public static final byte[] SPACE = { ' ' }; /** Number '0'. */ public static final byte[] ZERO = { '0' }; /** Number '-0'. */ private static final byte[] MZERO = { '-', '0' }; /** Number '1'. */ public static final byte[] ONE = { '1' }; /** Slash. */ public static final byte[] SLASH = { '/' }; /** Colon. */ public static final byte[] COLON = { ':' }; /** Unicode replacement character. */ public static final char REPLACEMENT = '\uFFFD'; /** Maximum length for hash calculation. */ private static final byte MAXLENGTH = 96; /** Maximum values for converting tokens to integer values. */ private static final int MAXINT = Integer.MAX_VALUE / 10; /** Maximum values for converting tokens to long values. */ private static final long MAXLONG = Long.MAX_VALUE / 10; /** Hex codes. */ public static final byte[] HEX = token("0123456789ABCDEF"); /** Reserved characters. */ private static final byte[] IRIRES = token("!#$%&*'()+,-./:;=?@[]~_"); /** Reserved characters. */ private static final byte[] RES = token("-._~"); /** Comparator for byte arrays. */ public static final Comparator<byte[]> COMP = new Comparator<byte[]>() { @Override public int compare(final byte[] o1, final byte[] o2) { return diff(o1, o2); } }; /** Case-insensitive comparator for byte arrays. */ public static final Comparator<byte[]> LC_COMP = new Comparator<byte[]>() { @Override public int compare(final byte[] o1, final byte[] o2) { return diff(lc(o1), lc(o2)); } }; /** Hidden constructor. */ private Token() { } /** * Returns the specified token as string. * @param token token * @return string */ public static String string(final byte[] token) { return string(token, 0, token.length); } /** * Returns the specified token as string. * @param token token * @param start start position * @param length length * @return string */ public static String string(final byte[] token, final int start, final int length) { if(length <= 0) return ""; /// check if string contains non-ascii characters final int e = start + length; for(int p = start; p < e; ++p) if(token[p] < 0) return utf8(token, start, length); /// copy ascii characters to character array final char[] str = new char[length]; for(int p = 0; p < length; ++p) str[p] = (char) token[start + p]; return new String(str); } /** * Returns a string of the specified UTF8 token. * @param token token * @param start start position * @param length length * @return string */ private static String utf8(final byte[] token, final int start, final int length) { // input is assumed to be correct UTF8. if input contains codepoints // larger than Character.MAX_CODE_POINT, results might be unexpected. final StringBuilder sb = new StringBuilder(length << 1); final int il = Math.min(start + length, token.length); for(int i = start; i < il; i += cl(token, i)) { final int cp = cp(token, i); if(cp < Character.MIN_SUPPLEMENTARY_CODE_POINT) { sb.append((char) cp); } else { final int o = cp - Character.MIN_SUPPLEMENTARY_CODE_POINT; sb.append((char) ((o >>> 10) + Character.MIN_HIGH_SURROGATE)); sb.append((char) ((o & 0x3ff) + Character.MIN_LOW_SURROGATE)); } } return sb.toString(); } /** * Checks if the specified token only consists of ASCII characters. * @param token token * @return result of check */ public static boolean ascii(final byte[] token) { for(final byte t : token) if(t < 0) return false; return true; } /** * Converts a string to a byte array. * All strings should be converted by this function to guarantee * a consistent character conversion. * @param string string to be converted * @return byte array */ public static byte[] token(final String string) { final int l = string.length(); if(l == 0) return EMPTY; final byte[] b = new byte[l]; for(int i = 0; i < l; ++i) { final char c = string.charAt(i); if(c > 0x7F) return utf8(string); b[i] = (byte) c; } return b; } /** * Converts the specified strings to tokens. * @param strings strings * @return tokens */ public static byte[][] tokens(final String... strings) { final byte[][] tokens = new byte[strings.length][]; final int tl = tokens.length; for(int t = 0; t < tl; ++t) tokens[t] = token(strings[t]); return tokens; } /** * Converts a string to a UTF8 byte array. * @param string string to be converted * @return byte array */ private static byte[] utf8(final String string) { final char[] arr = string.toCharArray(); final int al = arr.length; final TokenBuilder tb = new TokenBuilder(al << 1); for(int c = 0; c < al; ++c) { final char ch = arr[c]; tb.add(Character.isHighSurrogate(ch) && c < al - 1 && Character.isLowSurrogate(arr[c + 1]) ? Character.toCodePoint(ch, arr[++c]) : ch); } return tb.finish(); } /** * Converts a token from the input encoding to UTF8. * @param token token to be converted * @param encoding input encoding * @return byte array */ public static byte[] utf8(final byte[] token, final String encoding) { // UTF8 (comparison by ref.) or no special characters: return input if(encoding == Strings.UTF8 || ascii(token)) return token; // convert to utf8. if errors occur while converting, an empty is returned. try { return token(new String(token, encoding)); } catch(final Exception ex) { Util.debug(ex); return EMPTY; } } /** * Returns the codepoint (unicode value) of the specified token, starting at * the specified position. Returns a unicode replacement character for invalid values. * @param token token * @param pos character position * @return current character */ public static int cp(final byte[] token, final int pos) { // 0xxxxxxx final byte v = token[pos]; if((v & 0xFF) < 192) return v & 0xFF; // number of bytes to be read final int vl = cl(v); if(pos + vl > token.length) return REPLACEMENT; // 110xxxxx 10xxxxxx if(vl == 2) return (v & 0x1F) << 6 | token[pos + 1] & 0x3F; // 1110xxxx 10xxxxxx 10xxxxxx if(vl == 3) return (v & 0x0F) << 12 | (token[pos + 1] & 0x3F) << 6 | token[pos + 2] & 0x3F; // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx return (v & 0x07) << 18 | (token[pos + 1] & 0x3F) << 12 | (token[pos + 2] & 0x3F) << 6 | token[pos + 3] & 0x3F; } /** Character lengths. */ private static final int[] CHLEN = { 1, 1, 1, 1, 2, 2, 3, 4 }; /** * Returns the length of the specified UTF8 byte. * @param cp codepoint * @return character length */ public static int cl(final byte cp) { return cp >= 0 ? 1 : CHLEN[cp >> 4 & 0x7]; } /** * Returns the length of a UTF8 character at the specified position. * @param token token * @param pos position * @return character length */ public static int cl(final byte[] token, final int pos) { return cl(token[pos]); } /** * Converts a token to a sequence of codepoints. * @param token token * @return codepoints */ public static int[] cps(final byte[] token) { int pos = 0; final int len = token.length; final int[] cp = new int[len]; for(int i = 0; i < len; i += cl(token, i)) cp[pos++] = cp(token, i); return pos < len ? Arrays.copyOf(cp, pos) : cp; } /** * Returns the number of codepoints in the token. * @param token token * @return number of codepoints */ public static int length(final byte[] token) { final int tl = token.length; if(ascii(token)) return tl; int l = 0; for(int t = 0; t < tl; t += cl(token, t)) ++l; return l; } /** * Creates a byte array representation of the specified boolean value. * @param bool boolean value to be converted * @return boolean value in byte array */ public static byte[] token(final boolean bool) { return bool ? TRUE : FALSE; } /** * Creates a byte array representation of the specified integer value. * @param integer int value to be converted * @return integer value in byte array */ public static byte[] token(final int integer) { if(integer == 0) return ZERO; if(integer == Integer.MIN_VALUE) return MININT; int n = integer; final boolean m = n < 0; if(m) n = -n; int j = numDigits(n); if(m) ++j; final byte[] num = new byte[j]; // faster division by 10 for values < 81920 (see Integer.getChars) while(n > 81919) { final int q = n / 10; num[--j] = (byte) (n - (q << 3) - (q << 1) + '0'); n = q; } while(n != 0) { final int q = n * 52429 >>> 19; num[--j] = (byte) (n - (q << 3) - (q << 1) + '0'); n = q; } if(m) num[--j] = '-'; return num; } /** * Checks number of digits of the specified integer. * @param integer number to be checked * @return number of digits */ public static int numDigits(final int integer) { for(int i = 0;; ++i) if(integer <= INTSIZE[i]) return i + 1; } /** Minimum integer. */ private static final byte[] MININT = token("-2147483648"); /** Table with integer sizes. */ private static final int[] INTSIZE = { 9, 99, 999, 9999, 99999, 999999, 9999999, 99999999, 999999999, Integer.MAX_VALUE }; /** * Creates a byte array representation from the specified long value, * using Java's standard method. * @param integer value to be converted * @return byte array */ public static byte[] token(final long integer) { return integer >= Integer.MIN_VALUE && integer <= Integer.MAX_VALUE ? token((int) integer) : token(Long.toString(integer)); } /** US charset. */ private static final DecimalFormatSymbols LOC = new DecimalFormatSymbols(Locale.US); /** Scientific double output. */ private static final DecimalFormat SD = new DecimalFormat("0.0##################E0", LOC); /** Decimal double output. */ private static final DecimalFormat DD = new DecimalFormat("#####0.0################", LOC); /** Scientific float output. */ private static final DecimalFormat SF = new DecimalFormat("0.0######E0", LOC); /** Decimal float output. */ private static final DecimalFormat DF = new DecimalFormat("#####0.0######", LOC); /** * Creates a byte array representation from the specified double value. * @param dbl double value to be converted * @return byte array */ public static byte[] token(final double dbl) { final byte[] b = tok(dbl); if(b != null) return b; final double a = Math.abs(dbl); final String s; if(a >= 1.0e-6 && a < 1.0e6) { synchronized(DD) { s = DD.format(dbl); } } else { synchronized(SD) { s = SD.format(dbl); } } return chopNumber(token(s)); } /** * Creates a byte array representation from the specified float value. * @param flt float value to be converted * @return byte array */ public static byte[] token(final float flt) { final byte[] b = tok(flt); if(b != null) return b; final int fl = FLT.length; for(int i = 0; i < fl; ++i) if(flt == FLT[i]) return FLTSTR[i]; final float a = Math.abs(flt); final boolean small = a >= 1.0e-6f && a < 1.0e6f; String s1; if(small) { synchronized(DF) { s1 = DF.format(flt); } } else { synchronized(SF) { s1 = SF.format(flt); } } final String s2 = Float.toString(flt); if(s2.length() < s1.length() && (!s2.contains("E") || !small)) s1 = s2; return chopNumber(token(s1)); } /** * Tries to create a byte array representation from a floating point. * @param value value to be converted * @return byte array, or {@code null} */ private static byte[] tok(final double value) { if(value == Double.POSITIVE_INFINITY) return INF; if(value == Double.NEGATIVE_INFINITY) return NINF; if(value == 0) return 1 / value > 0 ? ZERO : MZERO; if(Double.isNaN(value)) return NAN; final double a = Math.abs(value); if(a < 1.0e6) { final int i = (int) value; if(i == value) return token(i); } return null; } /** * Finishes the numeric token, removing trailing zeroes. * @param token token to be modified * @return token */ public static byte[] chopNumber(final byte[] token) { if(!contains(token, '.') || contains(token, 'e') || contains(token, 'E')) return token; // remove trailing zeroes int l = token.length; while(--l > 0 && token[l] == '0'); return substring(token, 0, token[l] == '.' ? l : l + 1); } /** Constant float values. */ private static final float[] FLT = { 1.0E17f, 1.0E15f, 1.0E13f, 1.0E11f, -1.0E17f, -1.0E15f, -1.0E13f, -1.0E11f }; /** Token representations of float values. */ private static final byte[][] FLTSTR = tokens("1.0E17", "1.0E15", "1.0E13", "1.0E11", "-1.0E17", "-1.0E15", "-1.0E13", "-1.0E11"); /** * Converts the specified token into a double value. * @param token token to be converted * @return resulting double value, or {@link Double#NaN} is returned if the input is invalid */ public static double toDouble(final byte[] token) { final int tl = token.length; int s = -1; while(++s < tl && ws(token[s])); if(s == tl) return Double.NaN; int e = s; boolean f = false; for(int p = s; p < tl; ++p) { final byte b = token[p]; if(e == s) { if(digit(b) || b == '+') continue; if(ws(b)) { e = p + 1; } else { f = b == 'e' || b == 'E' || b == '.' || b == '-'; if(!f) return Double.NaN; } } else if(!ws(b)) { return Double.NaN; } } if(e == s) e = tl; if(f || e - s > 9) return toDouble(token, s, e); final int d = toInt(token, s, e); return d == Integer.MIN_VALUE ? Double.NaN : d; } /** * Converts the specified token into a double value. * {@link Double#NaN} is returned if the input is invalid. * @param token token to be converted * @param start first byte to be parsed * @param end last byte to be parsed - exclusive * @return resulting double value */ private static double toDouble(final byte[] token, final int start, final int end) { try { return Double.parseDouble(string(token, start, end - start)); } catch(final NumberFormatException ex) { return Double.NaN; } } /** * Converts the specified token into an long value. * {@link Long#MIN_VALUE} is returned if the input is invalid. * Note that this may also be the actual value ({@link #MINLONG}). * @param token token to be converted * @return resulting long value */ public static long toLong(final byte[] token) { return toLong(token, 0, token.length); } /** * Converts the specified token into an long value. * {@link Long#MIN_VALUE} is returned if the input is invalid. * Note that this may also be the actual value ({@link #MINLONG}). * @param token token to be converted * @param start first byte to be parsed * @param end last byte to be parsed - exclusive * @return resulting long value */ public static long toLong(final byte[] token, final int start, final int end) { int p = start; while(p < end && ws(token[p])) ++p; if(p == end) return Long.MIN_VALUE; boolean m = false; if(token[p] == '-' || token[p] == '+') m = token[p++] == '-'; if(p == end) return Long.MIN_VALUE; long v = 0; for(; p < end; ++p) { final byte b = token[p]; if(b < '0' || b > '9') break; if(v >= MAXLONG && (b > '7' || v > MAXLONG)) return Long.MIN_VALUE; v = (v << 3) + (v << 1) + b - '0'; } while(p < end && ws(token[p])) ++p; return p < end ? Long.MIN_VALUE : m ? -v : v; } /** * Converts the specified token into an integer value. * {@link Integer#MIN_VALUE} is returned if the input is invalid. * @param token token to be converted * @return resulting integer value */ public static int toInt(final byte[] token) { return toInt(token, 0, token.length); } /** * Converts the specified token into an integer value. * {@link Integer#MIN_VALUE} is returned if the input is invalid. * @param token token to be converted * @param start first byte to be parsed * @param end last byte to be parsed (exclusive) * @return resulting integer value */ private static int toInt(final byte[] token, final int start, final int end) { int p = start; while(p < end && ws(token[p])) ++p; if(p == end) return Integer.MIN_VALUE; boolean m = false; if(token[p] == '-' || token[p] == '+') m = token[p++] == '-'; if(p == end) return Integer.MIN_VALUE; int v = 0; for(; p < end; ++p) { final byte b = token[p]; if(b < '0' || b > '9') break; if(v >= MAXINT && (b > '7' || v > MAXINT)) return Integer.MIN_VALUE; v = (v << 3) + (v << 1) + b - '0'; } while(p < end && ws(token[p])) ++p; return p < end || v < 0 ? Integer.MIN_VALUE : m ? -v : v; } /** * Converts the specified token into a positive integer value. * {@link Integer#MIN_VALUE} is returned if non-digits are found * or if the input is longer than nine characters. * @param token token to be converted * @return resulting integer value */ public static int toSimpleInt(final byte[] token) { final int te = token.length; if(te >= 10 || te == 0) return Integer.MIN_VALUE; if(token[0] == '0') return te == 1 ? 0 : Integer.MIN_VALUE; int v = 0; for(final byte c : token) { if(c < '0' || c > '9') return Integer.MIN_VALUE; v = (v << 3) + (v << 1) + c - '0'; } return v; } /** * Calculates a hash code for the specified token. * @param token specified token * @return hash code */ public static int hash(final byte[] token) { int h = 0; final int l = Math.min(token.length, MAXLENGTH); for(int i = 0; i != l; ++i) h = (h << 5) - h + token[i]; return h; } /** * Compares two tokens for equality. * @param token1 first token * @param token2 token to be compared * @return true if the arrays are equal */ public static boolean eq(final byte[] token1, final byte[] token2) { final int tl = token2.length; if(tl != token1.length) return false; for(int t = 0; t != tl; ++t) if(token2[t] != token1[t]) return false; return true; } /** * Compares several tokens for equality. * @param token token * @param tokens tokens to be compared * @return true if one test is successful */ public static boolean eq(final byte[] token, final byte[]... tokens) { for(final byte[] t : tokens) if(eq(token, t)) return true; return false; } /** * Compares two tokens lexicographically. * @param token first token * @param compare token to be compared * @return 0 if tokens are equal, negative if first token is smaller, * positive if first token is bigger */ public static int diff(final byte[] token, final byte[] compare) { final int tl = token.length; final int cl = compare.length; final int l = Math.min(tl, cl); for(int i = 0; i < l; ++i) { final int c = (token[i] & 0xFF) - (compare[i] & 0xFF); if(c != 0) return c; } return tl - cl; } /** * Returns the smaller token. * @param token first token * @param compare token to be compared * @return smaller token */ public static byte[] min(final byte[] token, final byte[] compare) { return diff(token, compare) < 0 ? token : compare; } /** * Returns the bigger token. * @param token first token * @param compare token to be compared * @return bigger token */ public static byte[] max(final byte[] token, final byte[] compare) { return diff(token, compare) > 0 ? token : compare; } /** * Checks if the first token contains the second token. * @param token token * @param sub token to be found * @return result of test */ public static boolean contains(final byte[] token, final byte[] sub) { return contains(token, sub, 0); } /** * Checks if the first token contains the second token. * @param token token * @param sub token to be found * @param pos start position * @return result of test */ public static boolean contains(final byte[] token, final byte[] sub, final int pos) { return indexOf(token, sub, pos) != -1; } /** * Checks if the first token contains the specified character. * @param token token * @param ch character to be found * @return result of test */ public static boolean contains(final byte[] token, final int ch) { return indexOf(token, ch) != -1; } /** * Returns the position of the specified character or -1. * @param token token * @param ch character to be found * @return position or {@code -1} */ public static int indexOf(final byte[] token, final int ch) { final int tl = token.length; if(ch < 0x80) { for(int t = 0; t < tl; t++) if(token[t] == ch) return t; } else { for(int t = 0; t < tl; t += cl(token, t)) if(cp(token, t) == ch) return t; } return -1; } /** * Returns the last position of the specified character or -1. * @param token token * @param ch character to be found * @return position or {@code -1} */ public static int lastIndexOf(final byte[] token, final int ch) { final int tl = token.length; int p = -1; if(ch < 128) { for(int t = tl - 1; t >= 0; --t) if(token[t] == ch) return t; } else { for(int t = 0; t < tl; t += cl(token, t)) if(cp(token, t) == ch) p = t; } return p; } /** * Returns the position of the specified token or -1. * @param token token * @param sub token to be found * @return position or {@code -1} */ public static int indexOf(final byte[] token, final byte[] sub) { return indexOf(token, sub, 0); } /** * Returns the position of the specified token or -1. * @param token token * @param sub token to be found * @param pos start position * @return result of test */ public static int indexOf(final byte[] token, final byte[] sub, final int pos) { final int sl = sub.length; if(sl == 0) return pos; final int tl = token.length - sl; if(pos > tl) return -1; // compare tokens character wise for(int t = pos; t <= tl; ++t) { int s = 0; while(sub[s] == token[t + s]) if(++s == sl) return t; } return -1; } /** * Checks if the first token starts with the specified character. * @param token token * @param ch character to be found * @return result of test */ public static boolean startsWith(final byte[] token, final int ch) { return startsWith(token, ch, 0); } /** * Checks if the first token starts with the specified character. * @param token token * @param ch character to be found * @param pos start position * @return result of test */ private static boolean startsWith(final byte[] token, final int ch, final int pos) { return pos < token.length && token[pos] == ch; } /** * Checks if the first token starts with the second token. * @param token token * @param sub token to be found * @return result of test */ public static boolean startsWith(final byte[] token, final byte[] sub) { return startsWith(token, sub, 0); } /** * Checks if the first token starts with the second token. * @param token token * @param sub token to be found * @param pos start position * @return result of test */ public static boolean startsWith(final byte[] token, final byte[] sub, final int pos) { final int sl = sub.length; if(sl > token.length - pos) return false; for(int s = 0, p = pos; s < sl; ++s, ++p) { if(sub[s] != token[p]) return false; } return true; } /** * Checks if the first token starts with the specified character. * @param token token * @param ch character to be bound * @return result of test */ public static boolean endsWith(final byte[] token, final int ch) { return token.length != 0 && token[token.length - 1] == ch; } /** * Checks if the first token ends with the second token. * @param token token * @param sub token to be found * @return result of test */ public static boolean endsWith(final byte[] token, final byte[] sub) { final int sl = sub.length; final int tl = token.length; if(sl > tl) return false; for(int s = sl; s > 0; s--) if(sub[sl - s] != token[tl - s]) return false; return true; } /** * Returns a substring of the specified token. * Note that this method ignores Unicode codepoints; use {@link #subtoken} instead. * @param token input token * @param start start position * @return substring */ public static byte[] substring(final byte[] token, final int start) { return substring(token, start, token.length); } /** * Returns a substring of the specified token. * Note that this method ignores Unicode codepoints; use {@link #subtoken} instead. * @param token input token * @param start start position * @param end end position * @return substring */ public static byte[] substring(final byte[] token, final int start, final int end) { final int s = Math.max(0, start); final int e = Math.min(end, token.length); if(s == 0 && e == token.length) return token; return s >= e ? EMPTY : Arrays.copyOfRange(token, s, e); } /** * Returns a partial token. * @param token input token * @param start start position * @return resulting text */ public static byte[] subtoken(final byte[] token, final int start) { return subtoken(token, start, token.length); } /** * Returns a partial token. * @param token input text * @param start start position * @param end end position * @return resulting text */ public static byte[] subtoken(final byte[] token, final int start, final int end) { int s = Math.max(0, start); final int e = Math.min(end, token.length); if(s == 0 && e == token.length) return token; if(s >= e) return EMPTY; int t = Math.max(0, s - 4); for(; t != s && t < e; t += cl(token, t)) { if(t >= s) s = t; } for(; t < e; t += cl(token, t)); return Arrays.copyOfRange(token, s, t); } /** * Splits a token around matches of the given separator. * @param token token to be split * @param sep separation character * @return array */ public static byte[][] split(final byte[] token, final int sep) { final int tl = token.length; final byte[][] split = new byte[tl][]; int sl = 0; final TokenBuilder tb = new TokenBuilder(); for(int t = 0; t < tl; t += cl(token, t)) { final int c = cp(token, t); if(c == sep) { if(!tb.isEmpty()) split[sl++] = tb.next(); } else { tb.add(c); } } if(!tb.isEmpty()) split[sl++] = tb.finish(); return Array.copyOf(split, sl); } /** * Normalizes the specified input and returns its distinct tokens. * Optimized for small number of tokens. * @param token token * @return distinct tokens */ public static byte[][] distinctTokens(final byte[] token) { final byte[][] tokens = split(normalize(token), ' '); int tl = tokens.length; for(int i = 0; i < tl - 1; i++) { for(int j = i + 1; j < tl; j++) { if(eq(tokens[i], tokens[j])) { Array.move(tokens, j + 1, -1, tl - j - 1); j--; tl--; } } } return Array.copyOf(tokens, tl); } /** * Checks if the specified token has only whitespaces. * @param token token * @return true if all characters are whitespaces */ public static boolean ws(final byte[] token) { for(final byte t : token) if(!ws(t)) return false; return true; } /** * Replaces the specified character and returns the result token. * @param token token to be checked * @param search the character to be replaced * @param replace the new character * @return resulting token */ public static byte[] replace(final byte[] token, final int search, final int replace) { if(!contains(token, search)) return token; final TokenBuilder tb = new TokenBuilder(token.length); final int tl = token.length; for(int i = 0; i < tl; i += cl(token, i)) { final int c = cp(token, i); tb.add(c == search ? replace : c); } return tb.finish(); } /** * Removes leading and trailing whitespaces from the specified token. * @param token token to be trimmed * @return trimmed token */ public static byte[] trim(final byte[] token) { int s = -1, e = token.length; while(++s < e) if(!ws(token[s])) break; while(--e > s) if(!ws(token[e])) break; if(++e == token.length && s == 0) return token; return s == e ? EMPTY : Arrays.copyOfRange(token, s, e); } /** * Chops a token to the specified length and adds dots. * @param token token to be chopped * @param max maximum length * @return chopped token */ public static byte[] chop(final byte[] token, final int max) { if(token.length <= max) return token; final byte[] tt = Arrays.copyOf(token, max); if(max > 2) tt[max - 3] = '.'; if(max > 1) tt[max - 2] = '.'; if(max > 0) tt[max - 1] = '.'; return tt; } /** * Concatenates two tokens. * @param token1 first token * @param token2 second token * @return resulting array */ public static byte[] concat(final byte[] token1, final byte[] token2) { final int t1 = token1.length; final int t2 = token2.length; final byte[] tmp = new byte[t1 + t2]; System.arraycopy(token1, 0, tmp, 0, t1); System.arraycopy(token2, 0, tmp, t1, t2); return tmp; } /** * Concatenates three tokens. A {@link TokenBuilder} instance can be used to * concatenate more than three tokens. * @param token1 first token * @param token2 second token * @param token3 third token * @return resulting array */ public static byte[] concat(final byte[] token1, final byte[] token2, final byte[] token3) { final int t1 = token1.length; final int t2 = token2.length; final int t3 = token3.length; final byte[] tmp = new byte[t1 + t2 + t3]; System.arraycopy(token1, 0, tmp, 0, t1); System.arraycopy(token2, 0, tmp, t1, t2); System.arraycopy(token3, 0, tmp, t1 + t2, t3); return tmp; } /** * Deletes a character from the token. * @param token token * @param ch character to be removed * @return resulting token */ public static byte[] delete(final byte[] token, final int ch) { // ascii character if(ch < 0x80) { // skip deletion if character is not found if(!contains(token, ch)) return token; final int tl = token.length; final TokenBuilder tb = new TokenBuilder(tl); for(final byte c : token) { if(c != ch) tb.add(c); } return tb.finish(); } // remove character final int tl = token.length; final TokenBuilder tb = new TokenBuilder(tl); for(int i = 0; i < tl; i += cl(token, i)) { final int c = cp(token, i); if(c != ch) tb.add(c); } return tb.finish(); } /** * Normalizes all whitespace occurrences from the specified token. * @param token token * @return normalized token */ public static byte[] normalize(final byte[] token) { final int l = token.length; if(l == 0) return token; final byte[] tmp = new byte[l]; int c = 0; boolean ws1 = true; for(final byte t : token) { final boolean ws2 = ws(t); if(ws2 && ws1) continue; tmp[c++] = ws2 ? (byte) ' ' : t; ws1 = ws2; } if(c > 0 && ws(tmp[c - 1])) --c; return c == l ? tmp : Arrays.copyOf(tmp, c); } /** * Checks if the specified character is a whitespace. * @param ch the letter to be checked * @return result of check */ public static boolean ws(final int ch) { return ch == 0x09 || ch == 0x0A || ch == 0x0D || ch == 0x20; } /** * Checks if the specified character is a computer letter (A - Z, a - z, _). * @param ch the letter to be checked * @return result of check */ public static boolean letter(final int ch) { return ch >= 'A' && ch <= 'Z' || ch >= 'a' && ch <= 'z' || ch == '_'; } /** * Checks if the specified character is a digit (0 - 9). * @param ch the letter to be checked * @return result of check */ public static boolean digit(final int ch) { return ch >= '0' && ch <= '9'; } /** * Checks if the specified character is a computer letter or digit. * @param ch the letter to be checked * @return result of check */ public static boolean letterOrDigit(final int ch) { return letter(ch) || digit(ch); } /** * Converts the specified token to upper case. * @param token token to be converted * @return resulting token */ public static byte[] uc(final byte[] token) { if(ascii(token)) { final int tl = token.length; final byte[] tok = new byte[tl]; for(int t = 0; t < tl; t++) tok[t] = (byte) uc(token[t]); return tok; } return token(string(token).toUpperCase(Locale.ENGLISH)); } /** * Converts the specified token to title case. * @param token token to be converted * @return resulting token */ public static byte[] tc(final byte[] token) { final int tl = token.length; final TokenBuilder tb = new TokenBuilder(tl); boolean del = false; for(int t = 0; t < tl; t += cl(token, t)) { final int cp = cp(token, t); tb.add(del ? lc(cp) : uc(cp)); del = Character.isLetterOrDigit(cp); } return tb.finish(); } /** * Converts a character to upper case. * @param ch character to be converted * @return resulting character */ public static int uc(final int ch) { return ch >= 'a' && ch <= 'z' ? ch - 0x20 : ch > 0x7F ? Character.toUpperCase(ch) : ch; } /** * Converts the specified token to lower case. * @param token token to be converted * @return resulting token */ public static byte[] lc(final byte[] token) { if(ascii(token)) { final int tl = token.length; final byte[] tok = new byte[tl]; for(int t = 0; t < tl; t++) tok[t] = (byte) lc(token[t]); return tok; } return token(string(token).toLowerCase(Locale.ENGLISH)); } /** * Converts a character to lower case. * @param ch character to be converted * @return resulting character */ public static int lc(final int ch) { return ch >= 'A' && ch <= 'Z' ? ch | 0x20 : ch > 0x7F ? Character.toLowerCase(ch) : ch; } /** * Returns the prefix of the specified token. * @param name name * @return prefix or empty token if no prefix exists */ public static byte[] prefix(final byte[] name) { final int i = indexOf(name, ':'); return i == -1 ? EMPTY : substring(name, 0, i); } /** * Returns the local name of the specified name. * @param name name * @return local name */ public static byte[] local(final byte[] name) { final int i = indexOf(name, ':'); return i == -1 ? name : substring(name, i + 1); } /** * Returns a URI encoded token. * @param token token * @param iri input * @return encoded token */ public static byte[] uri(final byte[] token, final boolean iri) { final TokenBuilder tb = new TokenBuilder(); for(final byte t : token) { if(letterOrDigit(t) || contains(iri ? IRIRES : RES, t)) tb.addByte(t); else hex(tb, t); } return tb.finish(); } /** * Escapes the specified token. * @param token token * @return escaped token */ public static byte[] escape(final byte[] token) { final TokenBuilder tb = new TokenBuilder(); for(final byte t : token) { if(t >= 0x20 && t <= 0x7e) tb.addByte(t); else hex(tb, t); } return tb.finish(); } /** * Adds the specified byte in hex code. * @param tb token builder * @param value byte to be added */ private static void hex(final TokenBuilder tb, final byte value) { tb.add('%'); tb.addByte(HEX[(value & 0xFF) >> 4]); tb.addByte(HEX[value & 0xFF & 15]); } /** * Returns a hex representation of the specified byte array. * @param value values to be mapped * @param uc upper case * @return hex representation */ public static byte[] hex(final byte[] value, final boolean uc) { final int vl = value.length, u = uc ? 0x37 : 0x57; final byte[] data = new byte[vl << 1]; for(int v = 0, c = 0; v < vl; v++) { int b = value[v] >> 4 & 0x0F; data[c++] = (byte) (b + (b > 9 ? u : '0')); b = value[v] & 0x0F; data[c++] = (byte) (b + (b > 9 ? u : '0')); } return data; } }