package org.basex.util; import java.nio.charset.Charset; import java.security.MessageDigest; import java.text.DecimalFormat; import java.text.DecimalFormatSymbols; import java.util.Arrays; import java.util.Locale; /** * <p>This class provides convenience operations for handling 'Tokens'. * Tokens are UTF-8 encoded strings, stored in a byte array.</p> * * <p>Note that, to guarantee a consistent string representation, all string * conversions should be done via the methods of this class.</p> * * @author BaseX Team 2005-12, BSD License * @author Christian Gruen */ public final class Token { /** Maximum length for hash calculation. */ private static final byte MAXLENGTH = 96; /** Empty token. */ public static final byte[] EMPTY = {}; /** XML token. */ public static final byte[] XML = token("xml"); /** XML token with colon. */ public static final byte[] XMLC = token("xml:"); /** XMLNS token. */ public static final byte[] XMLNS = token("xmlns"); /** XMLNS token with colon. */ public static final byte[] XMLNSC = token("xmlns:"); /** Token 'true'. */ public static final byte[] TRUE = token("true"); /** Token 'false'. */ public static final byte[] FALSE = token("false"); /** Token 'null'. */ public static final byte[] NULL = token("null"); /** Token 'NaN'. */ private static final byte[] NAN = token("NaN"); /** Token 'INF'. */ public static final byte[] INF = token("INF"); /** Token '-INF'. */ public static final byte[] NINF = token("-INF"); /** Space. */ public static final byte[] SPACE = { ' ' }; /** Digit '0'. */ public static final byte[] ZERO = { '0' }; /** Digit '-0'. */ private static final byte[] MZERO = { '-', '0' }; /** Digit '1'. */ public static final byte[] ONE = { '1' }; /** Slash. */ public static final byte[] SLASH = { '/' }; /** Colon. */ public static final byte[] COLON = { ':' }; /** Hex codes. */ public static final byte[] HEX = token("0123456789ABCDEF"); /** Reserved characters. */ private static final byte[] IRIRES = token("!#$%&*'()+,-./:;=?@[]~_"); /** Reserved characters. */ private static final byte[] RES = token("-._~"); /** UTF8 encoding string. */ public static final String UTF8 = "UTF-8"; /** UTF8 encoding string (variant). */ public static final String UTF82 = "UTF8"; /** UTF16 encoding string. */ private static final String UTF16 = "UTF-16"; /** UTF16 encoding string. */ private static final String UTF162 = "UTF16"; /** UTF16BE (=UTF16) encoding string. */ public static final String UTF16BE = "UTF-16BE"; /** UTF16 encoding string. */ public static final String UTF16LE = "UTF-16LE"; /** Hidden constructor. */ private Token() { } /** * Returns the specified token as string. * @param token token * @return string */ public static String string(final byte[] token) { return string(token, 0, token.length); } /** * Returns the specified token as string. * @param token token * @param start start position * @param length length * @return string */ public static String string(final byte[] token, final int start, final int length) { if(length <= 0) return ""; final char[] str = new char[length]; for(int i = 0; i < length; ++i) { final byte b = token[start + i]; if(b < 0) return utf8(token, start, length); str[i] = (char) b; } return new String(str); } /** * Returns a string of the specified UTF8 token. * @param token token * @param start start position * @param length length * @return string */ private static String utf8(final byte[] token, final int start, final int length) { // input is assumed to be correct UTF8. if input contains codepoints // larger than Character.MAX_CODE_POINT, results might be unexpected. final StringBuilder sb = new StringBuilder(length); final int il = Math.min(start + length, token.length); for(int i = start; i < il; i += cl(token, i)) { final int cp = cp(token, i); if(cp < Character.MIN_SUPPLEMENTARY_CODE_POINT) { sb.append((char) cp); } else { final int o = cp - Character.MIN_SUPPLEMENTARY_CODE_POINT; sb.append((char) ((o >>> 10) + Character.MIN_HIGH_SURROGATE)); sb.append((char) ((o & 0x3ff) + Character.MIN_LOW_SURROGATE)); } } return sb.toString(); } /** * Checks if the specified token only consists of ASCII characters. * @param token token * @return result of check */ public static boolean ascii(final byte[] token) { for(final byte t : token) if(t < 0) return false; return true; } /** * Converts a string to a byte array. * All strings should be converted by this function to guarantee * a consistent character conversion. * @param string string to be converted * @return byte array */ public static byte[] token(final String string) { final int l = string.length(); if(l == 0) return EMPTY; final byte[] b = new byte[l]; for(int i = 0; i < l; ++i) { final char c = string.charAt(i); if(c > 0x7F) return utf8(string); b[i] = (byte) c; } return b; } /** * Converts the specified strings to tokens. * @param strings strings * @return tokens */ public static byte[][] tokens(final String... strings) { final byte[][] t = new byte[strings.length][]; for(int i = 0; i < t.length; ++i) t[i] = token(strings[i]); return t; } /** * Converts a string to a UTF8 byte array. * @param string string to be converted * @return byte array */ private static byte[] utf8(final String string) { final char[] arr = string.toCharArray(); final int al = arr.length; final TokenBuilder tb = new TokenBuilder(al << 1); for(int c = 0; c < al; ++c) { final char ch = arr[c]; tb.add(Character.isHighSurrogate(ch) && c < al - 1 && Character.isLowSurrogate(arr[c + 1]) ? Character.toCodePoint(ch, arr[++c]) : ch); } return tb.finish(); } /** * Converts a token from the input encoding to UTF8. * @param token token to be converted * @return byte array * @param encoding input encoding */ public static byte[] utf8(final byte[] token, final String encoding) { // UTF8 (comparison by ref.) or no special characters: return input if(encoding == UTF8 || ascii(token)) return token; // convert to utf8. if errors occur while converting, an empty is returned. try { return token(new String(token, encoding)); } catch(final Exception ex) { Util.debug(ex); return EMPTY; } } /** * Returns a unified representation of the specified encoding. * @param encoding input encoding * @param old (optional) old encoding * @return encoding */ public static String normEncoding(final String encoding, final String old) { final String e = encoding.toUpperCase(Locale.ENGLISH); if(eq(e, UTF8, UTF82)) return UTF8; if(e.equals(UTF16BE)) return UTF16BE; if(e.equals(UTF16LE)) return UTF16LE; if(eq(e, UTF16, UTF162)) return old == UTF16BE || old == UTF16LE ? old : UTF16BE; return encoding; } /** * Checks if the specified encoding is supported. * @param encoding encoding * @return result of check */ public static boolean supported(final String encoding) { try { return Charset.isSupported(encoding); } catch(final IllegalArgumentException ex) { return false; } } /** * Returns the codepoint (unicode value) of the specified token, starting at * the specified position. Returns a unicode replacement character for invalid * values. * @param token token * @param pos character position * @return current character */ public static int cp(final byte[] token, final int pos) { // 0xxxxxxx final byte v = token[pos]; if((v & 0xFF) < 192) return v & 0xFF; // number of bytes to be read final int vl = cl(v); if(pos + vl > token.length) return 0xFFFD; // 110xxxxx 10xxxxxx if(vl == 2) return (v & 0x1F) << 6 | token[pos + 1] & 0x3F; // 1110xxxx 10xxxxxx 10xxxxxx if(vl == 3) return (v & 0x0F) << 12 | (token[pos + 1] & 0x3F) << 6 | token[pos + 2] & 0x3F; // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx return (v & 0x07) << 18 | (token[pos + 1] & 0x3F) << 12 | (token[pos + 2] & 0x3F) << 6 | token[pos + 3] & 0x3F; } /** Character lengths. */ private static final int[] CHLEN = { 1, 1, 1, 1, 2, 2, 3, 4 }; /** * Returns the length of the specified UTF8 byte. * @param cp codepoint * @return character length */ public static int cl(final byte cp) { return cp >= 0 ? 1 : CHLEN[cp >> 4 & 0x7]; } /** * Returns the length of a UTF8 character at the specified position. * @param token token * @param pos position * @return character length */ public static int cl(final byte[] token, final int pos) { return cl(token[pos]); } /** * Converts a token to a sequence of codepoints. * @param token token * @return codepoints */ public static int[] cps(final byte[] token) { int pos = 0; final int len = token.length; final int[] cp = new int[len]; for(int i = 0; i < len; i += cl(token, i)) cp[pos++] = cp(token, i); return pos < len ? Arrays.copyOf(cp, pos) : cp; } /** * Returns the token length. * @param token token * @return length */ public static int len(final byte[] token) { int l = 0; for(int t = 0; t < token.length; t += cl(token, t)) ++l; return l; } /** * Creates a byte array representation of the specified boolean value. * @param bool boolean value to be converted * @return boolean value in byte array */ public static byte[] token(final boolean bool) { return bool ? TRUE : FALSE; } /** * Creates a byte array representation of the specified integer value. * @param integer int value to be converted * @return integer value in byte array */ public static byte[] token(final int integer) { if(integer == 0) return ZERO; if(integer == Integer.MIN_VALUE) return MININT; int n = integer; final boolean m = n < 0; if(m) n = -n; int j = numDigits(n); if(m) ++j; final byte[] num = new byte[j]; // faster division by 10 for values < 81920 (see Integer.getChars) while(n > 81919) { final int q = n / 10; num[--j] = (byte) (n - (q << 3) - (q << 1) + '0'); n = q; } while(n != 0) { final int q = n * 52429 >>> 19; num[--j] = (byte) (n - (q << 3) - (q << 1) + '0'); n = q; } if(m) num[--j] = '-'; return num; } /** * Checks number of digits of the specified integer. * @param integer number to be checked * @return number of digits */ public static int numDigits(final int integer) { for(int i = 0;; ++i) if(integer <= INTSIZE[i]) return i + 1; } /** Minimum integer. */ private static final byte[] MININT = token("-2147483648"); /** Table with integer sizes. */ private static final int[] INTSIZE = { 9, 99, 999, 9999, 99999, 999999, 9999999, 99999999, 999999999, Integer.MAX_VALUE }; /** * Creates a byte array representation from the specified long value, * using Java's standard method. * @param integer int value to be converted * @return byte array */ public static byte[] token(final long integer) { return integer >= Integer.MIN_VALUE && integer <= Integer.MAX_VALUE ? token((int) integer) : token(Long.toString(integer)); } /** US charset. */ private static final DecimalFormatSymbols LOC = new DecimalFormatSymbols(Locale.US); /** Scientific double output. */ private static final DecimalFormat SD = new DecimalFormat("0.0##################E0", LOC); /** Decimal double output. */ private static final DecimalFormat DD = new DecimalFormat("#####0.0################", LOC); /** Scientific float output. */ private static final DecimalFormat SF = new DecimalFormat("0.0######E0", LOC); /** Decimal float output. */ private static final DecimalFormat DF = new DecimalFormat("#####0.0######", LOC); /** * Creates a byte array representation from the specified double value; * inspired by Xavier Franc's Qizx. * @param dbl double value to be converted * @return byte array */ public static byte[] token(final double dbl) { final byte[] b = tok(dbl); if(b != null) return b; final double a = Math.abs(dbl); return chopNumber(token(a >= 1e-6 && a < 1e6 ? DD.format(dbl) : SD.format(dbl))); } /** * Creates a byte array representation from the specified float value. * @param flt float value to be converted * @return byte array */ public static byte[] token(final float flt) { final byte[] b = tok(flt); if(b != null) return b; // not that brilliant here.. no chance for elegant code either // due to the nifty differences between Java and XQuery for(int i = 0; i < FLT.length; ++i) if(flt == FLT[i]) return FLTSTR[i]; final float a = Math.abs(flt); final boolean small = a >= 1e-6f && a < 1e6f; String s1 = small ? DF.format(flt) : SF.format(flt); final String s2 = Float.toString(flt); if(s2.length() < s1.length() && (!s2.contains("E") || !small)) s1 = s2; return chopNumber(token(s1)); } /** * Checks if the specified value equals a constant token. * @param dbl value to be converted * @return byte array or zero, or {@code null} */ private static byte[] tok(final double dbl) { if(dbl == 1 / 0d) return INF; if(dbl == -1 / 0d) return NINF; if(dbl == 0) return 1 / dbl > 0 ? ZERO : MZERO; if(Double.isNaN(dbl)) return NAN; final double a = Math.abs(dbl); if(a < 1e6) { final int i = (int) dbl; if(i == dbl) return token(i); } return null; } /** * Finishes the numeric token, removing trailing zeroes. * @param token token to be modified * @return token */ public static byte[] chopNumber(final byte[] token) { if(!contains(token, '.') || contains(token, 'e') || contains(token, 'E')) return token; // remove trailing zeroes int l = token.length; while(--l > 0 && token[l] == '0'); return substring(token, 0, token[l] == '.' ? l : l + 1); } /** Constant float values. */ private static final float[] FLT = { 1.0E17f, 1.0E15f, 1.0E13f, 1.0E11f, -1.0E17f, -1.0E15f, -1.0E13f, -1.0E11f }; /** String representations of float values. */ private static final byte[][] FLTSTR = tokens("1.0E17", "1.0E15", "1.0E13", "1.0E11", "-1.0E17", "-1.0E15", "-1.0E13", "-1.0E11"); /** * Converts the specified token into a double value. * {@link Double#NaN} is returned if the input is invalid. * @param token token to be converted * @return resulting double value */ public static double toDouble(final byte[] token) { final int tl = token.length; boolean f = false; for(final int t : token) { if(t >= 0 && t <= ' ' || digit(t)) continue; f = t == 'e' || t == 'E' || t == '.' || t == '-'; if(!f) return Double.NaN; } if(f || tl > 9) return dbl(token); final int d = toInt(token); return d == Integer.MIN_VALUE ? Double.NaN : d; } /** * Converts the specified token into a double value. * {@link Double#NaN} is returned when the input is invalid. * @param token token to be converted * @return resulting double value */ private static double dbl(final byte[] token) { try { return Double.parseDouble(string(token)); } catch(final Exception ex) { return Double.NaN; } } /** * Converts the specified string into an long value. * {@link Long#MIN_VALUE} is returned when the input is invalid. * @param string string to be converted * @return resulting long value */ public static long toLong(final String string) { return toLong(token(string)); } /** * Converts the specified token into an long value. * {@link Long#MIN_VALUE} is returned when the input is invalid. * @param token token to be converted * @return resulting long value */ public static long toLong(final byte[] token) { return toLong(token, 0, token.length); } /** * Converts the specified token into an long value. * {@link Long#MIN_VALUE} is returned when the input is invalid. * @param token token to be converted * @param start first byte to be parsed * @param end last byte to be parsed - exclusive * @return resulting long value */ public static long toLong(final byte[] token, final int start, final int end) { int t = start; while(t < end && token[t] <= ' ') ++t; if(t == end) return Long.MIN_VALUE; boolean m = false; if(token[t] == '-' || token[t] == '+') m = token[t++] == '-'; if(t == end) return Long.MIN_VALUE; long v = 0; for(; t < end; ++t) { final byte c = token[t]; if(c < '0' || c > '9') break; final long w = (v << 3) + (v << 1) + c - '0'; if(w < v) return Long.MIN_VALUE; v = w; } while(t < end && token[t] <= ' ') ++t; return t < end ? Long.MIN_VALUE : m ? -v : v; } /** * Converts the specified string into an integer value. * {@link Integer#MIN_VALUE} is returned when the input is invalid. * @param string string to be converted * @return resulting integer value */ public static int toInt(final String string) { return toInt(token(string)); } /** * Converts the specified token into an integer value. * {@link Integer#MIN_VALUE} is returned when the input is invalid. * @param token token to be converted * @return resulting integer value */ public static int toInt(final byte[] token) { return toInt(token, 0, token.length); } /** * Converts the specified token into an integer value. * {@link Integer#MIN_VALUE} is returned when the input is invalid. * @param token token to be converted * @param start first byte to be parsed * @param end last byte to be parsed (exclusive) * @return resulting integer value */ public static int toInt(final byte[] token, final int start, final int end) { int t = start; while(t < end && token[t] <= ' ') ++t; if(t == end) return Integer.MIN_VALUE; boolean m = false; if(token[t] == '-' || token[t] == '+') m = token[t++] == '-'; if(t == end) return Integer.MIN_VALUE; int v = 0; for(; t < end; ++t) { final byte c = token[t]; if(c < '0' || c > '9') break; v = (v << 3) + (v << 1) + c - '0'; } while(t < end && token[t] <= ' ') ++t; return t < end ? Integer.MIN_VALUE : m ? -v : v; } /** * Converts the specified token into a positive integer value. * {@link Integer#MIN_VALUE} is returned if non-digits are found * or if the input is longer than nine characters. * @param token token to be converted * @return resulting integer value */ public static int toSimpleInt(final byte[] token) { final int te = token.length; if(te >= 10 || te == 0) return Integer.MIN_VALUE; if(token[0] == '0') return te == 1 ? 0 : Integer.MIN_VALUE; int v = 0; for(int ts = 0; ts < te; ++ts) { final byte c = token[ts]; if(c < '0' || c > '9') return Integer.MIN_VALUE; v = (v << 3) + (v << 1) + c - '0'; } return v; } /** * Calculates a hash code for the specified token. * @param token specified token * @return hash code */ public static int hash(final byte[] token) { int h = 0; final int l = Math.min(token.length, MAXLENGTH); for(int i = 0; i != l; ++i) h = (h << 5) - h + token[i]; return h; } /** * Compares two tokens for equality. * @param token1 first token * @param token2 token to be compared * @return true if the arrays are equal */ public static boolean eq(final byte[] token1, final byte[] token2) { final int tl = token2.length; if(tl != token1.length) return false; for(int t = 0; t != tl; ++t) if(token2[t] != token1[t]) return false; return true; } /** * Compares several tokens for equality. * @param token token * @param tokens tokens to be compared * @return true if one test is successful */ public static boolean eq(final byte[] token, final byte[]... tokens) { for(final byte[] t : tokens) if(eq(token, t)) return true; return false; } /** * Compares several strings for equality. * @param str first string * @param strings strings to be compared * @return true if one test is successful */ public static boolean eq(final String str, final String... strings) { for(final String s : strings) { if(str == null ? s == null : str.equals(s)) return true; } return false; } /** * Compares several strings for equality, ignoring the case. * @param str first string * @param strings strings to be compared * @return true if one test is successful */ public static boolean eqic(final String str, final String... strings) { for(final String s : strings) { if(str == null ? s == null : str.equalsIgnoreCase(s)) return true; } return false; } /** * Calculates the difference of two tokens. * @param token first token * @param compare token to be compared * @return 0 if tokens are equal, negative if first token is smaller, * positive if first token is bigger */ public static int diff(final byte[] token, final byte[] compare) { final int tl = token.length; final int cl = compare.length; final int l = Math.min(tl, cl); for(int i = 0; i < l; ++i) { final int c = (token[i] & 0xFF) - (compare[i] & 0xFF); if(c != 0) return c; } return tl - cl; } /** * Calculates the difference of two characters. * @param char1 first character * @param char2 character to be compared * @return 0 if characters are equal, negative if first token is smaller, * positive if first character is bigger */ public static int diff(final byte char1, final byte char2) { return (char1 & 0xFF) - (char2 & 0xFF); } /** * Checks if the first token contains the second token. * @param token token * @param sub token to be found * @return result of test */ public static boolean contains(final byte[] token, final byte[] sub) { return indexOf(token, sub) != -1; } /** * Checks if the first token contains the specified character. * @param token token * @param c character to be found * @return result of test */ public static boolean contains(final byte[] token, final int c) { return indexOf(token, c) != -1; } /** * Returns the position of the specified character or -1. * @param token token * @param c character to be found * @return result of test */ public static int indexOf(final byte[] token, final int c) { final int tl = token.length; for(int t = 0; t < tl; ++t) if(token[t] == c) return t; return -1; } /** * Returns the last position of the specified character or -1. * @param token token * @param c character to be found * @return result of test */ public static int lastIndexOf(final byte[] token, final int c) { for(int t = token.length - 1; t >= 0; --t) if(token[t] == c) return t; return -1; } /** * Returns the position of the specified token or -1. * @param token token * @param sub token to be found * @return result of test */ public static int indexOf(final byte[] token, final byte[] sub) { return indexOf(token, sub, 0); } /** * Returns the position of the specified token or -1. * @param token token * @param sub token to be found * @param pos start position * @return result of test */ public static int indexOf(final byte[] token, final byte[] sub, final int pos) { final int sl = sub.length; if(sl == 0) return 0; final int tl = token.length - sl; if(pos > tl) return -1; // compare tokens character wise for(int t = pos; t <= tl; ++t) { int s = 0; while(sub[s] == token[t + s]) if(++s == sl) return t; } return -1; } /** * Checks if the first token starts with the specified character. * @param token token * @param ch character to be found * @return result of test */ public static boolean startsWith(final byte[] token, final int ch) { return token.length != 0 && token[0] == ch; } /** * Checks if the first token starts with the second token. * @param token token * @param sub token to be found * @return result of test */ public static boolean startsWith(final byte[] token, final byte[] sub) { final int sl = sub.length; if(sl > token.length) return false; for(int s = 0; s < sl; ++s) if(sub[s] != token[s]) return false; return true; } /** * Checks if the first token starts with the specified character. * @param token token * @param ch character to be bound * @return result of test */ public static boolean endsWith(final byte[] token, final int ch) { return token.length != 0 && token[token.length - 1] == ch; } /** * Checks if the first token ends with the second token. * @param token token * @param sub token to be found * @return result of test */ public static boolean endsWith(final byte[] token, final byte[] sub) { final int sl = sub.length; final int tl = token.length; if(sl > tl) return false; for(int s = sl; s > 0; s--) if(sub[sl - s] != token[tl - s]) return false; return true; } /** * Returns a substring of the specified token. * Note that this method does not correctly split UTF8 character; * use {@link #subtoken} instead. * @param token input token * @param start start position * @return substring */ public static byte[] substring(final byte[] token, final int start) { return substring(token, start, token.length); } /** * Returns a substring of the specified token. * Note that this method does not correctly split UTF8 character; * use {@link #subtoken} instead. * @param token input token * @param start start position * @param end end position * @return substring */ public static byte[] substring(final byte[] token, final int start, final int end) { final int s = Math.max(0, start); final int e = Math.min(end, token.length); if(s == 0 && e == token.length) return token; return s >= e ? EMPTY : Arrays.copyOfRange(token, s, e); } /** * Returns a partial token. * @param token input token * @param start start position * @return resulting text */ public static byte[] subtoken(final byte[] token, final int start) { return subtoken(token, start, token.length); } /** * Returns a partial token. * @param token input text * @param start start position * @param end end position * @return resulting text */ public static byte[] subtoken(final byte[] token, final int start, final int end) { int s = Math.max(0, start); final int e = Math.min(end, token.length); if(s == 0 && e == token.length) return token; if(s >= e) return EMPTY; int t = Math.max(0, s - 4); for(; t != s && t < e; t += cl(token, t)) { if(t >= s) s = t; } for(; t < e; t += cl(token, t)); return Arrays.copyOfRange(token, s, t); } /** * Splits the token at all whitespaces and returns an array with all tokens. * @param token token to be split * @param sep separation character * @return array */ public static byte[][] split(final byte[] token, final int sep) { final int l = token.length; final byte[][] split = new byte[l][]; int s = 0; final TokenBuilder tb = new TokenBuilder(); for(int i = 0; i < l; i += cl(token, i)) { final int c = cp(token, i); if(c == sep) { if(tb.size() != 0) { split[s++] = tb.finish(); tb.reset(); } } else { tb.add(c); } } if(tb.size() != 0) split[s++] = tb.finish(); return Array.copyOf(split, s); } /** * Checks if the specified token has only whitespaces. * @param token token * @return true if all characters are whitespaces */ public static boolean ws(final byte[] token) { final int tl = token.length; for(int i = 0; i < tl; ++i) if(token[i] < 0 || token[i] > ' ') return false; return true; } /** * Replaces the specified character and returns the result token. * @param token token to be checked * @param search the character to be replaced * @param replace the new character * @return resulting token */ public static byte[] replace(final byte[] token, final int search, final int replace) { final TokenBuilder tb = new TokenBuilder(token.length); final int tl = token.length; for(int i = 0; i < tl; i += cl(token, i)) { final int c = cp(token, i); tb.add(c == search ? replace : c); } return tb.finish(); } /** * Removes leading and trailing whitespaces from the specified token. * @param token token to be trimmed * @return trimmed token */ public static byte[] trim(final byte[] token) { int s = -1; int e = token.length; while(++s < e) if(token[s] > ' ' || token[s] < 0) break; while(--e > s) if(token[e] > ' ' || token[e] < 0) break; if(++e == token.length && s == 0) return token; return s == e ? EMPTY : Arrays.copyOfRange(token, s, e); } /** * Chops a token to the specified length and adds dots. * @param token token to be chopped * @param max maximum length * @return chopped token */ public static byte[] chop(final byte[] token, final int max) { if(token.length <= max) return token; final byte[] tt = Arrays.copyOf(token, max); if(max > 2) tt[max - 3] = '.'; if(max > 1) tt[max - 2] = '.'; if(max > 0) tt[max - 1] = '.'; return tt; } /** * Concatenates two tokens. * @param token1 first token * @param token2 second token * @return resulting array */ public static byte[] concat(final byte[] token1, final byte[] token2) { final int t1 = token1.length; final int t2 = token2.length; final byte[] tmp = new byte[t1 + t2]; System.arraycopy(token1, 0, tmp, 0, t1); System.arraycopy(token2, 0, tmp, t1, t2); return tmp; } /** * Concatenates three tokens. A {@link TokenBuilder} instance can be used to * concatenate more than three tokens. * @param token1 first token * @param token2 second token * @param token3 third token * @return resulting array */ public static byte[] concat(final byte[] token1, final byte[] token2, final byte[] token3) { final int t1 = token1.length; final int t2 = token2.length; final int t3 = token3.length; final byte[] tmp = new byte[t1 + t2 + t3]; System.arraycopy(token1, 0, tmp, 0, t1); System.arraycopy(token2, 0, tmp, t1, t2); System.arraycopy(token3, 0, tmp, t1 + t2, t3); return tmp; } /** * Deletes the specified character from the token. * @param token token * @param ch character to be removed * @return resulting token */ public static byte[] delete(final byte[] token, final int ch) { final TokenBuilder tb = new TokenBuilder(token.length); final int tl = token.length; for(int i = 0; i < tl; i += cl(token, i)) { final int c = cp(token, i); if(c != ch) tb.add(c); } return tb.finish(); } /** * Normalizes all whitespace occurrences from the specified token. * @param token token * @return normalized token */ public static byte[] norm(final byte[] token) { final int l = token.length; final byte[] tmp = new byte[l]; int c = 0; boolean ws1 = true; for(int i = 0; i < l; ++i) { final boolean ws2 = ws(token[i]); if(ws2 && ws1) continue; tmp[c++] = ws2 ? (byte) ' ' : token[i]; ws1 = ws2; } if(c > 0 && ws(tmp[c - 1])) --c; return c == l ? tmp : Arrays.copyOf(tmp, c); } /** * Checks if the specified character is a whitespace. * @param ch the letter to be checked * @return result of comparison */ public static boolean ws(final int ch) { return ch == 0x09 || ch == 0x0A || ch == 0x0D || ch == 0x20; } /** * Checks if the specified character is a computer letter (A - Z, a - z, _). * @param ch the letter to be checked * @return result of comparison */ public static boolean letter(final int ch) { return ch >= 'A' && ch <= 'Z' || ch >= 'a' && ch <= 'z' || ch == '_'; } /** * Checks if the specified character is a digit (0 - 9). * @param ch the letter to be checked * @return result of comparison */ public static boolean digit(final int ch) { return ch >= '0' && ch <= '9'; } /** * Checks if the specified character is a computer letter or digit. * @param ch the letter to be checked * @return result of comparison */ public static boolean letterOrDigit(final int ch) { return letter(ch) || digit(ch); } /** * Returns true if the specified character is a full-text letter or digit. * @param ch character to be tested * @return result of check */ public static boolean ftChar(final int ch) { return ch >= '0' && (ch < 0x80 ? LOD[ch - '0'] : Character.isLetterOrDigit(ch)); } /** Letter-or-digit table for ASCII codes larger than '0'. */ private static final boolean[] LOD = { true, true, true, true, true, true, true, true, true, true, false, false, false, false, false, false, false, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, false, false, false, false, false, false, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, false, false, false, false, false }; /** * Converts the specified token to upper case. * @param token token to be converted * @return resulting token */ public static byte[] uc(final byte[] token) { if(ascii(token)) { final byte[] tok = new byte[token.length]; for(int i = 0; i < token.length; ++i) tok[i] = (byte) uc(token[i]); return tok; } return token(string(token).toUpperCase(Locale.ENGLISH)); } /** * Converts a character to upper case. * @param ch character to be converted * @return resulting character */ public static int uc(final int ch) { return ch >= 'a' && ch <= 'z' ? ch - 0x20 : ch > 0x7F ? Character.toUpperCase(ch) : ch; } /** * Converts the specified token to lower case. * @param token token to be converted * @return resulting token */ public static byte[] lc(final byte[] token) { if(ascii(token)) { final byte[] tok = new byte[token.length]; for(int i = 0; i < token.length; ++i) tok[i] = (byte) lc(token[i]); return tok; } return token(string(token).toLowerCase(Locale.ENGLISH)); } /** * Converts a character to lower case. * @param ch character to be converted * @return resulting character */ public static int lc(final int ch) { return ch >= 'A' && ch <= 'Z' ? ch | 0x20 : ch > 0x7F ? Character.toLowerCase(ch) : ch; } /** * Returns the prefix of the specified token. * @param name name * @return prefix or empty token if no prefix exists */ public static byte[] prefix(final byte[] name) { final int i = indexOf(name, ':'); return i == -1 ? EMPTY : substring(name, 0, i); } /** * Returns the local name of the specified name. * @param name name * @return local name */ public static byte[] local(final byte[] name) { final int i = indexOf(name, ':'); return i == -1 ? name : substring(name, i + 1); } /** * Returns a URI encoded token. * @param token token * @param iri input * @return encoded token */ public static byte[] uri(final byte[] token, final boolean iri) { final int tl = token.length; final TokenBuilder tb = new TokenBuilder(); for(int t = 0; t < tl; ++t) { final byte b = token[t]; if(letterOrDigit(b) || contains(iri ? IRIRES : RES, b)) tb.addByte(b); else hex(tb, b); } return tb.finish(); } /** * Escapes the specified token. * @param token token * @return escaped token */ public static byte[] escape(final byte[] token) { final int tl = token.length; final TokenBuilder tb = new TokenBuilder(); for(int t = 0; t < tl; ++t) { final byte b = token[t]; if(b >= 0x20 && b <= 0x7e) tb.addByte(b); else hex(tb, b); } return tb.finish(); } /** * Adds the specified byte in hex code. * @param tb token builder * @param b byte to be added */ private static void hex(final TokenBuilder tb, final byte b) { tb.add('%'); tb.addByte(HEX[(b & 0xFF) >> 4]); tb.addByte(HEX[b & 0xFF & 15]); } /** * Returns an MD5 hash. * @param string string to be hashed * @return md5 hash */ public static String md5(final String string) { try { final MessageDigest md = MessageDigest.getInstance("MD5"); return string(hex(md.digest(token(string)), false)); } catch(final Exception ex) { throw Util.notexpected(ex); } } /** * Returns a hex representation of the specified byte array. * @param val values to be mapped * @param uc upper case * @return hex representation */ public static byte[] hex(final byte[] val, final boolean uc) { final int u = uc ? 0x37 : 0x57; final byte[] data = new byte[val.length << 1]; for(int d = 0, c = 0; d < val.length; d++) { int b = val[d] >> 4 & 0x0F; data[c++] = (byte) (b + (b > 9 ? u : '0')); b = val[d] & 0x0F; data[c++] = (byte) (b + (b > 9 ? u : '0')); } return data; } /** * Returns a normalized character without diacritics. * This method supports all latin1 characters, including supplements. * @param ch character to be normalized * @return resulting character */ public static int norm(final int ch) { return ch < 0x80 || ch >= 0x200 ? ch : ch()[ch]; } /** * Initializes the array of normalized characters. * @return normalization array */ private static synchronized char[] ch() { if(norm == null) { // will be only initialized if needed norm = new char[0x200]; for(int n = 0; n < norm.length; ++n) norm[n] = (char) n; for(final char[] aNC : NC) norm[aNC[0]] = aNC[1]; } return norm; } /** Mapping table for character normalization. */ private static char[] norm; /** Normalized characters. */ private static final char[][] NC = { { '\u00C0', 'A' }, { '\u00C1', 'A' }, { '\u00C2', 'A' }, { '\u00C3', 'A' }, { '\u00C4', 'A' }, { '\u00C5', 'A' }, { '\u00C6', 'A' }, { '\u00C7', 'C' }, { '\u00C8', 'E' }, { '\u00C9', 'E' }, { '\u00CA', 'E' }, { '\u00CB', 'E' }, { '\u00CC', 'I' }, { '\u00CD', 'I' }, { '\u00CE', 'I' }, { '\u00CF', 'I' }, { '\u00D0', 'D' }, { '\u00D1', 'N' }, { '\u00D2', 'O' }, { '\u00D3', 'O' }, { '\u00D4', 'O' }, { '\u00D5', 'O' }, { '\u00D6', 'O' }, { '\u00D8', 'O' }, { '\u00D9', 'U' }, { '\u00DA', 'U' }, { '\u00DB', 'U' }, { '\u00DC', 'U' }, { '\u00DD', 'Y' }, { '\u00DE', 'd' }, { '\u00DF', 's' }, { '\u00E0', 'a' }, { '\u00E1', 'a' }, { '\u00E2', 'a' }, { '\u00E3', 'a' }, { '\u00E4', 'a' }, { '\u00E5', 'a' }, { '\u00E6', 'a' }, { '\u00E7', 'c' }, { '\u00E8', 'e' }, { '\u00E9', 'e' }, { '\u00EA', 'e' }, { '\u00EB', 'e' }, { '\u00EC', 'i' }, { '\u00ED', 'i' }, { '\u00EE', 'i' }, { '\u00EF', 'i' }, { '\u00F0', 'd' }, { '\u00F1', 'n' }, { '\u00F2', 'o' }, { '\u00F3', 'o' }, { '\u00F4', 'o' }, { '\u00F5', 'o' }, { '\u00F6', 'o' }, { '\u00F8', 'o' }, { '\u00F9', 'u' }, { '\u00FA', 'u' }, { '\u00FB', 'u' }, { '\u00FC', 'u' }, { '\u00FD', 'y' }, { '\u00FE', 'd' }, { '\u00FF', 'y' }, { '\u0100', 'A' }, { '\u0101', 'a' }, { '\u0102', 'A' }, { '\u0103', 'a' }, { '\u0104', 'A' }, { '\u0105', 'a' }, { '\u0106', 'C' }, { '\u0107', 'c' }, { '\u0108', 'C' }, { '\u0109', 'c' }, { '\u010A', 'C' }, { '\u010B', 'c' }, { '\u010C', 'C' }, { '\u010D', 'c' }, { '\u010E', 'D' }, { '\u010F', 'd' }, { '\u0110', 'D' }, { '\u0111', 'd' }, { '\u0112', 'E' }, { '\u0113', 'e' }, { '\u0114', 'E' }, { '\u0115', 'e' }, { '\u0116', 'E' }, { '\u0117', 'e' }, { '\u0118', 'E' }, { '\u0119', 'e' }, { '\u011A', 'E' }, { '\u011B', 'e' }, { '\u011C', 'G' }, { '\u011D', 'g' }, { '\u011E', 'G' }, { '\u011F', 'g' }, { '\u0120', 'G' }, { '\u0121', 'g' }, { '\u0122', 'G' }, { '\u0123', 'g' }, { '\u0124', 'H' }, { '\u0125', 'h' }, { '\u0126', 'H' }, { '\u0127', 'h' }, { '\u0128', 'I' }, { '\u0129', 'i' }, { '\u012A', 'I' }, { '\u012B', 'i' }, { '\u012C', 'I' }, { '\u012D', 'i' }, { '\u012E', 'I' }, { '\u012F', 'i' }, { '\u0130', 'I' }, { '\u0131', 'i' }, { '\u0132', 'I' }, { '\u0133', 'i' }, { '\u0134', 'J' }, { '\u0135', 'j' }, { '\u0136', 'K' }, { '\u0137', 'k' }, { '\u0138', 'k' }, { '\u0139', 'L' }, { '\u013A', 'l' }, { '\u013B', 'L' }, { '\u013C', 'l' }, { '\u013D', 'L' }, { '\u013E', 'l' }, { '\u013F', 'L' }, { '\u0140', 'l' }, { '\u0141', 'L' }, { '\u0142', 'l' }, { '\u0143', 'N' }, { '\u0144', 'n' }, { '\u0145', 'N' }, { '\u0146', 'n' }, { '\u0147', 'N' }, { '\u0148', 'n' }, { '\u0149', 'n' }, { '\u014A', 'N' }, { '\u014B', 'n' }, { '\u014C', 'O' }, { '\u014D', 'o' }, { '\u014E', 'O' }, { '\u014F', 'o' }, { '\u0150', 'O' }, { '\u0151', 'o' }, { '\u0152', 'O' }, { '\u0153', 'o' }, { '\u0154', 'R' }, { '\u0155', 'r' }, { '\u0156', 'R' }, { '\u0157', 'r' }, { '\u0158', 'R' }, { '\u0159', 'r' }, { '\u015A', 'S' }, { '\u015B', 's' }, { '\u015C', 'S' }, { '\u015D', 's' }, { '\u015E', 'S' }, { '\u015F', 's' }, { '\u0160', 'S' }, { '\u0161', 's' }, { '\u0162', 'T' }, { '\u0163', 't' }, { '\u0164', 'T' }, { '\u0165', 't' }, { '\u0166', 'T' }, { '\u0167', 't' }, { '\u0168', 'U' }, { '\u0169', 'u' }, { '\u016A', 'U' }, { '\u016B', 'u' }, { '\u016C', 'U' }, { '\u016D', 'u' }, { '\u016E', 'U' }, { '\u016F', 'u' }, { '\u0170', 'U' }, { '\u0171', 'u' }, { '\u0172', 'U' }, { '\u0173', 'u' }, { '\u0174', 'W' }, { '\u0175', 'w' }, { '\u0176', 'Y' }, { '\u0177', 'y' }, { '\u0178', 'Y' }, { '\u0179', 'Z' }, { '\u017A', 'z' }, { '\u017B', 'Z' }, { '\u017C', 'z' }, { '\u017D', 'Z' }, { '\u017E', 'z' }, { '\u01FA', 'A' }, { '\u01FB', 'a' }, { '\u01FC', 'A' }, { '\u01FD', 'a' }, { '\u01FE', 'O' }, { '\u01FF', 'o' } }; }