package org.basex.util;
import static org.basex.util.Token.*;
import org.basex.util.hash.*;
/**
* This class provides convenience operations for XML-specific character
* operations.
*
* @author BaseX Team 2005-17, BSD License
* @author Christian Gruen
*/
public final class XMLToken {
/** Index for all HTML entities (lazy initialization). */
private static final TokenMap ENTITIESMAP = new TokenMap();
/** The underscore. */
private static final byte[] UNDERSCORE = { '_' };
/** Hidden constructor. */
private XMLToken() { }
/**
* Checks if the specified character is a valid XML 1.0 character.
* @param ch the letter to be checked
* @return result of check
*/
public static boolean valid(final int ch) {
return ch < 0xD800 ? ch >= 0x20 || ch == 0xA || ch == 0x9 || ch == 0xD :
ch >= 0xE000 && ch <= 0xFFFD || ch >= 0x10000 && ch <= 0x10ffff;
}
/**
* Checks if the specified character is a name start character, as required
* e.g. by QName and NCName.
* @param ch character
* @return result of check
*/
public static boolean isNCStartChar(final int ch) {
return ch < 0x80 ?
ch >= 'A' && ch <= 'Z' || ch >= 'a' && ch <= 'z' || ch == '_' :
ch < 0x300 ? ch >= 0xC0 && ch != 0xD7 && ch != 0xF7 :
ch >= 0x370 && ch <= 0x37D || ch >= 0x37F && ch <= 0x1FFF ||
ch >= 0x200C && ch <= 0x200D || ch >= 0x2070 && ch <= 0x218F ||
ch >= 0x2C00 && ch <= 0x2EFF || ch >= 0x3001 && ch <= 0xD7FF ||
ch >= 0xF900 && ch <= 0xFDCF || ch >= 0xFDF0 && ch <= 0xFFFD ||
ch >= 0x10000 && ch <= 0xEFFFF;
}
/**
* Checks if the specified character is an XML letter.
* @param ch character
* @return result of check
*/
public static boolean isNCChar(final int ch) {
return isNCStartChar(ch) ||
(ch < 0x100 ? digit(ch) || ch == '-' || ch == '.' || ch == 0xB7 :
ch >= 0x300 && ch <= 0x36F || ch == 0x203F || ch == 0x2040);
}
/**
* Checks if the specified character is an XML first-letter.
* @param ch the letter to be checked
* @return result of check
*/
public static boolean isStartChar(final int ch) {
return isNCStartChar(ch) || ch == ':';
}
/**
* Checks if the specified character is an XML letter.
* @param ch the letter to be checked
* @return result of check
*/
public static boolean isChar(final int ch) {
return isNCChar(ch) || ch == ':';
}
/**
* Checks if the specified token is a valid NCName.
* @param value value to be checked
* @return result of check
*/
public static boolean isNCName(final byte[] value) {
final int l = value.length;
return l != 0 && ncName(value, 0) == l;
}
/**
* Checks if the specified token is a valid name.
* @param value value to be checked
* @return result of check
*/
public static boolean isName(final byte[] value) {
final int l = value.length;
for(int i = 0; i < l; i += cl(value, i)) {
final int c = cp(value, i);
if(i == 0 ? !isStartChar(c) : !isChar(c)) return false;
}
return l != 0;
}
/**
* Checks if the specified token is a valid NMToken.
* @param value value to be checked
* @return result of check
*/
public static boolean isNMToken(final byte[] value) {
final int l = value.length;
for(int i = 0; i < l; i += cl(value, i)) if(!isChar(cp(value, i))) return false;
return l != 0;
}
/**
* Checks if the specified token is a valid QName.
* @param value value to be checked
* @return result of check
*/
public static boolean isQName(final byte[] value) {
final int l = value.length;
if(l == 0) return false;
final int i = ncName(value, 0);
if(i == l) return true;
if(i == 0 || value[i] != ':') return false;
final int j = ncName(value, i + 1);
return j == l && j != i + 1;
}
/**
* Checks the specified token as an NCName.
* @param value value to be checked
* @param start start position
* @return end position
*/
private static int ncName(final byte[] value, final int start) {
final int l = value.length;
for(int i = start; i < l; i += cl(value, i)) {
final int c = cp(value, i);
if(i == start ? !isNCStartChar(c) : !isNCChar(c)) return i;
}
return l;
}
/**
* Checks if the specified name is an id/idref attribute ({@code idref}: local name must contain
* 'idref'; {@code id}: local name must contain 'if', but not 'idref').
* The correct approach would be to gather all id/idref attributes and store them as meta data.
* @param name name
* @param idref id/idref flag
* @return result of check
*/
public static boolean isId(final byte[] name, final boolean idref) {
final byte[] n = lc(local(name));
return idref ? contains(n, IDREF) : contains(n, ID) && !contains(n, IDREF);
}
/**
* Encodes a string to a valid NCName.
* @param name token to be encoded
* @param lax lax encoding (lossy, but better readable)
* @return valid NCName
*/
public static byte[] encode(final byte[] name, final boolean lax) {
// lax encoding: trim whitespaces
final byte[] nm = lax ? trim(name) : name;
final int nl = nm.length;
if(nl == 0) return UNDERSCORE;
for(int n = 0; n < nl; n += cl(nm, n)) {
int cp = cp(nm, n);
if(cp == '_' || (n == 0 ? !isNCStartChar(cp) : !isNCChar(cp))) {
final TokenBuilder tb = new TokenBuilder(nl << 1).add(nm, 0, n);
for(int m = n; m < nl; m += cl(nm, m)) {
cp = cp(nm, m);
if(lax) {
final boolean nc = isNCChar(cp);
// prefix invalid start chars (numbers, dashes, dots) with underscore
if(m == 0 && nc && !isNCStartChar(cp)) tb.add('_');
tb.add(nc ? cp : '_');
} else if(cp == '_') {
tb.add('_').add('_');
} else if(m == 0 ? isNCStartChar(cp) : isNCChar(cp)) {
tb.add(cp);
} else if(cp < 0x10000) {
addEsc(tb, cp);
} else {
final int r = cp - 0x10000;
addEsc(tb, (r >>> 10) + 0xD800);
addEsc(tb, (r & 0x3FF) + 0xDC00);
}
}
return tb.finish();
}
}
return nm;
}
/**
* Adds the given 16-bit char to the token builder in encoded form.
* @param tb token builder
* @param cp char
*/
private static void addEsc(final TokenBuilder tb, final int cp) {
tb.addByte(UNDERSCORE[0]);
final int a = cp >>> 12;
tb.addByte((byte) (a + (a > 9 ? 87 : '0')));
final int b = cp >>> 8 & 0x0F;
tb.addByte((byte) (b + (b > 9 ? 87 : '0')));
final int c = cp >>> 4 & 0x0F;
tb.addByte((byte) (c + (c > 9 ? 87 : '0')));
final int d = cp & 0x0F;
tb.addByte((byte) (d + (d > 9 ? 87 : '0')));
}
/**
* Decodes an NCName to a string.
* @param name name
* @param lax lax decoding
* @return cached QName, or {@code null} if not successful
*/
public static byte[] decode(final byte[] name, final boolean lax) {
if(lax) return name;
// convert name back to original representation
final TokenBuilder tb = new TokenBuilder();
int uc = 0;
// mode: 0=normal, 1=unicode, 2=underscore, 3=building unicode
int mode = 0;
final int nl = name.length;
for(int n = 0; n < nl;) {
final int cp = cp(name, n);
if(mode >= 3) {
uc <<= 4;
if(cp >= '0' && cp <= '9') {
uc += cp - '0';
} else if(cp >= 'A' && cp <= 'F') {
uc += cp - 0x37;
} else if(cp >= 'a' && cp <= 'f') {
uc += cp - 0x57;
} else {
return null;
}
if(++mode == 7) {
tb.add(uc);
mode = 0;
uc = 0;
}
} else if(cp == '_') {
// limit underscore counter
if(++mode == 3) {
tb.add('_');
mode = 0;
continue;
}
} else if(mode == 1) {
// unicode
mode = 3;
continue;
} else if(mode == 2) {
// underscore
tb.add('_');
mode = 0;
continue;
} else {
// normal character
tb.add(cp);
mode = 0;
}
n += cl(name, n);
}
if(mode == 2) {
tb.add('_');
} else if(mode > 0 && !tb.isEmpty()) {
return null;
}
return tb.finish();
}
/** HTML entities. */
private static final String[] HTMLENTITIES = { "Aacute", "\u00c1", "aacute",
"\u00e1", "Acirc", "\u00c2", "acirc", "\u00e2", "acute", "\u00b4",
"AElig", "\u00c6", "aelig", "\u00e6", "Agrave", "\u00c0", "agrave",
"\u00e0", "alefsym", "\u2135", "Alpha", "\u0391", "alpha", "\u03b1",
"and", "\u2227", "ang", "\u2220", "Aring", "\u00c5", "aring", "\u00e5",
"asymp", "\u2248", "Atilde", "\u00c3", "atilde", "\u00e3", "Auml",
"\u00c4", "auml", "\u00e4", "bdquo", "\u201e", "Beta", "\u0392", "beta",
"\u03b2", "brvbar", "\u00a6", "bull", "\u2022", "cap", "\u2229",
"Ccedil", "\u00c7", "ccedil", "\u00e7", "cedil", "\u00b8", "cent",
"\u00a2", "Chi", "\u03a7", "chi", "\u03c7", "circ", "\u02c6", "clubs",
"\u2663", "cong", "\u2245", "copy", "\u00a9", "crarr", "\u21b5", "cup",
"\u222a", "curren", "\u00a4", "dagger", "\u2020", "Dagger", "\u2021",
"darr", "\u2193", "dArr", "\u21d3", "deg", "\u00b0", "Delta", "\u0394",
"delta", "\u03b4", "diams", "\u2666", "divide", "\u00f7", "Eacute",
"\u00c9", "eacute", "\u00e9", "Ecirc", "\u00ca", "ecirc", "\u00ea",
"Egrave", "\u00c8", "egrave", "\u00e8", "empty", "\u2205", "emsp",
"\u2003", "ensp", "\u2002", "Epsilon", "\u0395", "epsilon", "\u03b5",
"equiv", "\u2261", "Eta", "\u0397", "eta", "\u03b7", "ETH", "\u00d0",
"eth", "\u00f0", "Euml", "\u00cb", "euml", "\u00eb", "euro", "\u20ac",
"exist", "\u2203", "fnof", "\u0192", "forall", "\u2200", "frac12",
"\u00bd", "frac14", "\u00bc", "frac34", "\u00be", "frasl", "\u2044",
"Gamma", "\u0393", "gamma", "\u03b3", "ge", "\u2265", "harr", "\u2194",
"hArr", "\u21d4", "hearts", "\u2665", "hellip", "\u2026", "Iacute",
"\u00cd", "iacute", "\u00ed", "Icirc", "\u00ce", "icirc", "\u00ee",
"iexcl", "\u00a1", "Igrave", "\u00cc", "igrave", "\u00ec", "image",
"\u2111", "infin", "\u221e", "int", "\u222b", "Iota", "\u0399", "iota",
"\u03b9", "iquest", "\u00bf", "isin", "\u2208", "Iuml", "\u00cf", "iuml",
"\u00ef", "Kappa", "\u039a", "kappa", "\u03ba", "Lambda", "\u039b",
"lambda", "\u03bb", "lang", "\u2329", "laquo", "\u00ab", "larr",
"\u2190", "lArr", "\u21d0", "lceil", "\u2308", "ldquo", "\u201c", "le",
"\u2264", "lfloor", "\u230a", "lowast", "\u2217", "loz", "\u25ca", "lrm",
"\u200e", "lsaquo", "\u2039", "lsquo", "\u2018", "macr", "\u00af",
"mdash", "\u2014", "micro", "\u00b5", "middot", "\u00b7", "minus",
"\u2212", "Mu", "\u039c", "mu", "\u03bc", "nabla", "\u2207", "nbsp",
"\u00a0", "ndash", "\u2013", "ne", "\u2260", "ni", "\u220b", "not",
"\u00ac", "notin", "\u2209", "nsub", "\u2284", "Ntilde", "\u00d1",
"ntilde", "\u00f1", "Nu", "\u039d", "nu", "\u03bd", "Oacute", "\u00d3",
"oacute", "\u00f3", "Ocirc", "\u00d4", "ocirc", "\u00f4", "OElig",
"\u0152", "oelig", "\u0153", "Ograve", "\u00d2", "ograve", "\u00f2",
"oline", "\u203e", "Omega", "\u03a9", "omega", "\u03c9", "Omicron",
"\u039f", "omicron", "\u03bf", "oplus", "\u2295", "or", "\u2228", "ordf",
"\u00aa", "ordm", "\u00ba", "Oslash", "\u00d8", "oslash", "\u00f8",
"Otilde", "\u00d5", "otilde", "\u00f5", "otimes", "\u2297", "Ouml",
"\u00d6", "ouml", "\u00f6", "para", "\u00b6", "part", "\u2202", "permil",
"\u2030", "perp", "\u22a5", "Phi", "\u03a6", "phi", "\u03c6", "Pi",
"\u03a0", "pi", "\u03c0", "piv", "\u03d6", "plusmn", "\u00b1", "pound",
"\u00a3", "prime", "\u2032", "Prime", "\u2033", "prod", "\u220f", "prop",
"\u221d", "Psi", "\u03a8", "psi", "\u03c8", "radic", "\u221a", "rang",
"\u232a", "raquo", "\u00bb", "rarr", "\u2192", "rArr", "\u21d2", "rceil",
"\u2309", "rdquo", "\u201d", "real", "\u211c", "reg", "\u00ae", "rfloor",
"\u230b", "Rho", "\u03a1", "rho", "\u03c1", "rlm", "\u200f", "rsaquo",
"\u203a", "rsquo", "\u2019", "sbquo", "\u201a", "Scaron", "\u0160",
"scaron", "\u0161", "sdot", "\u22c5", "sect", "\u00a7", "shy", "\u00ad",
"Sigma", "\u03a3", "sigma", "\u03c3", "sigmaf", "\u03c2", "sim",
"\u223c", "spades", "\u2660", "sub", "\u2282", "sube", "\u2286", "sum",
"\u2211", "sup", "\u2283", "sup1", "\u00b9", "sup2", "\u00b2", "sup3",
"\u00b3", "supe", "\u2287", "szlig", "\u00df", "Tau", "\u03a4", "tau",
"\u03c4", "there4", "\u2234", "Theta", "\u0398", "theta", "\u03b8",
"thetasym", "\u03d1", "thinsp", "\u2009", "THORN", "\u00de", "thorn",
"\u00fe", "tilde", "\u02dc", "times", "\u00d7", "trade", "\u2122",
"Uacute", "\u00da", "uacute", "\u00fa", "uarr", "\u2191", "uArr",
"\u21d1", "Ucirc", "\u00db", "ucirc", "\u00fb", "Ugrave", "\u00d9",
"ugrave", "\u00f9", "uml", "\u00a8", "upsih", "\u03d2", "Upsilon",
"\u03a5", "upsilon", "\u03c5", "Uuml", "\u00dc", "uuml", "\u00fc",
"weierp", "\u2118", "Xi", "\u039e", "xi", "\u03be", "Yacute", "\u00dd",
"yacute", "\u00fd", "yen", "\u00a5", "yuml", "\u00ff", "Yuml", "\u0178",
"Zeta", "\u0396", "zeta", "\u03b6", "zwj", "\u200d", "zwnj", "\u200c" };
/**
* Returns the unicode for the specified entity or {@code null}.
* @param key key
* @return unicode
*/
public static byte[] getEntity(final byte[] key) {
final TokenMap map = ENTITIESMAP;
if(map.isEmpty()) {
final String[] ents = HTMLENTITIES;
final int el = ents.length;
for(int e = 0; e < el; e += 2) map.put(ents[e], ents[e + 1]);
}
return map.get(key);
}
}