XMLToken.java example

Explorer
basex-master
package org.basex.util;

import static org.basex.util.Token.*;

import org.basex.util.hash.*;

/**
 * This class provides convenience operations for XML-specific character
 * operations.
 *
 * @author BaseX Team 2005-17, BSD License
 * @author Christian Gruen
 */
public final class XMLToken {
  /** Index for all HTML entities (lazy initialization). */
  private static final TokenMap ENTITIESMAP = new TokenMap();
  /** The underscore. */
  private static final byte[] UNDERSCORE = { '_' };

  /** Hidden constructor. */
  private XMLToken() { }

  /**
   * Checks if the specified character is a valid XML 1.0 character.
   * @param ch the letter to be checked
   * @return result of check
   */
  public static boolean valid(final int ch) {
    return ch < 0xD800 ? ch >= 0x20 || ch == 0xA || ch == 0x9 || ch == 0xD :
      ch >= 0xE000 && ch <= 0xFFFD || ch >= 0x10000 && ch <= 0x10ffff;
  }

  /**
   * Checks if the specified character is a name start character, as required
   * e.g. by QName and NCName.
   * @param ch character
   * @return result of check
   */
  public static boolean isNCStartChar(final int ch) {
    return ch < 0x80 ?
      ch >= 'A' && ch <= 'Z' || ch >= 'a' && ch <= 'z' || ch == '_' :
      ch < 0x300 ? ch >= 0xC0 && ch != 0xD7 && ch != 0xF7 :
      ch >= 0x370 && ch <= 0x37D || ch >= 0x37F && ch <= 0x1FFF ||
      ch >= 0x200C && ch <= 0x200D || ch >= 0x2070 && ch <= 0x218F ||
      ch >= 0x2C00 && ch <= 0x2EFF || ch >= 0x3001 && ch <= 0xD7FF ||
      ch >= 0xF900 && ch <= 0xFDCF || ch >= 0xFDF0 && ch <= 0xFFFD ||
      ch >= 0x10000 && ch <= 0xEFFFF;
  }

  /**
   * Checks if the specified character is an XML letter.
   * @param ch character
   * @return result of check
   */
  public static boolean isNCChar(final int ch) {
    return isNCStartChar(ch) ||
      (ch < 0x100 ? digit(ch) || ch == '-' || ch == '.' || ch == 0xB7 :
      ch >= 0x300 && ch <= 0x36F || ch == 0x203F || ch == 0x2040);
  }

  /**
   * Checks if the specified character is an XML first-letter.
   * @param ch the letter to be checked
   * @return result of check
   */
  public static boolean isStartChar(final int ch) {
    return isNCStartChar(ch) || ch == ':';
  }

  /**
   * Checks if the specified character is an XML letter.
   * @param ch the letter to be checked
   * @return result of check
   */
  public static boolean isChar(final int ch) {
    return isNCChar(ch) || ch == ':';
  }

  /**
   * Checks if the specified token is a valid NCName.
   * @param value value to be checked
   * @return result of check
   */
  public static boolean isNCName(final byte[] value) {
    final int l = value.length;
    return l != 0 && ncName(value, 0) == l;
  }

  /**
   * Checks if the specified token is a valid name.
   * @param value value to be checked
   * @return result of check
   */
  public static boolean isName(final byte[] value) {
    final int l = value.length;
    for(int i = 0; i < l; i += cl(value, i)) {
      final int c = cp(value, i);
      if(i == 0 ? !isStartChar(c) : !isChar(c)) return false;
    }
    return l != 0;
  }

  /**
   * Checks if the specified token is a valid NMToken.
   * @param value value to be checked
   * @return result of check
   */
  public static boolean isNMToken(final byte[] value) {
    final int l = value.length;
    for(int i = 0; i < l; i += cl(value, i)) if(!isChar(cp(value, i))) return false;
    return l != 0;
  }

  /**
   * Checks if the specified token is a valid QName.
   * @param value value to be checked
   * @return result of check
   */
  public static boolean isQName(final byte[] value) {
    final int l = value.length;
    if(l == 0) return false;
    final int i = ncName(value, 0);
    if(i == l) return true;
    if(i == 0 || value[i] != ':') return false;
    final int j = ncName(value, i + 1);
    return j == l && j != i + 1;
  }

  /**
   * Checks the specified token as an NCName.
   * @param value value to be checked
   * @param start start position
   * @return end position
   */
  private static int ncName(final byte[] value, final int start) {
    final int l = value.length;
    for(int i = start; i < l; i += cl(value, i)) {
      final int c = cp(value, i);
      if(i == start ? !isNCStartChar(c) : !isNCChar(c)) return i;
    }
    return l;
  }

  /**
   * Checks if the specified name is an id/idref attribute ({@code idref}: local name must contain
   * 'idref'; {@code id}: local name must contain 'if', but not 'idref').
   * The correct approach would be to gather all id/idref attributes and store them as meta data.
   * @param name name
   * @param idref id/idref flag
   * @return result of check
   */
  public static boolean isId(final byte[] name, final boolean idref) {
    final byte[] n = lc(local(name));
    return idref ? contains(n, IDREF) : contains(n, ID) && !contains(n, IDREF);
  }

  /**
   * Encodes a string to a valid NCName.
   * @param name token to be encoded
   * @param lax lax encoding (lossy, but better readable)
   * @return valid NCName
   */
  public static byte[] encode(final byte[] name, final boolean lax) {
    // lax encoding: trim whitespaces
    final byte[] nm = lax ? trim(name) : name;
    final int nl = nm.length;
    if(nl == 0) return UNDERSCORE;

    for(int n = 0; n < nl; n += cl(nm, n)) {
      int cp = cp(nm, n);
      if(cp == '_' || (n == 0 ? !isNCStartChar(cp) : !isNCChar(cp))) {
        final TokenBuilder tb = new TokenBuilder(nl << 1).add(nm, 0, n);
        for(int m = n; m < nl; m += cl(nm, m)) {
          cp = cp(nm, m);
          if(lax) {
            final boolean nc = isNCChar(cp);
            // prefix invalid start chars (numbers, dashes, dots) with underscore
            if(m == 0 && nc && !isNCStartChar(cp)) tb.add('_');
            tb.add(nc ? cp : '_');
          } else if(cp == '_') {
            tb.add('_').add('_');
          } else if(m == 0 ? isNCStartChar(cp) : isNCChar(cp)) {
            tb.add(cp);
          } else if(cp < 0x10000) {
            addEsc(tb, cp);
          } else {
            final int r = cp - 0x10000;
            addEsc(tb, (r >>> 10) + 0xD800);
            addEsc(tb, (r & 0x3FF) + 0xDC00);
          }
        }
        return tb.finish();
      }
    }
    return nm;
  }

  /**
   * Adds the given 16-bit char to the token builder in encoded form.
   * @param tb token builder
   * @param cp char
   */
  private static void addEsc(final TokenBuilder tb, final int cp) {
    tb.addByte(UNDERSCORE[0]);
    final int a = cp >>> 12;
    tb.addByte((byte) (a + (a > 9 ? 87 : '0')));
    final int b = cp >>> 8 & 0x0F;
    tb.addByte((byte) (b + (b > 9 ? 87 : '0')));
    final int c = cp >>> 4 & 0x0F;
    tb.addByte((byte) (c + (c > 9 ? 87 : '0')));
    final int d = cp & 0x0F;
    tb.addByte((byte) (d + (d > 9 ? 87 : '0')));
  }

  /**
   * Decodes an NCName to a string.
   * @param name name
   * @param lax lax decoding
   * @return cached QName, or {@code null} if not successful
   */
  public static byte[] decode(final byte[] name, final boolean lax) {
    if(lax) return name;

    // convert name back to original representation
    final TokenBuilder tb = new TokenBuilder();
    int uc = 0;

    // mode: 0=normal, 1=unicode, 2=underscore, 3=building unicode
    int mode = 0;
    final int nl = name.length;
    for(int n = 0; n < nl;) {
      final int cp = cp(name, n);
      if(mode >= 3) {
        uc <<= 4;
        if(cp >= '0' && cp <= '9') {
          uc += cp - '0';
        } else if(cp >= 'A' && cp <= 'F') {
          uc += cp - 0x37;
        } else if(cp >= 'a' && cp <= 'f') {
          uc += cp - 0x57;
        } else {
          return null;
        }
        if(++mode == 7) {
          tb.add(uc);
          mode = 0;
          uc = 0;
        }
      } else if(cp == '_') {
        // limit underscore counter
        if(++mode == 3) {
          tb.add('_');
          mode = 0;
          continue;
        }
      } else if(mode == 1) {
        // unicode
        mode = 3;
        continue;
      } else if(mode == 2) {
        // underscore
        tb.add('_');
        mode = 0;
        continue;
      } else {
        // normal character
        tb.add(cp);
        mode = 0;
      }
      n += cl(name, n);
    }

    if(mode == 2) {
      tb.add('_');
    } else if(mode > 0 && !tb.isEmpty()) {
      return null;
    }
    return tb.finish();
  }

  /** HTML entities. */
  private static final String[] HTMLENTITIES = { "Aacute", "\u00c1", "aacute",
    "\u00e1", "Acirc", "\u00c2", "acirc", "\u00e2", "acute", "\u00b4",
    "AElig", "\u00c6", "aelig", "\u00e6", "Agrave", "\u00c0", "agrave",
    "\u00e0", "alefsym", "\u2135", "Alpha", "\u0391", "alpha", "\u03b1",
    "and", "\u2227", "ang", "\u2220", "Aring", "\u00c5", "aring", "\u00e5",
    "asymp", "\u2248", "Atilde", "\u00c3", "atilde", "\u00e3", "Auml",
    "\u00c4", "auml", "\u00e4", "bdquo", "\u201e", "Beta", "\u0392", "beta",
    "\u03b2", "brvbar", "\u00a6", "bull", "\u2022", "cap", "\u2229",
    "Ccedil", "\u00c7", "ccedil", "\u00e7", "cedil", "\u00b8", "cent",
    "\u00a2", "Chi", "\u03a7", "chi", "\u03c7", "circ", "\u02c6", "clubs",
    "\u2663", "cong", "\u2245", "copy", "\u00a9", "crarr", "\u21b5", "cup",
    "\u222a", "curren", "\u00a4", "dagger", "\u2020", "Dagger", "\u2021",
    "darr", "\u2193", "dArr", "\u21d3", "deg", "\u00b0", "Delta", "\u0394",
    "delta", "\u03b4", "diams", "\u2666", "divide", "\u00f7", "Eacute",
    "\u00c9", "eacute", "\u00e9", "Ecirc", "\u00ca", "ecirc", "\u00ea",
    "Egrave", "\u00c8", "egrave", "\u00e8", "empty", "\u2205", "emsp",
    "\u2003", "ensp", "\u2002", "Epsilon", "\u0395", "epsilon", "\u03b5",
    "equiv", "\u2261", "Eta", "\u0397", "eta", "\u03b7", "ETH", "\u00d0",
    "eth", "\u00f0", "Euml", "\u00cb", "euml", "\u00eb", "euro", "\u20ac",
    "exist", "\u2203", "fnof", "\u0192", "forall", "\u2200", "frac12",
    "\u00bd", "frac14", "\u00bc", "frac34", "\u00be", "frasl", "\u2044",
    "Gamma", "\u0393", "gamma", "\u03b3", "ge", "\u2265", "harr", "\u2194",
    "hArr", "\u21d4", "hearts", "\u2665", "hellip", "\u2026", "Iacute",
    "\u00cd", "iacute", "\u00ed", "Icirc", "\u00ce", "icirc", "\u00ee",
    "iexcl", "\u00a1", "Igrave", "\u00cc", "igrave", "\u00ec", "image",
    "\u2111", "infin", "\u221e", "int", "\u222b", "Iota", "\u0399", "iota",
    "\u03b9", "iquest", "\u00bf", "isin", "\u2208", "Iuml", "\u00cf", "iuml",
    "\u00ef", "Kappa", "\u039a", "kappa", "\u03ba", "Lambda", "\u039b",
    "lambda", "\u03bb", "lang", "\u2329", "laquo", "\u00ab", "larr",
    "\u2190", "lArr", "\u21d0", "lceil", "\u2308", "ldquo", "\u201c", "le",
    "\u2264", "lfloor", "\u230a", "lowast", "\u2217", "loz", "\u25ca", "lrm",
    "\u200e", "lsaquo", "\u2039", "lsquo", "\u2018", "macr", "\u00af",
    "mdash", "\u2014", "micro", "\u00b5", "middot", "\u00b7", "minus",
    "\u2212", "Mu", "\u039c", "mu", "\u03bc", "nabla", "\u2207", "nbsp",
    "\u00a0", "ndash", "\u2013", "ne", "\u2260", "ni", "\u220b", "not",
    "\u00ac", "notin", "\u2209", "nsub", "\u2284", "Ntilde", "\u00d1",
    "ntilde", "\u00f1", "Nu", "\u039d", "nu", "\u03bd", "Oacute", "\u00d3",
    "oacute", "\u00f3", "Ocirc", "\u00d4", "ocirc", "\u00f4", "OElig",
    "\u0152", "oelig", "\u0153", "Ograve", "\u00d2", "ograve", "\u00f2",
    "oline", "\u203e", "Omega", "\u03a9", "omega", "\u03c9", "Omicron",
    "\u039f", "omicron", "\u03bf", "oplus", "\u2295", "or", "\u2228", "ordf",
    "\u00aa", "ordm", "\u00ba", "Oslash", "\u00d8", "oslash", "\u00f8",
    "Otilde", "\u00d5", "otilde", "\u00f5", "otimes", "\u2297", "Ouml",
    "\u00d6", "ouml", "\u00f6", "para", "\u00b6", "part", "\u2202", "permil",
    "\u2030", "perp", "\u22a5", "Phi", "\u03a6", "phi", "\u03c6", "Pi",
    "\u03a0", "pi", "\u03c0", "piv", "\u03d6", "plusmn", "\u00b1", "pound",
    "\u00a3", "prime", "\u2032", "Prime", "\u2033", "prod", "\u220f", "prop",
    "\u221d", "Psi", "\u03a8", "psi", "\u03c8", "radic", "\u221a", "rang",
    "\u232a", "raquo", "\u00bb", "rarr", "\u2192", "rArr", "\u21d2", "rceil",
    "\u2309", "rdquo", "\u201d", "real", "\u211c", "reg", "\u00ae", "rfloor",
    "\u230b", "Rho", "\u03a1", "rho", "\u03c1", "rlm", "\u200f", "rsaquo",
    "\u203a", "rsquo", "\u2019", "sbquo", "\u201a", "Scaron", "\u0160",
    "scaron", "\u0161", "sdot", "\u22c5", "sect", "\u00a7", "shy", "\u00ad",
    "Sigma", "\u03a3", "sigma", "\u03c3", "sigmaf", "\u03c2", "sim",
    "\u223c", "spades", "\u2660", "sub", "\u2282", "sube", "\u2286", "sum",
    "\u2211", "sup", "\u2283", "sup1", "\u00b9", "sup2", "\u00b2", "sup3",
    "\u00b3", "supe", "\u2287", "szlig", "\u00df", "Tau", "\u03a4", "tau",
    "\u03c4", "there4", "\u2234", "Theta", "\u0398", "theta", "\u03b8",
    "thetasym", "\u03d1", "thinsp", "\u2009", "THORN", "\u00de", "thorn",
    "\u00fe", "tilde", "\u02dc", "times", "\u00d7", "trade", "\u2122",
    "Uacute", "\u00da", "uacute", "\u00fa", "uarr", "\u2191", "uArr",
    "\u21d1", "Ucirc", "\u00db", "ucirc", "\u00fb", "Ugrave", "\u00d9",
    "ugrave", "\u00f9", "uml", "\u00a8", "upsih", "\u03d2", "Upsilon",
    "\u03a5", "upsilon", "\u03c5", "Uuml", "\u00dc", "uuml", "\u00fc",
    "weierp", "\u2118", "Xi", "\u039e", "xi", "\u03be", "Yacute", "\u00dd",
    "yacute", "\u00fd", "yen", "\u00a5", "yuml", "\u00ff", "Yuml", "\u0178",
    "Zeta", "\u0396", "zeta", "\u03b6", "zwj", "\u200d", "zwnj", "\u200c" };

  /**
   * Returns the unicode for the specified entity or {@code null}.
   * @param key key
   * @return unicode
   */
  public static byte[] getEntity(final byte[] key) {
    final TokenMap map = ENTITIESMAP;
    if(map.isEmpty()) {
      final String[] ents = HTMLENTITIES;
      final int el = ents.length;
      for(int e = 0; e < el; e += 2) map.put(ents[e], ents[e + 1]);
    }
    return map.get(key);
  }
}