package client.net.sf.saxon.ce.expr.number; /** * This class contains static utility methods to test whether a character is alphanumeric, as defined * by the rules of xsl:number: that is, whether it is in one of the Unicode categories * Nd, Nl, No, Lu, Ll, Lt, Lm or Lo */ public class Alphanumeric { private static int[] zeroDigits = { 0x0030, 0x0660, 0x06f0, 0x0966, 0x09e6, 0x0a66, 0x0ae6, 0x0b66, 0x0be6, 0x0c66, 0x0ce6, 0x0d66, 0x0e50, 0x0ed0, 0x0f20, 0x1040, 0x17e0, 0x1810, 0x1946, 0x19d0, 0xff10, 0x104a0, 0x107ce, 0x107d8, 0x107e2, 0x107ec, 0x107f6 }; // These data sets were generated from the Unicode 4.0 database using a custom stylesheet. // (copied below; source in MyJava/Unicode-db4/listAlphanumeric.xsl) // Note that the characters in the CJK Extended Ideograph ranges A and B, 3400-4DB5 and // 20000-2A6D6 as well as 4E00-9FBB and AC00-D7A3 are not listed individually in the database, // and therefore need to be handled specially. /** * Determine whether a Unicode codepoint is alphanumeric, that is, whether it is in one of the * categories Nd, Nl, No, Lu, Ll, Lt, Lm or Lo * @param c the codepoint to be tested * @return true if the codepoint is in one of these categories */ public static boolean isAlphanumeric(int c) { if (c <= 0x7F) { // Fast path for ASCII characters return (c >= 0x30 && c <= 0x39) || (c >= 0x41 && c <= 0x5A) || (c >= 0x61 && c <= 0x7A); } else if (c <= 0xffff) { return Character.isLetterOrDigit((char)c); } else { for (int i=0; i<startAstralAlphaNumeric.length; i++) { if (c <= endAstralAlphaNumeric[i]) { return (c >= startAstralAlphaNumeric[i]); } } return false; } } private static int[] startAstralAlphaNumeric = { 0x10000, 0x1000D, 0x10028, 0x1003C, 0x1003F, 0x10050, 0x10080, 0x10107, 0x10140, 0x1018A, 0x10300, 0x10320, 0x10330, 0x10380, 0x103A0, 0x103C8, 0x103D1, 0x10400, 0x104A0, 0x10800, 0x10808, 0x1080A, 0x10837, 0x1083C, 0x1083F, 0x10A00, 0x10A10, 0x10A15, 0x10A19, 0x10A40, 0x1D400, 0x1D456, 0x1D49E, 0x1D4A2, 0x1D4A5, 0x1D4A9, 0x1D4AE, 0x1D4BB, 0x1D4BD, 0x1D4C5, 0x1D507, 0x1D50D, 0x1D516, 0x1D51E, 0x1D53B, 0x1D540, 0x1D546, 0x1D54A, 0x1D552, 0x1D6A8, 0x1D6C2, 0x1D6DC, 0x1D6FC, 0x1D716, 0x1D736, 0x1D750, 0x1D770, 0x1D78A, 0x1D7AA, 0x1D7C4, 0x1D7CE, 0x20000, 0x2F800 }; private static int[] endAstralAlphaNumeric = { 0x1000B, 0x10026, 0x1003A, 0x1003D, 0x1004D, 0x1005D, 0x100FA, 0x10133, 0x10178, 0x1018A, 0x1031E, 0x10323, 0x1034A, 0x1039D, 0x103C3, 0x103CF, 0x103D5, 0x1049D, 0x104A9, 0x10805, 0x10808, 0x10835, 0x10838, 0x1083C, 0x1083F, 0x10A00, 0x10A13, 0x10A17, 0x10A33, 0x10A47, 0x1D454, 0x1D49C, 0x1D49F, 0x1D4A2, 0x1D4A6, 0x1D4AC, 0x1D4B9, 0x1D4BB, 0x1D4C3, 0x1D505, 0x1D50A, 0x1D514, 0x1D51C, 0x1D539, 0x1D53E, 0x1D544, 0x1D546, 0x1D550, 0x1D6A5, 0x1D6C0, 0x1D6DA, 0x1D6FA, 0x1D714, 0x1D734, 0x1D74E, 0x1D76E, 0x1D788, 0x1D7A8, 0x1D7C2, 0x1D7C9, 0x1D7FF, 0x2A6D6, 0x2FA1D }; /** * Determine whether a character represents a decimal digit and if so, which digit. * @param in the Unicode character being tested. * @return -1 if it's not a decimal digit, otherwise the digit value. */ public static int getDigitValue(int in) { for (int z=0; z<zeroDigits.length; z++) { if (in <= zeroDigits[z]+9) { if (in >= zeroDigits[z]) { return in - zeroDigits[z]; } else { return -1; } } } return -1; } /** * Determine which digit family a decimal digit belongs to: that is, return the corresponding zero digit. * @param in a Unicode character * @return if the character is a digit, return the Unicode character that represents zero in the same digit * family. Otherwise, return -1. */ public static int getDigitFamily(int in){ for (int z=0; z<zeroDigits.length; z++) { if (in <= zeroDigits[z]+9) { if (in >= zeroDigits[z]) { return zeroDigits[z]; } else { return -1; } } } return -1; } private Alphanumeric(){} } // For completeness, here is the stylesheet used to generate these lists of ranges from UnicodeData.txt: //<xsl:stylesheet version="2.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" // xmlns:xs="http://www.w3.org/2001/XMLSchema" // xmlns:f="http://saxonica.com/ns/unicode" // exclude-result-prefixes="xs f" //> // //<!-- Output a list of the start and end points of contiguous ranges of characters // classified as letters or digits. // // Note this doesn't handle the CJK Extended Ideograph ranges A and B, 3400-4DB5 and 20000-2A6D6, // which have to be edited in by hand. Also 4E00-9FBB and AC00-D7A3 //--> // //<xsl:output method="text"/> //<xsl:variable name="data" select="doc('UnicodeData.xml')"/> // //<xsl:function name="f:isAlphaNum" as="xs:boolean"> // <xsl:param name="char" as="element(Char)"/> // <xsl:sequence select="$char/Field3=('Nd', 'Nl', 'No', 'Lu', 'Ll', 'Lt', 'Lm', 'Lo')"/> //</xsl:function> // //<xsl:function name="f:hexToInt" as="xs:integer?"> // <xsl:param name="hex" as="xs:string?"/> // <xsl:sequence select="if (empty($hex)) then () else Integer:parseInt($hex, 16)" // xmlns:Integer="java:java.lang.Integer"/> //</xsl:function> // //<xsl:param name="p"/> //<xsl:template name="test"> // <xsl:value-of select="f:hexToInt($p)"/> //</xsl:template> // //<xsl:template name="main"> // // <xsl:text>int[] startPoints = new int[]{</xsl:text> // <xsl:for-each-group select="$data/*/Char" group-adjacent="concat(f:isAlphaNum(.), f:hexToInt(code) - position())"> // <xsl:if test="f:isAlphaNum(.)"> // <xsl:text>0x</xsl:text> // <xsl:value-of select="current-group()[1]/code"/> // <xsl:text>, </xsl:text> // <xsl:if test="position() mod 10 = 0"> </xsl:if> // </xsl:if> // </xsl:for-each-group> // <xsl:text>}; </xsl:text> // <xsl:text>int[] endPoints = new int[]{</xsl:text> // <xsl:for-each-group select="$data/*/Char" group-adjacent="concat(f:isAlphaNum(.), f:hexToInt(code) - position())"> // <xsl:if test="f:isAlphaNum(.)"> // <xsl:text>0x</xsl:text> // <xsl:value-of select="current-group()[last()]/code"/> // <xsl:text>, </xsl:text> // <xsl:if test="position() mod 10 = 0"> </xsl:if> // </xsl:if> // </xsl:for-each-group> // <xsl:text>}; </xsl:text> // //</xsl:template> // // //</xsl:stylesheet> // This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0. // If a copy of the MPL was not distributed with this file, You can obtain one at http://mozilla.org/MPL/2.0/. // This Source Code Form is “Incompatible With Secondary Licenses”, as defined by the Mozilla Public License, v. 2.0.