AlbiteCharacter.java example

Explorer
AlbiteREADER-master
- src
/*
 * To change this template, choose Tools | Templates
 * and open the template in the editor.
 */

package org.albite.lang;

import java.io.DataInputStream;
import java.io.IOException;
import java.io.InputStream;

/**
 *
 * @author albus
 */
public class AlbiteCharacter {

    public static final byte
	UNASSIGNED		= 0,
	UPPERCASE_LETTER	= 1,
	LOWERCASE_LETTER	= 2,
	TITLECASE_LETTER	= 3,
	MODIFIER_LETTER		= 4,
	OTHER_LETTER		= 5,
	NON_SPACING_MARK	= 6,
	ENCLOSING_MARK		= 7,
	COMBINING_SPACING_MARK	= 8,
	DECIMAL_DIGIT_NUMBER	= 9,
	LETTER_NUMBER		= 10,
	OTHER_NUMBER		= 11,
	SPACE_SEPARATOR		= 12,
	LINE_SEPARATOR		= 13,
	PARAGRAPH_SEPARATOR	= 14,
	CONTROL			= 15,
	FORMAT			= 16,
	PRIVATE_USE		= 18,
	SURROGATE		= 19,
	DASH_PUNCTUATION	= 20,
	START_PUNCTUATION	= 21,
	END_PUNCTUATION		= 22,
	CONNECTOR_PUNCTUATION	= 23,
	OTHER_PUNCTUATION	= 24,
	MATH_SYMBOL		= 25,
	CURRENCY_SYMBOL		= 26,
	MODIFIER_SYMBOL		= 27,
	OTHER_SYMBOL		= 28;

    public static final byte[]  X = new byte[1024];
    public static final short[] Y = new short[4032];
    public static final int[]   A = new int[632];

    static {
        /*
         * Load data from external file
         */
        InputStream is = (new Object()).getClass()
                .getResourceAsStream("/res/charmap.bin");

        if (is != null) {
            DataInputStream in = new DataInputStream(is);
            try {
                try {
                    for (int i = 0; i < 1024; i++) {
                        X[i] = in.readByte();
                    }

                    for (int i = 0; i < 4032; i++) {
                        Y[i] = in.readShort();
                    }

                    for (int i = 0; i < 632; i++) {
                        A[i] = in.readInt();
                    }
                } finally {
                    in.close();
                }
            } catch (IOException e) {}
        }
    }

    /**
     * Returns a value indicating a character category.
     *
     * @param   ch      the character to be tested.
     * @return  a value of type int, the character category.
     * @see     java.lang.Character#COMBINING_SPACING_MARK
     * @see     java.lang.Character#CONNECTOR_PUNCTUATION
     * @see     java.lang.Character#CONTROL
     * @see     java.lang.Character#CURRENCY_SYMBOL
     * @see     java.lang.Character#DASH_PUNCTUATION
     * @see     java.lang.Character#DECIMAL_DIGIT_NUMBER
     * @see     java.lang.Character#ENCLOSING_MARK
     * @see     java.lang.Character#END_PUNCTUATION
     * @see     java.lang.Character#FORMAT
     * @see     java.lang.Character#LETTER_NUMBER
     * @see     java.lang.Character#LINE_SEPARATOR
     * @see     java.lang.Character#LOWERCASE_LETTER
     * @see     java.lang.Character#MATH_SYMBOL
     * @see     java.lang.Character#MODIFIER_LETTER
     * @see     java.lang.Character#MODIFIER_SYMBOL
     * @see     java.lang.Character#NON_SPACING_MARK
     * @see     java.lang.Character#OTHER_LETTER
     * @see     java.lang.Character#OTHER_NUMBER
     * @see     java.lang.Character#OTHER_PUNCTUATION
     * @see     java.lang.Character#OTHER_SYMBOL
     * @see     java.lang.Character#PARAGRAPH_SEPARATOR
     * @see     java.lang.Character#PRIVATE_USE
     * @see     java.lang.Character#SPACE_SEPARATOR
     * @see     java.lang.Character#START_PUNCTUATION
     * @see     java.lang.Character#SURROGATE
     * @see     java.lang.Character#TITLECASE_LETTER
     * @see     java.lang.Character#UNASSIGNED
     * @see     java.lang.Character#UPPERCASE_LETTER
     * @since   JDK1.1
     */
    public static int getType(final char ch) {
        return A[Y[(X[ch>>6]<<5)|((ch>>1)&0x1F)]|(ch&0x1)] & 0x1F;
    }

    /**
     * Determines if the specified character is a letter or digit.
     * For a more complete specification that encompasses all Unicode
     * characters, see Gosling, Joy, and Steele, <i>The Java Language
     * Specification</i>.
     *
     * <p> A character is considered to be a letter if and only if
     * it is specified to be a letter or a digit by the Unicode 2.0 standard
     * (category "Lu", "Ll", "Lt", "Lm", "Lo", or "Nd" in the Unicode
     * specification data file).  In other words, isLetterOrDigit is true
     * of a character if and only if either isLetter is true of the character
     * or isDigit is true of the character.
     *
     * @param   ch   the character to be tested.
     * @return  <code>true</code> if the character is a letter or digit;
     *          <code>false</code> otherwise.
     * @see     java.lang.Character#isDigit(char)
     * @see     java.lang.Character#isJavaIdentifierPart(char)
     * @see     java.lang.Character#isJavaLetter(char)
     * @see     java.lang.Character#isJavaLetterOrDigit(char)
     * @see     java.lang.Character#isLetter(char)
     * @see     java.lang.Character#isUnicodeIdentifierPart(char)
     * @since   JDK1.0.2
     */
    public static boolean isLetterOrDigit(final char ch) {
        return (((((1 << UPPERCASE_LETTER) |
                   (1 << LOWERCASE_LETTER) |
                   (1 << TITLECASE_LETTER) |
                   (1 << MODIFIER_LETTER) |
                   (1 << OTHER_LETTER) |
                   (1 << DECIMAL_DIGIT_NUMBER))
                  >> (A[Y[(X[ch>>6]<<5)|((ch>>1)&0x1F)]|(ch&0x1)] & 0x1F)) & 1) != 0);
    }

    /**
     * Determines if the specified character is a letter. For a
     * more complete specification that encompasses all Unicode
     * characters, see Gosling, Joy, and Steele, <i>The Java Language
     * Specification</i>.
     *
     * <p> A character is considered to be a letter if and only if
     * it is specified to be a letter by the Unicode 2.0 standard
     * (category "Lu", "Ll", "Lt", "Lm", or "Lo" in the Unicode
     * specification data file).
     *
     * <p> Note that most ideographic characters are considered
     * to be letters (category "Lo") for this purpose.
     *
     * <p> Note also that not all letters have case: many Unicode characters are
     * letters but are neither uppercase nor lowercase nor titlecase.
     *
     * @param   ch   the character to be tested.
     * @return  <code>true</code> if the character is a letter;
     *          <code>false</code> otherwise.
     * @see     java.lang.Character#isDigit(char)
     * @see     java.lang.Character#isJavaIdentifierStart(char)
     * @see     java.lang.Character#isJavaLetter(char)
     * @see     java.lang.Character#isJavaLetterOrDigit(char)
     * @see     java.lang.Character#isLetterOrDigit(char)
     * @see     java.lang.Character#isLowerCase(char)
     * @see     java.lang.Character#isTitleCase(char)
     * @see     java.lang.Character#isUnicodeIdentifierStart(char)
     * @see     java.lang.Character#isUpperCase(char)
     */
    public static boolean isLetter(final char ch) {
        return (((((1 << UPPERCASE_LETTER) |
                   (1 << LOWERCASE_LETTER) |
                   (1 << TITLECASE_LETTER) |
                   (1 << MODIFIER_LETTER) |
                   (1 << OTHER_LETTER))
                  >> (A[Y[(X[ch>>6]<<5)|((ch>>1)&0x1F)]|(ch&0x1)] & 0x1F)) & 1) != 0);
    }

    /**
     * Determines if the specified character is a digit.
     * A character is considered to be a digit if it is not in the range
     * <code>'\u2000' <= ch <= '\u2FFF'</code>
     * and its Unicode name contains the word
     * "<code>DIGIT</code>". For a more complete
     * specification that encompasses all Unicode characters that are
     * defined as digits, see Gosling, Joy, and Steele, <i>The Java
     * Language Specification</i>.
     * <p>
     * These are the ranges of Unicode characters that are considered digits:
     * <table>
     * <tr><td>0x0030 through 0x0039</td>
     *                        <td>ISO-LATIN-1 digits ('0' through '9')</td></tr>
     * <tr><td>0x0660 through 0x0669</td>  <td>Arabic-Indic digits</td></tr>
     * <tr><td>0x06F0 through 0x06F9</td>
     *                                <td>Extended Arabic-Indic digits</td></tr>
     * <tr><td>0x0966 through 0x096F</td>  <td>Devanagari digits</td></tr>
     * <tr><td>0x09E6 through 0x09EF</td>  <td>Bengali digits</td></tr>
     * <tr><td>0x0A66 through 0x0A6F</td>  <td>Gurmukhi digits</td></tr>
     * <tr><td>0x0AE6 through 0x0AEF</td>  <td>Gujarati digits</td></tr>
     * <tr><td>0x0B66 through 0x0B6F</td>  <td>Oriya digits</td></tr>
     * <tr><td>0x0BE7 through 0x0BEF</td>  <td>Tamil digits</td></tr>
     * <tr><td>0x0C66 through 0x0C6F</td>  <td>Telugu digits</td></tr>
     * <tr><td>0x0CE6 through 0x0CEF</td>  <td>Kannada digits</td></tr>
     * <tr><td>0x0D66 through 0x0D6F</td>  <td>Malayalam digits</td></tr>
     * <tr><td>0x0E50 through 0x0E59</td>  <td>Thai digits</td></tr>
     * <tr><td>0x0ED0 through 0x0ED9</td>  <td>Lao digits</td></tr>
     * <tr><td>0x0F20 through 0x0F29</td>  <td>Tibetan digits</td></tr>
     * <tr><td>0xFF10 through 0xFF19</td>  <td>Fullwidth digits</td></tr>
     * </table>
     *
     * @param   ch   the character to be tested.
     * @return  <code>true</code> if the character is a digit;
     *          <code>false</code> otherwise.
     * @see     java.lang.Character#digit(char, int)
     * @see     java.lang.Character#forDigit(int, int)
     */
    public static boolean isDigit(final char ch) {
        return (A[Y[(X[ch>>6]<<5)|((ch>>1)&0x1F)]|(ch&0x1)] & 0x1F) == DECIMAL_DIGIT_NUMBER;
    }

    /**
     * The given character is mapped to its lowercase equivalent; if the
     * character has no lowercase equivalent, the character itself is
     * returned.
     * <p>
     * A character has a lowercase equivalent if and only if a lowercase
     * mapping is specified for the character in the Unicode attribute
     * table.
     * <p>
     * Note that some Unicode characters in the range
     * <code>'\u2000'</code> to <code>'\u2FFF'</code> have lowercase
     * mappings; this method does map such characters to their lowercase
     * equivalents even though the method <code>isUpperCase</code> does
     * not return <code>true</code> for such characters.
     *
     * @param   ch   the character to be converted.
     * @return  the lowercase equivalent of the character, if any;
     *          otherwise the character itself.
     * @see     java.lang.Character#isLowerCase(char)
     * @see     java.lang.Character#isUpperCase(char)
     * @see     java.lang.Character#toTitleCase(char)
     * @see     java.lang.Character#toUpperCase(char)
     */
    public static char toLowerCase(final char ch) {
        final int val = A[Y[(X[ch>>6]<<5)|((ch>>1)&0x1F)]|(ch&0x1)];

        if ((val & 0x00200000) != 0) {
          return (char)(ch + (val >> 22));
        } else {
          return ch;
        }
    }

    public static char[] toLowerCase(final char[] ch) {
        final char[] res = new char[ch.length];
        for (int i = 0; i < ch.length; i++) {
            res[i] = toLowerCase(ch[i]);
        }

        return res;
    }

    public static String toLowerCase(final String s) {
        return new String(toLowerCase(s.toCharArray()));
    }
}