// Uses: CharInfo /* Copyright (C) 1999 Artur Biesiadowski Copyright (C) 2004 Stephen Crawley This file is part of Mauve. Mauve is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2, or (at your option) any later version. Mauve is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Mauve; see the file COPYING. If not, write to the Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ package gnu.testlet.wonka.lang.Character; import java.io.*; import gnu.testlet.Testlet; import gnu.testlet.TestHarness; import gnu.testlet.ResourceNotFoundException; /* MISSING: Instance tests (constructor, charValue, serialization): should be in other file */ public abstract class UnicodeBase implements Testlet { public static boolean testDeprecated; public static boolean verbose; public static boolean benchmark; public CharInfo[] chars = new CharInfo[0x10000]; public int failures; public int tests; TestHarness harness; public UnicodeBase() { } public UnicodeBase(TestHarness aHarness, String filename) throws IOException, ResourceNotFoundException { harness = aHarness; Reader bir = new InputStreamReader( this.getClass().getResourceAsStream("/" +filename)); harness.debug("Reading unicode database..."); while (bir.ready()) { String str; CharInfo ci = new CharInfo(); // 0 - Code value str = getNext(bir); int code = (char) Integer.parseInt(str, 16); // 1 - Character name ci.name = getNext(bir); // 2 - General category ci.category = getNext(bir); // 3 - Canonical combining classes getNext(bir); // 4 - Bidirectional category getNext(bir); // 5 - Character decomposition mapping getNext(bir); // 6 - Decimal digit value str = getNext(bir); if (!str.equals("")) ci.decimalDigit = Integer.parseInt(str, 10); else ci.decimalDigit = -1; // 7 - Digit value str = getNext(bir); if (!str.equals("")) ci.digit = Integer.parseInt(str, 10); else ci.digit = -1; // 8 - Numeric value str = getNext(bir); if (str.equals("")) { ci.numericValue = -1; } else { try { ci.numericValue = Integer.parseInt(str, 10); if (ci.numericValue < 0) ci.numericValue = -2; } catch (NumberFormatException e) { ci.numericValue = -2; } } // 9 - Mirrored getNext(bir); // 10 - Unicode 1.0 name getNext(bir); // 11 - ISO 10646 comment field getNext(bir); // 12 - Upper case mapping str = getNext(bir); if (!str.equals("")) ci.uppercase = (char) Integer.parseInt(str, 16); // 13 - Lower case mapping str = getNext(bir); if (!str.equals("")) ci.lowercase = (char) Integer.parseInt(str, 16); // 14 - Title case mapping str = getNext(bir); if (!str.equals("")) ci.titlecase = (char) Integer.parseInt(str, 16); // Character.digit() only treats "Nd" as decimal digits, not "No" // or "Nl". Tweak the character defns accordingly. if (ci.digit != -1 && !("Nd".equals(ci.category))) ci.digit = -1; chars[code] = ci; } // Fill in the character ranges that are reserved in Unicode 3.0 CharInfo ch = new CharInfo(); ch.name = "CJK Ideograph"; ch.category = "Lo"; ch.decimalDigit = -1; ch.digit = -1; ch.numericValue = -1; for (int i = 0x4E01; i <= 0x9FA4; i++) { chars[i] = ch; } ch = new CharInfo(); ch.name = "CJK Ideograph Extension A"; ch.category = "Lo"; ch.decimalDigit = -1; ch.digit = -1; ch.numericValue = -1; for (int i = 0x3400; i <= 0x4DB5; i++) { chars[i] = ch; } ch = new CharInfo(); ch.name = "Hangul Syllable"; ch.category = "Lo"; ch.decimalDigit = -1; ch.digit = -1; ch.numericValue = -1; for (int i = 0xAC01; i <= 0xD7A2; i++) { chars[i] = ch; } ch = new CharInfo(); ch.name = "CJK Compatibility Ideograph"; ch.category = "Lo"; ch.decimalDigit = -1; ch.digit = -1; ch.numericValue = -1; for (int i = 0xF901; i <= 0xFA2C; i++) { chars[i] = ch; } ch = new CharInfo(); ch.name = "Surrogate"; ch.category= "Cs"; ch.decimalDigit = -1; ch.digit = -1; ch.numericValue = -1; for (int i = 0xD800; i <= 0xDFFFl; i++) { chars[i] = ch; } ch = new CharInfo(); ch.name = "Private Use"; ch.category = "Co"; ch.decimalDigit = -1; ch.digit = -1; ch.numericValue = -1; for (int i = 0xE000; i <= 0xF8FF; i++) { chars[i] = ch; } ch = new CharInfo(); ch.name = "UNDEFINED"; ch.category = "Cn"; ch.decimalDigit = -1; ch.digit = -1; ch.numericValue = -1; for (int i = 0; i <= 0xFFFF; i++) { if (chars[i] == null) chars[i] = ch; } /* Override the character definitions for Latin letters with digit values to cope with the semantics of Character.digit(), etc. It is not stated that A-Z and a-z should have getNumericValue() (as it is in digit()) */ for (int i = 'A'; i <= 'Z'; i++) { chars[i].digit = i - 'A' + 10; chars[i].numericValue = chars[i].digit; // ?? } for (int i = 'a'; i <= 'z'; i++) { chars[i].digit = i - 'a' + 10; chars[i].numericValue = chars[i].digit; // ?? } for (int i = 0xFF21; i <= 0xFF3A; i++) { chars[i].digit = i - 0xFF21 + 10; chars[i].numericValue = chars[i].digit; // ?? } for (int i = 0xFF41; i <= 0xFF5A; i++) { chars[i].digit = i - 0xFF41 + 10; chars[i].numericValue = chars[i].digit; // ?? } harness.debug("done"); } private String getNext(Reader r) throws IOException { StringBuffer sb = new StringBuffer(); while (r.ready()) { char ch = (char) r.read(); if (ch == '\r') { continue; } else if (ch == ';' || ch == '\n') { return sb.toString(); } else sb.append(ch); } return sb.toString(); } public String stringChar(int ch) { return "Character " + Integer.toString(ch,16) + ":" + chars[ch].name; } protected void reportError( String what) { harness.check(false, what); } protected void reportError( int ch, String what) { harness.check(false, stringChar(ch) +" incorrectly reported as " + what); } protected void checkPassed() { harness.check(true); } public boolean range(int mid, int low, int high) { return (mid >= low && mid <= high); } public boolean ignorable(int i) { return (range(i, 0x0000, 0x0008) || range(i, 0x000E, 0x001B) || range(i, 0x007f, 0x009f) || "Cf".equals(chars[i].category)); } public boolean whitespace(int i) { return ((chars[i].category.charAt(0) == 'Z' && i != 0x00a0 && i != 0x2007 && i != 0x202f) || range(i, 0x0009, 0x000D) || range(i, 0x001C, 0x001F)); } public boolean identifierStart(int i) { return ("Ll".equals(chars[i].category) || "Lu".equals(chars[i].category) || "Lt".equals(chars[i].category) || "Lm".equals(chars[i].category) || "Lo".equals(chars[i].category) || "Nl".equals(chars[i].category) || "Sc".equals(chars[i].category) || "Pc".equals(chars[i].category)); } public boolean unicodeIdentifierStart(int i) { return ("Ll".equals(chars[i].category) || "Lu".equals(chars[i].category) || "Lt".equals(chars[i].category) || "Lm".equals(chars[i].category) || "Lo".equals(chars[i].category) || "Nl".equals(chars[i].category)); } public void performTests() { for (int x = 0; x <= 0xffff; x++) { // isLowerCase char i = (char) x; if ("Ll".equals(chars[i].category) != Character.isLowerCase((char) i)) { reportError(i, (Character.isLowerCase((char) i) ? "lowercase" : "not-lowercase")); } else checkPassed(); // isUpperCase if ("Lu".equals(chars[i].category) != Character.isUpperCase((char) i)) { reportError(i, (Character.isUpperCase((char) i) ? "uppercase" : "not-uppercase")); } else checkPassed(); // isTitleCase if ( "Lt".equals(chars[i].category) != Character.isTitleCase((char) i)) { reportError(i, (Character.isTitleCase((char) i) ? "titlecase" : "not-titlecase")); } else checkPassed(); // isDigit if ("Nd".equals(chars[i].category) != Character.isDigit((char) i)) { reportError(i, (Character.isDigit((char) i) ? "digit" : "not-digit")); } else checkPassed(); // isDefined if (!chars[i].category.equals("Cn") != Character.isDefined((char) i)) { reportError(i, (Character.isDefined((char) i) ? "defined" : "not-defined")); } else checkPassed(); // isLetter if ((chars[i].category.charAt(0) == 'L') != Character.isLetter((char) i)) { reportError(i, (Character.isLetter((char) i) ? "letter" : "not-letter")); } else checkPassed(); // isLetterOrDigit if (Character.isLetterOrDigit(i) != (Character.isLetter(i) || Character.isDigit(i))) { reportError(i, (Character.isLetterOrDigit(i) ? "letterordigit" : "not-letterordigit")); } else checkPassed(); // isSpaceChar if ((chars[i].category.charAt(0) == 'Z') != Character.isSpaceChar(i)) { reportError(i, (Character.isSpaceChar(i) ? "spacechar" : "not-spacechar")); } else checkPassed(); // isWhiteSpace if (whitespace(i) != Character.isWhitespace(i)) { reportError(i, Character.isWhitespace(i) ? "whitespace" : "not-whitespace"); } else checkPassed(); // isISOControl if (((i <= 0x001F) || range(i, 0x007F, 0x009F)) != Character.isISOControl(i)) { reportError(i, Character.isISOControl(i) ? "isocontrol" : "not-isocontrol"); } else checkPassed(); int type = Character.getType(i); String typeStr = null; switch (type) { case Character.UNASSIGNED: typeStr = "Cn"; break; case Character.UPPERCASE_LETTER: typeStr = "Lu"; break; case Character.LOWERCASE_LETTER: typeStr = "Ll"; break; case Character.TITLECASE_LETTER: typeStr = "Lt"; break; case Character.MODIFIER_LETTER: typeStr = "Lm"; break; case Character.OTHER_LETTER: typeStr = "Lo"; break; case Character.NON_SPACING_MARK: typeStr = "Mn"; break; case Character.ENCLOSING_MARK: typeStr = "Me"; break; case Character.COMBINING_SPACING_MARK: typeStr = "Mc"; break; case Character.DECIMAL_DIGIT_NUMBER: typeStr = "Nd"; break; case Character.LETTER_NUMBER: typeStr = "Nl"; break; case Character.OTHER_NUMBER: typeStr = "No"; break; case Character.SPACE_SEPARATOR: typeStr = "Zs"; break; case Character.LINE_SEPARATOR: typeStr = "Zl"; break; case Character.PARAGRAPH_SEPARATOR: typeStr = "Zp"; break; case Character.CONTROL: typeStr = "Cc"; break; case Character.FORMAT: typeStr = "Cf"; break; case Character.PRIVATE_USE: typeStr = "Co"; break; case Character.SURROGATE: typeStr = "Cs"; break; case Character.DASH_PUNCTUATION: typeStr = "Pd"; break; case Character.START_PUNCTUATION: typeStr = "Ps"; break; case Character.END_PUNCTUATION: typeStr = "Pe"; break; case Character.CONNECTOR_PUNCTUATION: typeStr = "Pc"; break; case Character.FINAL_QUOTE_PUNCTUATION: typeStr = "Pf"; break; case Character.INITIAL_QUOTE_PUNCTUATION: typeStr = "Pi"; break; case Character.OTHER_PUNCTUATION: typeStr = "Po"; break; case Character.MATH_SYMBOL: typeStr = "Sm"; break; case Character.CURRENCY_SYMBOL: typeStr = "Sc"; break; case Character.MODIFIER_SYMBOL: typeStr = "Sk"; break; case Character.OTHER_SYMBOL: typeStr = "So"; break; default: typeStr = "ERROR (" + type + ")"; break; } if (!(chars[i].category.equals(typeStr) || (typeStr.equals("Ps") && chars[i].category.equals("Pi")) || (typeStr.equals("Pe") && chars[i].category.equals("Pf")))) { reportError(stringChar(i) + " is reported to be type " + typeStr + " instead of " + chars[i].category); } else checkPassed(); // isJavaIdentifierStart if (identifierStart(i) != Character.isJavaIdentifierStart(i)) { reportError(i, Character.isJavaIdentifierStart(i) ? "javaindentifierstart" : "not-javaidentifierstart"); } else checkPassed(); // isJavaIdentifierPart boolean shouldbe = false; typeStr = chars[i].category; if ((typeStr.charAt(0) == 'L' || typeStr.equals("Sc") || typeStr.equals("Pc") || typeStr.equals("Nd") || typeStr.equals("Nl") || typeStr.equals("Mc") || typeStr.equals("Mn") || typeStr.equals("Cf") || (typeStr.equals("Cc") && ignorable(i))) != Character.isJavaIdentifierPart(i)) { reportError(i, Character.isJavaIdentifierPart(i) ? "javaidentifierpart" : "not-javaidentifierpart"); } else checkPassed(); //isUnicodeIdentifierStart if (unicodeIdentifierStart(i) != Character.isUnicodeIdentifierStart(i)) { reportError(i, Character.isUnicodeIdentifierStart(i) ? "unicodeidentifierstart" : "not-unicodeidentifierstart"); } else checkPassed(); //isUnicodeIdentifierPart shouldbe = false; typeStr = chars[i].category; if ((typeStr.charAt(0) == 'L' || typeStr.equals("Pc") || typeStr.equals("Nd") || typeStr.equals("Nl") || typeStr.equals("Mc") || typeStr.equals("Mn") || typeStr.equals("Cf") || (typeStr.equals("Cc") && ignorable(i))) != Character.isUnicodeIdentifierPart(i)) { reportError(i, Character.isUnicodeIdentifierPart(i) ? "unicodeidentifierpart" : "not-unicodeidentifierpart"); } else checkPassed(); //isIdentifierIgnorable if (ignorable(i) != Character.isIdentifierIgnorable(i)) { reportError(i, Character.isIdentifierIgnorable(i) ? "identifierignorable": "not-identifierignorable"); } else checkPassed(); // toLowerCase char cs = (chars[i].lowercase != 0 ? chars[i].lowercase : i); if (Character.toLowerCase(i) != cs) { reportError(stringChar(i) + " has wrong lowercase form of " + stringChar(Character.toLowerCase(i)) +" instead of " + stringChar(cs)); } else checkPassed(); // toUpperCase cs =(chars[i].uppercase != 0 ? chars[i].uppercase : i); if (Character.toUpperCase(i) != cs) { reportError(stringChar(i) + " has wrong uppercase form of " + stringChar(Character.toUpperCase(i)) + " instead of " + stringChar(cs)); } else checkPassed(); // toTitleCase cs = (chars[i].titlecase != 0 ? chars[i].titlecase : (chars[i].uppercase != 0 ? chars[i].uppercase : i)); if ("Lt".equals(chars[i].category)) { cs = i; } if (Character.toTitleCase(i) != cs) { reportError(stringChar(i) + " has wrong titlecase form of " + stringChar(Character.toTitleCase(i)) + " instead of " + stringChar(cs)); } else checkPassed(); // digit for (int radix = Character.MIN_RADIX; radix <= Character.MAX_RADIX; radix++) { int digit = chars[i].digit; if (digit >= radix) digit = -1; if (Character.digit(i, radix) != digit) { reportError(stringChar(i) + " has wrong digit form of " + Character.digit(i, radix) + " for radix " + radix + " instead of " + digit + "(" + chars[i].digit + ")"); } else checkPassed(); } // getNumericValue if (chars[i].numericValue != Character.getNumericValue(i)) { reportError(stringChar(i) + " has wrong numeric value of " + Character.getNumericValue(i) + " instead of " + chars[i].numericValue); } if (testDeprecated) { // isJavaLetter if ((i == '$' || i == '_' || Character.isLetter(i)) != Character.isJavaLetter(i)) { reportError(i, (Character.isJavaLetter(i)? "javaletter" : "not-javaletter")); } else checkPassed(); // isJavaLetterOrDigit if ((Character.isJavaLetter(i) || Character.isDigit(i) || i == '$' || i == '_') != Character.isJavaLetterOrDigit(i) ) { reportError(i, (Character.isJavaLetterOrDigit(i) ? "javaletterordigit" : "not-javaletterordigit")); } else checkPassed(); // isSpace if (((i == ' ' || i == '\t' || i == '\n' || i == '\r' || i == '\f')) != Character.isSpace(i)) { reportError(i, (Character.isSpace(i) ? "space" : "non-space")); } else checkPassed(); } // testDeprecated } // for // forDigit for (int r = -100; r < 100; r++) { for (int d = -100; d < 100; d++) { char dch = Character.forDigit(d,r); char wantch = 0; if (range(r, Character.MIN_RADIX, Character.MAX_RADIX) && range(d, 0, r - 1)) { if (d < 10) { wantch = (char) ('0' + (char) d); } else if (d < 36) { wantch = (char) ('a' + d - 10); } } if (dch != wantch) { reportError("Error in forDigit(" + d + "," + r + "), got " + dch + " wanted " + wantch); } else checkPassed(); } } } }