/* * Copyright (c) 2003, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. Oracle designates this * particular file as subject to the "Classpath" exception as provided * by Oracle in the LICENSE file that accompanied this code. * * This code is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * version 2 for more details (a copy is included in the LICENSE file that * accompanied this code). * * You should have received a copy of the GNU General Public License version * 2 along with this work; if not, write to the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. * * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA * or visit www.oracle.com if you need additional information or have any * questions. */ /** * This is a tool to generate categoryNames and categoryMap which are used in * CharSet.java. */ package build.tools.generatebreakiteratordata; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.FileReader; import java.io.FileWriter; import java.util.StringTokenizer; class CharacterCategory { /** * A list of Unicode category names. */ static final String[] categoryNames = { "Ll", /* Letter, Lowercase */ "Lu", /* Letter, Uppercase */ "Lt", /* Letter, Titlecase */ "Lo", /* Letter, Other */ "Lm", /* Letter, Modifier */ "Nd", /* Number, Decimal Digit */ "Nl", /* Number, Letter */ "No", /* Number, Other */ "Ps", /* Punctuation, Open */ "Pe", /* Punctuation, Close */ "Pi", /* Punctuation, Initial quote */ "Pf", /* Punctuation, Final quote */ "Pd", /* Punctuation, Dash */ "Pc", /* Punctuation, Connector */ "Po", /* Punctuation, Other */ "Sc", /* Symbol, Currency */ "Sm", /* Symbol, Math */ "So", /* Symbol, Other */ "Mn", /* Mark, Non-Spacing */ "Mc", /* Mark, Spacing Combining */ "Me", /* Mark, Enclosing */ "Zl", /* Separator, Line */ "Zp", /* Separator, Paragraph */ "Zs", /* Separator, Space */ "Cc", /* Other, Control */ "Cf", /* Other, Format */ "--", /* Dummy, ignored */ // Don't add anything after the Dummy entry!! }; /** * A array of Unicode code points for each category. */ private static int[][] categoryMap; /** * Generates CategoryMap for GenerateBreakIteratorData. */ static void makeCategoryMap(String filename) { /* Overwrite specfile name */ specfile = filename; /* Generate data in current format (1.5.0) */ generateNewData(); /* Copy generated data to cateogyMap */ categoryMap = new int[categoryNames.length-1][]; for (int i = 0; i < categoryNames.length-1; i++) { int len = newListCount[BMP][i] + newListCount[nonBMP][i]; categoryMap[i] = new int[len]; System.arraycopy(newList[i], 0, categoryMap[i], 0, len); } } /** * Returns categoryMap for the given category. */ static int[] getCategoryMap(int category) { return categoryMap[category]; } /** * Only used for debugging and generating a test program. */ public static void main(String[] args) { /* Parses command-line options */ processArgs(args); /* Generates data in current format (1.5.0) */ generateNewData(); /* * Generates data in older format (1.4.X and earlier) and creates * the old CategoryMap if "oldFilename" is not null. */ if (!oldDatafile.equals("")) { generateOldData(); generateOldDatafile(); } /* Displays summary of generated data */ showSummary(); /* * Generates a test program which compares the new data and the return * values of Character.getType(). * and the old data and the new data. */ generateTestProgram(); } /** * Spec (Unicode data file) */ private static String specfile = "UnicodeData.txt"; /** * Output directory */ private static String outputDir = ""; /** * Old data filename */ private static String oldDatafile = ""; /** * Parses the specified arguments and sets up the variables. */ private static void processArgs(String[] args) { for (int i = 0; i < args.length; i++) { String arg =args[i]; if (arg.equals("-spec")) { specfile = args[++i]; } else if (arg.equals("-old")) { oldDatafile = args[++i]; } else if (arg.equals("-o")) { outputDir = args[++i]; } else { System.err.println("Usage: java CharacterCategory [-spec specfile]"); System.exit(1); } } } /** * Displays summary of generated data */ private static void showSummary() { int oldSum = 0; int newSum = 0; int oldSuppSum = 0; int newSuppSum = 0; for (int i = 0; i < categoryNames.length-1; i++) { int newNum = newListCount[BMP][i] + newListCount[nonBMP][i]; if (oldTotalCount[i] != newNum) { System.err.println("Error: The number of generated data is different between the new approach and the old approach."); } if (oldListCount[SURROGATE][i] != newListCount[nonBMP][i]) { System.err.println("Error: The number of generated supplementarycharacters is different between the new approach and the old approach."); } System.out.println(" " + categoryNames[i] + ": " + oldTotalCount[i] + "(" + oldListCount[BEFORE][i] + " + " + oldListCount[SURROGATE][i] + " + " + oldListCount[AFTER][i] + ")" + " --- " + newNum + "(" + newListCount[BMP][i] + " + " + newListCount[nonBMP][i] + ")"); oldSum += oldListCount[BEFORE][i] * 2 + oldListCount[SURROGATE][i] * 4 + oldListCount[AFTER][i] * 2; newSum += newNum * 4 ; oldSuppSum += oldListCount[SURROGATE][i] * 4; newSuppSum += newListCount[nonBMP][i] * 4; } System.out.println("\nTotal buffer sizes are:\n " + oldSum + "bytes(Including " + oldSuppSum + "bytes for supplementary characters)\n " + newSum + "bytes(Including " + newSuppSum + "bytes for supplementary characters)"); if (!ignoredOld.toString().equals(ignoredNew.toString())) { System.err.println("Ignored categories: Error: List mismatch: " + ignoredOld + " vs. " + ignoredNew); } else { System.out.println("\nIgnored categories: " + ignoredOld); System.out.println("Please confirm that they aren't used in BreakIteratorRules."); } } private static final int HighSurrogate_CodeUnit_Start = 0xD800; private static final int LowSurrogate_CodeUnit_Start = 0xDC00; private static final int Supplementary_CodePoint_Start = 0x10000; private static StringBuffer ignoredOld = new StringBuffer(); private static int[] oldTotalCount = new int[categoryNames.length]; private static int[][] oldListCount = new int[3][categoryNames.length]; private static int[][] oldListLen = new int[3][categoryNames.length]; private static StringBuffer[][] oldList = new StringBuffer[3][categoryNames.length]; private static final int BEFORE = 0; private static final int SURROGATE = 1; private static final int AFTER = 2; /** * Makes CategoryMap in ordler format which had been used by JDK 1.4.X and * earlier versions. */ private static void generateOldData() { /* Initialize arrays. */ for (int i = 0; i<categoryNames.length; i++) { for (int j = BEFORE; j <= AFTER; j++) { oldListCount[j][i] = 0; oldList[j][i] = new StringBuffer(); oldListLen[j][i] = 17; } } storeOldData(); if (oldTotalCount[categoryNames.length-1] != 1) { System.err.println("This should not happen. Unicode data which belongs to an undefined category exists"); System.exit(1); } } private static void storeOldData() { try { FileReader fin = new FileReader(specfile); BufferedReader bin = new BufferedReader(fin); String prevCode = "????"; String line; int prevIndex = categoryNames.length - 1; int prevCodeValue = -1; int curCodeValue = 0; boolean setFirst = false; while ((line = bin.readLine()) != null) { if (line.length() == 0) { continue; } StringTokenizer st = new StringTokenizer(line, ";"); String code = st.nextToken(); char c = code.charAt(0); if (c == '#' || c == '/') { continue; } int i = Integer.valueOf(code, 16).intValue(); String characterName = st.nextToken(); String category = st.nextToken(); int index; for (index = 0; index < categoryNames.length; index++) { if (category.equals(categoryNames[index])) { break; } } if (index != categoryNames.length) { curCodeValue = Integer.parseInt(code, 16); if (prevIndex != index) { appendOldChar(prevIndex, prevCodeValue, prevCode); appendOldChar(index, curCodeValue, code); prevIndex = index; } else if (prevCodeValue != curCodeValue - 1) { if (setFirst && characterName.endsWith(" Last>")) { setFirst = false; } else { appendOldChar(prevIndex, prevCodeValue, prevCode); appendOldChar(index, curCodeValue, code); } } prevCodeValue = curCodeValue; prevCode = code; if (characterName.endsWith(" First>")) { setFirst = true; } } else { if (ignoredOld.indexOf(category) == -1) { ignoredOld.append(category); ignoredOld.append(' '); } } } appendOldChar(prevIndex, prevCodeValue, prevCode); bin.close(); fin.close(); } catch (Exception e) { throw new InternalError(e.toString()); } } private static void appendOldChar(int index, int code, String s) { int range; if (code < HighSurrogate_CodeUnit_Start) { range = BEFORE; } else if (code < Supplementary_CodePoint_Start) { range = AFTER; } else { range = SURROGATE; } if (oldListLen[range][index] > 64) { oldList[range][index].append("\"\n + \""); oldListLen[range][index] = 19; } if (code == 0x22 || code == 0x5c) { oldList[range][index].append('\\'); oldList[range][index].append((char)code); oldListLen[range][index] += 2; } else if (code > 0x20 && code < 0x7F) { oldList[range][index].append((char)code); oldListLen[range][index] ++; } else { if (range == SURROGATE) {// Need to convert code point to code unit oldList[range][index].append(toCodeUnit(code)); oldListLen[range][index] += 12; } else { oldList[range][index].append("\\u"); oldList[range][index].append(s); oldListLen[range][index] += 6; } } oldListCount[range][index] ++; oldTotalCount[index]++; } private static String toCodeUnit(int i) { StringBuffer sb = new StringBuffer(); sb.append("\\u"); sb.append(Integer.toString((i - Supplementary_CodePoint_Start) / 0x400 + HighSurrogate_CodeUnit_Start, 16).toUpperCase()); sb.append("\\u"); sb.append(Integer.toString(i % 0x400 + LowSurrogate_CodeUnit_Start, 16).toUpperCase()); return sb.toString(); } private static int toCodePoint(String s) { char c1 = s.charAt(0); if (s.length() == 1 || !Character.isHighSurrogate(c1)) { return (int)c1; } else { char c2 = s.charAt(1); if (s.length() != 2 || !Character.isLowSurrogate(c2)) { return -1; } return Character.toCodePoint(c1, c2); } } private static StringBuffer ignoredNew = new StringBuffer(); private static int[] newTotalCount = new int[categoryNames.length]; private static int[][] newListCount = new int[2][categoryNames.length]; private static int[][] newList = new int[categoryNames.length][]; private static final int BMP = 0; private static final int nonBMP = 1; /** * Makes CategoryMap in newer format which is used by JDK 1.5.0. */ private static void generateNewData() { /* Initialize arrays. */ for (int i = 0; i<categoryNames.length; i++) { newList[i] = new int[10]; } storeNewData(); if (newListCount[BMP][categoryNames.length-1] != 1) { System.err.println("This should not happen. Unicode data which belongs to an undefined category exists"); System.exit(1); } } private static void storeNewData() { try { FileReader fin = new FileReader(specfile); BufferedReader bin = new BufferedReader(fin); String line; int prevIndex = categoryNames.length - 1; int prevCodeValue = -1; int curCodeValue = 0; boolean setFirst = false; while ((line = bin.readLine()) != null) { if (line.length() == 0) { continue; } StringTokenizer st = new StringTokenizer(line, ";"); String code = st.nextToken(); char c = code.charAt(0); if (c == '#' || c == '/') { continue; } int i = Integer.valueOf(code, 16).intValue(); String characterName = st.nextToken(); String category = st.nextToken(); int index; for (index = 0; index < categoryNames.length; index++) { if (category.equals(categoryNames[index])) { break; } } if (index != categoryNames.length) { curCodeValue = Integer.parseInt(code, 16); if (prevIndex == index) { if (setFirst) { if (characterName.endsWith(" Last>")) { setFirst = false; } else { System.err.println("*** Error 1 at " + code); } } else { if (characterName.endsWith(" First>")) { setFirst = true; } else if (characterName.endsWith(" Last>")) { System.err.println("*** Error 2 at " + code); } else { if (prevCodeValue != curCodeValue - 1) { appendNewChar(prevIndex, prevCodeValue); appendNewChar(index, curCodeValue); } } } } else { if (setFirst) { System.err.println("*** Error 3 at " + code); } else if (characterName.endsWith(" First>")) { setFirst = true; } else if (characterName.endsWith(" Last>")) { System.err.println("*** Error 4 at " + code); } appendNewChar(prevIndex, prevCodeValue); appendNewChar(index, curCodeValue); prevIndex = index; } prevCodeValue = curCodeValue; } else { if (ignoredNew.indexOf(category) == -1) { ignoredNew.append(category); ignoredNew.append(' '); } } } appendNewChar(prevIndex, prevCodeValue); bin.close(); fin.close(); } catch (Exception e) { System.err.println("Error occurred on accessing " + specfile); e.printStackTrace(); System.exit(1); } } private static void appendNewChar(int index, int code) { int bufLen = newList[index].length; if (newTotalCount[index] == bufLen) { int[] tmpBuf = new int[bufLen + 10]; System.arraycopy(newList[index], 0, tmpBuf, 0, bufLen); newList[index] = tmpBuf; } newList[index][newTotalCount[index]++] = code; if (code < 0x10000) { newListCount[BMP][index]++; } else { newListCount[nonBMP][index]++; } } /* Generates the old CategoryMap. */ private static void generateOldDatafile() { try { FileWriter fout = new FileWriter(oldDatafile); BufferedWriter bout = new BufferedWriter(fout); bout.write("\n //\n // The following String[][] can be used in CharSet.java as is.\n //\n\n private static final String[][] categoryMap = {\n"); for (int i = 0; i < categoryNames.length - 1; i++) { if (oldTotalCount[i] != 0) { bout.write(" { \"" + categoryNames[i] + "\","); /* 0x0000-0xD7FF */ if (oldListCount[BEFORE][i] != 0) { bout.write(" \""); bout.write(oldList[BEFORE][i].toString() + "\"\n"); } /* 0xD800-0xFFFF */ if (oldListCount[AFTER][i] != 0) { if (oldListCount[BEFORE][i] != 0) { bout.write(" + \""); } else { bout.write(" \""); } bout.write(oldList[AFTER][i].toString() + "\"\n"); } /* 0xD800DC00(0x10000)-0xDBFF0xDFFFF(0x10FFFF) */ if (oldListCount[SURROGATE][i] != 0) { if (oldListCount[BEFORE][i] != 0 || oldListCount[AFTER][i] != 0) { bout.write(" + \""); } else { bout.write(" \""); } bout.write(oldList[SURROGATE][i].toString() + "\"\n"); } bout.write(" },\n"); } } bout.write(" };\n\n"); bout.close(); fout.close(); } catch (Exception e) { System.err.println("Error occurred on accessing " + oldDatafile); e.printStackTrace(); System.exit(1); } System.out.println("\n" + oldDatafile + " has been generated."); } /** * Test program to be generated */ private static final String outfile = "CharacterCategoryTest.java"; /* * Generates a test program which compare the generated date (newer one) * with the return values of Characger.getType(). */ private static void generateTestProgram() { try { FileWriter fout = new FileWriter(outfile); BufferedWriter bout = new BufferedWriter(fout); bout.write(collationMethod); bout.write("\n //\n // The following arrays can be used in CharSet.java as is.\n //\n\n"); bout.write(" private static final String[] categoryNames = {"); for (int i = 0; i < categoryNames.length - 1; i++) { if (i % 10 == 0) { bout.write("\n "); } bout.write("\"" + categoryNames[i] + "\", "); } bout.write("\n };\n\n"); bout.write(" private static final int[][] categoryMap = {\n"); for (int i = 0; i < categoryNames.length - 1; i++) { StringBuffer sb = new StringBuffer(" { /* Data for \"" + categoryNames[i] + "\" category */"); for (int j = 0; j < newTotalCount[i]; j++) { if (j % 8 == 0) { sb.append("\n "); } sb.append(" 0x"); sb.append(Integer.toString(newList[i][j], 16).toUpperCase()); sb.append(','); } sb.append("\n },\n"); bout.write(sb.toString()); } bout.write(" };\n"); bout.write("\n}\n"); bout.close(); fout.close(); } catch (Exception e) { System.err.println("Error occurred on accessing " + outfile); e.printStackTrace(); System.exit(1); } System.out.println("\n" + outfile + " has been generated."); } static String collationMethod = "public class CharacterCategoryTest {\n\n" + " static final int SIZE = 0x110000;\n" + " static final String[] category = {\n" + " \"Cn\", \"Lu\", \"Ll\", \"Lt\", \"Lm\", \"Lo\", \"Mn\", \"Me\",\n" + " \"Mc\", \"Nd\", \"Nl\", \"No\", \"Zs\", \"Zl\", \"Zp\", \"Cc\",\n" + " \"Cf\", \"\", \"Co\", \"Cs\", \"Pd\", \"Ps\", \"Pe\", \"Pc\",\n" + " \"Po\", \"Sm\", \"Sc\", \"Sk\", \"So\", \"Pi\", \"Pf\"\n" + " };\n\n" + " public static void main(String[] args) {\n" + " boolean err = false;\n" + " byte[] b = new byte[SIZE];\n" + " for (int i = 0; i < SIZE; i++) {\n" + " b[i] = 0;\n" + " }\n" + " for (int i = 0; i < categoryMap.length; i++) {\n" + " byte categoryNum = 0;\n" + " String categoryName = categoryNames[i];\n" + " for (int j = 0; j < category.length; j++) {\n" + " if (categoryName.equals(category[j])) {\n" + " categoryNum = (byte)j;\n" + " break;\n" + " }\n" + " }\n" + " int[] values = categoryMap[i];\n" + " for (int j = 0; j < values.length;) {\n" + " int firstChar = values[j++];\n" + " int lastChar = values[j++];\n" + " for (int k = firstChar; k <= lastChar; k++) {\n" + " b[k] = categoryNum;\n" + " }\n" + " }\n" + " }\n" + " for (int i = 0; i < SIZE; i++) {\n" + " int characterType = Character.getType(i);\n" + " if (b[i] != characterType) {\n" + " /* Co, Cs and Sk categories are ignored in CharacterCategory. */\n" + " if (characterType == Character.PRIVATE_USE ||\n" + " characterType == Character.SURROGATE ||\n" + " characterType == Character.MODIFIER_SYMBOL) {\n" + " continue;\n" + " }\n" + " err = true;\n" + " System.err.println(\"Category conflict for a character(0x\" +\n" + " Integer.toHexString(i) +\n" + " \"). CharSet.categoryMap:\" +\n" + " category[b[i]] +\n" + " \" Character.getType():\" +\n" + " category[characterType]);\n" + " }\n" + " }\n\n" + " if (err) {\n" + " throw new RuntimeException(\"Conflict occurred between Charset.categoryMap and Character.getType()\");\n" + " }\n" + " }\n"; }