/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is HTML Parser C++ Translator code.
*
* The Initial Developer of the Original Code is
* Mozilla Foundation.
* Portions created by the Initial Developer are Copyright (C) 2008
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Henri Sivonen <hsivonen@iki.fi>
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
package nu.validator.htmlparser.generator;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.Map;
import java.util.TreeMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import nu.validator.htmlparser.cpptranslate.CppTypes;
public class GenerateNamedCharactersCpp {
/**
* The license for the output of this program except for data files.
*/
private static final String OUTPUT_LICENSE = "/*\n"
+ " * Copyright (c) 2008-2010 Mozilla Foundation\n"
+ " *\n"
+ " * Permission is hereby granted, free of charge, to any person obtaining a \n"
+ " * copy of this software and associated documentation files (the \"Software\"), \n"
+ " * to deal in the Software without restriction, including without limitation \n"
+ " * the rights to use, copy, modify, merge, publish, distribute, sublicense, \n"
+ " * and/or sell copies of the Software, and to permit persons to whom the \n"
+ " * Software is furnished to do so, subject to the following conditions:\n"
+ " *\n"
+ " * The above copyright notice and this permission notice shall be included in \n"
+ " * all copies or substantial portions of the Software.\n"
+ " *\n"
+ " * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR \n"
+ " * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, \n"
+ " * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL \n"
+ " * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER \n"
+ " * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING \n"
+ " * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER \n"
+ " * DEALINGS IN THE SOFTWARE.\n" + " */\n\n";
/**
* The license for the generated data files.
*/
private static final String DATA_LICENSE = "/*\n"
+ " * Copyright 2004-2010 Apple Computer, Inc., Mozilla Foundation, and Opera \n"
+ " * Software ASA.\n"
+ " * \n"
+ " * You are granted a license to use, reproduce and create derivative works of \n"
+ " * this document.\n" + " */\n\n";
private static final int LEAD_OFFSET = 0xD800 - (0x10000 >> 10);
private static final Pattern LINE_PATTERN = Pattern.compile("<td> <code title=\"\">([^<]*)</code> </td> <td> U\\+(\\S*) (?:U\\+(\\S*) )?</td>");
private static String toHexString(int c) {
String hexString = Integer.toHexString(c);
switch (hexString.length()) {
case 1:
return "0x000" + hexString;
case 2:
return "0x00" + hexString;
case 3:
return "0x0" + hexString;
case 4:
return "0x" + hexString;
default:
throw new RuntimeException("Unreachable.");
}
}
/**
* @param args
* @throws IOException
*/
public static void main(String[] args) throws IOException {
TreeMap<String, String> entities = new TreeMap<String, String>();
BufferedReader reader = new BufferedReader(new InputStreamReader(
new FileInputStream(args[0]), "utf-8"));
String line;
while ((line = reader.readLine()) != null) {
Matcher m = LINE_PATTERN.matcher(line);
while (m.find()) {
String value;
if (m.group(3) != null) {
// two BMP chars
int firstIntVal = Integer.parseInt(m.group(2), 16);
int secondIntVal = Integer.parseInt(m.group(3), 16);
value = ("" + (char)firstIntVal) + (char)secondIntVal;
} else {
// one code point
int intVal = Integer.parseInt(m.group(2), 16);
if (intVal <= 0xFFFF) {
value = "" + (char)intVal;
} else {
int high = (LEAD_OFFSET + (intVal >> 10));
int low = (0xDC00 + (intVal & 0x3FF));
value = ("" + (char)high) + (char)low;
}
}
entities.put(m.group(1), value);
}
}
CppTypes cppTypes = new CppTypes(null);
File targetDirectory = new File(args[1]);
generateH(targetDirectory, cppTypes, entities);
generateInclude(targetDirectory, cppTypes, entities);
generateCpp(targetDirectory, cppTypes, entities);
generateAccelH(targetDirectory, cppTypes, entities);
generateAccelCpp(targetDirectory, cppTypes, entities);
}
private static void generateAccelCpp(File targetDirectory,
CppTypes cppTypes, TreeMap<String, String> entities) throws IOException {
String includeFile = cppTypes.classPrefix()
+ "NamedCharactersInclude.h";
File cppFile = new File(targetDirectory, cppTypes.classPrefix()
+ "NamedCharactersAccel.cpp");
Writer out = new OutputStreamWriter(new FileOutputStream(cppFile),
"utf-8");
out.write(DATA_LICENSE);
out.write('\n');
out.write("#include \"" + cppTypes.classPrefix()
+ "NamedCharactersAccel.h\"\n");
out.write("\n");
// Java initializes arrays to zero. Zero is our magic value for no hilo
// value.
int[][] hiLoTable = new int['z' + 1]['Z' - 'A' + 1 + 'z' - 'a' + 1];
String firstName = entities.entrySet().iterator().next().getKey();
int firstKey = charToIndex(firstName.charAt(0));
int secondKey = firstName.charAt(1);
int row = 0;
int lo = 0;
for (Map.Entry<String, String> entity : entities.entrySet()) {
String name = entity.getKey();
int newFirst = charToIndex(name.charAt(0));
int newSecond = name.charAt(1);
assert !(newFirst == 0 && newSecond == 0) : "Not prepared for name starting with AA";
if (firstKey != newFirst || secondKey != newSecond) {
hiLoTable[secondKey][firstKey] = ((row - 1) << 16) | lo;
lo = row;
firstKey = newFirst;
secondKey = newSecond;
}
row++;
}
hiLoTable[secondKey][firstKey] = ((entities.size() - 1) << 16) | lo;
for (int i = 0; i < hiLoTable.length; i++) {
if (!allZero(hiLoTable[i])) {
out.write("static " + cppTypes.intType() + " const HILO_ACCEL_"
+ i + "[] = {\n");
for (int j = 0; j < hiLoTable[i].length; j++) {
if (j != 0) {
out.write(", ");
}
out.write("" + hiLoTable[i][j]);
}
out.write("\n};\n\n");
}
}
out.write("const int32_t* const " + cppTypes.classPrefix()
+ "NamedCharactersAccel::HILO_ACCEL[] = {\n");
for (int i = 0; i < hiLoTable.length; i++) {
if (i != 0) {
out.write(",\n");
}
if (allZero(hiLoTable[i])) {
out.write(" 0");
} else {
out.write(" HILO_ACCEL_" + i);
}
}
out.write("\n};\n\n");
out.flush();
out.close();
}
private static void generateAccelH(File targetDirectory, CppTypes cppTypes,
TreeMap<String, String> entities) throws IOException {
File hFile = new File(targetDirectory, cppTypes.classPrefix()
+ "NamedCharactersAccel.h");
Writer out = new OutputStreamWriter(new FileOutputStream(hFile),
"utf-8");
out.write(DATA_LICENSE);
out.write("#ifndef " + cppTypes.classPrefix() + "NamedCharactersAccel_h\n");
out.write("#define " + cppTypes.classPrefix() + "NamedCharactersAccel_h\n");
out.write('\n');
String[] includes = cppTypes.namedCharactersIncludes();
for (int i = 0; i < includes.length; i++) {
String include = includes[i];
out.write("#include \"" + include + ".h\"\n");
}
out.write('\n');
out.write("class " + cppTypes.classPrefix() + "NamedCharactersAccel\n");
out.write("{\n");
out.write(" public:\n");
out.write(" static const " + cppTypes.intType()
+ "* const HILO_ACCEL[];\n");
out.write("};\n");
out.write("\n#endif // " + cppTypes.classPrefix()
+ "NamedCharactersAccel_h\n");
out.flush();
out.close();
}
private static void generateH(File targetDirectory, CppTypes cppTypes,
Map<String, String> entities) throws IOException {
File hFile = new File(targetDirectory, cppTypes.classPrefix()
+ "NamedCharacters.h");
Writer out = new OutputStreamWriter(new FileOutputStream(hFile),
"utf-8");
out.write(OUTPUT_LICENSE);
out.write("#ifndef " + cppTypes.classPrefix() + "NamedCharacters_h\n");
out.write("#define " + cppTypes.classPrefix() + "NamedCharacters_h\n");
out.write('\n');
String[] includes = cppTypes.namedCharactersIncludes();
for (int i = 0; i < includes.length; i++) {
String include = includes[i];
out.write("#include \"" + include + ".h\"\n");
}
out.write("\nstruct ");
out.write(cppTypes.characterNameTypeDeclaration());
out.write(" {\n ");
out.write(cppTypes.unsignedShortType());
out.write(" nameStart;\n ");
out.write(cppTypes.unsignedShortType());
out.write(" nameLen;\n #ifdef DEBUG\n ");
out.write(cppTypes.intType());
out.write(" n;\n #endif\n ");
out.write(cppTypes.intType());
out.write(" length() const;\n ");
out.write(cppTypes.charType());
out.write(" charAt(");
out.write(cppTypes.intType());
out.write(" index) const;\n};\n\n");
out.write("class " + cppTypes.classPrefix() + "NamedCharacters\n");
out.write("{\n");
out.write(" public:\n");
out.write(" static const " + cppTypes.characterNameTypeDeclaration() + " NAMES[];\n");
out.write(" static const " + cppTypes.charType() + " VALUES[][2];\n");
out.write(" static " + cppTypes.charType() + "** WINDOWS_1252;\n");
out.write(" static void initializeStatics();\n");
out.write(" static void releaseStatics();\n");
out.write("};\n");
out.write("\n#endif // " + cppTypes.classPrefix()
+ "NamedCharacters_h\n");
out.flush();
out.close();
}
private static void generateInclude(File targetDirectory,
CppTypes cppTypes, Map<String, String> entities) throws IOException {
File includeFile = new File(targetDirectory, cppTypes.classPrefix()
+ "NamedCharactersInclude.h");
Writer out = new OutputStreamWriter(new FileOutputStream(includeFile),
"utf-8");
out.write(DATA_LICENSE);
out.write("/* Data generated from the table of named character references found at\n");
out.write(" *\n");
out.write(" * http://www.whatwg.org/specs/web-apps/current-work/multipage/named-character-references.html#named-character-references\n");
out.write(" *\n");
out.write(" * Files that #include this file must #define NAMED_CHARACTER_REFERENCE as a\n");
out.write(" * macro of four parameters:\n");
out.write(" *\n");
out.write(" * 1. a unique integer N identifying the Nth [0,1,..] macro expansion in this file,\n");
out.write(" * 2. a comma-separated sequence of characters comprising the character name,\n");
out.write(" * without the first two letters or 0 if the sequence would be empty. \n");
out.write(" * See Tokenizer.java.\n");
out.write(" * 3. the length of this sequence of characters,\n");
out.write(" * 4. placeholder flag (0 if argument #is not a placeholder and 1 if it is),\n");
out.write(" * 5. a comma-separated sequence of char16_t literals corresponding\n");
out.write(" * to the code-point(s) of the named character.\n");
out.write(" *\n");
out.write(" * The macro expansion doesn't have to refer to all or any of these parameters,\n");
out.write(" * but common sense dictates that it should involve at least one of them.\n");
out.write(" */\n");
out.write("\n");
out.write("// This #define allows the NAMED_CHARACTER_REFERENCE macro to accept comma-\n");
out.write("// separated sequences as single macro arguments. Using commas directly would\n");
out.write("// split the sequence into multiple macro arguments.\n");
out.write("#define _ ,\n");
out.write("\n");
int i = 0;
for (Map.Entry<String, String> entity : entities.entrySet()) {
out.write("NAMED_CHARACTER_REFERENCE(" + i++ + ", ");
String name = entity.getKey();
writeNameInitializer(out, name, " _ ");
out.write(", " + (name.length() - 2) + ", ");
out.write((name.length() == 2 ? "1" : "0") + ", ");
writeValueInitializer(out, entity.getValue(), " _ ");
out.write(")\n");
}
out.write("\n");
out.write("#undef _\n");
out.flush();
out.close();
}
private static void writeNameInitializer(Writer out,
String name, String separator)
throws IOException {
out.write("/* " + name.charAt(0) + " " + name.charAt(1) + " */ ");
if (name.length() == 2) {
out.write("0");
} else {
for (int i = 2; i < name.length(); i++) {
out.write("'" + name.charAt(i) + "'");
if (i < name.length() - 1)
out.write(separator);
}
}
}
private static void writeValueInitializer(Writer out,
String value, String separator)
throws IOException {
if (value.length() == 1) {
out.write(toHexString(value.charAt(0)));
out.write(separator);
out.write("0");
} else {
out.write(toHexString(value.charAt(0)));
out.write(separator);
out.write(toHexString(value.charAt(1)));
}
}
private static void defineMacroAndInclude(Writer out, String expansion,
String includeFile) throws IOException {
out.write("#define NAMED_CHARACTER_REFERENCE(N, CHARS, LEN, FLAG, VALUE) \\\n"
+ expansion + "\n");
out.write("#include \"" + includeFile + "\"\n");
out.write("#undef NAMED_CHARACTER_REFERENCE\n");
}
private static void defineMacroAndInclude(Writer out, String expansion,
String debugExpansion, String includeFile) throws IOException {
out.write("#ifdef DEBUG\n");
out.write(" #define NAMED_CHARACTER_REFERENCE(N, CHARS, LEN, FLAG, VALUE) \\\n"
+ debugExpansion + "\n");
out.write("#else\n");
out.write(" #define NAMED_CHARACTER_REFERENCE(N, CHARS, LEN, FLAG, VALUE) \\\n"
+ expansion + "\n");
out.write("#endif\n");
out.write("#include \"" + includeFile + "\"\n");
out.write("#undef NAMED_CHARACTER_REFERENCE\n");
}
private static void writeStaticMemberDeclaration(Writer out,
CppTypes cppTypes, String type, String name) throws IOException {
out.write(type + " " + cppTypes.classPrefix() + "NamedCharacters::"
+ name + ";\n");
}
private static int charToIndex(char c) {
if (c >= 'a' && c <= 'z') {
return c - 'a' + 26;
} else if (c >= 'A' && c <= 'Z') {
return c - 'A';
}
throw new IllegalArgumentException("Bad char in named character name: "
+ c);
}
private static boolean allZero(int[] arr) {
for (int i = 0; i < arr.length; i++) {
if (arr[i] != 0) {
return false;
}
}
return true;
}
private static void generateCpp(File targetDirectory, CppTypes cppTypes,
Map<String, String> entities) throws IOException {
String includeFile = cppTypes.classPrefix()
+ "NamedCharactersInclude.h";
File cppFile = new File(targetDirectory, cppTypes.classPrefix()
+ "NamedCharacters.cpp");
Writer out = new OutputStreamWriter(new FileOutputStream(cppFile),
"utf-8");
out.write(OUTPUT_LICENSE);
out.write("#define " + cppTypes.classPrefix()
+ "NamedCharacters_cpp_\n");
String[] includes = cppTypes.namedCharactersIncludes();
for (int i = 0; i < includes.length; i++) {
String include = includes[i];
out.write("#include \"" + include + ".h\"\n");
}
out.write('\n');
out.write("#include \"" + cppTypes.classPrefix()
+ "NamedCharacters.h\"\n");
out.write("\n");
out.write("const " + cppTypes.charType() + " " + cppTypes.classPrefix()
+ "NamedCharacters::VALUES[][2] = {\n");
defineMacroAndInclude(out, "{ VALUE },", includeFile);
// The useless terminator entry makes the above macro simpler with
// compilers that whine about a comma after the last item
out.write("{0, 0} };\n\n");
String staticMemberType = cppTypes.charType() + "**";
writeStaticMemberDeclaration(out, cppTypes, staticMemberType,
"WINDOWS_1252");
out.write("static " + cppTypes.charType()
+ " const WINDOWS_1252_DATA[] = {\n");
out.write(" 0x20AC,\n");
out.write(" 0x0081,\n");
out.write(" 0x201A,\n");
out.write(" 0x0192,\n");
out.write(" 0x201E,\n");
out.write(" 0x2026,\n");
out.write(" 0x2020,\n");
out.write(" 0x2021,\n");
out.write(" 0x02C6,\n");
out.write(" 0x2030,\n");
out.write(" 0x0160,\n");
out.write(" 0x2039,\n");
out.write(" 0x0152,\n");
out.write(" 0x008D,\n");
out.write(" 0x017D,\n");
out.write(" 0x008F,\n");
out.write(" 0x0090,\n");
out.write(" 0x2018,\n");
out.write(" 0x2019,\n");
out.write(" 0x201C,\n");
out.write(" 0x201D,\n");
out.write(" 0x2022,\n");
out.write(" 0x2013,\n");
out.write(" 0x2014,\n");
out.write(" 0x02DC,\n");
out.write(" 0x2122,\n");
out.write(" 0x0161,\n");
out.write(" 0x203A,\n");
out.write(" 0x0153,\n");
out.write(" 0x009D,\n");
out.write(" 0x017E,\n");
out.write(" 0x0178\n");
out.write("};\n\n");
out.write("/**\n");
out.write(" * To avoid having lots of pointers in the |charData| array, below,\n");
out.write(" * which would cause us to have to do lots of relocations at library\n");
out.write(" * load time, store all the string data for the names in one big array.\n");
out.write(" * Then use tricks with enums to help us build an array that contains\n");
out.write(" * the positions of each within the big arrays.\n");
out.write(" */\n\n");
out.write("static const " + cppTypes.byteType() + " ALL_NAMES[] = {\n");
defineMacroAndInclude(out, "CHARS ,", includeFile);
out.write("};\n\n");
out.write("enum NamePositions {\n");
out.write(" DUMMY_INITIAL_NAME_POSITION = 0,\n");
out.write("/* enums don't take up space, so generate _START and _END */\n");
defineMacroAndInclude(out,
"NAME_##N##_DUMMY, /* automatically one higher than previous */ \\\n"
+ "NAME_##N##_START = NAME_##N##_DUMMY - 1, \\\n"
+ "NAME_##N##_END = NAME_##N##_START + LEN + FLAG,",
includeFile);
out.write(" DUMMY_FINAL_NAME_VALUE\n");
out.write("};\n\n");
String arrayLengthMacro = cppTypes.arrayLengthMacro();
String staticAssert = cppTypes.staticAssert();
if (staticAssert != null && arrayLengthMacro != null) {
out.write("/* check that the start positions will fit in 16 bits */\n");
out.write(staticAssert + "(" + arrayLengthMacro
+ "(ALL_NAMES) < 0x10000);\n\n");
}
out.write("const " + cppTypes.characterNameTypeDeclaration() + " " + cppTypes.classPrefix()
+ "NamedCharacters::NAMES[] = {\n");
defineMacroAndInclude(out, "{ NAME_##N##_START, LEN, },", "{ NAME_##N##_START, LEN, N },", includeFile);
out.write("};\n\n");
out.write(cppTypes.intType());
out.write("\n");
out.write(cppTypes.characterNameTypeDeclaration());
out.write("::length() const\n{\n return nameLen;\n}\n\n");
out.write(cppTypes.charType());
out.write("\n");
out.write(cppTypes.characterNameTypeDeclaration());
out.write("::charAt(");
out.write("int32_t");
out.write(" index) const\n{\n return static_cast<");
out.write(cppTypes.charType());
out.write("> (ALL_NAMES[nameStart + index]);\n}\n\n");
out.write("void\n");
out.write(cppTypes.classPrefix()
+ "NamedCharacters::initializeStatics()\n");
out.write("{\n");
out.write(" WINDOWS_1252 = new " + cppTypes.charType() + "*[32];\n");
out.write(" for (" + cppTypes.intType() + " i = 0; i < 32; ++i) {\n");
out.write(" WINDOWS_1252[i] = (" + cppTypes.charType()
+ "*)&(WINDOWS_1252_DATA[i]);\n");
out.write(" }\n");
out.write("}\n");
out.write("\n");
out.write("void\n");
out.write(cppTypes.classPrefix()
+ "NamedCharacters::releaseStatics()\n");
out.write("{\n");
out.write(" delete[] WINDOWS_1252;\n");
out.write("}\n");
out.flush();
out.close();
}
}