/** * Copyright (c) 2012-2016 André Bargull * Alle Rechte vorbehalten / All Rights Reserved. Use is subject to license terms. * * <https://github.com/anba/es6draft> */ package com.github.anba.es6draft.regexp; import java.io.IOException; import java.io.PrintStream; import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.util.ArrayList; import java.util.Collection; import java.util.Formatter; import java.util.HashMap; import java.util.LinkedHashMap; import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.stream.Collectors; import java.util.stream.IntStream; import java.util.stream.Stream; /** * Utility class to generate the case folding data used in {@link CaseFoldData} */ @SuppressWarnings("unused") final class CaseFoldDataGenerator { private CaseFoldDataGenerator() { } public static void main(String[] args) throws IOException { Path unicode = Paths.get(args.length > 0 ? args[0] : "/tmp/unicode/Unicode8.0"); // String range = generateSpaceRange(unicode); // System.out.println(range); CaseFold bmpMapping = generateCaseFoldDataBMP(unicode); CaseFold unicodeMapping = generateCaseFoldDataUnicode(unicode); CaseFold caseFoldMapping = generateCaseFoldData(unicode); System.out.println(unicodeMapping.equals(caseFoldMapping)); // unicodeMapping.printTo(System.out); // String caseFoldTest = generateCaseFoldTest(unicode); // System.out.println(caseFoldTest); } static CaseFold generateCaseFoldDataBMP(Path unicode) throws IOException { Map<Integer, Integer> toUpper = new HashMap<>(); Map<Integer, Integer> toLower = new HashMap<>(); caseMappings(unicode, toUpper, toLower); CaseFold caseFolding = new CaseFold(); codePoints(unicode).forEach(codeValue -> { int caseFold; if (toUpper.containsKey(codeValue)) { caseFold = toUpper.get(codeValue); } else { return; } // ES2015, 21.2.2.8.2 Runtime Semantics: Canonicalize ( ch ) // 1. Ignore non-BMP code points. if (codeValue > 0xffff) { return; } // 2. Ignore mapping outside of basic multilingual plane. if (caseFold > 0xffff) { return; } // 3. Ignore mapping from non-ASCII to ASCII. if (codeValue > 0x7f && caseFold <= 0x7f) { return; } caseFolding.add(codeValue, caseFold); }); return caseFolding; } static CaseFold generateCaseFoldDataUnicode(Path unicode) throws IOException { Map<Integer, Integer> toUpper = new HashMap<>(); Map<Integer, Integer> toLower = new HashMap<>(); caseMappings(unicode, toUpper, toLower); CaseFold caseFolding = new CaseFold(); codePoints(unicode).forEach(codeValue -> { int caseFold; if (isCherokeeUppercase(codeValue)) { // Switch Cherokee uppercase/lowercase for compatibility with CaseFolding.txt output. return; } else if (isCherokeeLowercase(codeValue)) { caseFold = toUpper.get(codeValue); } else if (toLower.containsKey(codeValue)) { caseFold = toLower.get(codeValue); } else if (toUpper.containsKey(codeValue) && toLower.containsKey(toUpper.get(codeValue)) && codeValue != toLower.get(toUpper.get(codeValue))) { caseFold = toLower.get(toUpper.get(codeValue)); } else { return; } caseFolding.add(codeValue, caseFold); }); return caseFolding; } static CaseFold generateCaseFoldData(Path unicode) throws IOException { CaseFold caseFolding = new CaseFold(); caseFolding(unicode).forEach(m -> { char kind = m.group("status").charAt(0); if (kind == 'T' || kind == 'F') { return; } if (!isSingleCodePoint(m.group("mapping"))) { System.err.println("Invalid line: " + m.group()); } int codeValue = Integer.parseInt(m.group("code"), 16); int caseFold = Integer.parseInt(m.group("mapping"), 16); caseFolding.add(codeValue, caseFold); }); return caseFolding; } /** * Generates the test data for "unicode_case_folding.jsm". */ static String generateCaseFoldTest(Path unicode) throws IOException { class CaseFoldRange { final Stream.Builder<int[]> builder = Stream.builder(); boolean started, inRange; int startCodeValue, startCaseFold; int endCodeValue, endCaseFold; int steps; Stream<int[]> stream() { if (started) { builder.add(new int[] { startCodeValue, startCaseFold, endCodeValue, endCaseFold, steps }); } return builder.build(); } CaseFoldRange add(int[] m) { int codeValue = m[0]; int caseFold = m[1]; assert codeValue > endCodeValue; int step1 = codeValue - endCodeValue; int step2 = caseFold - endCaseFold; if (started && step1 == step2 && (!inRange || step1 == steps)) { endCodeValue = codeValue; endCaseFold = caseFold; steps = step1; inRange = true; } else { if (started) { builder.add(new int[] { startCodeValue, startCaseFold, endCodeValue, endCaseFold, steps }); } startCodeValue = endCodeValue = codeValue; startCaseFold = endCaseFold = caseFold; steps = 1; started = true; inRange = false; } return this; } CaseFoldRange unsupportedCombine(CaseFoldRange other) { throw new IllegalStateException(); } } Map<Integer, Integer> toUpper = new HashMap<>(); Map<Integer, Integer> toLower = new HashMap<>(); caseMappings(unicode, toUpper, toLower); return caseFolding(unicode).filter(m -> { char kind = m.group("status").charAt(0); return kind == 'C' || kind == 'S'; }).map(m -> { return new int[] { Integer.parseInt(m.group("code"), 16), Integer.parseInt(m.group("mapping"), 16) }; }).sequential().reduce(new CaseFoldRange(), CaseFoldRange::add, CaseFoldRange::unsupportedCombine).stream() .map(range -> { int startCodeValue = range[0], startCaseFold = range[1]; int endCodeValue = range[2], endCaseFold = range[3]; int steps = range[4]; String type; if (startCodeValue <= 0xff && startCaseFold <= 0xff && endCodeValue <= 0xff && endCaseFold <= 0xff) { type = "latin"; } else if (startCodeValue <= 0xffff && startCaseFold <= 0xffff && endCodeValue <= 0xffff && endCaseFold <= 0xffff) { type = "basic"; } else { type = "supplementary"; } String options = ""; if (startCodeValue == endCodeValue && startCaseFold == endCaseFold) { if (startCodeValue > 0x7f && startCaseFold <= 0x7f) { // Single mapping with case folding into ASCII range. options = ", {unicode: true}"; } else { // Single mapping with different case-fold-upper value. int codePoint = startCodeValue; int upperCase = toUpper.getOrDefault(codePoint, codePoint); int caseFold = toLower.getOrDefault(upperCase, upperCase); int caseFoldUpper = toUpper.getOrDefault(caseFold, caseFold); if (codePoint == upperCase && codePoint != caseFoldUpper) { options = ", {unicode: true}"; } } } return String.format("test(range(0x%x, 0x%x, %d), range(0x%x, 0x%x, %d), %s%s);", startCodeValue, endCodeValue, steps, startCaseFold, endCaseFold, steps, type, options); }).collect(Collectors.joining("\n")); } private static final class CaseFold { private final List<Integer> caseFold_From = new ArrayList<>(); private final List<Integer> caseFold_To = new ArrayList<>(); private final List<Integer> caseUnfold_From = new ArrayList<>(); private final Map<Integer, List<Integer>> caseUnfold_To = new LinkedHashMap<>(); void add(int codeValue, int caseFold) { assert codeValue != caseFold : String.format("%d == %d", codeValue, caseFold); caseFold_From.add(codeValue); caseFold_To.add(caseFold); if (!caseUnfold_To.containsKey(caseFold)) { caseUnfold_From.add(caseFold); caseUnfold_To.put(caseFold, new ArrayList<>()); } caseUnfold_To.get(caseFold).add(codeValue); } void printTo(PrintStream stream) { stream.println(array("CaseFold_From", caseFold_From)); stream.println(array("CaseFold_To", caseFold_To)); stream.println(array("CaseUnfold_From", caseUnfold_From)); stream.println(array("CaseUnfold_To", caseUnfold_To.values())); } @Override public boolean equals(Object obj) { if (!(obj instanceof CaseFold)) { return false; } CaseFold other = (CaseFold) obj; if (!caseFold_From.equals(other.caseFold_From)) { return false; } if (!caseFold_To.equals(other.caseFold_To)) { return false; } if (!caseUnfold_From.equals(other.caseUnfold_From)) { return false; } if (!caseUnfold_To.equals(other.caseUnfold_To)) { return false; } return true; } private static String array(String name, List<Integer> codePoints) { try (Formatter fmt = new Formatter(new StringBuilder(), Locale.ROOT)) { fmt.format("static final int[] %s = {%n/* @formatter:off */%n", name); boolean isNewLine = true; int index = 0; for (int codePoint : codePoints) { if (isNewLine) { isNewLine = false; } fmt.format("0x%x,", codePoint); if (++index % 8 == 0) { isNewLine = true; fmt.format("%n"); } else { fmt.format(" "); } } if (!isNewLine) { fmt.format("%n"); } fmt.format("/* @formatter:on */%n};%n"); return fmt.toString(); } } private static String array(String name, Collection<List<Integer>> codePoints) { try (Formatter fmt = new Formatter(new StringBuilder(), Locale.ROOT)) { fmt.format("static final int[][] %s = {%n/* @formatter:off */%n", name); boolean isNewLine = true; int index = 0; for (List<Integer> codePoint : codePoints) { if (isNewLine) { isNewLine = false; } fmt.format("{"); String prefix = ""; for (int cp : codePoint) { fmt.format("%s0x%x", prefix, cp); prefix = ", "; } fmt.format("},"); if (++index % 6 == 0) { isNewLine = true; fmt.format("%n"); } else { fmt.format(" "); } } if (!isNewLine) { fmt.format("%n"); } fmt.format("/* @formatter:on */%n};%n"); return fmt.toString(); } } } private static IntStream codePoints(Path unicode) throws IOException { return unicodeData(unicode).mapToInt(m -> Integer.parseInt(m.group("codeValue"), 16)); } private static Stream<Matcher> unicodeData(Path unicode) throws IOException { // ftp://ftp.unicode.org/Public/3.0-Update/UnicodeData-3.0.0.html String codeValue = "(?<codeValue>[0-9A-F]{4,6})"; String characterName = "(?<characterName>[A-Z0-9\\- ]+|<control>|<(?<rangeName>[A-Za-z0-9 ]+), (?<range>First|Last)>)"; String generalCategory = "(?<generalCategory>[A-Z][a-z])"; String canonicalCombiningClass = "(?<canonicalCombiningClass>[0-9]+)"; String bidirectionalCategory = "(?<bidirectionalCategory>[A-Z]{1,3})"; String characterDecompositionMapping = "(?<characterDecompositionMapping>(?:<[A-Za-z]+> )?[0-9A-F]{4,6}(?: [0-9A-F]{4,6})*)?"; String decimalDigitValue = "(?<decimalDigitValue>[0-9])?"; String digitValue = "(?<digitValue>[0-9]+)?"; String numericValue = "(?<numericValue>-?[0-9]+(?:/[0-9]+)?)?"; String mirrored = "(?<mirrored>Y|N)"; String unicode1Name = "(?<unicode1Name>[^;]*)"; String commentField = "(?<commentField>[^;]*)"; String uppercaseMapping = "(?<uppercaseMapping>[0-9A-F]{4,6})?"; String lowercaseMapping = "(?<lowercaseMapping>[0-9A-F]{4,6})?"; String titlecaseMapping = "(?<titlecaseMapping>[0-9A-F]{4,6})?"; Pattern p = Pattern.compile(String.join(";", codeValue, characterName, generalCategory, canonicalCombiningClass, bidirectionalCategory, characterDecompositionMapping, decimalDigitValue, digitValue, numericValue, mirrored, unicode1Name, commentField, uppercaseMapping, lowercaseMapping, titlecaseMapping)); return unicodeStream(unicode.resolve("UnicodeData.txt"), p); } private static Stream<Matcher> caseFolding(Path unicode) throws IOException { // Format "<code>; <status>; <mapping>; # <name>" defined in CaseFolding.txt. String code = "(?<code>[0-9A-F]{4,6})"; String status = "(?<status>[CFST])"; String mapping = "(?<mapping>[0-9A-F]{4,6}(?: [0-9A-F]{4,6})*)"; String name = "# (?<name>.*)"; Pattern p = Pattern.compile(String.join("; ", code, status, mapping, name)); return unicodeStream(unicode.resolve("CaseFolding.txt"), p); } private static Stream<Matcher> unicodeStream(Path path, Pattern pattern) throws IOException { return Files.lines(path, StandardCharsets.UTF_8).filter(line -> !(line.isEmpty() || line.charAt(0) == '#')) .map(line -> { Matcher matcher = pattern.matcher(line); if (!matcher.matches()) { System.err.println("Invalid line: " + line); } return matcher; }); } private static void caseMappings(Path unicode, Map<Integer, Integer> toUpper, Map<Integer, Integer> toLower) throws IOException { unicodeData(unicode).forEach(matcher -> { int codeValue = Integer.parseInt(matcher.group("codeValue"), 16); if (codeValue == 0x0130 || codeValue == 0x0131) { // Skip: LATIN CAPITAL LETTER I WITH DOT ABOVE // Skip: LATIN SMALL LETTER DOTLESS I return; } String uppercaseMapping = matcher.group("uppercaseMapping"); String lowercaseMapping = matcher.group("lowercaseMapping"); if (uppercaseMapping != null && isSingleCodePoint(uppercaseMapping)) { toUpper.put(codeValue, Integer.parseInt(uppercaseMapping, 16)); } if (lowercaseMapping != null && isSingleCodePoint(lowercaseMapping)) { toLower.put(codeValue, Integer.parseInt(lowercaseMapping, 16)); } }); } private static boolean isCherokeeUppercase(int codeValue) { return (0x13A0 <= codeValue && codeValue <= 0x13EF) || (0x13F0 <= codeValue && codeValue <= 0x13F5); } private static boolean isCherokeeLowercase(int codeValue) { return (0xAB70 <= codeValue && codeValue <= 0xABBF) || (0x13F8 <= codeValue && codeValue <= 0x13FD); } private static boolean isSingleCodePoint(String s) { if (4 <= s.length() && s.length() <= 6) { for (int i = 0; i < s.length(); ++i) { if (Character.digit(s.charAt(i), 16) < 0) { return false; } } return true; } return false; } /** * Generate {@link UEncoding#codeRangeSpace} array */ static String generateSpaceRange(Path unicode) throws IOException { Set<Integer> spaceSeparator = unicodeData(unicode).filter(m -> "Zs".equals(m.group("generalCategory"))) .map(m -> Integer.parseInt(m.group("codeValue"), 16)).collect(Collectors.toSet()); StringBuilder code = new StringBuilder(); int count = 0; for (int c = Character.MIN_CODE_POINT; c <= Character.MAX_CODE_POINT; ++c) { if (isSpace(c, spaceSeparator)) { count += 1; int from = c, to = c; for (int d = from + 1; d <= Character.MAX_VALUE && isSpace(d, spaceSeparator); ++d) { to = d; } code.append(String.format(", 0x%04x, 0x%04x", from, to)); c = to; } } return code.insert(0, count).toString(); } private static boolean isSpace(int c, Set<Integer> spaceSeparator) { switch (c) { /* ES2015 11.2 White Space */ case 0x0009: case 0x000B: case 0x000C: case 0x0020: case 0x00A0: case 0xFEFF: return true; /* ES2015 11.3 Line Terminators */ case 0x000A: case 0x000D: case 0x2028: case 0x2029: return true; /* ES2015 11.2 White Space */ default: return spaceSeparator.contains(c); } } }