/* * Copyright (C) 2014. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 3 or * version 2 as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. */ package uk.me.parabola.util; import java.io.BufferedReader; import java.io.FileReader; import java.io.IOException; import java.nio.ByteBuffer; import java.nio.CharBuffer; import java.nio.charset.CharacterCodingException; import java.nio.charset.Charset; import java.nio.charset.CharsetDecoder; import java.util.Formatter; import java.util.HashMap; import java.util.Map; import java.util.NavigableSet; import java.util.TreeSet; import java.util.regex.Matcher; import java.util.regex.Pattern; import com.ibm.icu.text.CollationElementIterator; import com.ibm.icu.text.Collator; import com.ibm.icu.text.RuleBasedCollator; //import java.text.CollationElementIterator; //import java.text.Collator; //import java.text.RuleBasedCollator; /** * Create a set of rules for a given code page. * * Should be usable, perhaps with a few tweaks. * Works with unicode too, need to choose which blocks to take for unicode. * * @author Steve Ratcliffe */ public class CollationRules { private CharsetDecoder decoder; private final NavigableSet<CharPosition> positionMap = new TreeSet<>(); private final NavigableSet<CharPosition> basePositionMap = new TreeSet<>(); private final Map<Character, CharPosition> charMap = new HashMap<>(); private boolean isUnicode; private Charset charset; public static void main(String[] args) { String charsetName = args[0]; CollationRules main = new CollationRules(); main.go(charsetName); } private void go(String charsetName) { RuleBasedCollator col = (RuleBasedCollator) Collator.getInstance(); charset = Charset.forName(charsetName); if (charsetName.equalsIgnoreCase("utf-8")) isUnicode = true; decoder = charset.newDecoder(); if (isUnicode) addUnicode(); else addBlock(col, 0); printCharMap(); printExpansions(); } private void addBlock(RuleBasedCollator col, int block) { for (int i = 0; i < 0x100; i++) { int ch = (block << 8) + i; String testString = getString(ch); char conv = testString.charAt(0); if (Character.getType(conv) == Character.UNASSIGNED || conv == 0xfffd) continue; CollationElementIterator it = col.getCollationElementIterator(testString); System.out.printf("# %s ", fmtChar(testString.charAt(0))); int next; int index = 0; CharPosition cp = new CharPosition(0); while ((next = it.next()) != CollationElementIterator.NULLORDER) { if (index == 0) { cp = new CharPosition(ch); cp.setOrder(next); } else { assert index < 3; if ((next & 0xffff0000) == 0) { cp.addOrder(next, index); } else { cp.addChar(new CharPosition(ch)); cp.setOrder(next); } } index++; } System.out.printf(" %s %d", cp, Character.getType(cp.getUnicode())); System.out.println(); tweak(cp); if (ch > 0) positionMap.add(cp); if (cp.nextChar == null) { basePositionMap.add(cp); charMap.put(conv, cp); } } } private void addUnicode() { Pattern pat = Pattern.compile("([0-9A-F]{4,5}) ? ; \\[[.*](.*)\\] #.*"); try (FileReader r = new FileReader("allkeys.txt")) { try (BufferedReader br = new BufferedReader(r)) { String line; while ((line = br.readLine()) != null) { Matcher matcher = pat.matcher(line); if (matcher.matches()) { String weights = matcher.group(2); int ch = Integer.parseInt(matcher.group(1), 16); if (ch > 0xffff) continue; System.out.printf("# %04x %s ", ch, fmtChar(ch)); String[] split = weights.split("]\\[[.*]"); int index = 0; CharPosition cp = new CharPosition(0); for (String s : split) { String[] ws = s.split("\\."); int next = Integer.parseInt(ws[0], 16) << 16 | ((Integer.parseInt(ws[1], 16) << 8) & 0xff00) | ((Integer.parseInt(ws[2], 16)) & 0xff); if (index == 0) { cp = new CharPosition(ch); cp.setOrder(next); } else { if ((next & 0xffff0000) == 0) { cp.addOrder(next, index); } else { cp.addChar(new CharPosition(ch)); cp.setOrder(next); } } index++; } System.out.printf(" %s %d\n", cp, Character.getType(cp.getUnicode())); tweak(cp); if (ch > 0) positionMap.add(cp); if (cp.nextChar == null) { basePositionMap.add(cp); charMap.put((char) ch, cp); } } else { System.out.println("# NOMATCH: " + line); } } } } catch (IOException e) { e.printStackTrace(); } } /** * Fix up a few characters that we always want to be in well known places. * * @param cp The position to change. */ private void tweak(CharPosition cp) { if (cp.val < 8) cp.third = cp.val + 7; if (!isUnicode) { switch (cp.getUnicode()) { case '¼': cp.nextChar = charMap.get('/').copy(); cp.nextChar.nextChar = charMap.get('4'); break; case '½': cp.nextChar = charMap.get('/').copy(); cp.nextChar.nextChar = charMap.get('2'); break; case '¾': cp.nextChar = charMap.get('/').copy(); cp.nextChar.nextChar = charMap.get('4'); break; } } switch (cp.getUnicode()) { case '˜': CharPosition tilde = charMap.get('~'); cp.first = tilde.first; cp.second = tilde.second + 1; cp.third = tilde.third + 1; cp.nextChar = null; break; } } private String getString(int i) { if (isUnicode) return new String(new char[]{(char) i}); else { byte[] b = {(byte) i}; return new String(b, 0, 1, charset); } } private void printCharMap() { Formatter chars = new Formatter(); chars.format("\n"); CharPosition last = new CharPosition(0); last.first = 0; for (CharPosition cp : positionMap) { if (cp.isExpansion()) continue; if (cp.first != last.first) { chars.format("\n < "); } else if (cp.second != last.second) { chars.format(" ; "); } else if (cp.third != last.third) { chars.format(","); } else { chars.format("="); } last = cp; int uni = toUnicode(cp.val); chars.format("%s", fmtChar(uni)); } System.out.println(chars); } private void printExpansions() { for (CharPosition cp : positionMap) { if (!cp.isExpansion()) continue; Formatter fmt = new Formatter(); //noinspection MalformedFormatString fmt.format("expand %c to", cp.getUnicode()); boolean ok = true; for (CharPosition cp2 = cp; cp2 != null; cp2 = cp2.nextChar) { cp2.second = 0x50000; int top = (cp2.third >> 16) & 0xff; cp2.third = (top == 0x9e || top == 0xa2 || top == 0x2b) ? 0x9b0000 : 0; CharPosition floor = basePositionMap.ceiling(cp2); if (floor == null || floor.getUnicode() == 0xfffd) { fmt.format(" NF"); ok = false; } else { fmt.format(" %s", fmtChar(floor.getUnicode())); } } System.out.println((ok ? "" : "# ") + fmt.toString()); // Print comments to help find problems. for (CharPosition cp2 = cp; cp2 != null; cp2 = cp2.nextChar) { CharPosition floor = basePositionMap.ceiling(cp2); if (floor == null) { System.out.println("#FIX: NF ref=" + cp2); } else { System.out.println("#floor is " + fmtChar(toUnicode(floor.val)) + ", " + floor + ", ref is " + cp2); } } } } private String fmtChar(int val) { boolean asChar = true; switch (val) { case '<': case ';': case ',': case '=': case '#': asChar = false; break; default: switch (Character.getType(val)) { case Character.UNASSIGNED: case Character.NON_SPACING_MARK: case Character.FORMAT: case Character.CONTROL: case Character.SPACE_SEPARATOR: case Character.LINE_SEPARATOR: case Character.PARAGRAPH_SEPARATOR: asChar = false; } } if (asChar) { //noinspection MalformedFormatString return String.format("%c", val); } else { return String.format("%04x", val); } } private int toUnicode(int c) { if (isUnicode) return c; ByteBuffer b = ByteBuffer.allocate(1); b.put((byte) c); b.flip(); try { CharBuffer chars = decoder.decode(b); return chars.charAt(0); } catch (CharacterCodingException e) { return '?'; } } class CharPosition implements Comparable<CharPosition> { private final int val; private int first; private int second; private int third; private CharPosition nextChar; public CharPosition(int charValue) { this.val = charValue; } public int compareTo(CharPosition other) { if (other.first == first) return compareSecond(other); else if (first < other.first) return -1; else return 1; } private int compareSecond(CharPosition c2) { if (c2.second == second) return compareThird(c2); else if (second < c2.second) return -1; else return 1; } private int compareThird(CharPosition c2) { if (third == c2.third) return new Integer(val).compareTo(c2.val); else if (third < c2.third) return -1; else return 1; } public String toString() { Formatter fmt = new Formatter(); toString(fmt); return fmt.toString(); } private void toString(Formatter fmt) { fmt.format("[%04x %02x %02x]", first, second, third); if (nextChar != null) nextChar.toString(fmt); } public void setOrder(int next) { if (nextChar != null) { nextChar.setOrder(next); return; } first = (next >> 16) & 0xffff; second = (next << 8) & 0xff0000; third = (next << 16) & 0xff0000; } public void addOrder(int next, int count) { assert ((next >>> 16) & 0xffff) == 0; if (this.nextChar != null) { this.nextChar.addOrder(next, count); return; } second += ((next >> 8) & 0xff) << (2-count)*8; third += ((next) & 0xff) << (2-count)*8; } public boolean isExpansion() { return nextChar != null; } public void addChar(CharPosition pos) { if (nextChar != null) { nextChar.addChar(pos); return; } nextChar = pos; } public int getUnicode() { return toUnicode(val); } public CharPosition copy() { CharPosition cp = new CharPosition(this.val); cp.first = this.first; cp.second = this.second; cp.third = this.third; return cp; } } }