/*
* Copyright (C) 2014.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 3 or
* version 2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/
package uk.me.parabola.util;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.util.Formatter;
import java.util.HashMap;
import java.util.Map;
import java.util.NavigableSet;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.ibm.icu.text.CollationElementIterator;
import com.ibm.icu.text.Collator;
import com.ibm.icu.text.RuleBasedCollator;
//import java.text.CollationElementIterator;
//import java.text.Collator;
//import java.text.RuleBasedCollator;
/**
* Create a set of rules for a given code page.
*
* Should be usable, perhaps with a few tweaks.
* Works with unicode too, need to choose which blocks to take for unicode.
*
* @author Steve Ratcliffe
*/
public class CollationRules {
private CharsetDecoder decoder;
private final NavigableSet<CharPosition> positionMap = new TreeSet<>();
private final NavigableSet<CharPosition> basePositionMap = new TreeSet<>();
private final Map<Character, CharPosition> charMap = new HashMap<>();
private boolean isUnicode;
private Charset charset;
public static void main(String[] args) {
String charsetName = args[0];
CollationRules main = new CollationRules();
main.go(charsetName);
}
private void go(String charsetName) {
RuleBasedCollator col = (RuleBasedCollator) Collator.getInstance();
charset = Charset.forName(charsetName);
if (charsetName.equalsIgnoreCase("utf-8"))
isUnicode = true;
decoder = charset.newDecoder();
if (isUnicode)
addUnicode();
else
addBlock(col, 0);
printCharMap();
printExpansions();
}
private void addBlock(RuleBasedCollator col, int block) {
for (int i = 0; i < 0x100; i++) {
int ch = (block << 8) + i;
String testString = getString(ch);
char conv = testString.charAt(0);
if (Character.getType(conv) == Character.UNASSIGNED || conv == 0xfffd)
continue;
CollationElementIterator it = col.getCollationElementIterator(testString);
System.out.printf("# %s ", fmtChar(testString.charAt(0)));
int next;
int index = 0;
CharPosition cp = new CharPosition(0);
while ((next = it.next()) != CollationElementIterator.NULLORDER) {
if (index == 0) {
cp = new CharPosition(ch);
cp.setOrder(next);
} else {
assert index < 3;
if ((next & 0xffff0000) == 0) {
cp.addOrder(next, index);
} else {
cp.addChar(new CharPosition(ch));
cp.setOrder(next);
}
}
index++;
}
System.out.printf(" %s %d", cp, Character.getType(cp.getUnicode()));
System.out.println();
tweak(cp);
if (ch > 0)
positionMap.add(cp);
if (cp.nextChar == null) {
basePositionMap.add(cp);
charMap.put(conv, cp);
}
}
}
private void addUnicode() {
Pattern pat = Pattern.compile("([0-9A-F]{4,5}) ? ; \\[[.*](.*)\\] #.*");
try (FileReader r = new FileReader("allkeys.txt")) {
try (BufferedReader br = new BufferedReader(r)) {
String line;
while ((line = br.readLine()) != null) {
Matcher matcher = pat.matcher(line);
if (matcher.matches()) {
String weights = matcher.group(2);
int ch = Integer.parseInt(matcher.group(1), 16);
if (ch > 0xffff)
continue;
System.out.printf("# %04x %s ", ch, fmtChar(ch));
String[] split = weights.split("]\\[[.*]");
int index = 0;
CharPosition cp = new CharPosition(0);
for (String s : split) {
String[] ws = s.split("\\.");
int next = Integer.parseInt(ws[0], 16) << 16
| ((Integer.parseInt(ws[1], 16) << 8) & 0xff00)
| ((Integer.parseInt(ws[2], 16)) & 0xff);
if (index == 0) {
cp = new CharPosition(ch);
cp.setOrder(next);
} else {
if ((next & 0xffff0000) == 0) {
cp.addOrder(next, index);
} else {
cp.addChar(new CharPosition(ch));
cp.setOrder(next);
}
}
index++;
}
System.out.printf(" %s %d\n", cp, Character.getType(cp.getUnicode()));
tweak(cp);
if (ch > 0)
positionMap.add(cp);
if (cp.nextChar == null) {
basePositionMap.add(cp);
charMap.put((char) ch, cp);
}
} else {
System.out.println("# NOMATCH: " + line);
}
}
}
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* Fix up a few characters that we always want to be in well known places.
*
* @param cp The position to change.
*/
private void tweak(CharPosition cp) {
if (cp.val < 8)
cp.third = cp.val + 7;
if (!isUnicode) {
switch (cp.getUnicode()) {
case '¼':
cp.nextChar = charMap.get('/').copy();
cp.nextChar.nextChar = charMap.get('4');
break;
case '½':
cp.nextChar = charMap.get('/').copy();
cp.nextChar.nextChar = charMap.get('2');
break;
case '¾':
cp.nextChar = charMap.get('/').copy();
cp.nextChar.nextChar = charMap.get('4');
break;
}
}
switch (cp.getUnicode()) {
case '˜':
CharPosition tilde = charMap.get('~');
cp.first = tilde.first;
cp.second = tilde.second + 1;
cp.third = tilde.third + 1;
cp.nextChar = null;
break;
}
}
private String getString(int i) {
if (isUnicode)
return new String(new char[]{(char) i});
else {
byte[] b = {(byte) i};
return new String(b, 0, 1, charset);
}
}
private void printCharMap() {
Formatter chars = new Formatter();
chars.format("\n");
CharPosition last = new CharPosition(0);
last.first = 0;
for (CharPosition cp : positionMap) {
if (cp.isExpansion())
continue;
if (cp.first != last.first) {
chars.format("\n < ");
} else if (cp.second != last.second) {
chars.format(" ; ");
} else if (cp.third != last.third) {
chars.format(",");
} else {
chars.format("=");
}
last = cp;
int uni = toUnicode(cp.val);
chars.format("%s", fmtChar(uni));
}
System.out.println(chars);
}
private void printExpansions() {
for (CharPosition cp : positionMap) {
if (!cp.isExpansion())
continue;
Formatter fmt = new Formatter();
//noinspection MalformedFormatString
fmt.format("expand %c to", cp.getUnicode());
boolean ok = true;
for (CharPosition cp2 = cp; cp2 != null; cp2 = cp2.nextChar) {
cp2.second = 0x50000;
int top = (cp2.third >> 16) & 0xff;
cp2.third = (top == 0x9e || top == 0xa2 || top == 0x2b) ? 0x9b0000 : 0;
CharPosition floor = basePositionMap.ceiling(cp2);
if (floor == null || floor.getUnicode() == 0xfffd) {
fmt.format(" NF");
ok = false;
} else {
fmt.format(" %s", fmtChar(floor.getUnicode()));
}
}
System.out.println((ok ? "" : "# ") + fmt.toString());
// Print comments to help find problems.
for (CharPosition cp2 = cp; cp2 != null; cp2 = cp2.nextChar) {
CharPosition floor = basePositionMap.ceiling(cp2);
if (floor == null) {
System.out.println("#FIX: NF ref=" + cp2);
} else {
System.out.println("#floor is " + fmtChar(toUnicode(floor.val)) + ", " + floor + ", ref is " + cp2);
}
}
}
}
private String fmtChar(int val) {
boolean asChar = true;
switch (val) {
case '<':
case ';':
case ',':
case '=':
case '#':
asChar = false;
break;
default:
switch (Character.getType(val)) {
case Character.UNASSIGNED:
case Character.NON_SPACING_MARK:
case Character.FORMAT:
case Character.CONTROL:
case Character.SPACE_SEPARATOR:
case Character.LINE_SEPARATOR:
case Character.PARAGRAPH_SEPARATOR:
asChar = false;
}
}
if (asChar) {
//noinspection MalformedFormatString
return String.format("%c", val);
} else {
return String.format("%04x", val);
}
}
private int toUnicode(int c) {
if (isUnicode)
return c;
ByteBuffer b = ByteBuffer.allocate(1);
b.put((byte) c);
b.flip();
try {
CharBuffer chars = decoder.decode(b);
return chars.charAt(0);
} catch (CharacterCodingException e) {
return '?';
}
}
class CharPosition implements Comparable<CharPosition> {
private final int val;
private int first;
private int second;
private int third;
private CharPosition nextChar;
public CharPosition(int charValue) {
this.val = charValue;
}
public int compareTo(CharPosition other) {
if (other.first == first)
return compareSecond(other);
else if (first < other.first)
return -1;
else
return 1;
}
private int compareSecond(CharPosition c2) {
if (c2.second == second)
return compareThird(c2);
else if (second < c2.second)
return -1;
else
return 1;
}
private int compareThird(CharPosition c2) {
if (third == c2.third)
return new Integer(val).compareTo(c2.val);
else if (third < c2.third)
return -1;
else
return 1;
}
public String toString() {
Formatter fmt = new Formatter();
toString(fmt);
return fmt.toString();
}
private void toString(Formatter fmt) {
fmt.format("[%04x %02x %02x]", first, second, third);
if (nextChar != null)
nextChar.toString(fmt);
}
public void setOrder(int next) {
if (nextChar != null) {
nextChar.setOrder(next);
return;
}
first = (next >> 16) & 0xffff;
second = (next << 8) & 0xff0000;
third = (next << 16) & 0xff0000;
}
public void addOrder(int next, int count) {
assert ((next >>> 16) & 0xffff) == 0;
if (this.nextChar != null) {
this.nextChar.addOrder(next, count);
return;
}
second += ((next >> 8) & 0xff) << (2-count)*8;
third += ((next) & 0xff) << (2-count)*8;
}
public boolean isExpansion() {
return nextChar != null;
}
public void addChar(CharPosition pos) {
if (nextChar != null) {
nextChar.addChar(pos);
return;
}
nextChar = pos;
}
public int getUnicode() {
return toUnicode(val);
}
public CharPosition copy() {
CharPosition cp = new CharPosition(this.val);
cp.first = this.first;
cp.second = this.second;
cp.third = this.third;
return cp;
}
}
}