package org.geogebra.common.util; import java.util.HashMap; import edu.umd.cs.findbugs.annotations.SuppressFBWarnings; public class Korean { static StringBuilder sb; static HashMap<Character, Character> koreanLeadToTail; static void init() { if (koreanLeadToTail == null) { koreanLeadToTail = new HashMap<Character, Character>(); } koreanLeadToTail.put(new Character('\u1100'), new Character('\u11a8')); koreanLeadToTail.put(new Character('\u1101'), new Character('\u11a9')); koreanLeadToTail.put(new Character('\u1102'), new Character('\u11ab')); koreanLeadToTail.put(new Character('\u1103'), new Character('\u11ae')); koreanLeadToTail.put(new Character('\u1104'), new Character('\u1104')); // map // to // itself koreanLeadToTail.put(new Character('\u1105'), new Character('\u11af')); koreanLeadToTail.put(new Character('\u1106'), new Character('\u11b7')); koreanLeadToTail.put(new Character('\u1107'), new Character('\u11b8')); koreanLeadToTail.put(new Character('\u1108'), new Character('\u1108')); // map // to // itself koreanLeadToTail.put(new Character('\u1109'), new Character('\u11ba')); koreanLeadToTail.put(new Character('\u110a'), new Character('\u11bb')); koreanLeadToTail.put(new Character('\u110b'), new Character('\u11bc')); koreanLeadToTail.put(new Character('\u110c'), new Character('\u11bd')); koreanLeadToTail.put(new Character('\u110d'), new Character('\u110d')); // map // to // itself koreanLeadToTail.put(new Character('\u110e'), new Character('\u11be')); koreanLeadToTail.put(new Character('\u110f'), new Character('\u11bf')); koreanLeadToTail.put(new Character('\u1110'), new Character('\u11c0')); koreanLeadToTail.put(new Character('\u1111'), new Character('\u11c1')); koreanLeadToTail.put(new Character('\u1112'), new Character('\u11c2')); } /* * convert eg \uB458 to \u1103\u116e\u11af * * and \uB450 to \u1103\u116E */ public static String flattenKorean(String s) { init(); if (sb == null) { sb = new StringBuilder(); } else { sb.setLength(0); } boolean lastWasVowel = false; for (int i = 0; i < s.length(); i++) { char c = s.charAt(i); if (isKoreanMultiChar(c)) { appendKoreanMultiChar(sb, c); } else { // if a "lead char" follows a vowel, turn into a "tail char" if (lastWasVowel && isKoreanLeadChar(c)) { sb.append(koreanLeadToTail.get(Character.valueOf(c)) .charValue()); } else { sb.append(c); } } lastWasVowel = isKoreanVowelChar(sb.charAt(sb.length() - 1)); } return sb.toString(); } // static { // for (char i = 0x1100; i <= 0x1112; i++) { // for (char j = 0x1161; j <= 0x1175; j++) { // String s = i + "" + j; // Log.debug(i + " " + j + " " + StringUtil.toHexString(s) // + StringUtil.toHexString(unflattenKorean(s).toString())); // } // } // // for (char i = 0xac00; i <= 0xD788; i += 1) { // String s = i + ""; // Log.debug(i + " " + StringUtil.toHexString(s) + " " // + isKoreanLeadPlusVowelChar(i)); // } // } // from 0xac00 to 0xd788, every 28th character is a combination of 2 // characters not 3 static boolean isKoreanLeadPlusVowelChar(char c) { if (c >= 0xac00 && c <= 0xd7af) { int ch = c - 0xac00; if ((ch % 28) == 0) { return true; } } return false; } private static boolean isKoreanMultiChar(char c) { if (c >= 0xac00 && c <= 0xd7af) { return true; } return false; } private static boolean isKoreanLeadChar(char c) { if (c >= 0x1100 && c <= 0x1112) { return true; } return false; } private static boolean isKoreanVowelChar(char c) { if (c >= 0x1161 && c <= 0x1175) { return true; } return false; } private static boolean isKoreanTailChar(char c) { if (c >= 0x11a8 && c <= 0x11c2) { return true; } return false; } /* * convert eg \u1103\u116e\u11af to \uB458 * * also converts 2 chars eg \u1103\u116E to \uB450 */ public static StringBuilder unflattenKorean(String str) { StringBuilder ret = new StringBuilder(); char lead = 0; char vowel = 0; char tail = 0; for (int i = 0; i < str.length(); i++) { boolean korean = false; char c = str.charAt(i); if (isKoreanLeadChar(c)) { korean = true; if (lead != 0) { appendKoreanChar(ret, lead, vowel, tail); lead = 0; vowel = 0; tail = 0; } lead = c; } if (isKoreanVowelChar(c)) { korean = true; vowel = c; } if (isKoreanTailChar(c)) { korean = true; tail = c; appendKoreanChar(ret, lead, vowel, tail); lead = 0; vowel = 0; tail = 0; } if (!korean) { ret.append(c); } } // make sure last char done! if (lead != 0) { appendKoreanChar(ret, lead, vowel, tail); } return ret; } private static void appendKoreanChar(StringBuilder ret, char lead, char vowel, char tail) { int lead0 = lead - 0x1100 + 1; int vowel0 = vowel - 0x1161 + 1; int tail0 = tail == 0 ? 0 : tail - 0x11a8 + 1; // http://gernot-katzers-spice-pages.com/var/korean_hangul_unicode.html char unicode = (char) (tail0 + (vowel0 - 1) * 28 + (lead0 - 1) * 588 + 44032); ret.append(unicode); } /* * http://www.kfunigraz.ac.at/~katzer/korean_hangul_unicode.html * http://gernot-katzers-spice-pages.com/var/korean_hangul_unicode.html */ private static void appendKoreanMultiChar(StringBuilder sb, char c) { char tail = (char) (0x11a7 + (c - 44032) % 28); char vowel = (char) (0x1161 + ((c - 44032 - (tail - 0x11a7)) % 588) / 28); char lead = (char) (0x1100 + (c - 44032) / 588); // Application.debug(Util.toHexString(c)+" decoded to // "+Util.toHexString(lead)+Util.toHexString(vowel)+Util.toHexString(tail)); sb.append(lead); sb.append(vowel); if (!isKoreanLeadPlusVowelChar(c)) { sb.append(tail); } } // static { // String s = "\ub450"; // s = flattenKorean(s); // Log.debug("length = " + s.length()); // for (int i = 0; i < s.length(); i++) { // Log.debug(StringUtil.toHexString(s.charAt(i))); // } // // s = "\u1103\u116E"; // s = unflattenKorean(s).toString(); // Log.debug("\u1103\u116E goes to " + StringUtil.toHexString(s)); // } /* * avoid having to press shift by merging eg \u1100\u1100 to \u1101 * http://www.kfunigraz.ac.at/~katzer/korean_hangul_unicode.html */ @SuppressFBWarnings({ "SF_SWITCH_FALLTHROUGH", "missing break is deliberate" }) public static String mergeDoubleCharacters(String str) { if (str.length() < 2) { return str; } if (sb == null) { sb = new StringBuilder(); } else { sb.setLength(0); } char c, c2; for (int i = 0; i < str.length() - 1; i++) { int offset = 1; switch (c = str.charAt(i)) { case '\u1161': // these character are "doubled" by adding 2 to their // Unicode value case '\u1162': case '\u1165': case '\u1166': offset++; // fall through // case '\u1100' : // these character are "doubled" by adding 1 // to their Unicode value case '\u1103': // case '\u1107' : case '\u1109': case '\u110c': case '\u11a8': case '\u11ba': if (str.charAt(i + 1) == c) { sb.append((char) (c + offset)); // eg \u1101 ie doubled char i++; } else { sb.append(c); } break; case '\u1169': c2 = str.charAt(i + 1); if (c2 == '\u1161') { sb.append('\u116a'); // eg \u1101 ie doubled char i++; } else if (c2 == '\u1162') { sb.append('\u116b'); // eg \u1101 ie doubled char i++; } else if (c2 == '\u1175') { sb.append('\u116c'); // eg \u1101 ie doubled char i++; } else if (c2 == '\u1169') { sb.append('\u116d'); // eg \u1101 ie doubled char i++; } else { sb.append(c); } break; case '\u1105': c2 = str.charAt(i + 1); if (c2 == '\u1100') { sb.append('\u11b0'); // eg \u1101 ie doubled char i++; } else if (c2 == '\u1106') { sb.append('\u11b1'); // eg \u1101 ie doubled char i++; } else if (c2 == '\u1107') { sb.append('\u11b2'); // eg \u1101 ie doubled char i++; } else if (c2 == '\u1109') { sb.append('\u11b3'); // eg \u1101 ie doubled char i++; } else if (c2 == '\u1110') { sb.append('\u11b4'); // eg \u1101 ie doubled char i++; } else if (c2 == '\u1112') { sb.append('\u11b6'); // eg \u1101 ie doubled char i++; } else { sb.append(c); } break; case '\u116e': c2 = str.charAt(i + 1); if (c2 == '\u1165') { sb.append('\u116f'); // eg \u1101 ie doubled char i++; } else if (c2 == '\u1166') { sb.append('\u1170'); // eg \u1101 ie doubled char i++; } else if (c2 == '\u1175') { sb.append('\u1171'); // eg \u1101 ie doubled char i++; } else if (c2 == '\u116e') { sb.append('\u1172'); // eg \u1101 ie doubled char i++; } else { sb.append(c); } break; case '\u1173': c2 = str.charAt(i + 1); if (c2 == '\u1175') { sb.append('\u1174'); // eg \u1101 ie doubled char i++; } else { sb.append(c); } break; case '\u1100': c2 = str.charAt(i + 1); if (c2 == '\u1100') { sb.append('\u11a9'); // eg \u1101 ie doubled char i++; } else if (c2 == '\u1109') { sb.append('\u11aa'); // eg \u1101 ie doubled char i++; } else { sb.append(c); } break; case '\u1102': c2 = str.charAt(i + 1); if (c2 == '\u110c') { sb.append('\u11ac'); // eg \u1101 ie doubled char i++; } else if (c2 == '\u1112') { sb.append('\u11ad'); // eg \u1101 ie doubled char i++; } else { sb.append(c); } break; case '\u1111': c2 = str.charAt(i + 1); if (c2 == '\u1111') { sb.append('\u11b5'); // eg \u1101 ie doubled char i++; } else { sb.append(c); } break; case '\u1107': c2 = str.charAt(i + 1); if (c2 == '\u1109') { sb.append('\u11b9'); // eg \u1101 ie doubled char i++; } else if (c2 == '\u1107') { sb.append('\u1108'); // eg \u1101 ie doubled char i++; } else { sb.append(c); } break; default: sb.append(c); } if (i == str.length() - 2) { sb.append(str.charAt(str.length() - 1)); } } return sb.toString(); } }