package com.formulasearchengine.mathosphere.mlp.text; import com.google.common.collect.ImmutableMap; import com.google.common.collect.ImmutableMap.Builder; import com.google.common.collect.Lists; import org.apache.commons.lang3.CharUtils; import java.lang.Character.UnicodeBlock; import java.util.List; import java.util.Map; public class UnicodeUtils { private static final int BOLD_A = 119808; private static final int MONOSPACE_z = 120483; private static final int BOLD_ALPHA = 120488; private static final int BOLD_ITALIC_VAR_PI_SMALL = 120777; private static final int BOLD_0 = 120782; private static final int MONOSPACE_9 = 120831; private static final List<String> LATIN_NORMAL = asList("ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "abcdefghijklmnopqrstuvwxyz"); private static final List<String> GREEK_NORMAL = asList("ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡϴΣΤΥΦΧΨΩ" + "\u2207" + "αβγδεζηθικλμνξοπρςστυφχψω∂ϵϑϰϕϱϖ"); private static final List<String> DIGIT_NORMAL = asList("0123456789"); private static final Map<Integer, String> LETTER_LIKE_MAPPING = buildLetterLikeMap(); private static Map<Integer, String> buildLetterLikeMap() { // see table here // http://unicode-table.com/en/blocks/letterlike-symbols/ Builder<Integer, String> letterLike = ImmutableMap.builder(); letterLike.put(0x2102, "C"); letterLike.put(0x210A, "g"); letterLike.put(0x210B, "H"); letterLike.put(0x210C, "H"); letterLike.put(0x210D, "H"); letterLike.put(0x210F, "h"); letterLike.put(0x2110, "I"); letterLike.put(0x2111, "I"); letterLike.put(0x2112, "L"); letterLike.put(0x2113, "l"); letterLike.put(0x2115, "N"); letterLike.put(0x2118, "P"); letterLike.put(0x2119, "P"); letterLike.put(0x211A, "Q"); letterLike.put(0x211B, "R"); letterLike.put(0x211C, "R"); letterLike.put(0x211D, "R"); letterLike.put(0x2124, "Z"); letterLike.put(0x2126, "Ω"); letterLike.put(0x212C, "B"); letterLike.put(0x212D, "C"); letterLike.put(0x212F, "e"); letterLike.put(0x2130, "E"); letterLike.put(0x2131, "F"); letterLike.put(0x2133, "M"); letterLike.put(0x2134, "o"); letterLike.put(0x213C, "π"); letterLike.put(0x213D, "γ"); letterLike.put(0x213E, "Γ"); letterLike.put(0x213F, "Π"); letterLike.put(0x2140, "Σ"); letterLike.put(0x2145, "D"); letterLike.put(0x2146, "d"); letterLike.put(0x2147, "e"); letterLike.put(0x2148, "i"); letterLike.put(0x2149, "j"); return letterLike.build(); } public static String normalizeString(String in) { int[] chars = in.codePoints().toArray(); StringBuilder res = new StringBuilder(in.length()); for (int code : chars) { res.append(normalizeCharacter(code)); } return res.toString(); } public static String normalizeCharacter(int codePoint) { if (!Character.isValidCodePoint(codePoint)) { return ""; } // TODO: long search? maybe replace with own implementation UnicodeBlock block = Character.UnicodeBlock.of(codePoint); if (block == Character.UnicodeBlock.MATHEMATICAL_ALPHANUMERIC_SYMBOLS) { return normalizeMath(codePoint); } if (block == Character.UnicodeBlock.LETTERLIKE_SYMBOLS) { return normalizeLetterLike(codePoint); } return codePointToString(codePoint); } private static String normalizeMath(int codePoint) { // see here // http://unicode-table.com/en/blocks/mathematical-alphanumeric-symbols/ if (isMathLatin(codePoint)) { return processLatin(codePoint); } if (isMathGreek(codePoint)) { return processGreek(codePoint); } if (isMathDigit(codePoint)) { return processDigit(codePoint); } return codePointToString(codePoint); } /** * Detects the following types of mathematical unicode chars: <ul> <li>bold</li> <li>italic</li> * <li>bold italic</li> <li>script</li> <li>bold script</li> <li>fraktur</li> * <li>double-struck</li> <li>bold fraktur</li> <li>sans-serif</li> <li>sans-serif bold</li> * <li>sans-serif italic</li> <li>sans-serif bold italic</li> <li>monospace</li> </ul> * * @param codePoint character code to check * @return <code>true</code> if the code belongs to one of the mentioned categories */ public static boolean isMathLatin(int codePoint) { return BOLD_A <= codePoint && codePoint <= MONOSPACE_z; } /** * Detects the following types of mathematical unicode chars of the Greek alphabet: <ul> * <li>bold</li> <li>italic</li> <li>bold italic</li> </ul> <p> Also handles extra symbols like * \varphi, \varpi, \nabla * * @param codePoint character code to check * @return <code>true</code> if the code belongs to one of the mentioned categories */ public static boolean isMathGreek(int codePoint) { return BOLD_ALPHA <= codePoint && codePoint <= BOLD_ITALIC_VAR_PI_SMALL; } /** * Detects the following types of mathematical unicode chars for digits: <ul> <li>bold</li> * <li>double-struck</li> <li>sans-serif</li> <li>sans-serif bold</li> <li>monospace</li> </ul> * * @param codePoint character code to check * @return <code>true</code> if the code belongs to one of the mentioned categories */ public static boolean isMathDigit(int codePoint) { return BOLD_0 <= codePoint && codePoint <= MONOSPACE_9; } private static String processLatin(int codePoint) { return replace(codePoint, BOLD_A, LATIN_NORMAL); } private static String processGreek(int codePoint) { return replace(codePoint, BOLD_ALPHA, GREEK_NORMAL); } private static String processDigit(int codePoint) { return replace(codePoint, BOLD_0, DIGIT_NORMAL); } private static String replace(int codePoint, int firstCharacterInGroup, List<String> referenceList) { int relative = codePoint - firstCharacterInGroup; int taget = relative % referenceList.size(); return referenceList.get(taget); } private static List<String> asList(String string) { List<String> list = Lists.newArrayListWithCapacity(string.length()); string.chars().forEach(c -> list.add(CharUtils.toString((char) c))); return list; } private static String codePointToString(int codePoint) { if (Character.isBmpCodePoint(codePoint)) { return CharUtils.toString((char) codePoint); } char[] chars = Character.toChars(codePoint); return new String(chars); } private static String normalizeLetterLike(int codePoint) { if (LETTER_LIKE_MAPPING.containsKey(codePoint)) { return LETTER_LIKE_MAPPING.get(codePoint); } else { return codePointToString(codePoint); } } }