package fr.neamar.kiss.normalizer; import android.util.Pair; import java.text.Normalizer; /** * String utils to handle accented characters for search and highlighting */ public class StringNormalizer { private StringNormalizer() { } /** * Make the given string easier to compare by performing a number of simplifications on it * <p/> * 1. Decompose combination characters into their respective parts (see below) * 2. Strip all combining character marks (see below) * 3. Strip some other common-but-not-very-useful characters (such as dashes) * 4. Lower-case the string * <p/> * Combination characters are characters that (essentially) have the same meaning as one or * more other, more common, characters. Examples for these include: * Roman numerals (`Ⅱ` → `II`) and half-width katakana (`ミ` → `ミ`) * <p/> * Combining character marks are diacritics and other extra strokes that are often found as * part of many characters in non-English roman scripts. Examples for these include: * Diaereses (`ë` → `e`), acutes (`á` → `a`) and macrons (`ō` → `o`) * * @param input string input, with accents and anything else you can think of * @return normalized string and list that maps each result string position to its source * string position */ public static Pair<String, int[]> normalizeWithMap(String input) { StringBuilder resultString = new StringBuilder(); IntSequenceBuilder resultMap = new IntSequenceBuilder(input.length() * 3 / 2); StringBuilder charBuffer = new StringBuilder(2); int inputOffset = 0, inputLength = input.length(); while (inputOffset < inputLength) { int inputChar = input.codePointAt(inputOffset); // Decompose codepoint at given position charBuffer.append(Character.toChars(inputChar)); String decomposedCharString = Normalizer.normalize(charBuffer, Normalizer.Form.NFKD); charBuffer.delete(0, charBuffer.length()); // `inputChar` codepoint may be decomposed to four (or maybe even more) new code points int decomposedCharOffset = 0; while (decomposedCharOffset < decomposedCharString.length()) { int resultChar = decomposedCharString.codePointAt(decomposedCharOffset); // Skip characters for some unicode character classes, including: // * combining characters produced by the NFKD normalizer above // * dashes // See the method's description for more information switch (Character.getType(resultChar)) { case Character.NON_SPACING_MARK: case Character.COMBINING_SPACING_MARK: // Some combining character found break; case Character.DASH_PUNCTUATION: // Some other unwanted character found break; default: resultString.appendCodePoint(Character.toLowerCase(resultChar)); resultMap.add(inputOffset); } decomposedCharOffset += Character.charCount(resultChar); } inputOffset += Character.charCount(inputChar); } // Since we stripped all combining Unicode characters in the // previous while-loop there should be no combining character // remaining in the string and the composed and decomposed // versions of the string should be equivalent. This also means // we do not need to convert the string back to composed Unicode // before returning it. return new Pair<>(resultString.toString(), resultMap.toArray()); } /** * Make the given string easier to compare by performing a number of simplifications on it * * @param input string input, with accents and anything else you can think of * @return normalized string * @see StringNormalizer#normalizeWithMap(String) */ public static String normalize(String input) { return StringNormalizer.normalizeWithMap(input).first; } }