package com.cloudhopper.commons.charset; /* * #%L * ch-commons-charset * %% * Copyright (C) 2012 Cloudhopper by Twitter * %% * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * #L% */ import java.text.Normalizer; /** * Utility class for working with text used on mobile phones (primarily SMS). * Helpful methods for converting unicode characters into their ascii equivalents * such as smart quotes to dumb quotes. * * @author joelauer (twitter: @jjlauer or <a href="http://twitter.com/jjlauer" target=window>http://twitter.com/jjlauer</a>) */ public class MobileTextUtil { // source-char, replace-char // http://en.wikipedia.org/wiki/Quotation_mark_glyphs static public final char[][] CHAR_TABLE = { { '\u2013', '-' }, { '\u2014', '-' }, { '\u2018', '\'' }, { '\u2019', '\'' }, { '\u201A', '\'' }, { '\u201B', '\'' }, // U+201B ‛​ single high-reversed-9 quotation mark (HTML: ‛ ), also called single reversed comma, quotation mark { '\u201C', '"' }, { '\u201D', '"' }, { '\u201E', '"' }, { '\u201F', '"' }, // U+201F ‟​ double high-reversed-9 quotation mark (HTML: ‟ ), also called double reversed comma, quotation mark { '\u2020', '+' }, { '\u2022', '.' }, { '\u2026', '.' }, // actually "...", but just replacing with "." { '\u2039', '<' }, { '\u203A', '>' }, /** deprecated at recommendation by Turkcell - these replacements changed meaning too much */ //{ '\u0131', '1' }, // U+0131 is a lower case letter dotless i (ı) //{ '\u0130', 'i' }, // U+0130 (İ) is capital i with dot }; /** * Replace unicode characters with their ascii equivalents, limiting * replacement to "safe" characters such as smart quotes to dumb quotes. * "Safe" is subjective, but generally the agreement is that these character * replacements should not change the meaning of the string in any meaninful * way. * * @param buffer The buffer containing the characters to analyze and replace * if necessary. * @return The number of characters replaced */ static public int replaceSafeUnicodeChars(StringBuilder buffer) { int replaced = 0; for (int i = 0; i < buffer.length(); i++) { char c = buffer.charAt(i); for (int j = 0; j < CHAR_TABLE.length; j++) { if (c == CHAR_TABLE[j][0]) { replaced++; buffer.setCharAt(i, CHAR_TABLE[j][1]); } } } return replaced; } /** * Replace accented characters with their ascii equivalents. For example, * convert é to e.<br><br> * NOTE: This method is not very efficient. The String will be copied * twice during conversion, so you'll likely only want to run this against * small strings. * * @param buffer The buffer containing the characters to analyze and replace * if necessary. * @return The number of characters replaced */ public static int replaceAccentedChars(StringBuilder buffer) { // save the size before we strip out the accents int sizeBefore = buffer.length(); // each accented char will be converted into 2 chars -- the ascii version // followed by the accent character String s = Normalizer.normalize(buffer, Normalizer.Form.NFD); // new size will include accented chars int sizeAfter = s.length(); // efficiency check #1 - if the length hasn't changed, do nothing int replaced = sizeAfter - sizeBefore; if (replaced <= 0) { return 0; } // replace the accents with nothing s = s.replaceAll("\\p{InCombiningDiacriticalMarks}+", ""); buffer.setLength(0); buffer.append(s); return replaced; } }