package edu.berkeley.cs.nlp.ocular.data.textreader;
import static edu.berkeley.cs.nlp.ocular.util.CollectionHelper.makeMap;
import static edu.berkeley.cs.nlp.ocular.util.CollectionHelper.makeSet;
import static edu.berkeley.cs.nlp.ocular.util.CollectionHelper.setUnion;
import static edu.berkeley.cs.nlp.ocular.util.Tuple2.Tuple2;
import static edu.berkeley.cs.nlp.ocular.util.Tuple3.Tuple3;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import edu.berkeley.cs.nlp.ocular.util.StringHelper;
import edu.berkeley.cs.nlp.ocular.util.Tuple2;
import edu.berkeley.cs.nlp.ocular.util.Tuple3;
import tberg.murphy.indexer.Indexer;
/**
* @author Dan Garrette (dhgarrette@gmail.com)
*/
public class Charset {
public static final String SPACE = " ";
public static final String HYPHEN = "-";
public static final Set<String> LOWERCASE_LATIN_LETTERS = makeSet("a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z");
public static final Set<String> LOWERCASE_VOWELS = makeSet("a", "e", "i", "o", "u");
public static final Map<String,String> LIGATURES = makeMap(Tuple2("Æ","AE"), Tuple2("æ","ae"), Tuple2("Œ","OE"), Tuple2("œ","oe"));
public static final String LONG_S = "\u017F"; // ſ
public static final Set<String> BANNED_CHARS = makeSet("@", "$", "%");
/**
* Punctuation symbols that should be made available for any language,
* regardless of whether they are seen in the language model training
* material.
*/
public static final Set<String> UNIV_PUNC = makeSet("&", ".", ",", "[", "]", HYPHEN, "*", "§", "¶");
private static boolean isPunctuation(char c) {
return !Character.isWhitespace(c) && !Character.isAlphabetic(c) && !Character.isDigit(c);
}
public static boolean isPunctuationChar(String s) {
for (char c: removeAnyDiacriticFromChar(s).toCharArray())
if (!isPunctuation(c)) return false;
return true;
}
public static final String GRAVE_COMBINING = "\u0300";
public static final String ACUTE_COMBINING = "\u0301";
public static final String CIRCUMFLEX_COMBINING = "\u0302";
public static final String TILDE_COMBINING = "\u0303";
public static final String MACRON_COMBINING = "\u0304"; // shorter overline
public static final String BREVE_COMBINING = "\u0306";
public static final String DIAERESIS_COMBINING = "\u0308"; // == umlaut
public static final String CEDILLA_COMBINING = "\u0327";
public static final String MACRON_BELOW_COMBINING = "\0331";
private static boolean isCombiningChar(String c) {
return (("\u0300".compareTo(c) <= 0 && c.compareTo("\u036F") <= 0) ||
("\u1AB0".compareTo(c) <= 0 && c.compareTo("\u1AFF") <= 0) ||
("\u1DC0".compareTo(c) <= 0 && c.compareTo("\u1DFF") <= 0) ||
("\u20D0".compareTo(c) <= 0 && c.compareTo("\u20FF") <= 0) ||
("\uFE20".compareTo(c) <= 0 && c.compareTo("\uFE2F") <= 0));
}
public static final String GRAVE_ESCAPE = "\\`";
public static final String ACUTE_ESCAPE = "\\'";
public static final String CIRCUMFLEX_ESCAPE = "\\^";
public static final String TILDE_ESCAPE = "\\~";
public static final String MACRON_ESCAPE = "\\-"; // shorter overline
public static final String BREVE_ESCAPE = "\\v";
public static final String DIAERESIS_ESCAPE = "\\\""; // == umlaut
public static final String CEDILLA_ESCAPE = "\\c";
public static final String MACRON_BELOW_ESCAPE = "\\_";
private static final HashMap<String,String> COMBINING_TO_ESCAPE_MAP = new HashMap<String,String>();
static {
COMBINING_TO_ESCAPE_MAP.put(GRAVE_COMBINING, GRAVE_ESCAPE);
COMBINING_TO_ESCAPE_MAP.put(ACUTE_COMBINING, ACUTE_ESCAPE);
COMBINING_TO_ESCAPE_MAP.put(CIRCUMFLEX_COMBINING, CIRCUMFLEX_ESCAPE);
COMBINING_TO_ESCAPE_MAP.put(TILDE_COMBINING, TILDE_ESCAPE);
COMBINING_TO_ESCAPE_MAP.put(MACRON_COMBINING, MACRON_ESCAPE);
COMBINING_TO_ESCAPE_MAP.put(BREVE_COMBINING, BREVE_ESCAPE);
COMBINING_TO_ESCAPE_MAP.put(DIAERESIS_COMBINING, DIAERESIS_ESCAPE);
COMBINING_TO_ESCAPE_MAP.put(CEDILLA_COMBINING, CEDILLA_ESCAPE);
COMBINING_TO_ESCAPE_MAP.put(MACRON_BELOW_COMBINING, MACRON_BELOW_ESCAPE);
}
// private static String combiningToEscape(String combiningChar) {
// String escape = COMBINING_TO_ESCAPE_MAP.get(combiningChar);
// if (escape != null)
// return escape;
// else
// throw new RuntimeException("Unrecognized combining char: [" + combiningChar + "] (" + StringHelper.toUnicode(combiningChar) + ")");
// }
private static String escapeToCombining(String escSeq) {
if (GRAVE_ESCAPE.equals(escSeq))
return GRAVE_COMBINING;
else if (ACUTE_ESCAPE.equals(escSeq))
return ACUTE_COMBINING;
else if (CIRCUMFLEX_ESCAPE.equals(escSeq))
return CIRCUMFLEX_COMBINING;
else if (TILDE_ESCAPE.equals(escSeq))
return TILDE_COMBINING;
else if (MACRON_ESCAPE.equals(escSeq))
return MACRON_COMBINING;
else if (BREVE_ESCAPE.equals(escSeq))
return BREVE_COMBINING;
else if (DIAERESIS_ESCAPE.equals(escSeq))
return DIAERESIS_COMBINING;
else if (CEDILLA_ESCAPE.equals(escSeq))
return CEDILLA_COMBINING;
else if (MACRON_BELOW_ESCAPE.equals(escSeq))
return MACRON_BELOW_COMBINING;
else
throw new RuntimeException("Unrecognized escape sequence: [" + escSeq + "]");
}
private static final Map<String, String> PRECOMPOSED_TO_ESCAPED_MAP = new HashMap<String, String>();
static {
PRECOMPOSED_TO_ESCAPED_MAP.put("à", "\\`a"); // \`a
PRECOMPOSED_TO_ESCAPED_MAP.put("á", "\\'a"); // \'a
PRECOMPOSED_TO_ESCAPED_MAP.put("â", "\\^a"); // \^a
PRECOMPOSED_TO_ESCAPED_MAP.put("ä", "\\\"a"); // \"a
PRECOMPOSED_TO_ESCAPED_MAP.put("ã", "\\~a"); // \~a
PRECOMPOSED_TO_ESCAPED_MAP.put("ā", "\\-a"); // \-a
PRECOMPOSED_TO_ESCAPED_MAP.put("ă", "\\va"); // \va
PRECOMPOSED_TO_ESCAPED_MAP.put("è", "\\`e"); // \`e
PRECOMPOSED_TO_ESCAPED_MAP.put("é", "\\'e"); // \'e
PRECOMPOSED_TO_ESCAPED_MAP.put("ê", "\\^e"); // \^e
PRECOMPOSED_TO_ESCAPED_MAP.put("ë", "\\\"e"); // \"e
PRECOMPOSED_TO_ESCAPED_MAP.put("ẽ", "\\~e"); // \~e
PRECOMPOSED_TO_ESCAPED_MAP.put("ē", "\\-e"); // \-e
PRECOMPOSED_TO_ESCAPED_MAP.put("ĕ", "\\ve"); // \ve
PRECOMPOSED_TO_ESCAPED_MAP.put("ì", "\\`i"); // \`i
PRECOMPOSED_TO_ESCAPED_MAP.put("í", "\\'i"); // \'i
PRECOMPOSED_TO_ESCAPED_MAP.put("î", "\\^i"); // \^i
PRECOMPOSED_TO_ESCAPED_MAP.put("ï", "\\\"i"); // \"i
PRECOMPOSED_TO_ESCAPED_MAP.put("ĩ", "\\~i"); // \~i
PRECOMPOSED_TO_ESCAPED_MAP.put("ī", "\\-i"); // \-i
PRECOMPOSED_TO_ESCAPED_MAP.put("ĭ", "\\vi"); // \vi
//PRECOMPOSED_TO_ESCAPED_MAP.put("ı", "\\ii"); // \ii
PRECOMPOSED_TO_ESCAPED_MAP.put("ò", "\\`o"); // \`o
PRECOMPOSED_TO_ESCAPED_MAP.put("ó", "\\'o"); // \'o
PRECOMPOSED_TO_ESCAPED_MAP.put("ô", "\\^o"); // \^o
PRECOMPOSED_TO_ESCAPED_MAP.put("ö", "\\\"o"); // \"o
PRECOMPOSED_TO_ESCAPED_MAP.put("õ", "\\~o"); // \~o
PRECOMPOSED_TO_ESCAPED_MAP.put("ō", "\\-o"); // \-o
PRECOMPOSED_TO_ESCAPED_MAP.put("ŏ", "\\vo"); // \vo
PRECOMPOSED_TO_ESCAPED_MAP.put("ù", "\\`u"); // \`u
PRECOMPOSED_TO_ESCAPED_MAP.put("ú", "\\'u"); // \'u
PRECOMPOSED_TO_ESCAPED_MAP.put("û", "\\^u"); // \^u
PRECOMPOSED_TO_ESCAPED_MAP.put("ü", "\\\"u"); // \"u
PRECOMPOSED_TO_ESCAPED_MAP.put("ũ", "\\~u"); // \~u
PRECOMPOSED_TO_ESCAPED_MAP.put("ū", "\\-u"); // \-u
PRECOMPOSED_TO_ESCAPED_MAP.put("ŭ", "\\vu"); // \vu
PRECOMPOSED_TO_ESCAPED_MAP.put("ñ", "\\~n"); // \~n
PRECOMPOSED_TO_ESCAPED_MAP.put("ç", "\\cc"); // \cc
PRECOMPOSED_TO_ESCAPED_MAP.put("À", "\\`A"); // \`A
PRECOMPOSED_TO_ESCAPED_MAP.put("Á", "\\'A"); // \'A
PRECOMPOSED_TO_ESCAPED_MAP.put("Â", "\\^A"); // \^A
PRECOMPOSED_TO_ESCAPED_MAP.put("Ä", "\\\"A"); // \"A
PRECOMPOSED_TO_ESCAPED_MAP.put("Ã", "\\~A"); // \~A
PRECOMPOSED_TO_ESCAPED_MAP.put("Ā", "\\-A"); // \-A
PRECOMPOSED_TO_ESCAPED_MAP.put("Ă", "\\vA"); // \vA
PRECOMPOSED_TO_ESCAPED_MAP.put("È", "\\`E"); // \`E
PRECOMPOSED_TO_ESCAPED_MAP.put("É", "\\'E"); // \'E
PRECOMPOSED_TO_ESCAPED_MAP.put("Ê", "\\^E"); // \^E
PRECOMPOSED_TO_ESCAPED_MAP.put("Ë", "\\\"E"); // \"E
PRECOMPOSED_TO_ESCAPED_MAP.put("Ẽ", "\\~E"); // \~E
PRECOMPOSED_TO_ESCAPED_MAP.put("Ē", "\\-E"); // \-E
PRECOMPOSED_TO_ESCAPED_MAP.put("Ĕ", "\\vE"); // \ve
PRECOMPOSED_TO_ESCAPED_MAP.put("Ì", "\\`I"); // \`I
PRECOMPOSED_TO_ESCAPED_MAP.put("Í", "\\'I"); // \'I
PRECOMPOSED_TO_ESCAPED_MAP.put("Î", "\\^I"); // \^I
PRECOMPOSED_TO_ESCAPED_MAP.put("Ï", "\\\"I"); // \"I
PRECOMPOSED_TO_ESCAPED_MAP.put("Ĩ", "\\~I"); // \~I
PRECOMPOSED_TO_ESCAPED_MAP.put("Ī", "\\-I"); // \-I
PRECOMPOSED_TO_ESCAPED_MAP.put("Ĭ", "\\vI"); // \vI
PRECOMPOSED_TO_ESCAPED_MAP.put("Ò", "\\`O"); // \`O
PRECOMPOSED_TO_ESCAPED_MAP.put("Ó", "\\'O"); // \'O
PRECOMPOSED_TO_ESCAPED_MAP.put("Ô", "\\^O"); // \^O
PRECOMPOSED_TO_ESCAPED_MAP.put("Ö", "\\\"O"); // \"O
PRECOMPOSED_TO_ESCAPED_MAP.put("Õ", "\\~O"); // \~O
PRECOMPOSED_TO_ESCAPED_MAP.put("Ō", "\\-O"); // \-O
PRECOMPOSED_TO_ESCAPED_MAP.put("Ŏ", "\\vO"); // \vO
PRECOMPOSED_TO_ESCAPED_MAP.put("Ù", "\\`U"); // \`U
PRECOMPOSED_TO_ESCAPED_MAP.put("Ú", "\\'U"); // \'U
PRECOMPOSED_TO_ESCAPED_MAP.put("Û", "\\^U"); // \^U
PRECOMPOSED_TO_ESCAPED_MAP.put("Ü", "\\\"U"); // \"U
PRECOMPOSED_TO_ESCAPED_MAP.put("Ũ", "\\~U"); // \~U
PRECOMPOSED_TO_ESCAPED_MAP.put("Ū", "\\-U"); // \-U
PRECOMPOSED_TO_ESCAPED_MAP.put("Ŭ", "\\vU"); // \vU
PRECOMPOSED_TO_ESCAPED_MAP.put("Ñ", "\\~N"); // \~N
PRECOMPOSED_TO_ESCAPED_MAP.put("Ç", "\\cC"); // \cC
// note: superscript is marked \s as in superscript o = \so and superscript r is \sr
//note for "breve" (u over letter) mark \va
}
private static final Map<String, String> PRECOMPOSED_TO_COMBINED_MAP = new HashMap<String, String>();
static {
for (Map.Entry<String, String> entry : PRECOMPOSED_TO_ESCAPED_MAP.entrySet()) {
String value = entry.getValue();
String baseChar = value.substring(value.length() - 1);
String escapeCodes = value.substring(0, value.length() - 1);
if (escapeCodes.length() % 2 != 0) throw new RuntimeException("problem with precomposed mapping: " + value);
StringBuilder baseWithCombining = new StringBuilder(baseChar);
for (int i = escapeCodes.length() - 2; i >= 0; i -= 2)
baseWithCombining.append(escapeToCombining(escapeCodes.substring(i, i + 2)));
PRECOMPOSED_TO_COMBINED_MAP.put(entry.getKey(), baseWithCombining.toString());
}
}
private static final Map<String, String> COMBINED_TO_PRECOMPOSED_MAP = new HashMap<String, String>();
static {
for (Map.Entry<String, String> entry : PRECOMPOSED_TO_COMBINED_MAP.entrySet()) {
COMBINED_TO_PRECOMPOSED_MAP.put(entry.getValue(), entry.getKey());
}
}
public static final Set<String> CHARS_THAT_CAN_BE_REPLACED = setUnion(LOWERCASE_LATIN_LETTERS, makeSet("ç")); // TODO: Change this?
public static final Set<String> VALID_CHAR_SUBSTITUTIONS = LOWERCASE_LATIN_LETTERS; // TODO: Change this?
public static final Set<String> CHARS_THAT_CAN_DOUBLED = LOWERCASE_LATIN_LETTERS; // TODO: Change this?
public static final Set<String> CHARS_THAT_CAN_BE_DECORATED_WITH_AN_ELISION_TILDE = LOWERCASE_LATIN_LETTERS; // TODO: Change this?
public static final Set<String> CHARS_THAT_CAN_BE_ELIDED = LOWERCASE_LATIN_LETTERS; // TODO: Change this?
private static final Set<String> COMBINING_DIACRITICS_THAT_CAN_BE_DISREGARDED = makeSet(GRAVE_COMBINING, ACUTE_COMBINING);
public static final Set<String> LETTERS_WITH_DISREGARDEDABLE_DIACRITICS = LOWERCASE_VOWELS;
public static Set<Integer> makePunctSet(Indexer<String> charIndexer) {
Set<Integer> punctSet = new HashSet<Integer>();
for (String c : charIndexer.getObjects()) {
if (isPunctuationChar(c))
punctSet.add(charIndexer.getIndex(c));
}
return punctSet;
}
public static Set<Integer> makeCanBeReplacedSet(Indexer<String> charIndexer) {
Set<Integer> canBeReplaced = new HashSet<Integer>();
for (String c : charIndexer.getObjects()) {
if (CHARS_THAT_CAN_BE_REPLACED.contains(c))
canBeReplaced.add(charIndexer.getIndex(c));
}
return canBeReplaced;
}
public static Set<Integer> makeValidSubstitutionCharsSet(Indexer<String> charIndexer) {
Set<Integer> validSubstitutionChars = new HashSet<Integer>();
for (String c : charIndexer.getObjects()) {
if (VALID_CHAR_SUBSTITUTIONS.contains(c))
validSubstitutionChars.add(charIndexer.getIndex(c));
}
return validSubstitutionChars;
}
public static Set<Integer> makeValidDoublableSet(Indexer<String> charIndexer) {
Set<Integer> validDoublableChars = new HashSet<Integer>();
for (String c : charIndexer.getObjects()) {
if (CHARS_THAT_CAN_DOUBLED.contains(c))
validDoublableChars.add(charIndexer.getIndex(c));
}
return validDoublableChars;
}
public static Set<Integer> makeCanBeElidedSet(Indexer<String> charIndexer) {
Set<Integer> canBeElided = new HashSet<Integer>();
for (String c : charIndexer.getObjects()) {
if (CHARS_THAT_CAN_BE_ELIDED.contains(c))
canBeElided.add(charIndexer.getIndex(c));
}
return canBeElided;
}
public static Map<Integer,Integer> makeAddTildeMap(Indexer<String> charIndexer) {
Map<Integer,Integer> m = new HashMap<Integer, Integer>();
for (String original : charIndexer.getObjects()) {
Tuple2<String,List<String>> originalLetterAndCombiningDiacritics = normalizeCharSeparateDiacritics(original);
String baseLetter = originalLetterAndCombiningDiacritics._1;
if (CHARS_THAT_CAN_BE_DECORATED_WITH_AN_ELISION_TILDE.contains(original)) {
m.put(charIndexer.getIndex(original), charIndexer.getIndex(addTilde(baseLetter)));
}
else if (LETTERS_WITH_DISREGARDEDABLE_DIACRITICS.contains(baseLetter)) {
for (String diacritic : originalLetterAndCombiningDiacritics._2) {
if (COMBINING_DIACRITICS_THAT_CAN_BE_DISREGARDED.contains(diacritic)) {
m.put(charIndexer.getIndex(original), charIndexer.getIndex(addTilde(baseLetter)));
break;
}
}
}
}
return m;
}
public static Map<Integer,List<Integer>> makeLigatureMap(Indexer<String> charIndexer) {
Map<Integer,List<Integer>> m = new HashMap<Integer, List<Integer>>();
for (Map.Entry<String,String> entry : LIGATURES.entrySet()) {
List<String> ligature = readNormalizeCharacters(entry.getKey());
if (ligature.size() > 1) throw new RuntimeException("Ligature ["+entry.getKey()+"] has more than one character: "+ligature);
List<Integer> l = new ArrayList<Integer>();
for (String c : readNormalizeCharacters(entry.getValue()))
l.add(charIndexer.getIndex(c));
m.put(charIndexer.getIndex(ligature.get(0)), l);
}
return m;
}
public static Map<Integer,Integer> makeDiacriticDisregardMap(Indexer<String> charIndexer) {
Map<Integer,Integer> m = new HashMap<Integer,Integer>();
for (String original : charIndexer.getObjects()) { // find accented letters
Tuple2<String,List<String>> originalLetterAndCombiningDiacritics = normalizeCharSeparateDiacritics(original);
String baseLetter = originalLetterAndCombiningDiacritics._1;
if (LETTERS_WITH_DISREGARDEDABLE_DIACRITICS.contains(baseLetter)) {
for (String diacritic : originalLetterAndCombiningDiacritics._2) {
if (COMBINING_DIACRITICS_THAT_CAN_BE_DISREGARDED.contains(diacritic)) {
m.put(charIndexer.getIndex(original), charIndexer.getIndex(baseLetter));
break;
}
}
}
}
return m;
}
public static String addTilde(String c) {
return normalizeChar(c + TILDE_COMBINING);
}
/**
* Get the character code including any escaped diacritics that precede
* the letter and any unicode "combining characters" that follow it.
*
* Precomposed accents are given the highest priority. Combining characters
* are interpreted as left-associative and high-priority, while escapes are
* right-associative and low-priority. So, for a letter x with precomposed
* diacritic 0, combining chars 1,2,3, and escapes 4,5,6, the input 654x123
* becomes encoded (with escapes) as 6543210x, and decoded (with precomposed
* and combining characters) as x01234656.
*
* @param c A single character, potentially with diacritics encoded in any
* form (composed, precomposed, escaped).
* @return A string representing a single fully-escaped character, with all
* diacritics (combining and precomposed) converted to their equivalent escape
* sequences.
* @throws RuntimeException if the parameter `s` does not represent a single
* (potentially composed or escaped) character.
*/
public static String normalizeChar(String c) {
Tuple2<String, List<String>> letterAndDiacritics = normalizeCharSeparateDiacritics(c);
return letterAndDiacritics._1 + StringHelper.join(letterAndDiacritics._2);
}
/**
* @see edu.berkeley.cs.nlp.ocular.data.textreader.textreader.Charset.normalizeChar
*
* @param c A single character, potentially with diacritics encoded in any
* form (composed, precomposed, escaped).
* @return A fully-normalized character, with all diacritics (combining and
* precomposed) converted to their equivalent normalized forms and placed in
* a list to be returned with the bare letter.
* @throws RuntimeException if the parameter `s` does not represent a single
* (potentially composed or escaped) character.
*/
public static Tuple2<String,List<String>> normalizeCharSeparateDiacritics(String c) {
Tuple3<String, List<String>, Integer> letterAndLength = readLetterAndNormalDiacriticsAt(c, 0);
int length = letterAndLength._3;
if (c.length() != length) throw new RuntimeException("Could not escape ["+c+"] because it contains more than one character ("+StringHelper.toUnicode(c)+")");
return Tuple2(letterAndLength._1, letterAndLength._2);
}
/**
* Read a single character from the line, starting at the given offset.
*
* @see edu.berkeley.cs.nlp.ocular.data.textreader.textreader.Charset.normalizeChar
*
* @param line A line of text possibly containing characters with diacritics
* composed, precomposed, or escaped.
* @param offset The offset point in `line` from which to start reading for a
* character.
* @return A fully-normalized character string, with all diacritics (combining
* and precomposed) converted to their equivalent combining forms. Also
* return the length in the ORIGINAL string of the span used to produce this
* normalized character (to use as an offset when scanning through the string).
*/
private static Tuple2<String, Integer> readNormalizeCharAt(String line, int offset) {
Tuple3<String, List<String>, Integer> result = readLetterAndNormalDiacriticsAt(line, offset);
String c = result._1 + StringHelper.join(result._2);
int length = result._3;
return Tuple2(c, length);
}
/**
* Read a single character from the line including a list of all its diacritics,
* starting at the given offset.
*
* @see edu.berkeley.cs.nlp.ocular.data.textreader.textreader.Charset.normalizeChar
*
* @param line A line of text possibly containing characters with diacritics
* composed, precomposed, or normalized.
* @param offset The offset point in `line` from which to start reading for a
* character.
* @return A fully-normalized character, with all diacritics (combining and
* precomposed) converted to their equivalent combining forms and put in a list,
* the base letter with all diacritics removed, and the length in the ORIGINAL
* string of the span used to produce this normalized character (to use as an
* offset when scanning through the string).
*/
private static Tuple3<String, List<String>, Integer> readLetterAndNormalDiacriticsAt(String line, int offset) {
int lineLen = line.length();
if (offset >= lineLen) throw new RuntimeException("offset must be less than the line length");
if (lineLen - offset >= 2 && line.substring(offset, offset + 2).equals("\\\\"))
return Tuple3("\\\\", (List<String>)new ArrayList<String>(), 2); // "\\" is its own character (for "\"), not an escaped diacritic
List<String> escapeDiacritics = new ArrayList<String>(); // in reversed order!
List<String> combiningDiacritics = new ArrayList<String>();
// get any escape prefixes characters
int i = offset;
while (i < lineLen && line.charAt(i) == '\\') {
if (i + 1 >= lineLen) throw new RuntimeException("expected more after escape symbol, but found nothing: " + i + "," + lineLen + " " + line.substring(Math.max(0, i - 10), i) + "[" + line.substring(i) + "]");
String escape = line.substring(i, i + 2);
escapeDiacritics.add(0, escape);
i += 2; // accept the 2-character escape sequence
}
if (i >= lineLen) throw new RuntimeException("expected a letter after escape code, but found nothing: " + i + "," + lineLen + " " + line.substring(Math.max(0, i - 50), i) + "[" + line.substring(i) + "]");
String letter = String.valueOf(line.charAt(i));
if (isCombiningChar(letter)) throw new RuntimeException("found unexpected combining char: " + i + "," + lineLen + " " + line.substring(Math.max(0, i - 50), i) + "[" + line.substring(i) + "]");
i += 1; // accept the letter itself
// get any combining characters
while (i < lineLen) {
String next = line.substring(i, i + 1);
if (!isCombiningChar(next)) break;
combiningDiacritics.add(next);
i++; // accept the combining character
}
String deprecomposedChar = PRECOMPOSED_TO_COMBINED_MAP.get(letter);
String letterOnly;
if (deprecomposedChar == null) {
letterOnly = letter;
}
else {
letterOnly = String.valueOf(deprecomposedChar.charAt(0));
for (int j = 1; j < deprecomposedChar.length(); ++j)
combiningDiacritics.add(0, String.valueOf(deprecomposedChar.charAt(j)));
}
for (String diacritic : escapeDiacritics) {
if (diacritic.equals("\\i")) {
if (!letterOnly.equals("i")) throw new RuntimeException("the \\i escape sequence can only be used on the character 'i' (to indicate a no-dot i)");
letterOnly = "ı";
}
else {
combiningDiacritics.add(escapeToCombining(diacritic));
}
}
if (letterOnly.length() != 1) throw new RuntimeException("base letter should be length 1, found: " + letterOnly);
if (!combiningDiacritics.isEmpty()) {
char letterChar = letterOnly.charAt(0);
if (!(Character.isAlphabetic(letterChar)))
throw new RuntimeException("because there were diacritics, letter is expected, but something else was found: " + i + "," + lineLen + " " + line.substring(Math.max(0, i - 50), i) + "[" + line.substring(i) + "]");
}
return Tuple3(letterOnly, combiningDiacritics, i - offset);
}
/**
* Convert a string into a sequence of diacritic-normalized characters.
*
* @see edu.berkeley.cs.nlp.ocular.data.textreader.textreader.Charset.normalizeChar
*
* @param line A line of text possibly containing characters with diacritics
* composed, precomposed, or escaped.
* @return A fully-normalized character string, with all diacritics (combining
* and precomposed) converted to their equivalent combining chars.
*/
public static List<String> readNormalizeCharacters(String line) {
List<String> normalizedChars = new ArrayList<String>();
int i = 0;
while (i < line.length()) {
Tuple2<String, Integer> normalizedCharAndLength = readNormalizeCharAt(line, i);
String c = normalizedCharAndLength._1;
int length = normalizedCharAndLength._2;
normalizedChars.add(c);
i += length; // advance to the next character
}
return normalizedChars;
}
/**
* Convert character into unicode precomposed and combining characters
*/
public static String unescapeChar(String c, boolean precomposedOnly) {
if (c.equals("\\\\")) return "\\";
Tuple2<String,List<String>> letterAndNormalDiacritics = normalizeCharSeparateDiacritics(c); // use combining chars only (and make sure it's a valid character)
String baseLetter = letterAndNormalDiacritics._1;
List<String> diacritics = letterAndNormalDiacritics._2;
if (diacritics.isEmpty()) return baseLetter;
StringBuilder b = new StringBuilder();
// Attempt to make a precomposed letter, falling back to composed otherwise
String firstDiacritic = diacritics.get(0);
String precomposed = COMBINED_TO_PRECOMPOSED_MAP.get(baseLetter + firstDiacritic);
if (precomposed != null)
b.append(precomposed);
else {
b.append(baseLetter);
if (!precomposedOnly) b.append(firstDiacritic);
}
if (precomposedOnly) {
// Handle the rest of the diacritics
for (int i = (precomposed != null ? 1 : 0); i < diacritics.size(); ++i) {
String escape = COMBINING_TO_ESCAPE_MAP.get(diacritics.get(i));
if (escape != null)
b.insert(0, escape);
else
b.append(StringHelper.toUnicode(diacritics.get(i)));
}
}
else {
// Handle the rest of the diacritics
for (int i = 1; i < diacritics.size(); ++i) {
b.append(diacritics.get(i));
}
}
return b.toString();
}
/**
* Convert character into unicode precomposed and combining characters
*/
public static String unescapeChar(String c) {
return unescapeChar(c, false);
}
/**
* Convert character into a base character and explicit escape sequences
*/
public static String fullyEscapeChar(String c) {
if (c.equals("\\\\")) return c;
Tuple2<String,List<String>> letterAndNormalDiacritics = normalizeCharSeparateDiacritics(c); // use combining chars only (and make sure it's a valid character)
String baseLetter = letterAndNormalDiacritics._1;
List<String> diacritics = letterAndNormalDiacritics._2;
if (baseLetter.equals("ı"))
baseLetter = "\\ii";
if (diacritics.isEmpty()) return baseLetter;
StringBuilder b = new StringBuilder(baseLetter);
// Handle the rest of the diacritics
for (int i = 0; i < diacritics.size(); ++i) {
String escape = COMBINING_TO_ESCAPE_MAP.get(diacritics.get(i));
if (escape != null)
b.insert(0, escape);
else
b.append(StringHelper.toUnicode(diacritics.get(i)));
}
return b.toString();
}
public static String removeAnyDiacriticFromChar(String c) {
return normalizeCharSeparateDiacritics(c)._1;
}
}