/* * Copyright (C) 2010. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 3 or * version 2 as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. */ package uk.me.parabola.imgfmt.app.labelenc; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.util.Arrays; import java.util.Locale; import uk.me.parabola.log.Logger; /** * A simple transliterator that transliterates character by character based * on pre-prepared tables. It is not context sensitive - the same input character * always produces the same output character(s), so the results are * not very good for languages where that is important. * * Tables are only read when needed, so for a typical map only a small * number of files will actually be read. */ public class TableTransliterator implements Transliterator { private static final Logger log = Logger.getLogger(TableTransliterator.class); private final String[][] rows = new String[256][]; private final boolean useLatin; private boolean forceUppercase; public TableTransliterator(String targetCharset) { if (targetCharset.equals("latin1") || targetCharset.equals("cp1252")) useLatin = true; else useLatin = false; } /** * Convert a string into a string that uses only ascii characters. * * @param s The original string. It can use any unicode character. Can be null in which case null will * be returned. * @return A string that uses only ascii characters that is a transcription or * transliteration of the original string. */ public String transliterate(String s) { if (s == null) return null; StringBuilder sb = new StringBuilder(s.length() + 5); for (char c : s.toCharArray()) { if (c <= (useLatin? 0xff: 0x7f)) { sb.append(c); } else { int row = c >>> 8; String[] rowmap = rows[row]; if (rowmap == null) rowmap = loadRow(row); sb.append(rowmap[c & 0xff]); } } String text = sb.toString(); if (forceUppercase) text = text.toUpperCase(Locale.ENGLISH); return text; } public void forceUppercase(boolean uc) { forceUppercase = uc; } /** * Load one row of characters. This means unicode characters that are of the * form U+RRXX where RR is the row. * * @param row Row number 0-255. * @return An array of strings, one for each character in the row. If there is * no ascii representation then a '?' character will fill that * position. */ private String[] loadRow(int row) { if (rows[row] != null) return rows[row]; String[] newRow = new String[256]; rows[row] = newRow; // Default all to a question mark Arrays.fill(newRow, "?"); // If we are doing latin1, see if there is a specific file for latin // characters first. if (useLatin) { String name = String.format("/chars/latin1/row%02x.trans", row); readCharFile(name, newRow); } // Fill in any remaining characters from the ascii mappings. String name = String.format("/chars/ascii/row%02x.trans", row); readCharFile(name, newRow); return newRow; } private void readCharFile(String name, String[] newRow) { InputStream is = getClass().getResourceAsStream(name); if (is == null) return; try { BufferedReader br = new BufferedReader(new InputStreamReader(is, "utf-8")); String line; while ((line = br.readLine()) != null) { line = line.trim(); if (line.isEmpty() || line.charAt(0) == '#') continue; String[] fields = line.split("\\s+"); if (fields.length < 2) continue; String upoint = fields[0]; String translation = fields[1]; if ("?".equals(translation)) continue; if (upoint.length() != 6 || upoint.charAt(0) != 'U') continue; // The first field must look like 'U+RRXX', we extract the XX part int index = Integer.parseInt(upoint.substring(4), 16); if (newRow[index].equals("?")) { if (forceUppercase) newRow[index] = translation.toUpperCase(Locale.ENGLISH); else newRow[index] = translation; } } } catch (IOException e) { log.error("Could not read character translation table"); } } }