package edu.stanford.nlp.trees.international.pennchinese; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.InputStreamReader; import java.io.IOException; import java.io.OutputStreamWriter; import java.util.Map; import java.util.Set; import edu.stanford.nlp.io.RuntimeIOException; import java.util.function.Function; import edu.stanford.nlp.util.Generics; /** * This class is a Function which transforms a String of traditional * text into a string of simplified text. It does this by looking for * and extracting all single characters from a CEDict file. * <br> * There are a few hardcoded translations to cover for ambiguities in * the simplified translations of traditional characters. * * <ul> * <li> 鹼: mapped to 碱, although 硷 is listed as a possibility in CEDict. * <li> 於: mapped to 于, although 於 is listed as a possibility in CEDict. * <li> 祇: mapped to 只, although 祇 is listed as a possibility in CEDict. * <li> 彷: sometimes also 彷, but 仿 is more common. * <li> 甚: sometimes also 甚, but 什 is more common. * <li> 麽: can appear as 幺麽, but very rare. Hardcoded for now * unless that causes problems. * </ul> * @author John Bauer */ public class TraditionalSimplifiedCharacterMap implements Function<String, String> { Map<String, String> map = Generics.newHashMap(); String[][] HARDCODED = {{"鹼", "碱"}, {"於", "于"}, {"祇", "只"}, {"彷", "仿"}, {"甚", "什"}, {"麽", "么"}}; public TraditionalSimplifiedCharacterMap() { this(CEDict.path()); } public TraditionalSimplifiedCharacterMap(String path) { // TODO: gzipped maps might be faster try { FileInputStream fis = new FileInputStream(path); InputStreamReader isr = new InputStreamReader(fis, "utf-8"); BufferedReader br = new BufferedReader(isr); init(br); br.close(); isr.close(); fis.close(); } catch (IOException e) { throw new RuntimeIOException(e); } } void init(BufferedReader reader) { try { Set<String> hardcodedSet = Generics.newHashSet(); for (String[] transform : HARDCODED) { hardcodedSet.add(transform[0]); String traditional = transform[0]; String simplified = transform[1]; map.put(traditional, simplified); } String line; while ((line = reader.readLine()) != null) { if (line.startsWith("#")) { continue; } if (line.length() >= 3 && line.charAt(1) == ' ' && line.charAt(3) == ' ') { // We're only interested in lines that represent a single character String traditional = line.substring(0, 1); String simplified = line.substring(2, 3); // Fail on duplicates. Only a few come up in cedict, and // those that do should already be accommodated if (map.containsKey(traditional) && !hardcodedSet.contains(traditional) && !simplified.equals(map.get(traditional))) { throw new RuntimeException("Character " + traditional + " mapped to " + simplified + " already mapped to " + map.get(traditional)); } map.put(traditional, simplified); } } } catch (IOException e) { throw new RuntimeIOException(e); } } public String apply(String input) { StringBuilder translated = new StringBuilder(); for (int i = 0; i < input.length(); ++i) { String c = input.substring(i, i + 1); if (map.containsKey(c)) { translated.append(map.get(c)); } else { translated.append(c); } } return translated.toString(); } public void translateLines(BufferedReader br, BufferedWriter bw) { try { String line; while ((line = br.readLine()) != null) { bw.write(apply(line)); bw.newLine(); } } catch (IOException e) { throw new RuntimeIOException(e); } } public void translateFile(String input, String output) { try { FileInputStream fis = new FileInputStream(input); InputStreamReader isr = new InputStreamReader(fis, "utf-8"); BufferedReader br = new BufferedReader(isr); FileOutputStream fos = new FileOutputStream(output); OutputStreamWriter osw = new OutputStreamWriter(fos, "utf-8"); BufferedWriter bw = new BufferedWriter(osw); translateLines(br, bw); bw.close(); osw.close(); fos.close(); br.close(); isr.close(); fis.close(); } catch (IOException e) { throw new RuntimeIOException(e); } } public static void main(String[] args) { TraditionalSimplifiedCharacterMap mapper = new TraditionalSimplifiedCharacterMap(); mapper.translateFile(args[0], args[1]); } }