TraditionalSimplifiedCharacterMap.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.trees.international.pennchinese;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.InputStreamReader;
import java.io.IOException;
import java.io.OutputStreamWriter;

import java.util.Map;
import java.util.Set;

import edu.stanford.nlp.io.RuntimeIOException;
import java.util.function.Function;
import edu.stanford.nlp.util.Generics;

/**
 * This class is a Function which transforms a String of traditional
 * text into a string of simplified text.  It does this by looking for
 * and extracting all single characters from a CEDict file.
 * <br>
 * There are a few hardcoded translations to cover for ambiguities in
 * the simplified translations of traditional characters.
 *
 * <ul>
 * <li> 鹼: mapped to 碱, although 硷 is listed as a possibility in CEDict.
 * <li> 於: mapped to 于, although 於 is listed as a possibility in CEDict.
 * <li> 祇: mapped to 只, although 祇 is listed as a possibility in CEDict.
 * <li> 彷: sometimes also 彷, but 仿 is more common.
 * <li> 甚: sometimes also 甚, but 什 is more common.
 * <li> 麽: can appear as 幺麽, but very rare.  Hardcoded for now
 *          unless that causes problems.
 * </ul>
 * @author John Bauer
 */
public class TraditionalSimplifiedCharacterMap implements Function<String, String> {
  Map<String, String> map = Generics.newHashMap();

  String[][] HARDCODED = {{"鹼", "碱"},
                          {"於", "于"},
                          {"祇", "只"},
                          {"彷", "仿"},
                          {"甚", "什"},
                          {"麽", "么"}};

  public TraditionalSimplifiedCharacterMap() {
    this(CEDict.path());
  }

  public TraditionalSimplifiedCharacterMap(String path) {
    // TODO: gzipped maps might be faster
    try {
      FileInputStream fis = new FileInputStream(path);
      InputStreamReader isr = new InputStreamReader(fis, "utf-8");
      BufferedReader br = new BufferedReader(isr);
      init(br);
      br.close();
      isr.close();
      fis.close();
    } catch (IOException e) {
      throw new RuntimeIOException(e);
    }
  }

  void init(BufferedReader reader) {
    try {
      Set<String> hardcodedSet = Generics.newHashSet();
      for (String[] transform : HARDCODED) {
        hardcodedSet.add(transform[0]);
        String traditional = transform[0];
        String simplified = transform[1];
        map.put(traditional, simplified);
      }

      String line;
      while ((line = reader.readLine()) != null) {
        if (line.startsWith("#")) {
          continue;
        }
        if (line.length() >= 3 &&
            line.charAt(1) == ' ' && line.charAt(3) == ' ') {
          // We're only interested in lines that represent a single character
          String traditional = line.substring(0, 1);
          String simplified = line.substring(2, 3);
          // Fail on duplicates.  Only a few come up in cedict, and
          // those that do should already be accommodated
          if (map.containsKey(traditional) && !hardcodedSet.contains(traditional) &&
              !simplified.equals(map.get(traditional))) {
            throw new RuntimeException("Character " + traditional + " mapped to " +
                                       simplified + " already mapped to " +
                                       map.get(traditional));
          }
          map.put(traditional, simplified);
        }
      }
    } catch (IOException e) {
      throw new RuntimeIOException(e);
    }
  }

  public String apply(String input) {
    StringBuilder translated = new StringBuilder();
    for (int i = 0; i < input.length(); ++i) {
      String c = input.substring(i, i + 1);
      if (map.containsKey(c)) {
        translated.append(map.get(c));
      } else {
        translated.append(c);
      }
    }
    return translated.toString();
  }

  public void translateLines(BufferedReader br, BufferedWriter bw) {
    try {
      String line;
      while ((line = br.readLine()) != null) {
        bw.write(apply(line));
        bw.newLine();
      }
    } catch (IOException e) {
      throw new RuntimeIOException(e);
    }
  }

  public void translateFile(String input, String output) {
    try {
      FileInputStream fis = new FileInputStream(input);
      InputStreamReader isr = new InputStreamReader(fis, "utf-8");
      BufferedReader br = new BufferedReader(isr);

      FileOutputStream fos = new FileOutputStream(output);
      OutputStreamWriter osw = new OutputStreamWriter(fos, "utf-8");
      BufferedWriter bw = new BufferedWriter(osw);

      translateLines(br, bw);

      bw.close();
      osw.close();
      fos.close();

      br.close();
      isr.close();
      fis.close();
    } catch (IOException e) {
      throw new RuntimeIOException(e);
    }
  }

  public static void main(String[] args) {
    TraditionalSimplifiedCharacterMap mapper = new TraditionalSimplifiedCharacterMap();
    mapper.translateFile(args[0], args[1]);
  }
}