// ChineseEnglishWordMap -- a mapping from Chinese to English words. // Copyright (c) 2002, 2003, 2004 The Board of Trustees of // The Leland Stanford Junior University. All Rights Reserved. // // This program is free software; you can redistribute it and/or // modify it under the terms of the GNU General Public License // as published by the Free Software Foundation; either version 2 // of the License, or (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // Map is taken from CEDict Chinese-English Lexicon. Future versions // will support multiple Lexicons. // // http://www.mandarintools.com/cedict.html // // You should have received a copy of the GNU General Public License // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. // // For more information, bug reports, fixes, contact: // Christopher Manning // Dept of Computer Science, Gates 1A // Stanford CA 94305-9010 // USA // Support/Questions: java-nlp-user@lists.stanford.edu // Licensing: java-nlp-support@lists.stanford.edu package edu.stanford.nlp.trees.international.pennchinese; import edu.stanford.nlp.util.logging.Redwood; import edu.stanford.nlp.trees.TreebankLanguagePack; import edu.stanford.nlp.util.Generics; import edu.stanford.nlp.util.StringUtils; import java.io.*; import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * A class for mapping Chinese words to English. Uses CEDict free Lexicon. * * @author Galen Andrew */ public class ChineseEnglishWordMap implements Serializable { /** A logger for this class */ private static Redwood.RedwoodChannels log = Redwood.channels(ChineseEnglishWordMap.class); /** * */ private static final long serialVersionUID = 7655332268578049993L; private Map<String, Set<String>> map = Generics.newHashMap(10000); // large dictionary! private static final String defaultPattern = "[^ ]+ ([^ ]+)[^/]+/(.+)/"; private static final String defaultDelimiter = "[/;]"; private static final String defaultCharset = "UTF-8"; private static final String punctuations[] = { "\uff08.*?\uff09", "\\(.*?\\)", "<.*?>", "[\u2033\u20dd\u25cb\u25ef\u2039\u2329\u27e8\u203a\u232a\u27e9\u00ab\u27ea\u00bb\u27eb\u2308\u230b\u27e6\u27e7\u3030\uff5e\u201c\u2036\u201d\u2033\u2307\u301c\u3012\u29c4\u300a\u300b\u3000]", "^to "}; private static final boolean DEBUG = false; private boolean normalized = false; /** * SingletonHolder is loaded on the first execution of getInstance(). */ private static class SingletonHolder { private SingletonHolder() {} private final static ChineseEnglishWordMap INSTANCE = new ChineseEnglishWordMap(); } /** * A method for getting a singleton instance of this class. * In general, you should use this method rather than the constructor, * since each instance of the class is a large data file in memory. * * @return An instance of ChineseEnglishWordMap */ public static ChineseEnglishWordMap getInstance() { return SingletonHolder.INSTANCE; } /** * Does the word exist in the dictionary? * @param key The word in Chinese * @return Whether it is in the dictionary */ public boolean containsKey(String key) { key = key.toLowerCase(); key = key.trim(); return map.containsKey(key); } /** * * @param key a Chinese word * @return the English translation (null if not in dictionary) */ public Set<String> getAllTranslations(String key) { key = key.toLowerCase(); key = key.trim(); return map.get(key); } /** * * @param key a Chinese word * @return the English translations as an array (null if not in dictionary) */ public String getFirstTranslation(String key) { key = key.toLowerCase(); key = key.trim(); Set<String> strings = map.get(key); if (strings == null) return null; else return strings.iterator().next(); } public void readCEDict(String dictPath) { readCEDict(dictPath, defaultPattern, defaultDelimiter, defaultCharset); } private String normalize(String t) { String origT; if (DEBUG) { origT = t; } if ( ! this.normalized) { return t; } for (String punc : punctuations) { t = t.replaceAll(punc, ""); } t = t.trim(); if (DEBUG && !origT.equals(t)) { log.info("orig="+origT); log.info("norm="+t); } return t; } private Set<String> normalize(Set<String> trans) { if (!this.normalized) { return trans; } Set<String> set = Generics.newHashSet(); for (String t : trans) { t = normalize(t); if ( ! t.equals("")) { set.add(t); } } return set; } public void readCEDict(String dictPath, String pattern, String delimiter, String charset) { try { BufferedReader infile = new BufferedReader(new InputStreamReader(new FileInputStream(dictPath), charset)); Pattern p = Pattern.compile(pattern); for (String line = infile.readLine(); line != null; line = infile.readLine()) { Matcher m = p.matcher(line); if (m.matches()) { String word = (m.group(1)).toLowerCase(); word = word.trim(); // don't want leading or trailing spaces String transGroup = m.group(2); String[] trans = transGroup.split(delimiter); // TODO: strip out punctuations from translation if (map.containsKey(word)) { Set<String> oldtrans = map.get(word); for (String t : trans) { t = normalize(t); if ( ! t.equals("")) { if ( ! oldtrans.contains(t)) { oldtrans.add(t); } } } } else { Set<String> transList = new LinkedHashSet<>(Arrays.asList(trans)); String normW = normalize(word); Set<String> normSet = normalize(transList); if ( ! normW.equals("") && normSet.size() > 0) { map.put(normW, normSet); } } } } infile.close(); } catch (IOException e) { throw new RuntimeException("IOException reading CEDict from file " + dictPath, e); } } /** * Make a ChineseEnglishWordMap with a default CEDict path. * It looks for the file "cedict_ts.u8" in the working directory, for the * value of the CEDICT environment variable, and in a Stanford NLP Group * specific place. It throws an exception if a dictionary cannot be found. */ public ChineseEnglishWordMap() { String path = CEDict.path(); readCEDict(path); } /** * Make a ChineseEnglishWordMap * @param dictPath the path/filename of the CEDict */ public ChineseEnglishWordMap(String dictPath) { readCEDict(dictPath); } /** * Make a ChineseEnglishWordMap * @param dictPath the path/filename of the CEDict * @param normalized whether the entries in dictionary are normalized or not */ public ChineseEnglishWordMap(String dictPath, boolean normalized) { this.normalized = normalized; readCEDict(dictPath); } public ChineseEnglishWordMap(String dictPath, String pattern, String delimiter, String charset) { readCEDict(dictPath, pattern, delimiter, charset); } public ChineseEnglishWordMap(String dictPath, String pattern, String delimiter, String charset, boolean normalized) { this.normalized = normalized; readCEDict(dictPath, pattern, delimiter, charset); } private static boolean isDigits(String in) { for (int i = 0, len = in.length(); i < len; i++) { if ( ! Character.isDigit(in.charAt(i))) { return false; } } return true; } /** * Returns a reversed map of the current map. * * @return A reversed map of the current map. */ public Map<String, Set<String>> getReverseMap() { Set<Map.Entry<String,Set<String>>> entries = map.entrySet(); Map<String, Set<String>> rMap = Generics.newHashMap(entries.size()); for (Map.Entry<String,Set<String>> me : entries) { String k = me.getKey(); Set<String> transList = me.getValue(); for (String trans : transList) { Set<String> entry = rMap.get(trans); if (entry == null) { // reduce default size as most will be small Set<String> toAdd = new LinkedHashSet<>(6); toAdd.add(k); rMap.put(trans, toAdd); } else { entry.add(k); } } } return rMap; } /** * Add all of the mappings from the specified map to the current map. */ public int addMap(Map<String, Set<String>> addM) { int newTrans = 0; for (Map.Entry<String,Set<String>> me : addM.entrySet()) { String k = me.getKey(); Set<String> addList = me.getValue(); Set<String> origList = map.get(k); if (origList == null) { map.put(k, new LinkedHashSet<>(addList)); Set<String> newList = map.get(k); if (newList != null && newList.size() != 0) { newTrans+=addList.size(); } } else { for (String toAdd : addList) { if (!(origList.contains(toAdd))) { origList.add(toAdd); newTrans++; } } } } return newTrans; } @Override public String toString() { return map.toString(); } public int size() { return map.size(); } /** * The main method reads (segmented, whitespace delimited) words from a file * and prints them with their English translation(s). * * The path and filename of the CEDict Lexicon can be supplied via the * "-dictPath" flag; otherwise the default filename "cedict_ts.u8" in the * current directory is checked. * * By default, only the first translation is printed. If the "-all" flag * is given, all translations are printed. * * The input and output encoding can be specified using the "-encoding" flag. * Otherwise UTF-8 is assumed. */ public static void main(String[] args) throws IOException { Map<String, Integer> flagsToNumArgs = Generics.newHashMap(); flagsToNumArgs.put("-dictPath" , 1); flagsToNumArgs.put("-encoding" , 1); Map<String, String[]> argMap = StringUtils.argsToMap(args, flagsToNumArgs); String[] otherArgs = argMap.get(null); if (otherArgs.length < 1) { log.info("usage: ChineseEnglishWordMap [-all] [-dictPath path] [-encoding enc_string] inputFile"); System.exit(1); } String filename = otherArgs[0]; boolean allTranslations = argMap.containsKey("-all"); String charset = defaultCharset; if (argMap.containsKey("-encoding")) { charset = argMap.get("-encoding")[0]; } BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(filename), charset)); TreebankLanguagePack tlp = new ChineseTreebankLanguagePack(); String[] dpString = argMap.get("-dictPath"); ChineseEnglishWordMap cewm = (dpString == null) ? new ChineseEnglishWordMap() : new ChineseEnglishWordMap(dpString[0]); int totalWords = 0, coveredWords = 0; PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out, charset), true); for (String line = r.readLine(); line != null; line = r.readLine()) { String[] words = line.split("\\s", 1000); for (String word : words) { totalWords++; if (word.length() == 0) continue; pw.print(StringUtils.pad(word + ':', 8)); if (tlp.isPunctuationWord(word)) { totalWords--; pw.print(word); } else if (isDigits(word)) { pw.print(word + " [NUMBER]"); } else if (cewm.containsKey(word)) { coveredWords++; if (allTranslations) { List<String> trans = new ArrayList<>(cewm.getAllTranslations(word)); for (String s : trans) { pw.print((trans.indexOf(s) > 0 ? "|" : "") + s); } } else { pw.print(cewm.getFirstTranslation(word)); } } else { pw.print("[UNK]"); } pw.println(); } pw.println(); } r.close(); log.info("Finished translating " + totalWords + " words ("); log.info(coveredWords + " were in dictionary)."); } }