package ruc.irm.similarity.util; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.util.HashMap; import java.util.HashSet; import java.util.Map; import java.util.Set; /** * 拼音处理的工具,负责从拼音词典加载内容,根据汉字词语或汉字查找拼音 * * @author <a href="mailto:iamxiatian@gmail.com">夏天</a> * @organization 中国人民大学信息资源管理学院 知识工程实验室 */ public class PinyinUtils { /** 拼音的Map词典, 一个汉字可能对应多个拼音, 它所有的拼音放到一个集合中 */ private Map<Character, Set<String>> pinyinDict = null; /** 单例 */ private static PinyinUtils instance = null; private PinyinUtils() throws IOException{ //从classpath中加载拼音词典文件 String pinyinDictFile = getClass().getPackage().getName().replaceAll("\\.", "/") + "/F02-GB2312-to-PuTongHua-PinYin.txt"; InputStream input = this.getClass().getClassLoader().getResourceAsStream(pinyinDictFile); BufferedReader in = new BufferedReader(new InputStreamReader(input, "UTF-8")); String line = null; MyTraverseEvent event = new MyTraverseEvent(); while ((line = in.readLine()) != null) { event.visit(line); } input.close(); in.close(); this.pinyinDict = event.getPinyins(); } public static PinyinUtils getInstance(){ if(instance == null){ try { instance = new PinyinUtils(); } catch (IOException e) { e.printStackTrace(); } } return instance; } /** * 获取汉字的拼音, 由于汉字具有多音字,故返回一个集合 * @param hanzi * @return */ public Set<String> getPinyin(Character hanzi){ Set<String> set = pinyinDict.get(hanzi); if(set==null || set.size()==0){ set = new HashSet<String>(); set.add(hanzi.toString()); } return set; } /** * 获取词语的拼音, 一个词语可能对应多个拼音,把所有可能的组合放到集合中返回 * @param word * @return */ public Set<String> getPinyin(String word){ Set<String> word_set = new HashSet<String>(); for(int i=0; i<word.length(); i++){ Set<String> hanzi_set = getPinyin(word.charAt(i)); if(word_set==null || word_set.size()==0){ word_set.addAll(hanzi_set); continue; } Set<String> tmp_set = new HashSet<String>(); for(String w:word_set){ for(String h:hanzi_set){ tmp_set.add(w + h); } } word_set = tmp_set; } return word_set; } /** * 获取拼音字符串,多音字只取一个 * @param word * @return */ public String getPinyinSingle(String word){ StringBuffer sb = new StringBuffer(); for(int i=0; i<word.length(); i++){ sb.append(getPinyin(word.charAt(i)).iterator().next()); } return sb.toString(); } /** * 获取拼音串,对于多音字,给出所有拼音 * @param word * @return */ public String getPinyinString(String word){ StringBuffer sb = new StringBuffer(); for(int i=0; i<word.length(); i++){ Set<String> pinyin = getPinyin(word.charAt(i)); sb.append(pinyin.toString()); } return sb.toString(); } /** * 获取拼音首字母 * @param word * @return */ public String getPinyinHead(String word){ StringBuffer sb = new StringBuffer(); for(int i=0; i<word.length(); i++){ sb.append(getPinyin(word.charAt(i)).iterator().next().charAt(0)); } return sb.toString(); } private static class MyTraverseEvent { /** 一个汉字对应多个拼音, 多个拼音放到集合中 */ private Map<Character, Set<String>> pinyins = null; public MyTraverseEvent(){ this.pinyins = new HashMap<Character, Set<String>>(); } public Map<Character, Set<String>> getPinyins(){ return pinyins; } public boolean visit(String item) { if(item.startsWith("//")){ return true; } char hanzi = item.charAt(0); //String pinyin = item.substring(2, item.length()-1); String pinyin = item.substring(2, item.length()); Set<String> set = pinyins.get(hanzi); if(set==null){ set = new HashSet<String>(); } set.add(pinyin); pinyins.put(hanzi, set); return true; } } }