/*
* Copyright (c) 2008-2016 Computer Network Information Center (CNIC), Chinese Academy of Sciences.
*
* This file is part of Duckling project.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package cn.vlabs.umt.common.util;
import net.sourceforge.pinyin4j.PinyinHelper;
import net.sourceforge.pinyin4j.format.HanyuPinyinCaseType;
import net.sourceforge.pinyin4j.format.HanyuPinyinOutputFormat;
import net.sourceforge.pinyin4j.format.HanyuPinyinToneType;
import net.sourceforge.pinyin4j.format.HanyuPinyinVCharType;
import net.sourceforge.pinyin4j.format.exception.BadHanyuPinyinOutputFormatCombination;
import org.apache.log4j.Logger;
/**
* @date 2011-11-15
* @author JohnX
*/
public final class PinyinUtil {
private PinyinUtil() {
}
public static final char CH_START = '\u4e00';
public static final char CH_END = '\u9fa5';
public static final char SPLIT_CHAR = ';';
public static final int CH_CODE_VALUE = 128;
public static final int DB_CHNAME_LEN = 3;
private static final Logger LOG = Logger.getLogger(PinyinUtil.class);
/**
* 只获得拼音 比如"拼音",获得pinyin
* */
public static String getPinyinOnly(String str) {
return getPart(str, 0);
}
/**
* 只获得缩写 比如"拼音" ,获得py
* */
public static String getShortPinyin(String str) {
return CommonUtils.isNull(getPart(str, 1)) ? getPart(str, 0) : getPart(str, 1);
}
private static String getPart(String str, int index) {
String[] split = getPinyin(str).split(";");
if (!CommonUtils.isNull(split) && split.length > index) {
return split[index];
}
return "";
}
/**
* 获得拼音,如果不是汉字,则返回原字符串
* @param 需要转换成拼音的汉字
* @return pinyin;py
* */
public static String getPinyin(String str) {
if (CommonUtils.isNull(str)) {
return ";";
}
HanyuPinyinOutputFormat outputFormat = new HanyuPinyinOutputFormat();
outputFormat.setToneType(HanyuPinyinToneType.WITHOUT_TONE);
outputFormat.setVCharType(HanyuPinyinVCharType.WITH_V);
outputFormat.setCaseType(HanyuPinyinCaseType.LOWERCASE);
StringBuilder sb = new StringBuilder();
appendPinyin(str, sb, outputFormat);
appendFirstCharInCH(str, sb, outputFormat);
return sb.toString();
}
/**
* 汉语人名 转为 拼音组成的英文人名
* 按英文人名规则,名在前,姓在后,以空格区分
* @param chines
* 汉字人名
* @return 拼音的英文人名
*/
public static String getPinyinMingXing(String chinesName){
if(PinyinUtil.isEnName(chinesName)){
return chinesName;
}
String lastName = "";
StringBuilder firstName = new StringBuilder("");
StringBuilder engNameBuff = new StringBuilder();
String pinyinChar = "";
char[] nameChar = chinesName.toCharArray();
HanyuPinyinOutputFormat defaultFormat = new HanyuPinyinOutputFormat();
defaultFormat.setCaseType(HanyuPinyinCaseType.LOWERCASE);
defaultFormat.setToneType(HanyuPinyinToneType.WITHOUT_TONE);
defaultFormat.setVCharType(HanyuPinyinVCharType.WITH_V);
int count = 0;
int division = isDoubleLastName(chinesName)?2:1; /** used for divide the lastname and firstname*/
if(isMixChEn(nameChar)){
return getMixChEn(nameChar);
}
for (int i = 0; i < nameChar.length; i++) {
if (nameChar[i] > CH_CODE_VALUE) {
count++;
try {
String[] temp = PinyinHelper.toHanyuPinyinStringArray(
nameChar[i], defaultFormat);
pinyinChar = (null != temp)? temp[0]:"";
} catch (BadHanyuPinyinOutputFormatCombination e) {
LOG.error(e);
}
if(count<=division){
lastName += pinyinChar;
}else{
/** 处理名中含u:特殊情况,名转换为u,姓默认为v */
if(pinyinChar.endsWith("v")){
pinyinChar = pinyinChar.replace("v", "u");
}
firstName.append(pinyinChar);
}
}
}
engNameBuff.append(firstName).append(" ").append(lastName);
return engNameBuff.toString();
}
/**
* 将str转换成拼音,并将转化结果插入到sb中
* @param str 待转换的中文串(可中英混合)
* @param sb 保存结果的StringBuilder对象
* @param outputFormat 拼音格式化对象
*/
private static void appendPinyin(String str, StringBuilder sb, HanyuPinyinOutputFormat outputFormat){
for (int i = 0; i < str.length(); i++) {
char ch = str.charAt(i);
if (ch < PinyinUtil.CH_START || ch > PinyinUtil.CH_END) {
sb.append(ch);
} else {
String[] pinyinArray = null;
try {
pinyinArray = PinyinHelper.toHanyuPinyinStringArray(ch, outputFormat);
} catch (BadHanyuPinyinOutputFormatCombination e) {
LOG.error(e.getMessage());
}
if (pinyinArray != null && pinyinArray.length > 0) {
sb.append(pinyinArray[0]);
}
}
}
}
/**
* 将Str中的中文字符的首字母提取出来,并插入到sb中,与前面的字符串用";"隔开
* @param str 待转换的中文串(可中英文混合)
* @param sb 保存结果的StringBuilder对象
* @param outputFormat 拼音格式化对象
*/
private static void appendFirstCharInCH(String str, StringBuilder sb, HanyuPinyinOutputFormat outputFormat){
boolean firstChineseChar = true;
for (int i = 0; i < str.length(); i++) {
char ch = str.charAt(i);
if (ch >= PinyinUtil.CH_START && ch <= PinyinUtil.CH_END) {
if (firstChineseChar) {
sb.append(SPLIT_CHAR);
firstChineseChar = false;
}
String[] pinyinArray = null;
try {
pinyinArray = PinyinHelper.toHanyuPinyinStringArray(ch, outputFormat);
} catch (BadHanyuPinyinOutputFormatCombination e) {
LOG.error(e.getMessage());
}
if (pinyinArray != null && pinyinArray.length > 0) {
sb.append(pinyinArray[0].charAt(0));
}
}
}
if (sb.indexOf(";") < 0) {
sb.append(";");
}
}
/**
* 判断字符数组array是否是中英文混合
* @param array
* @return
*/
private static boolean isMixChEn(char[] array){
if(null == array || array.length<=0){
return false;
}
boolean en = false;
boolean zh = false;
for(int i=0; i<array.length && !(en && zh); i++){
if(array[i] > CH_CODE_VALUE){
zh = true;
}else{
en = true;
}
}
return en && zh;
}
/**
* 将中英文混合的array转换成英文字符串
* @param array
* @return
*/
private static String getMixChEn(char[] array){
StringBuilder sb = new StringBuilder();
if(null != array && array.length>0){
for(int i=0; i<array.length; i++){
if(array[i] > CH_CODE_VALUE){
sb.append(getPinyinOnly(String.valueOf(array[i])));
}else{
sb.append(array[i]);
}
}
}
return sb.toString();
}
/**
* 将中文姓名转化为拼音,结果由三部分组成:姓+空格+名
* @param chiName 中文名
* @return 英文名
*/
public static String getPinyinXingMing(String chiName){
String mingXing = getPinyinMingXing(chiName);
int space = mingXing.indexOf(' ');
if(space>0){
String ming = mingXing.substring(0, space);
String xing = mingXing.substring(space+1, mingXing.length());
return xing+" "+ming;
}else{
return mingXing;
}
}
private static String[] doubleNames = {"皇甫","公孙","慕容","欧阳","上官","司马","司徒","尉迟","长孙","诸葛"};
/**判定是否为复姓
* @param chineseName 中文姓名
* @return boolean
* */
public static boolean isDoubleLastName(String chinesName){
if(chinesName.length() >= DB_CHNAME_LEN){
for(int i=0;i<doubleNames.length;i++){
if(chinesName.startsWith(doubleNames[i])){
return true;
}
}
}
return false;
}
/**判定是否是中文
* @param name 中文姓名
* @return boolean
* */
public static boolean isChineseName(String name) {
char[] nameChar = name.toCharArray();
return nameChar[0] < CH_CODE_VALUE ? false : true;
}
/**判定是否是纯拼音(英文)名字
* @param name 中文姓名
* @return boolean
* */
public static boolean isEnName(String name) {
char[] nameChar = name.toCharArray();
if(null == nameChar || nameChar.length<=0){
return false;
}
boolean en = false;
boolean zh = false;
for(int i=0; i<nameChar.length && !(en && zh); i++){
if(nameChar[i] > CH_CODE_VALUE){
zh = true;
}else{
en = true;
}
}
return en && !zh;
}
}