CharUtils.java example

Explorer
ef-orm-master
/*
 * JEF - Copyright 2009-2010 Jiyi (mr.jiyi@gmail.com)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package jef.tools.string;

import jef.common.wrapper.IntRange;
import jef.common.wrapper.IntRangeGroup;
import jef.tools.StringUtils;

public final class CharUtils extends org.apache.commons.lang.CharUtils {
	/*
	long int char byte经常转来转去，但实际上危险性是很大的。
	long占用8个字节，int 占用4个字节，short / char占用两个字节  byte一个字节。
	凡是窄类型转宽类型是安全的，但是宽类型转窄类型就可能造成数据丢失,是危险的。
	long转int会丢失高位，这个很容易引起注意,但是其他几种有很容易出问题。
	short的范围是-32767 ~ 32768.
	比如  InputStream.read(),得到一个int,转成char后，范围就是0~65535.(0~FFFF)，
	如果要判断流的结束（-1），那么就必须在转成char之前，用int去判断，转成char之后判断就有错。
	但是偏偏java语法是允许你写出 char ==-1这样的直接比较（隐式转换，从char转int）.
	
	byte的范围是 -128 ~ 127.而不是0 到255，这也是很容易搞错的地方，
	byte的实际范围是 -128 ~ 127 ，即Integer的cache的范围，因此将int转换到
	因此，将byte转int应该这样写
	int unsignedByte = signedByte >= 0 ? signedByte : 256 + signedByte;  
	将int转byte应该这样写（这个逻辑和(byte)的转换效果应该是等同的）
	int byteValue;  
	int temp = intValue % 256;  
	if ( intValue < 0) {  
	  byteValue =  temp < -128 ? 256 + temp : temp;  
	}  
	else {  
	  byteValue =  temp > 127 ? temp - 256 : temp;  
	}  
	System.out.println();
	System.out.println(byte2hex(md));
	*/
	
	/**
	 * 常量：所有数字字符
	 */
	public static final char[] NUMBERS = "0123456789".toCharArray();
	/**
	 * 常量：十六进制数字字符
	 */
	public static final char[] HEX_NUMBERS = "0123456789ABCDEFabcdef".toCharArray();
	/**
	 * 常量：所有大写字母
	 */
	public static final char[] ALPHA_UPPERS = "ABCDEFGHIJKLMNOPQRSTUVWXYZ".toCharArray();
	/**
	 * 常量：所有小写字母
	 */
	public static final char[] ALPHA_LOWERS = "abcdefghijklmnopqrstuvwxyz".toCharArray();
	/**
	 * 常量：大写和小写字母
	 */
	public static final char[] ALPHAS = "ABCDEFGHIJKLMNOPQRSTNVWXYZabcdefghijklmnopqrstuvwxyz".toCharArray();
	/**
	 * 常量：所有常用标点符号和空格
	 */
	public static final char[] SYMBOLS = " !\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~".toCharArray();
	/**
	 * 常量：字母，数字和下划线
	 */
	public static final char[] ALPHA_NUM_UNDERLINE = "0123456789ABCDEFGHIJKLMNOPQRSTNVWXYZabcdefghijklmnopqrstuvwxyz_".toCharArray();
	/**
	 * 常量：允许在URL中出现的字符，包含字母数字下划线和其他常用符号
	 */
	public static final char[] CHARS_IN_URL = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890_&?=#%;~,./-+".toCharArray();

	public static final String JAP_POINT=new String(new char[]{12539,65381,40658,65378,65379,9834, 12316, 65533,8722,8810,8811,63});
	public static final String CHN_POINT=new String(new char[]{183,  183,  40657,'『','』',    65374 ,65374,13199,'-','《','》' ,'？'});
	
	/**
	 *  是否为数字
	 * @param c
	 * @return true if char is a number
	 */
	public static final boolean isNumber(char c) {
		return c >= 48 && c <= 57;
	}

	/**
	 *  是否为空格(含中文空格)
	 * @param c
	 * @return true if the char is space (chinese space included)
	 */
	public static final boolean isSpace(char c) {
		return c == 32 || c == 12288;
	}

	/**
	 *  是否为大写字母
	 * @param c
	 * @return true if the char is a alphabat in upper case
	 */
	public static final boolean isUpperAlpha(char c) {
		return c >= 65 && c <= 90;
	}

	/**
	 *  是否为小写字母
	 * @param c
	 * @return true if the char is alphabat in lower case.
	 */
	public static final boolean isLowerAlpha(char c) {
		return c >= 97 && c <= 122;
	}

	/**
	 * 是否为各种符号
	 * @param c
	 * @return true if the char is a symbol
	 */
	public static boolean isSymbol(char c) {
		return (c >= 32 && c <= 47) || (c >= 58 && c <= 64) || (c >= 91 && c <= 96) || (c > 122 && c < 127);
	}

	/**
	 * 是否为控制字符
	 * @param c
	 * @return true if the char is not a visible character
	 */
	public static final boolean isCtrl(char c) {
		return (c < 32) || c > 255;
	}

	public static boolean isChinese(char c) {
		return c>=0x4e00 && c<=0x9fa5;
	}
	
	/**
	 * 是否为东亚字符(含符号)
	 * @param c
	 * @return <tt>true</tt> if char is chinese or japanese.., otherwise <tt>false</tt>
	 */
	public static boolean isAsian(char c) {
		return (c > 255 && c != 65279);
	}

	/**
	 * 是否为GB18030符号或控制字符
	 * @param c
	 * @return <tt>true</tt> if char is a 全角符号 
	 */
	public static boolean isAsianSymbol(char c) {
		if (c > 19968 && c < 40869)
			return false;
		if (isKatakana(c) || isNumberSBC(c) || isHiragana(c) || isSpace(c) || isAlphaSBC(c))
			return false;
		return true;
	}
	public static final IntRange SBC_ALPHA_UPPER = new IntRange(65313, 65338);
	public static final IntRange SBC_ALPHA_LOWER = new IntRange(65345, 65370);
	public static final IntRangeGroup SBC_ALPHA = new IntRangeGroup(SBC_ALPHA_LOWER, SBC_ALPHA_UPPER);
	public static final IntRange SBC_NUMBER = new IntRange(65296, 65305);
	public static final IntRange SBC_CHARS_WITHOUT_SPACE = new IntRange(65281,65374);
	public static final char SBC_SPACE=(char) 12288;
	/**
	 * 是否为GB18030全角数字
	 * @param c
	 * @return
	 */
	public static final boolean isNumberSBC(char c) {
		return SBC_NUMBER.contains(c);
	}

	/**
	 * 是否为GB18030全角字母
	 * @param c
	 * @return true if a char is a 全角中文字母
	 */
	public static final boolean isAlphaSBC(char c) {
		return SBC_ALPHA.contains((int) c);// (c>=65313 && c<=65338) ||(c>=65345
											// && c<=65370);
	}

	/** 
	 * 是否为GB18030片假名
	 * @param c
	 * @return true if the char is a 片假名
	 */
	public static final boolean isKatakana(char c) {
		return (c >= 12449 && c <= 12542);
	}

	/**
	 * 是否为GB18030 平假名
	 * @param c
	 * @return 如果是平假名返回true，反之
	 */
	public static final boolean isHiragana(char c) {
		return (c >= 12353 && c <= 12435);
	}

	/**
	 * 获得字符的类型
	 * @param c
	 * @return enum CharType
	 */
	public static CharType getType(char c) {
		if (isUpperAlpha(c) || isLowerAlpha(c)) {
			return CharType.ALPHA;
		} else if (isNumber(c)) {
			return CharType.NUMBER;
		} else if (isSymbol(c)) {
			return CharType.SYMBOL;
		} else if (isSpace(c)) {
			return CharType.SPACE;
		} else if (isCtrl(c)) {
			return CharType.CTRL;
		} else if (isAsianSymbol(c)) {
			return CharType.ASIAN_SYMBOL;
		} else {
			return CharType.ASIAN;
		}
	}
	
	public enum CharType {
		// 字母 数字 符号 控制字符 空格
		ALPHA, NUMBER, SYMBOL, CTRL, SPACE, ASIAN_SYMBOL, // 东亚字符中的符号（以GB18030编码为准）
		ASIAN, // 东亚字符（不含符号）
	}
	
	/**
	 * 转全角的函数(SBC case)
	 * 全角空格为12288，半角空格为32
	 * 其他字符半角(33-126)与全角(65281-65374)的对应关系是：均相差65248
	 * @param input
	 * @return 全角字符的文字
	 */
	public static String ToSBC(String input) {
		// 半角转全角：
		char[] c = input.toCharArray();
		for (int i = 0; i < c.length; i++) {
			if (c[i] == 32) {
				c[i] = (char) 12288;
				continue;
			}
			if (c[i] < 127)
				c[i] = (char) (c[i] + 65248);
		}
		return new String(c);
	}

	/**
	 * 半角字符转全角
	 * @param c
	 * @return 全角字符
	 */
	public static char toSBC(char c) {
		if (c == 32)return SBC_SPACE;
		if (c < 127){
			return (char) (c + 65248);
		}
		return c;
	}
	
	/**
	*全角字符转半角(DBC case)
	*全角空格为12288，半角空格为32
	*其他字符半角(33-126)与全角(65281-65374)的对应关系是：均相差65248
	*/
	public static String toDBC(String input) {
		char[] c = input.toCharArray();
		for (int i = 0; i < c.length; i++) {
			if (c[i] == 12288) {
				c[i] = (char) 32;
				continue;
			}
			if (c[i] > 65280 && c[i] < 65375)
				c[i] = (char) (c[i] - 65248);
		}
		return new String(c);
	}
	
	/**
	 * 全角字符转半角字符
	 * @param c
	 * @return 半角字符
	 */
	public static char toDBC(char c) {
		if (c == 12288)return (char) 32;
		if (c > 65280 && c < 65375)
			c = (char) (c - 65248);
		return c;
	}
	
	/**
	 * 确保unicode字符串能安全的转换为GB18030，不会丢弃字符。一些GB18030不支持的字符转换到接近的字符上。
	 * @param line
	 * @return
	 */
	public static String toGB18030(String line){
		line = StringUtils.replaceChars(line, JAP_POINT, CHN_POINT);
		char[] cs = line.toCharArray();
		boolean flag = false;
		for (int i = 0; i < cs.length; i++) {
			char c = cs[i];
			if (c > 65379 && c < 65440) {
				cs[i] = (char) ((int) cs[i] - 52933);
				flag = true;
			}
		}
		if (flag) {
			line = new String(cs);
		}
		return line;
	}
}