/*
* JEF - Copyright 2009-2010 Jiyi (mr.jiyi@gmail.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package jef.tools.string;
import jef.common.wrapper.IntRange;
import jef.common.wrapper.IntRangeGroup;
import jef.tools.StringUtils;
public final class CharUtils extends org.apache.commons.lang.CharUtils {
/*
long int char byte经常转来转去,但实际上危险性是很大的。
long占用8个字节,int 占用4个字节,short / char占用两个字节 byte一个字节。
凡是窄类型转宽类型是安全的,但是宽类型转窄类型就可能造成数据丢失,是危险的。
long转int会丢失高位,这个很容易引起注意,但是其他几种有很容易出问题。
short的范围是-32767 ~ 32768.
比如 InputStream.read(),得到一个int,转成char后,范围就是0~65535.(0~FFFF),
如果要判断流的结束(-1),那么就必须在转成char之前,用int去判断,转成char之后判断就有错。
但是偏偏java语法是允许你写出 char ==-1这样的直接比较(隐式转换,从char转int).
byte的范围是 -128 ~ 127.而不是0 到255,这也是很容易搞错的地方,
byte的实际范围是 -128 ~ 127 ,即Integer的cache的范围,因此将int转换到
因此,将byte转int应该这样写
int unsignedByte = signedByte >= 0 ? signedByte : 256 + signedByte;
将int转byte应该这样写(这个逻辑和(byte)的转换效果应该是等同的)
int byteValue;
int temp = intValue % 256;
if ( intValue < 0) {
byteValue = temp < -128 ? 256 + temp : temp;
}
else {
byteValue = temp > 127 ? temp - 256 : temp;
}
System.out.println();
System.out.println(byte2hex(md));
*/
/**
* 常量:所有数字字符
*/
public static final char[] NUMBERS = "0123456789".toCharArray();
/**
* 常量:十六进制数字字符
*/
public static final char[] HEX_NUMBERS = "0123456789ABCDEFabcdef".toCharArray();
/**
* 常量:所有大写字母
*/
public static final char[] ALPHA_UPPERS = "ABCDEFGHIJKLMNOPQRSTUVWXYZ".toCharArray();
/**
* 常量:所有小写字母
*/
public static final char[] ALPHA_LOWERS = "abcdefghijklmnopqrstuvwxyz".toCharArray();
/**
* 常量:大写和小写字母
*/
public static final char[] ALPHAS = "ABCDEFGHIJKLMNOPQRSTNVWXYZabcdefghijklmnopqrstuvwxyz".toCharArray();
/**
* 常量:所有常用标点符号和空格
*/
public static final char[] SYMBOLS = " !\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~".toCharArray();
/**
* 常量:字母,数字和下划线
*/
public static final char[] ALPHA_NUM_UNDERLINE = "0123456789ABCDEFGHIJKLMNOPQRSTNVWXYZabcdefghijklmnopqrstuvwxyz_".toCharArray();
/**
* 常量:允许在URL中出现的字符,包含字母数字下划线和其他常用符号
*/
public static final char[] CHARS_IN_URL = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890_&?=#%;~,./-+".toCharArray();
public static final String JAP_POINT=new String(new char[]{12539,65381,40658,65378,65379,9834, 12316, 65533,8722,8810,8811,63});
public static final String CHN_POINT=new String(new char[]{183, 183, 40657,'『','』', 65374 ,65374,13199,'-','《','》' ,'?'});
/**
* 是否为数字
* @param c
* @return true if char is a number
*/
public static final boolean isNumber(char c) {
return c >= 48 && c <= 57;
}
/**
* 是否为空格(含中文空格)
* @param c
* @return true if the char is space (chinese space included)
*/
public static final boolean isSpace(char c) {
return c == 32 || c == 12288;
}
/**
* 是否为大写字母
* @param c
* @return true if the char is a alphabat in upper case
*/
public static final boolean isUpperAlpha(char c) {
return c >= 65 && c <= 90;
}
/**
* 是否为小写字母
* @param c
* @return true if the char is alphabat in lower case.
*/
public static final boolean isLowerAlpha(char c) {
return c >= 97 && c <= 122;
}
/**
* 是否为各种符号
* @param c
* @return true if the char is a symbol
*/
public static boolean isSymbol(char c) {
return (c >= 32 && c <= 47) || (c >= 58 && c <= 64) || (c >= 91 && c <= 96) || (c > 122 && c < 127);
}
/**
* 是否为控制字符
* @param c
* @return true if the char is not a visible character
*/
public static final boolean isCtrl(char c) {
return (c < 32) || c > 255;
}
public static boolean isChinese(char c) {
return c>=0x4e00 && c<=0x9fa5;
}
/**
* 是否为东亚字符(含符号)
* @param c
* @return <tt>true</tt> if char is chinese or japanese.., otherwise <tt>false</tt>
*/
public static boolean isAsian(char c) {
return (c > 255 && c != 65279);
}
/**
* 是否为GB18030符号或控制字符
* @param c
* @return <tt>true</tt> if char is a 全角符号
*/
public static boolean isAsianSymbol(char c) {
if (c > 19968 && c < 40869)
return false;
if (isKatakana(c) || isNumberSBC(c) || isHiragana(c) || isSpace(c) || isAlphaSBC(c))
return false;
return true;
}
public static final IntRange SBC_ALPHA_UPPER = new IntRange(65313, 65338);
public static final IntRange SBC_ALPHA_LOWER = new IntRange(65345, 65370);
public static final IntRangeGroup SBC_ALPHA = new IntRangeGroup(SBC_ALPHA_LOWER, SBC_ALPHA_UPPER);
public static final IntRange SBC_NUMBER = new IntRange(65296, 65305);
public static final IntRange SBC_CHARS_WITHOUT_SPACE = new IntRange(65281,65374);
public static final char SBC_SPACE=(char) 12288;
/**
* 是否为GB18030全角数字
* @param c
* @return
*/
public static final boolean isNumberSBC(char c) {
return SBC_NUMBER.contains(c);
}
/**
* 是否为GB18030全角字母
* @param c
* @return true if a char is a 全角中文字母
*/
public static final boolean isAlphaSBC(char c) {
return SBC_ALPHA.contains((int) c);// (c>=65313 && c<=65338) ||(c>=65345
// && c<=65370);
}
/**
* 是否为GB18030片假名
* @param c
* @return true if the char is a 片假名
*/
public static final boolean isKatakana(char c) {
return (c >= 12449 && c <= 12542);
}
/**
* 是否为GB18030 平假名
* @param c
* @return 如果是平假名返回true,反之
*/
public static final boolean isHiragana(char c) {
return (c >= 12353 && c <= 12435);
}
/**
* 获得字符的类型
* @param c
* @return enum CharType
*/
public static CharType getType(char c) {
if (isUpperAlpha(c) || isLowerAlpha(c)) {
return CharType.ALPHA;
} else if (isNumber(c)) {
return CharType.NUMBER;
} else if (isSymbol(c)) {
return CharType.SYMBOL;
} else if (isSpace(c)) {
return CharType.SPACE;
} else if (isCtrl(c)) {
return CharType.CTRL;
} else if (isAsianSymbol(c)) {
return CharType.ASIAN_SYMBOL;
} else {
return CharType.ASIAN;
}
}
public enum CharType {
// 字母 数字 符号 控制字符 空格
ALPHA, NUMBER, SYMBOL, CTRL, SPACE, ASIAN_SYMBOL, // 东亚字符中的符号(以GB18030编码为准)
ASIAN, // 东亚字符(不含符号)
}
/**
* 转全角的函数(SBC case)
* 全角空格为12288,半角空格为32
* 其他字符半角(33-126)与全角(65281-65374)的对应关系是:均相差65248
* @param input
* @return 全角字符的文字
*/
public static String ToSBC(String input) {
// 半角转全角:
char[] c = input.toCharArray();
for (int i = 0; i < c.length; i++) {
if (c[i] == 32) {
c[i] = (char) 12288;
continue;
}
if (c[i] < 127)
c[i] = (char) (c[i] + 65248);
}
return new String(c);
}
/**
* 半角字符转全角
* @param c
* @return 全角字符
*/
public static char toSBC(char c) {
if (c == 32)return SBC_SPACE;
if (c < 127){
return (char) (c + 65248);
}
return c;
}
/**
*全角字符转半角(DBC case)
*全角空格为12288,半角空格为32
*其他字符半角(33-126)与全角(65281-65374)的对应关系是:均相差65248
*/
public static String toDBC(String input) {
char[] c = input.toCharArray();
for (int i = 0; i < c.length; i++) {
if (c[i] == 12288) {
c[i] = (char) 32;
continue;
}
if (c[i] > 65280 && c[i] < 65375)
c[i] = (char) (c[i] - 65248);
}
return new String(c);
}
/**
* 全角字符转半角字符
* @param c
* @return 半角字符
*/
public static char toDBC(char c) {
if (c == 12288)return (char) 32;
if (c > 65280 && c < 65375)
c = (char) (c - 65248);
return c;
}
/**
* 确保unicode字符串能安全的转换为GB18030,不会丢弃字符。一些GB18030不支持的字符转换到接近的字符上。
* @param line
* @return
*/
public static String toGB18030(String line){
line = StringUtils.replaceChars(line, JAP_POINT, CHN_POINT);
char[] cs = line.toCharArray();
boolean flag = false;
for (int i = 0; i < cs.length; i++) {
char c = cs[i];
if (c > 65379 && c < 65440) {
cs[i] = (char) ((int) cs[i] - 52933);
flag = true;
}
}
if (flag) {
line = new String(cs);
}
return line;
}
}