package com.athena.asm.tool.infosense;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* 考虑如下电话号码的混淆方式:
* 使用中文:〇一二三四五六七八九零壹贰叁肆伍陆柒捌玖洞幺参拐oO①②③④⑤⑥⑦⑧⑨, 来代替相应数字
* 1. 括号不会被嵌套
* 2. 段与段之间只有不超过1个WhiteSpace
* 3. '-'不能作为开头
* 4. 电话号码必须>=5位,纯特殊字符书写的则必须>=8位
* @author aleck
*
*/
public class PhoneNumSensor extends Sensor {
// 其余可能的字符,只用于检测,不要求specials映射对应(但最好是对应)
private static final String OTHER_DIGITS = "〇一二三四五六七八九零壹贰叁肆伍陆柒捌玖洞幺参拐oO①②③④⑤⑥⑦⑧⑨";
// 正则表达式
private static final Pattern phone = Pattern.compile(
"((\\+?)(([0-9PpWw" + OTHER_DIGITS + "])+|\\(([0-9PpWw" + OTHER_DIGITS + "])+\\)))" +
"((\\s|[\\+\\-])?(([0-9PpWw" + OTHER_DIGITS + "])+|\\(([0-9PpWw" + OTHER_DIGITS + "])+\\)))*"
);
private static final Map<Character, Character> specials;
static {
specials = new HashMap<Character, Character>();
// simple form
specials.put('〇', '0');
specials.put('一', '1');
specials.put('二', '2');
specials.put('三', '3');
specials.put('四', '4');
specials.put('五', '5');
specials.put('六', '6');
specials.put('七', '7');
specials.put('八', '8');
specials.put('九', '9');
// capital form
specials.put('零', '0');
specials.put('贰', '1');
specials.put('叁', '2');
specials.put('肆', '3');
specials.put('伍', '4');
specials.put('陆', '5');
specials.put('柒', '6');
specials.put('捌', '7');
specials.put('玖', '8');
specials.put('拾', '9');
// others
specials.put('洞', '0');
specials.put('幺', '1');
specials.put('参', '3');
specials.put('拐', '7');
specials.put('o', '0');
specials.put('O', '0');
specials.put('①', '1');
specials.put('②', '2');
specials.put('③', '3');
specials.put('④', '4');
specials.put('⑤', '5');
specials.put('⑥', '6');
specials.put('⑦', '7');
specials.put('⑧', '8');
specials.put('⑨', '9');
}
protected PhoneNumSensor() {
super(Type.PHONE_NUMBER);
}
public String normalize(String raw) {
// 可以被去掉的字符集合
// 数字,字母p和w不可以被去掉
final String ignore = "+-()";
StringBuilder output = new StringBuilder();
for (int i = 0; i < raw.length(); i++) {
Character ch = raw.charAt(i);
if (Character.isWhitespace(ch) || ignore.indexOf(ch) != -1) {
// ignore
} else if (specials.containsKey(ch)) {
// translate
output.append(specials.get(ch));
} else {
output.append(ch);
}
}
return output.toString();
}
@Override
public List<Info> scan(CharSequence text) {
List<Info> ret = new ArrayList<Info>();
Matcher matcher = phone.matcher(text);
while (matcher.find()) {
String original = matcher.group();
String content = normalize(original);
if (satisfyLengthConstraint(content)) {
ret.add(new Info(Type.PHONE_NUMBER, content, original, matcher.start()));
}
}
return ret;
}
/**
* 检查是否满足长度要求
* 1. 最少长度为8,对于纯中文串,最少长度为5
* 2. 最大长度为16
* @param content
* @return
*/
private boolean satisfyLengthConstraint(String content) {
return (content.length() <= 16) &&
(content.length() >= 8 || !isPureSpecial(content) && content.length() >= 5);
}
private boolean isPureSpecial(String content) {
for (int i = 0; i < content.length(); i++) {
Character ch = content.charAt(i);
if (!specials.containsKey(ch)) {
return false;
}
}
return true;
}
}