package edu.stanford.nlp.trees.international.pennchinese;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.io.EncodingPrintWriter;
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.util.logging.Redwood;
import java.io.*;
import java.util.regex.Pattern;
/** This class contains a few String constants and
* static methods for dealing with Chinese text.
* <p/>
* <b>Warning:</b> The code contains a version that uses codePoint methods
* to handle full Unicode. But it seems to tickle some bugs in
* Sun's JDK 1.5. It works correctly with JDK 1.6+. By default it is
* enabled. The version that only handles BMP characters can be used by editing the code. The
* latter prints a warning message if it sees a high-surrogate character.
*
* @author Christopher Manning
*/
public class ChineseUtils {
/** A logger for this class */
private static final Redwood.RedwoodChannels log = Redwood.channels(ChineseUtils.class);
/** Whether to only support BMP character normalization.
* If set to true, this is more limited, but avoids bugs in JDK 1.5.
*/
private static final boolean ONLY_BMP = false;
// These are good Unicode whitespace regexes for any language!
public static final String ONEWHITE = "[\\s\\p{Zs}]";
public static final String WHITE = ONEWHITE + "*";
public static final String WHITEPLUS = ONEWHITE + "+";
// Chinese numbers 1-10
public static final String NUMBERS = "[\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341]";
// List of characters similar to \u00B7 listed in the Unicode 5.0 manual
public static final String MID_DOT_REGEX_STR = "[\u00B7\u0387\u2022\u2024\u2027\u2219\u22C5\u30FB]";
// These are the constants for the normalize method
public static final int LEAVE = 0;
public static final int ASCII = 1;
public static final int NORMALIZE = 1; // Unicode normalization moves to low
public static final int FULLWIDTH = 2;
public static final int DELETE = 3;
public static final int DELETE_EXCEPT_BETWEEN_ASCII = 4;
public static final int MAX_LEGAL = 4;
// private int[] puaChars = { 0xE005 };
// private int[] uniChars = { 0x42B5 };
// not instantiable
private ChineseUtils() {}
public static boolean isNumber(char c) {
return (StringUtils.matches(String.valueOf(c), NUMBERS) || Character.isDigit(c));
}
public static String normalize(String in) {
return normalize(in, FULLWIDTH, ASCII);
}
public static String normalize(String in, int ascii, int spaceChar) {
return normalize(in, ascii, spaceChar, LEAVE);
}
/** This will normalize a Unicode String in various ways. This routine
* correctly handles characters outside the basic multilingual plane.
*
* @param in The String to be normalized
* @param ascii For characters conceptually in the ASCII range of
* ! through ~ (U+0021 through U+007E or U+FF01 through U+FF5E),
* if this is ChineseUtils.LEAVE, then do nothing,
* if it is ASCII then map them from the Chinese Full Width range
* to ASCII values, and if it is FULLWIDTH then do the reverse.
* @param spaceChar For characters that satisfy Character.isSpaceChar(),
* if this is ChineseUtils.LEAVE, then do nothing,
* if it is ASCII then map them to the space character U+0020, and
* if it is FULLWIDTH then map them to U+3000.
* @param midDot For a set of 7 characters that are roughly middle dot characters,
* if this is ChineseUtils.LEAVE, then do nothing,
* if it is NORMALIZE then map them to the extended Latin character U+00B7, and
* if it is FULLWIDTH then map them to U+30FB.
* @return The in String normalized according to the other arguments.
*/
public static String normalize(String in,
int ascii,
int spaceChar,
int midDot) {
if (ascii < 0 || ascii > MAX_LEGAL ||
spaceChar < 0 || spaceChar > MAX_LEGAL) {
throw new IllegalArgumentException("ChineseUtils: Unknown parameter option");
}
if (ONLY_BMP) {
return normalizeBMP(in, ascii, spaceChar, midDot);
} else {
return normalizeUnicode(in, ascii, spaceChar, midDot);
}
}
private static String normalizeBMP(String in, int ascii, int spaceChar, int midDot) {
StringBuilder out = new StringBuilder();
int len = in.length();
for (int i = 0; i < len; i++) {
char cp = in.charAt(i);
if (Character.isHighSurrogate(cp)) {
if (i + 1 < len) {
log.warn("ChineseUtils.normalize warning: non-BMP codepoint U+" +
Integer.toHexString(Character.codePointAt(in, i)) + " in " + in);
} else {
log.warn("ChineseUtils.normalize warning: unmatched high surrogate character U+" +
Integer.toHexString(Character.codePointAt(in, i)) + " in " + in);
}
}
Character.UnicodeBlock cub = Character.UnicodeBlock.of(cp);
if (cub == Character.UnicodeBlock.PRIVATE_USE_AREA ||
cub == Character.UnicodeBlock.SUPPLEMENTARY_PRIVATE_USE_AREA_A ||
cub == Character.UnicodeBlock.SUPPLEMENTARY_PRIVATE_USE_AREA_B) {
EncodingPrintWriter.err.println("ChineseUtils.normalize warning: private use area codepoint U+" + Integer.toHexString(cp) + " in " + in);
}
boolean delete = false;
switch (ascii) {
case LEAVE:
break;
case ASCII:
if (cp >= '\uFF01' && cp <= '\uFF5E') {
cp -= (0xFF00 - 0x0020);
}
break;
case FULLWIDTH:
if (cp >= '\u0021' && cp <= '\u007E') {
cp += (0xFF00 - 0x0020);
}
break;
default:
throw new IllegalArgumentException("ChineseUtils: Unsupported parameter option: ascii=" + ascii);
}
switch (spaceChar) {
case LEAVE:
break;
case ASCII:
if (Character.isSpaceChar(cp)) {
cp = ' ';
}
break;
case FULLWIDTH:
if (Character.isSpaceChar(cp)) {
cp = '\u3000';
}
break;
case DELETE:
if (Character.isSpaceChar(cp)) {
delete = true;
}
break;
case DELETE_EXCEPT_BETWEEN_ASCII:
char cpp = 0;
if (i > 0) { cpp = in.charAt(i - 1); }
char cpn = 0;
if (i < (len - 1)) { cpn = in.charAt(i + 1); }
// EncodingPrintWriter.out.println("cp: " + cp + "; cpp: " + cpp + "cpn: " + cpn +
// "; isSpace: " + Character.isSpaceChar(cp) + "; isAsciiLHL: " + isAsciiLowHigh(cpp) +
// "; isAsciiLHR: " + isAsciiLowHigh(cpn), "UTF-8");
if (Character.isSpaceChar(cp) && ! (isAsciiLowHigh(cpp) && isAsciiLowHigh(cpn))) {
delete = true;
}
}
switch (midDot) {
case LEAVE:
break;
case NORMALIZE:
if (isMidDot(cp)) {
cp = '\u00B7';
}
break;
case FULLWIDTH:
if (isMidDot(cp)) {
cp = '\u30FB';
}
break;
case DELETE:
if (isMidDot(cp)) {
delete = true;
}
break;
default:
throw new IllegalArgumentException("ChineseUtils: Unsupported parameter option: midDot=" + midDot);
}
if ( ! delete) {
out.append(cp);
}
} // end for
return out.toString();
}
private static String normalizeUnicode(String in, int ascii, int spaceChar, int midDot) {
StringBuilder out = new StringBuilder();
int len = in.length();
// Do it properly with codepoints, for non-BMP Unicode as well
// int numCP = in.codePointCount(0, len);
int cpp = 0; // previous codepoint
for (int offset = 0, cp; offset < len; offset += Character.charCount(cp)) {
// int offset = in.offsetByCodePoints(0, offset);
cp = in.codePointAt(offset);
Character.UnicodeBlock cub = Character.UnicodeBlock.of(cp);
if (cub == Character.UnicodeBlock.PRIVATE_USE_AREA ||
cub == Character.UnicodeBlock.SUPPLEMENTARY_PRIVATE_USE_AREA_A ||
cub == Character.UnicodeBlock.SUPPLEMENTARY_PRIVATE_USE_AREA_B) {
EncodingPrintWriter.err.println("ChineseUtils.normalize warning: private use area codepoint U+" + Integer.toHexString(cp) + " in " + in);
}
boolean delete = false;
switch (ascii) {
case LEAVE:
break;
case ASCII:
if (cp >= '\uFF01' && cp <= '\uFF5E') {
cp -= (0xFF00 - 0x0020);
}
break;
case FULLWIDTH:
if (cp >= '\u0021' && cp <= '\u007E') {
cp += (0xFF00 - 0x0020);
}
break;
default:
throw new IllegalArgumentException("ChineseUtils: Unsupported parameter option: ascii=" + ascii);
}
switch (spaceChar) {
case LEAVE:
break;
case ASCII:
if (Character.isSpaceChar(cp)) {
cp = ' ';
}
break;
case FULLWIDTH:
if (Character.isSpaceChar(cp)) {
cp = '\u3000';
}
break;
case DELETE:
if (Character.isSpaceChar(cp)) {
delete = true;
}
break;
case DELETE_EXCEPT_BETWEEN_ASCII:
int nextOffset = offset + Character.charCount(cp);
int cpn = 0;
if (nextOffset < len) {
cpn = in.codePointAt(nextOffset);
}
if (Character.isSpaceChar(cp) && ! (isAsciiLowHigh(cpp) && isAsciiLowHigh(cpn))) {
delete = true;
}
}
switch (midDot) {
case LEAVE:
break;
case NORMALIZE:
if (isMidDot(cp)) {
cp = '\u00B7';
}
break;
case FULLWIDTH:
if (isMidDot(cp)) {
cp = '\u30FB';
}
break;
case DELETE:
if (isMidDot(cp)) {
delete = true;
}
break;
default:
throw new IllegalArgumentException("ChineseUtils: Unsupported parameter option: midDot=" + midDot);
}
if ( ! delete) {
out.appendCodePoint(cp);
}
cpp = cp;
} // end for
return out.toString();
}
private static boolean isMidDot(int cp) {
return cp == '\u00B7' || cp == '\u0387' || cp == '\u2022' ||
cp == '\u2024' || cp == '\u2027' || cp == '\u2219' ||
cp == '\u22C5' || cp == '\u30FB';
}
private static boolean isAsciiLowHigh(int cp) {
return cp >= '\uFF01' && cp <= '\uFF5E' ||
cp >= '\u0021' && cp <= '\u007E';
}
/** Mainly for testing. Usage:
* {@code ChineseUtils ascii spaceChar word*}
*
* ascii and spaceChar are integers: 0 = leave, 1 = ascii, 2 = fullwidth.
* The words listed are then normalized and sent to stdout.
* If no words are given, the program reads from and normalizes stdin.
* Input is assumed to be in UTF-8.
*
* @param args Command line arguments as above
* @throws IOException If any problems accessing command-line files
*/
public static void main(String[] args) throws IOException {
if (args.length < 3) {
log.info("usage: ChineseUtils ascii space midDot word*");
log.info(" First 3 args are int flags; a filter or maps args as words; assumes UTF-8");
return;
}
int i = Integer.parseInt(args[0]);
int j = Integer.parseInt(args[1]);
int midDot = Integer.parseInt(args[2]);
if (args.length > 3) {
for (int k = 3; k < args.length; k++) {
EncodingPrintWriter.out.println(normalize(args[k], i, j, midDot));
}
} else {
BufferedReader r = IOUtils.readerFromStdin("UTF-8");
for (String line; (line = r.readLine()) != null; ) {
EncodingPrintWriter.out.println(normalize(line, i, j, midDot));
}
}
}
// year, month, day chars. Sometime try adding \u53f7 and see if it helps...
private static final Pattern dateChars = Pattern.compile("[\u5E74\u6708\u65E5]+");
// year, month, day chars. Adding \u53F7 and seeing if it helps...
private static final Pattern dateCharsPlus = Pattern.compile("[\u5E74\u6708\u65E5\u53f7]+");
// number chars (Chinese and Western).
// You get U+25CB circle masquerading as zero in mt data - or even in Sighan 2003 ctb
// add U+25EF for good measure (larger geometric circle)
// private static final Pattern numberChars = Pattern.compile("[0-90-9" +
// "一二三四五六七八九十" +
// "零〇百千万亿兩○◯〡-〩〸-〺]");
private static final Pattern numberChars = Pattern.compile("[0-9\uff10-\uff19" +
"\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4E5D\u5341" +
"\u96F6\u3007\u767E\u5343\u4E07\u4ebf\u5169\u25cb\u25ef\u3021-\u3029\u3038-\u303A]+");
// A-Za-z, narrow and full width
private static final Pattern letterChars = Pattern.compile("[A-Za-z\uFF21-\uFF3A\uFF41-\uFF5A]+");
private static final Pattern periodChars = Pattern.compile("[\ufe52\u2027\uff0e.\u70B9]+");
// two punctuation classes for Low and Ng style features.
private static final Pattern separatingPuncChars = Pattern.compile("[]!\"(),;:<=>?\\[\\\\`{|}~^\u3001-\u3003\u3008-\u3011\u3014-\u301F\u3030" +
"\uff3d\uff01\uff02\uff08\uff09\uff0c\uff1b\uff1a\uff1c\uff1d\uff1e\uff1f" +
"\uff3b\uff3c\uff40\uff5b\uff5c\uff5d\uff5e\uff3e]+");
private static final Pattern ambiguousPuncChars = Pattern.compile("[-#$%&'*+/@_\uff0d\uff03\uff04\uff05\uff06\uff07\uff0a\uff0b\uff0f\uff20\uff3f]+");
private static final Pattern midDotPattern = Pattern.compile(ChineseUtils.MID_DOT_REGEX_STR + "+");
public static String shapeOf(CharSequence input,
boolean augmentedDateChars,
boolean useMidDotShape) {
String shape;
if (augmentedDateChars && dateCharsPlus.matcher(input).matches()) {
shape = "D";
} else if (input.charAt(0) == '第') {
return "o"; // detect those Chinese ordinals!
} else if (dateChars.matcher(input).matches()) {
shape = "D";
} else if (numberChars.matcher(input).matches()) {
shape = "N";
} else if (letterChars.matcher(input).matches()) {
shape = "L";
} else if (periodChars.matcher(input).matches()) {
shape = "P";
} else if (separatingPuncChars.matcher(input).matches()) {
shape = "S";
} else if (ambiguousPuncChars.matcher(input).matches()) {
shape = "A";
} else if (useMidDotShape && midDotPattern.matcher(input).matches()) {
shape = "M";
} else {
shape = "C";
}
return shape;
}
}