package edu.stanford.nlp.wordseg; import java.io.File; import java.util.Iterator; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import static edu.stanford.nlp.trees.international.pennchinese.ChineseUtils.WHITE; import static edu.stanford.nlp.trees.international.pennchinese.ChineseUtils.WHITEPLUS; import edu.stanford.nlp.io.EncodingPrintWriter; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.objectbank.ObjectBank; import edu.stanford.nlp.sequences.SeqClassifierFlags; // TODO: ChineseStringUtils and ChineseUtils should be put somewhere common import edu.stanford.nlp.trees.international.pennchinese.ChineseUtils; public class ChineseStringUtils { private static final boolean DEBUG = false; private static final boolean DEBUG_MORE = false; public static boolean isLetterASCII(char c) { return c <= 127 && Character.isLetter(c); } public static String combineSegmentedSentence(List<CoreLabel> doc, SeqClassifierFlags flags) { // Hey all: Some of the code that was previously here for // whitespace normalization was a bit hackish as well as // obviously broken for some test cases. So...I went ahead and // re-wrote it. // // Also, putting everything into 'testContent', is a bit wasteful // memory wise. But, it's on my near-term todo list to // code something thats a bit more memory efficient. // // Finally, if these changes ended up breaking anything // just e-mail me (cerd@colorado.edu), and I'll try to fix it // asap -cer (6/14/2006) /* Sun Oct 7 19:55:09 2007 I'm actually not using "testContent" anymore. I think it's broken because the whole test file has been read over and over again, tand the testContentIdx has been set to 0 every time, while "doc" is moving line by line!!!! -pichuan */ int testContentIdx=0; StringBuilder ans = new StringBuilder(); // the actual output we will return StringBuilder unmod_ans = new StringBuilder(); // this is the original output from the CoreLabel StringBuilder unmod_normed_ans = new StringBuilder(); // this is the original output from the CoreLabel CoreLabel wi = null; for (Iterator<CoreLabel> wordIter = doc.iterator(); wordIter.hasNext(); testContentIdx++) { CoreLabel pwi = wi; wi = wordIter.next(); boolean originalWhiteSpace = "1".equals(wi.get(CoreAnnotations.SpaceBeforeAnnotation.class)); // if the CRF says "START" (segmented), and it's not the first word.. if (wi.get(CoreAnnotations.AnswerAnnotation.class).equals("1") && !("0".equals(String.valueOf(wi.get(CoreAnnotations.PositionAnnotation.class))))) { // check if we need to preserve the "no space" between English // characters boolean seg = true; // since it's in the "1" condition.. default // is to seg if (flags.keepEnglishWhitespaces) { if (testContentIdx > 0) { char prevChar = pwi.get(CoreAnnotations.OriginalCharAnnotation.class).charAt(0); char currChar = wi.get(CoreAnnotations.OriginalCharAnnotation.class).charAt(0); if (isLetterASCII(prevChar) && isLetterASCII(currChar)) { // keep the "non space" before wi if (! originalWhiteSpace) { seg = false; } } } } // if there was space and keepAllWhitespaces is true, restore it no matter what if (flags.keepAllWhitespaces && originalWhiteSpace) { seg = true; } if (seg) { if (originalWhiteSpace) { ans.append('\u1924'); // a pretty Limbu character which is later changed to a space } else { ans.append(' '); } } unmod_ans.append(' '); unmod_normed_ans.append(' '); } else { boolean seg = false; // since it's in the "0" condition.. default // Changed after conversation with Huihsin. // // Decided that all words consisting of English/ASCII characters // should be separated from the surrounding Chinese characters. -cer /* Sun Oct 7 22:14:46 2007 (pichuan) the comment above was from DanC. I changed the code but I think I'm doing the same thing here. */ if (testContentIdx > 0) { char prevChar = pwi.get(CoreAnnotations.OriginalCharAnnotation.class).charAt(0); char currChar = wi.get(CoreAnnotations.OriginalCharAnnotation.class).charAt(0); if ((prevChar < (char)128) != (currChar < (char)128)) { if (ChineseUtils.isNumber(prevChar) && ChineseUtils.isNumber(currChar)) { // cdm: you would get here if you had an ASCII number next to a // Unihan range number. Does that happen? It presumably // shouldn't do any harm.... [cdm, oct 2007] } else if (flags.separateASCIIandRange) { seg = true; } } } if (flags.keepEnglishWhitespaces) { if (testContentIdx > 0) { char prevChar = pwi.get(CoreAnnotations.OriginalCharAnnotation.class).charAt(0); char currChar = wi.get(CoreAnnotations.OriginalCharAnnotation.class).charAt(0); if (isLetterASCII(prevChar) && isLetterASCII(currChar) || isLetterASCII(prevChar) && ChineseUtils.isNumber(currChar) || ChineseUtils.isNumber(prevChar) && isLetterASCII(currChar)) { // keep the "space" before wi if ("1".equals(wi.get(CoreAnnotations.SpaceBeforeAnnotation.class))) { seg = true; } } } } // if there was space and keepAllWhitespaces is true, restore it no matter what if (flags.keepAllWhitespaces) { if (!("0".equals(String.valueOf(wi.get(CoreAnnotations.PositionAnnotation.class)))) && "1".equals(wi.get(CoreAnnotations.SpaceBeforeAnnotation.class))) { seg = true; } } if (seg) { if (originalWhiteSpace) { ans.append('\u1924'); // a pretty Limbu character which is later changed to a space } else { ans.append(' '); } } } ans.append(wi.get(CoreAnnotations.OriginalCharAnnotation.class)); unmod_ans.append(wi.get(CoreAnnotations.OriginalCharAnnotation.class)); unmod_normed_ans.append(wi.get(CoreAnnotations.CharAnnotation.class)); } String ansStr = ans.toString(); if (flags.sighanPostProcessing) { if ( ! flags.keepAllWhitespaces) { // remove the Limbu char now, so it can be deleted in postprocessing ansStr = ansStr.replaceAll("\u1924", " "); } ansStr = postProcessingAnswer(ansStr, flags); } // definitely remove the Limbu char if it survived till now ansStr = ansStr.replaceAll("\u1924", " "); if (DEBUG) { EncodingPrintWriter.err.println("CLASSIFIER(normed): " + unmod_normed_ans, "UTF-8"); EncodingPrintWriter.err.println("CLASSIFIER: " + unmod_ans, "UTF-8"); EncodingPrintWriter.err.println("POSTPROCESSED: "+ans, "UTF-8"); } return ansStr; } /** * post process the answer to be output * these post processing are not dependent on original input */ private static String postProcessingAnswer(String ans, SeqClassifierFlags flags) { if (flags.useHk) { //System.err.println("Using HK post processing."); return postProcessingAnswerHK(ans); } else if (flags.useAs) { //System.err.println("Using AS post processing."); return postProcessingAnswerAS(ans); } else if (flags.usePk) { //System.err.println("Using PK post processing."); return postProcessingAnswerPK(ans,flags.keepAllWhitespaces); } else if (flags.useMsr) { //System.err.println("Using MSR post processing."); return postProcessingAnswerMSR(ans); } else { //System.err.println("Using CTB post processing."); return postProcessingAnswerCTB(ans, flags.keepAllWhitespaces, flags.suppressMidDotPostprocessing); } } static Pattern[] puncsPat = null; static Character[] puncs = null; private static String separatePuncs(String ans) { /* make sure some punctuations will only appeared as one word (segmented from others). */ /* These punctuations are derived directly from the training set. */ if (puncs == null) { puncs = new Character[]{'\u3001', '\u3002', '\u3003', '\u3008', '\u3009', '\u300a', '\u300b', '\u300c', '\u300d', '\u300e', '\u300f', '\u3010', '\u3011', '\u3014', '\u3015'}; } if (puncsPat == null) { //System.err.println("Compile Puncs"); puncsPat = new Pattern[puncs.length]; for(int i = 0; i < puncs.length; i++) { Character punc = puncs[i]; puncsPat[i] = Pattern.compile(WHITE + punc + WHITE); } } for (int i = 0; i < puncsPat.length; i++) { Pattern p = puncsPat[i]; Character punc = puncs[i]; Matcher m = p.matcher(ans); ans = m.replaceAll(" "+punc+" "); } ans = ans.trim(); return ans; } private static String separatePuncs(Character[] puncs_in, String ans) { /* make sure some punctuations will only appeared as one word (segmented from others). */ /* These punctuations are derived directly from the training set. */ if (puncs == null) { puncs = puncs_in; } if (puncsPat == null) { //System.err.println("Compile Puncs"); puncsPat = new Pattern[puncs.length]; for(int i = 0; i < puncs.length; i++) { Character punc = puncs[i]; if (punc == '(' || punc == ')') { // escape puncsPat[i] = Pattern.compile(WHITE + "\\" + punc + WHITE); } else { puncsPat[i] = Pattern.compile(WHITE + punc + WHITE); } } } for (int i = 0; i < puncsPat.length; i++) { Pattern p = puncsPat[i]; Character punc = puncs[i]; Matcher m = p.matcher(ans); ans = m.replaceAll(" "+punc+" "); } ans = ans.trim(); return ans; } /** The one extant use of this method is to connect a U+30FB (Katakana midDot * with preceding and following non-space characters (in CTB * postprocessing). I would hypothesize that if mid dot chars were correctly * recognized in shape contexts, then this would be unnecessary [cdm 2007]. * Also, note that IBM GALE normalization seems to produce U+30FB and not * U+00B7. * * @param punc character to be joined to surrounding chars * @param ans Input string which may or may not contain punc * @return String with spaces removed between any instance of punc and * surrounding chars. */ private static String gluePunc(Character punc, String ans) { Pattern p = Pattern.compile(WHITE + punc); Matcher m = p.matcher(ans); ans = m.replaceAll(String.valueOf(punc)); p = Pattern.compile(punc + WHITE); m = p.matcher(ans); ans = m.replaceAll(String.valueOf(punc)); ans = ans.trim(); return ans; } static Character[] colons = {'\ufe55', ':', '\uff1a'}; static Pattern[] colonsPat = null; static Pattern[] colonsWhitePat = null; private static String processColons(String ans, String numPat) { /* ':' 1. if "5:6" then put together 2. if others, separate ':' and others *** Note!! All the "digits" are actually extracted/learned from the training data!!!! They are not real "digits" knowledge. *** See /u/nlp/data/chinese-segmenter/Sighan2005/dict/wordlist for the list we extracted. */ // first , just separate all ':' if (colonsPat == null) { colonsPat = new Pattern[colons.length]; for (int i = 0; i < colons.length; i++) { Character colon = colons[i]; colonsPat[i] = Pattern.compile(WHITE + colon + WHITE); } } for (int i = 0; i < colons.length; i++) { Character colon = colons[i]; Pattern p = colonsPat[i]; Matcher m = p.matcher(ans); ans = m.replaceAll(" "+colon+" "); } if (colonsWhitePat == null) { colonsWhitePat = new Pattern[colons.length]; for (int i = 0; i < colons.length; i++) { Character colon = colons[i]; colonsWhitePat[i] = Pattern.compile("("+numPat+")" + WHITEPLUS + colon + WHITEPLUS + "("+numPat+")"); } } // second , combine "5:6" patterns for (int i = 0; i < colons.length; i++) { Character colon = colons[i]; Pattern p = colonsWhitePat[i]; Matcher m = p.matcher(ans); while(m.find()) { ans = m.replaceAll("$1"+colon+"$2"); m = p.matcher(ans); } } ans = ans.trim(); return ans; } private static final Pattern percentsPat = Pattern.compile(WHITE + "([\uff05%])" + WHITE); private static final String percentStr = WHITEPLUS + "([\uff05%])"; private static Pattern percentsWhitePat; // = null; private static String processPercents(String ans, String numPat) { // 1. if "6%" then put together // 2. if others, separate '%' and others // System.err.println("Process percents called!"); // first , just separate all '%' Matcher m = percentsPat.matcher(ans); ans = m.replaceAll(" $1 "); // second , combine "6%" patterns if (percentsWhitePat==null) { percentsWhitePat = Pattern.compile("(" + numPat + ")" + percentStr); } Matcher m2 = percentsWhitePat.matcher(ans); ans = m2.replaceAll("$1$2"); ans = ans.trim(); return ans; } private static String processDots(String ans, String numPat) { /* all "\d\.\d" patterns */ String dots = "[\ufe52\u2027\uff0e.]"; Pattern p = Pattern.compile("("+numPat+")" + WHITEPLUS + "("+dots+")" + WHITEPLUS + "("+numPat+")"); Matcher m = p.matcher(ans); while(m.find()) { ans = m.replaceAll("$1$2$3"); m = p.matcher(ans); } p = Pattern.compile("("+numPat+")("+dots+")" + WHITEPLUS + "("+numPat+")"); m = p.matcher(ans); while (m.find()) { ans = m.replaceAll("$1$2$3"); m = p.matcher(ans); } p = Pattern.compile("("+numPat+")" + WHITEPLUS + "("+dots+")("+numPat+")"); m = p.matcher(ans); while(m.find()) { ans = m.replaceAll("$1$2$3"); m = p.matcher(ans); } ans = ans.trim(); return ans; } private static String processCommas(String ans) { String numPat = "[0-9\uff10-\uff19]"; String nonNumPat = "[^0-9\uff10-\uff19]"; /* all "\d\.\d" patterns */ String commas = ","; //Pattern p = Pattern.compile(WHITE + commas + WHITE); ans = ans.replaceAll(",", " , "); ans = ans.replaceAll(" ", " "); if (DEBUG) EncodingPrintWriter.err.println("ANS (before comma norm): "+ans, "UTF-8"); Pattern p = Pattern.compile("("+numPat+")" + WHITE + "("+commas+")" + WHITE + "("+numPat+"{3}" + nonNumPat+")"); // cdm: I added the {3} to be a crude fix so it wouldn't joint back // up small numbers. Only proper thousands markers. But it's a // crude hack, which should be done better. // In fact this whole method is horrible and should be done better! /* -- cdm: I didn't understand this code, and changed it to what -- seemed sane to me: replaceAll replaces them all in one step.... Matcher m = p.matcher(ans); while(m.find()) { ans = m.replaceAll("$1$2$3"); m = p.matcher(ans); } */ /* ++ cdm: The replacement */ Matcher m = p.matcher(ans); if (m.find()) { ans = m.replaceAll("$1$2$3"); } /* p = Pattern.compile("("+nonNumPat+")" + WHITE + "("+commas+")" + WHITE + "("+numPat+")"); m = p.matcher(ans); while(m.find()) { ans = m.replaceAll("$1 $2 $3"); m = p.matcher(ans); } p = Pattern.compile("("+numPat+")" + WHITE + "("+commas+")" + WHITE + "("+nonNumPat+")"); m = p.matcher(ans); while(m.find()) { ans = m.replaceAll("$1 $2 $3"); m = p.matcher(ans); } p = Pattern.compile("("+nonNumPat+")" + WHITE + "("+commas+")" + WHITE + "("+nonNumPat+")"); m = p.matcher(ans); while(m.find()) { ans = m.replaceAll("$1 $2 $3"); m = p.matcher(ans); } */ ans = ans.trim(); return ans; } static String postProcessingAnswerCTB(String ans, boolean keepAllWhitespaces, boolean suppressMidDotPostprocessing) { Character[] puncs = {'\u3001', '\u3002', '\u3003', '\u3008', '\u3009', '\u300a', '\u300b', '\u300c', '\u300d', '\u300e', '\u300f', '\u3010', '\u3011', '\u3014', '\u3015', '\u0028', '\u0029', '\u0022', '\u003c', '\u003e' }; String numPat = "[0-9\uff10-\uff19]+"; // if ( ! keepAllWhitespaces) { // these should now never delete an original space ans = separatePuncs(puncs, ans); if (!suppressMidDotPostprocessing) { ans = gluePunc('\u30fb', ans); // this is a 'connector' - the katakana midDot char } ans = processColons(ans, numPat); ans = processPercents(ans, numPat); ans = processDots(ans, numPat); ans = processCommas(ans); // } ans = ans.trim(); return ans; } private static String postProcessingAnswerPK(String ans, boolean keepAllWhitespaces) { Character[] puncs = {'\u3001', '\u3002', '\u3003', '\u3008', '\u3009', '\u300a', '\u300b', '\u300c', '\u300d', '\u300e', '\u300f', '\u3010', '\u3011', '\u3014', '\u3015', '\u2103'}; ans = separatePuncs(puncs, ans); /* Note!! All the "digits" are actually extracted/learned from the training data!!!! They are not real "digits" knowledge. See /u/nlp/data/chinese-segmenter/Sighan2005/dict/wordlist for the list we extracted */ String numPat = "[0-9\uff10-\uff19\uff0e\u00b7\u4e00\u5341\u767e]+"; if (!keepAllWhitespaces) { ans = processColons(ans, numPat); ans = processPercents(ans, numPat); ans = processDots(ans, numPat); ans = processCommas(ans); /* "\u2014\u2014\u2014" and "\u2026\u2026" should be together */ String[] puncPatterns = {"\u2014" + WHITE + "\u2014" + WHITE + "\u2014", "\u2026" + WHITE + "\u2026"}; String[] correctPunc = {"\u2014\u2014\u2014", "\u2026\u2026"}; //String[] puncPatterns = {"\u2014 \u2014 \u2014", "\u2026 \u2026"}; for (int i = 0; i < puncPatterns.length; i++) { Pattern p = Pattern.compile(WHITE + puncPatterns[i]+ WHITE); Matcher m = p.matcher(ans); ans = m.replaceAll(" "+correctPunc[i]+" "); } } ans = ans.trim(); return ans; } private static String postProcessingAnswerMSR(String ans) { ans = separatePuncs(ans); return ans; } private static String postProcessingAnswerAS(String ans) { ans = separatePuncs(ans); /* Note!! All the "digits" are actually extracted/learned from the training data!!!! They are not real "digits" knowledge. See /u/nlp/data/chinese-segmenter/Sighan2005/dict/wordlist for the list we extracted */ String numPat = "[\uff10-\uff19\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e\u5343]+"; ans = processColons(ans, numPat); ans = processPercents(ans, numPat); ans = processDots(ans, numPat); ans = processCommas(ans); return ans; } private static String postProcessingAnswerHK(String ans) { Character[] puncs = {'\u3001', '\u3002', '\u3003', '\u3008', '\u3009', '\u300a', '\u300b', '\u300c', '\u300d', '\u300e', '\u300f', '\u3010', '\u3011', '\u3014', '\u3015', '\u2103'}; ans = separatePuncs(puncs, ans); /* Note!! All the "digits" are actually extracted/learned from the training data!!!! They are not real "digits" knowledge. See /u/nlp/data/chinese-segmenter/Sighan2005/dict/wordlist for the list we extracted */ String numPat = "[0-9]+"; ans = processColons(ans, numPat); /* "\u2014\u2014\u2014" and "\u2026\u2026" should be together */ String[] puncPatterns = {"\u2014" + WHITE + "\u2014" + WHITE + "\u2014", "\u2026" + WHITE + "\u2026"}; String[] correctPunc = {"\u2014\u2014\u2014", "\u2026\u2026"}; //String[] puncPatterns = {"\u2014 \u2014 \u2014", "\u2026 \u2026"}; for (int i = 0; i < puncPatterns.length; i++) { Pattern p = Pattern.compile(WHITE + puncPatterns[i]+ WHITE); Matcher m = p.matcher(ans); ans = m.replaceAll(" "+correctPunc[i]+" "); } ans = ans.trim(); return ans; } /** * just for testing */ public static void main(String[] args) { String input = args[0]; String enc = args[1]; for (String line : ObjectBank.getLineIterator(new File(input), enc)) { // System.out.println(postProcessingAnswerHK(line)); EncodingPrintWriter.out.println(processPercents(line, "[0-9\uff10-\uff19]+"), "UTF-8"); } } }