package edu.stanford.nlp.process; import java.io.File; import java.io.IOException; import java.io.OutputStreamWriter; import java.io.PrintWriter; import java.io.Reader; import java.io.Serializable; import java.io.StringReader; import java.net.URL; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Properties; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.swing.text.MutableAttributeSet; import javax.swing.text.html.HTML; import javax.swing.text.html.HTMLEditorKit; import javax.swing.text.html.parser.ParserDelegator; import edu.stanford.nlp.io.IOUtils; import edu.stanford.nlp.ling.Word; import edu.stanford.nlp.objectbank.ObjectBank; import edu.stanford.nlp.trees.international.pennchinese.ChineseUtils; import edu.stanford.nlp.util.Generics; import edu.stanford.nlp.util.Pair; import edu.stanford.nlp.util.StringUtils; import edu.stanford.nlp.util.logging.Redwood; import static edu.stanford.nlp.trees.international.pennchinese.ChineseUtils.WHITEPLUS; /** * Convert a Chinese Document into a List of sentence Strings. * * @author Pi-Chuan Chang */ public class ChineseDocumentToSentenceProcessor implements Serializable { /** A logger for this class */ private static Redwood.RedwoodChannels log = Redwood.channels(ChineseDocumentToSentenceProcessor.class); // todo: This class is a mess. We should try to get it out of core private static final long serialVersionUID = 4054964767812217460L; private static final Set<Character> fullStopsSet = Generics.newHashSet(Arrays.asList(new Character[]{'。', '!', '?', '!', '?'})); // not \uff0e . (too often separates English first/last name, etc.) private static final Set<Character> rightMarkSet = Generics.newHashSet(Arrays.asList(new Character[]{'”', '’', '》', '』', '〉', '」', '>', ''', ')', '\'', '"', ')', ']', '>'})); // private final String normalizationTableFile; private static final String encoding = "UTF-8"; private final List<Pair<String,String>> normalizationTable; public ChineseDocumentToSentenceProcessor() { this(null); } private static final Pattern PAIR_PATTERN = Pattern.compile("([^\\s]+)\\s+([^\\s]+)"); /** @param normalizationTableFile A file listing character pairs for * normalization. Currently the normalization table must be in UTF-8. * If this parameter is {@code null}, the default normalization * of the zero-argument constructor is used. */ public ChineseDocumentToSentenceProcessor(String normalizationTableFile) { // this.normalizationTableFile = normalizationTableFile; if (normalizationTableFile != null) { normalizationTable = new ArrayList<>(); for (String line : ObjectBank.getLineIterator(new File(normalizationTableFile), encoding)) { Matcher pairMatcher = PAIR_PATTERN.matcher(line); if (pairMatcher.find()) { normalizationTable.add(new Pair<>(pairMatcher.group(1), pairMatcher.group(2))); } else { log.info("Didn't match: "+line); } } } else { normalizationTable = null; } } /* public ChineseDocumentToSentenceProcessor(String normalizationTableFile, String encoding) { log.info("WARNING: ChineseDocumentToSentenceProcessor ignores normalizationTableFile argument!"); log.info("WARNING: ChineseDocumentToSentenceProcessor ignores encoding argument!"); // encoding is never read locally this.encoding = encoding; } */ /** This should now become disused, and other people should call * ChineseUtils directly! CDM June 2006. */ public String normalization(String in) { //log.info("BEFOR NORM: "+in); String norm = ChineseUtils.normalize(in); String out = normalize(norm); //log.info("AFTER NORM: "+out); return out; } private static final Pattern WHITEPLUS_PATTERN = Pattern.compile(WHITEPLUS); private static final Pattern START_WHITEPLUS_PATTERN = Pattern.compile('^' + WHITEPLUS); private static final Pattern END_WHITEPLUS_PATTERN = Pattern.compile(WHITEPLUS + '$'); private String normalize(String inputString) { if (normalizationTable == null) { return inputString; } Pattern replacePattern = WHITEPLUS_PATTERN; Matcher replaceMatcher = replacePattern.matcher(inputString); inputString = replaceMatcher.replaceAll(" "); for (Pair<String,String> p : normalizationTable) { replacePattern = Pattern.compile(p.first(), Pattern.LITERAL); replaceMatcher = replacePattern.matcher(inputString); String escape = p.second(); if (escape.equals("$")) {escape="\\$";} inputString = replaceMatcher.replaceAll(escape); } return inputString; } /** usage: java ChineseDocumentToSentenceProcessor [-segmentIBM] * -file filename [-encoding encoding] * <p> * The -segmentIBM option is for IBM GALE-specific splitting of an * XML element into sentences. */ public static void main(String[] args) throws Exception { //String encoding = "GB18030"; Properties props = StringUtils.argsToProperties(args); // log.info("Here are the properties:"); // props.list(System.err); boolean alwaysAddS = props.containsKey("alwaysAddS"); ChineseDocumentToSentenceProcessor cp; if (! props.containsKey("file")) { log.info("usage: java ChineseDocumentToSentenceProcessor [-segmentIBM] -file filename [-encoding encoding]"); return; } cp = new ChineseDocumentToSentenceProcessor(); if (props.containsKey("encoding")) { log.info("WARNING: for now the default encoding is "+cp.encoding+". It's not changeable for now"); } String input = IOUtils.slurpFileNoExceptions(props.getProperty("file"), cp.encoding); // String input = StringUtils.slurpGBURLNoExceptions(new URL(props.getProperty("file"))); if (props.containsKey("segmentIBM")) { Tokenizer<Word> tok = WhitespaceTokenizer. newWordWhitespaceTokenizer(new StringReader(input), true); String parseInside = props.getProperty("parseInside"); if (parseInside == null) parseInside = ""; Pattern p1, p2, p3, p4; PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out, cp.encoding), true); StringBuilder buff = new StringBuilder(); StringBuilder sgmlbuff = new StringBuilder(); String lastSgml = ""; p1 = Pattern.compile("<.*>"); p2 = Pattern.compile("\uFEFF?<[\\p{Alpha}]+"); p3 = Pattern.compile("[A-Za-z0-9=\"]+>"); p4 = Pattern.compile("<(?:" + parseInside + ")[ >]"); boolean inSGML = false; int splitItems = 0; int numAdded = 0; while (tok.hasNext()) { String s = tok.next().word(); // pw.println("The token is |" + s + "|"); if (p2.matcher(s).matches()) { inSGML = true; sgmlbuff.append(s).append(" "); } else if (p1.matcher(s).matches() || inSGML && p3.matcher(s).matches() || "\n".equals(s)) { inSGML = false; if (buff.toString().trim().length() > 0) { // pw.println("Dumping sentences"); // pw.println("Buff is " + buff); boolean processIt = false; if (parseInside.equals("")) { processIt = true; } else if (p4.matcher(lastSgml).find()) { processIt = true; } if (processIt) { List<String> sents = ChineseDocumentToSentenceProcessor.fromPlainText(buff.toString(), true); // pw.println("Sents is " + sents); // pw.println(); if (alwaysAddS || sents.size() > 1) { int i = 1; for (String str : sents) { pw.print("<s id=\"" + i + "\">"); pw.print(str); pw.println("</s>"); i++; } if (sents.size() > 1) { splitItems++; numAdded += sents.size() - 1; } } else if (sents.size() == 1) { pw.print(sents.get(0)); } } else { pw.print(buff); } buff = new StringBuilder(); } sgmlbuff.append(s); // pw.println("sgmlbuff is " + sgmlbuff); pw.print(sgmlbuff); lastSgml = sgmlbuff.toString(); sgmlbuff = new StringBuilder(); } else { if (inSGML) { sgmlbuff.append(s).append(" "); } else { buff.append(s).append(" "); } // pw.println("Buff is now |" + buff + "|"); } } // end while (tok.hasNext()) { // empty remaining buffers pw.flush(); pw.close(); log.info("Split " + splitItems + " segments, adding " + numAdded + " sentences."); } else { List<String> sent = cp.fromHTML(input); PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.err, cp.encoding), true); for (String a : sent) { pw.println(a); } } } /** * Strip off HTML tags before processing. * Only the simplest tag stripping is implemented. * * @param inputString Chinese document text which contains HTML tags * @return a List of sentence strings */ public static List<String> fromHTML(String inputString) throws IOException { //HTMLParser parser = new HTMLParser(); //return fromPlainText(parser.parse(inputString)); List<String> ans = new ArrayList<>(); MyHTMLParser parser = new MyHTMLParser(); List<String> sents = parser.parse(inputString); for (String s : sents) { ans.addAll(fromPlainText(s)); } return ans; } /** * @param contentString Chinese document text * @return a List of sentence strings * @throws IOException */ public static List<String> fromPlainText(String contentString) throws IOException { return fromPlainText(contentString, false); } public static List<String> fromPlainText(String contentString, boolean segmented) throws IOException { if (segmented) { contentString = ChineseUtils.normalize(contentString, ChineseUtils.LEAVE, ChineseUtils.ASCII); } else { contentString = ChineseUtils.normalize(contentString, ChineseUtils.FULLWIDTH, ChineseUtils.ASCII); } String sentenceString = ""; char[] content = contentString.toCharArray(); boolean sentenceEnd = false; List<String> sentenceList = new ArrayList<>(); int lastCh = -1; for (Character c : content) { // EncodingPrintWriter.out.println("Char is |" + c + "|", "UTF-8"); String newChar = c.toString(); if ( ! sentenceEnd) { if (segmented && fullStopsSet.contains(c) && (lastCh == -1 || Character.isSpaceChar(lastCh))) { // require it to be a standalone punctuation mark -- cf. URLs sentenceString += newChar; sentenceEnd = true; } else if ( ! segmented && fullStopsSet.contains(c)) { // EncodingPrintWriter.out.println(" End of sent char", "UTF-8"); sentenceString += newChar; sentenceEnd = true; } else { sentenceString += newChar; } } else { // sentenceEnd == true if (rightMarkSet.contains(c)) { sentenceString += newChar; // EncodingPrintWriter.out.println(" Right mark char", "UTF-8"); } else if (newChar.matches("\\s")) { sentenceString += newChar; } else if (fullStopsSet.contains(c)) { // EncodingPrintWriter.out.println(" End of sent char (2+)", "UTF-8"); sentenceString += newChar; } else { // otherwise if (sentenceString.length() > 0) { sentenceEnd = false; } sentenceString = removeWhitespace(sentenceString, segmented); if (sentenceString.length() > 0) { //log.info("<<< "+sentenceString+" >>>"); sentenceList.add(sentenceString); } sentenceString = ""; sentenceString += newChar; } } lastCh = c.charValue(); } // end for (Character c : content) sentenceString = removeWhitespace(sentenceString, segmented); if (sentenceString.length() > 0) { //log.info("<<< "+sentenceString+" >>>"); sentenceList.add(sentenceString); } return sentenceList; } /** In non-segmented mode, all whitespace is removed, * in segmented mode only leading and trailing whitespace goes away. * */ private static String removeWhitespace(String str, boolean segmented) { if (str.length() > 0) { //System.out.println("Add: "+sentenceString); Pattern replacePattern = START_WHITEPLUS_PATTERN; Matcher replaceMatcher = replacePattern.matcher(str); str = replaceMatcher.replaceAll(""); replacePattern = END_WHITEPLUS_PATTERN; replaceMatcher = replacePattern.matcher(str); str = replaceMatcher.replaceAll(""); if ( ! segmented) { replacePattern = WHITEPLUS_PATTERN; replaceMatcher = replacePattern.matcher(str); str = replaceMatcher.replaceAll(""); } } return str; } static class MyHTMLParser extends HTMLEditorKit.ParserCallback { protected StringBuffer textBuffer; protected List<String> sentences; protected String title; protected boolean isTitle; protected boolean isBody; protected boolean isScript; protected boolean isBreak; public MyHTMLParser() { super(); title = ""; isTitle = false; isBody = false; isScript = false; isBreak = false; } @Override public void handleText(char[] data, int pos) { if (data.length == 0) return; if (isTitle) { title = new String(data); } else if (isBody && !isScript) { //textBuffer.append(data).append(" "); } //if (isBreak) { if (true) { textBuffer.append(data); String text = textBuffer.toString(); text = text.replaceAll("\u00a0",""); text = text.trim(); if (text.length()==0) return; sentences.add(text); textBuffer = new StringBuffer(500); } } /** * Sets a flag if the start tag is the "TITLE" element start tag. */ @Override public void handleStartTag(HTML.Tag tag, MutableAttributeSet attrSet, int pos) { if (tag == HTML.Tag.TITLE) { isTitle = true; } else if (tag == HTML.Tag.BODY) { isBody = true; } else if (tag == HTML.Tag.SCRIPT) { isScript = true; } isBreak = tag.breaksFlow(); } /** * Sets a flag if the end tag is the "TITLE" element end tag */ @Override public void handleEndTag(HTML.Tag tag, int pos) { if (tag == HTML.Tag.TITLE) { isTitle = false; } else if (tag == HTML.Tag.BODY) { isBody = false; } else if (tag == HTML.Tag.SCRIPT) { isScript = false; } } public List<String> parse(URL url) throws IOException { return (parse(IOUtils.slurpURL(url))); } public List<String> parse(Reader r) throws IOException { return parse(IOUtils.slurpReader(r)); } /** * The parse method that actually does the work. * Now it first gets rid of singleton tags before running. * @throws IOException */ public List<String> parse(String text) throws IOException { text = text.replaceAll("/>", ">"); text = text.replaceAll("<\\?","<"); StringReader r = new StringReader(text); textBuffer = new StringBuffer(200); sentences = new ArrayList<>(); new ParserDelegator().parse(r, this, true); return sentences; } public String title() { return title; } /* public static void main(String[] args) throws IOException { MyHTMLParser parser = new MyHTMLParser(); String input = StringUtils.slurpGBURLNoExceptions(new URL(args[0])); List<String> result = parser.parse(input); PrintWriter orig = new PrintWriter("file.orig"); PrintWriter parsed = new PrintWriter("file.parsed"); log.info("output to file.orig"); orig.println(input); for (String s : result) { log.info("output to file.parsed"); parsed.println(s); parsed.println("-----------------------------------------"); } orig.close(); parsed.close(); } */ } } // end class ChineseDocumentToSentenceProcessor