ChineseDocumentToSentenceProcessor.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.process;

import java.io.File;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.Reader;
import java.io.Serializable;
import java.io.StringReader;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Properties;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLEditorKit;
import javax.swing.text.html.parser.ParserDelegator;

import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.ling.Word;
import edu.stanford.nlp.objectbank.ObjectBank;
import edu.stanford.nlp.trees.international.pennchinese.ChineseUtils;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.Pair;
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.util.logging.Redwood;

import static edu.stanford.nlp.trees.international.pennchinese.ChineseUtils.WHITEPLUS;

/**
 * Convert a Chinese Document into a List of sentence Strings.
 *
 * @author Pi-Chuan Chang
 */
public class ChineseDocumentToSentenceProcessor implements Serializable  {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(ChineseDocumentToSentenceProcessor.class);

  // todo: This class is a mess. We should try to get it out of core

  private static final long serialVersionUID = 4054964767812217460L;

  private static final Set<Character> fullStopsSet = Generics.newHashSet(Arrays.asList(new Character[]{'。', '！', '？', '!', '?'}));
  // not \uff0e . (too often separates English first/last name, etc.)

  private static final Set<Character> rightMarkSet = Generics.newHashSet(Arrays.asList(new Character[]{'”', '’', '》', '』', '〉', '」', '＞', '＇', '）', '\'', '"', ')', ']', '>'}));

  // private final String normalizationTableFile;

  private static final String encoding = "UTF-8";
  private final List<Pair<String,String>> normalizationTable;


  public ChineseDocumentToSentenceProcessor() {
    this(null);
  }

  private static final Pattern PAIR_PATTERN = Pattern.compile("([^\\s]+)\\s+([^\\s]+)");

  /** @param normalizationTableFile A file listing character pairs for
   *     normalization.  Currently the normalization table must be in UTF-8.
   *     If this parameter is {@code null}, the default normalization
   *     of the zero-argument constructor is used.
   */
  public ChineseDocumentToSentenceProcessor(String normalizationTableFile) {
    // this.normalizationTableFile = normalizationTableFile;
    if (normalizationTableFile != null) {
      normalizationTable = new ArrayList<>();
      for (String line : ObjectBank.getLineIterator(new File(normalizationTableFile), encoding)) {
        Matcher pairMatcher = PAIR_PATTERN.matcher(line);
        if (pairMatcher.find()) {
          normalizationTable.add(new Pair<>(pairMatcher.group(1), pairMatcher.group(2)));
        } else {
          log.info("Didn't match: "+line);
        }
      }
    } else {
      normalizationTable = null;
    }
  }
  /*
  public ChineseDocumentToSentenceProcessor(String normalizationTableFile, String encoding) {
    log.info("WARNING: ChineseDocumentToSentenceProcessor ignores normalizationTableFile argument!");
    log.info("WARNING: ChineseDocumentToSentenceProcessor ignores encoding argument!");
    // encoding is never read locally
    this.encoding = encoding;
  }
  */


  /** This should now become disused, and other people should call
   *  ChineseUtils directly!  CDM June 2006.
   */
  public String normalization(String in) {
    //log.info("BEFOR NORM: "+in);
    String norm = ChineseUtils.normalize(in);
    String out = normalize(norm);
    //log.info("AFTER NORM: "+out);
    return out;
  }

  private static final Pattern WHITEPLUS_PATTERN = Pattern.compile(WHITEPLUS);
  private static final Pattern START_WHITEPLUS_PATTERN = Pattern.compile('^' + WHITEPLUS);
  private static final Pattern END_WHITEPLUS_PATTERN = Pattern.compile(WHITEPLUS + '$');

  private String normalize(String inputString) {
    if (normalizationTable == null) {
      return inputString;
    }

    Pattern replacePattern = WHITEPLUS_PATTERN;
    Matcher replaceMatcher = replacePattern.matcher(inputString);
    inputString = replaceMatcher.replaceAll(" ");

    for (Pair<String,String> p : normalizationTable) {
      replacePattern = Pattern.compile(p.first(), Pattern.LITERAL);
      replaceMatcher = replacePattern.matcher(inputString);
      String escape = p.second();
      if (escape.equals("$")) {escape="\\$";}
      inputString = replaceMatcher.replaceAll(escape);
    }
    return inputString;
  }


  /** usage: java ChineseDocumentToSentenceProcessor [-segmentIBM]
   *  -file filename [-encoding encoding]
   *  <p>
   *  The -segmentIBM option is for IBM GALE-specific splitting of an
   *  XML element into sentences.
   */
  public static void main(String[] args) throws Exception {
    //String encoding = "GB18030";
    Properties props = StringUtils.argsToProperties(args);
    // log.info("Here are the properties:");
    // props.list(System.err);
    boolean alwaysAddS = props.containsKey("alwaysAddS");
    ChineseDocumentToSentenceProcessor cp;
    if (! props.containsKey("file")) {
      log.info("usage: java ChineseDocumentToSentenceProcessor [-segmentIBM] -file filename [-encoding encoding]");
      return;
    }
    cp = new ChineseDocumentToSentenceProcessor();
    if (props.containsKey("encoding")) {
      log.info("WARNING: for now the default encoding is "+cp.encoding+". It's not changeable for now");
    }
    String input = IOUtils.slurpFileNoExceptions(props.getProperty("file"),
                                                     cp.encoding);
    // String input = StringUtils.slurpGBURLNoExceptions(new URL(props.getProperty("file")));

    if (props.containsKey("segmentIBM")) {
      Tokenizer<Word> tok = WhitespaceTokenizer.
        newWordWhitespaceTokenizer(new StringReader(input), true);
      String parseInside = props.getProperty("parseInside");
      if (parseInside == null) parseInside = "";

      Pattern p1, p2, p3, p4;
      PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out, cp.encoding), true);
      StringBuilder buff = new StringBuilder();
      StringBuilder sgmlbuff = new StringBuilder();
      String lastSgml = "";

      p1 = Pattern.compile("<.*>");
      p2 = Pattern.compile("\uFEFF?<[\\p{Alpha}]+");
      p3 = Pattern.compile("[A-Za-z0-9=\"]+>");
      p4 = Pattern.compile("<(?:" + parseInside + ")[ >]");

      boolean inSGML = false;
      int splitItems = 0;
      int numAdded = 0;
      while (tok.hasNext()) {
        String s = tok.next().word();
        // pw.println("The token is |" + s + "|");
        if (p2.matcher(s).matches()) {
          inSGML = true;
          sgmlbuff.append(s).append(" ");
        } else if (p1.matcher(s).matches() || inSGML && p3.matcher(s).matches() || "\n".equals(s)) {
          inSGML = false;
          if (buff.toString().trim().length() > 0) {
            // pw.println("Dumping sentences");
            // pw.println("Buff is " + buff);
            boolean processIt = false;
            if (parseInside.equals("")) {
              processIt = true;
            } else if (p4.matcher(lastSgml).find()) {
              processIt = true;
            }
            if (processIt) {
              List<String> sents = ChineseDocumentToSentenceProcessor.fromPlainText(buff.toString(), true);
              // pw.println("Sents is " + sents);
              // pw.println();
              if (alwaysAddS || sents.size() > 1) {
                int i = 1;
                for (String str : sents) {
                  pw.print("<s id=\"" + i + "\">");
                  pw.print(str);
                  pw.println("</s>");
                  i++;
                }
                if (sents.size() > 1) {
                  splitItems++;
                  numAdded += sents.size() - 1;
                }
              } else if (sents.size() == 1) {
                pw.print(sents.get(0));
              }
            } else {
              pw.print(buff);
            }
            buff = new StringBuilder();
          }
          sgmlbuff.append(s);
          // pw.println("sgmlbuff is " + sgmlbuff);
          pw.print(sgmlbuff);
          lastSgml = sgmlbuff.toString();
          sgmlbuff = new StringBuilder();
        } else {
          if (inSGML) {
            sgmlbuff.append(s).append(" ");
          } else {
            buff.append(s).append(" ");
          }
          // pw.println("Buff is now |" + buff + "|");
        }
      } // end while (tok.hasNext()) {
      // empty remaining buffers
      pw.flush();
      pw.close();
      log.info("Split " + splitItems + " segments, adding " +
                         numAdded + " sentences.");
    } else {
      List<String> sent = cp.fromHTML(input);
      PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.err, cp.encoding), true);

      for (String a : sent) {
        pw.println(a);
      }
    }
  }


  /**
   * Strip off HTML tags before processing.
   * Only the simplest tag stripping is implemented.
   *
   * @param inputString Chinese document text which contains HTML tags
   * @return a List of sentence strings
   */
  public static List<String> fromHTML(String inputString) throws IOException {
    //HTMLParser parser = new HTMLParser();
    //return fromPlainText(parser.parse(inputString));
    List<String> ans = new ArrayList<>();
    MyHTMLParser parser = new MyHTMLParser();
    List<String> sents = parser.parse(inputString);
    for (String s : sents) {
      ans.addAll(fromPlainText(s));
    }
    return ans;
  }


  /**
   * @param contentString Chinese document text
   * @return a List of sentence strings
   * @throws IOException
   */
  public static List<String> fromPlainText(String contentString) throws IOException {
    return fromPlainText(contentString, false);
  }

  public static List<String> fromPlainText(String contentString, boolean segmented) throws IOException {
    if (segmented) {
      contentString = ChineseUtils.normalize(contentString,
                                             ChineseUtils.LEAVE,
                                             ChineseUtils.ASCII);
    } else {
      contentString = ChineseUtils.normalize(contentString,
                                             ChineseUtils.FULLWIDTH,
                                             ChineseUtils.ASCII);
    }

    String sentenceString = "";

    char[] content = contentString.toCharArray();
    boolean sentenceEnd = false;
    List<String> sentenceList = new ArrayList<>();

    int lastCh = -1;
    for (Character c : content) {
      // EncodingPrintWriter.out.println("Char is |" + c + "|", "UTF-8");
      String newChar = c.toString();

      if ( ! sentenceEnd) {
        if (segmented && fullStopsSet.contains(c) &&
            (lastCh == -1 || Character.isSpaceChar(lastCh))) {
          // require it to be a standalone punctuation mark -- cf. URLs
          sentenceString += newChar;
          sentenceEnd = true;
        } else if ( ! segmented && fullStopsSet.contains(c)) {
          // EncodingPrintWriter.out.println("  End of sent char", "UTF-8");
          sentenceString += newChar;
          sentenceEnd = true;
        } else {
          sentenceString += newChar;
        }
      } else { // sentenceEnd == true
        if (rightMarkSet.contains(c)) {
          sentenceString += newChar;
          // EncodingPrintWriter.out.println("  Right mark char", "UTF-8");
        } else if (newChar.matches("\\s")) {
          sentenceString += newChar;
        } else if (fullStopsSet.contains(c)) {
          // EncodingPrintWriter.out.println("  End of sent char (2+)", "UTF-8");
          sentenceString += newChar;
        } else { // otherwise
          if (sentenceString.length() > 0) {
            sentenceEnd = false;
          }
          sentenceString = removeWhitespace(sentenceString, segmented);
          if (sentenceString.length() > 0) {
            //log.info("<<< "+sentenceString+" >>>");
            sentenceList.add(sentenceString);
          }
          sentenceString = "";
          sentenceString += newChar;
        }
      }
      lastCh = c.charValue();
    } // end for (Character c : content)

    sentenceString = removeWhitespace(sentenceString, segmented);
    if (sentenceString.length() > 0) {
      //log.info("<<< "+sentenceString+" >>>");
      sentenceList.add(sentenceString);
    }
    return sentenceList;
  }

  /** In non-segmented mode, all whitespace is removed,
   *  in segmented mode only leading and trailing whitespace goes away.
   *
   */
  private static String removeWhitespace(String str, boolean segmented) {
    if (str.length() > 0) {
      //System.out.println("Add: "+sentenceString);
      Pattern replacePattern = START_WHITEPLUS_PATTERN;
      Matcher replaceMatcher = replacePattern.matcher(str);
      str = replaceMatcher.replaceAll("");
      replacePattern = END_WHITEPLUS_PATTERN;
      replaceMatcher = replacePattern.matcher(str);
      str = replaceMatcher.replaceAll("");

      if ( ! segmented) {
        replacePattern = WHITEPLUS_PATTERN;
        replaceMatcher = replacePattern.matcher(str);
        str = replaceMatcher.replaceAll("");
      }
    }
    return str;
  }



  static class MyHTMLParser extends HTMLEditorKit.ParserCallback {

    protected StringBuffer textBuffer;
    protected List<String> sentences;
    protected String title;
    protected boolean isTitle;
    protected boolean isBody;
    protected boolean isScript;
    protected boolean isBreak;

    public MyHTMLParser() {
      super();
      title = "";
      isTitle = false;
      isBody = false;
      isScript = false;
      isBreak = false;
    }


    @Override
    public void handleText(char[] data, int pos) {
      if (data.length == 0) return;

      if (isTitle) {
        title = new String(data);
      } else if (isBody && !isScript) {
        //textBuffer.append(data).append(" ");
      }
      //if (isBreak) {
      if (true) {
        textBuffer.append(data);
        String text = textBuffer.toString();
        text = text.replaceAll("\u00a0","");
        text = text.trim();
        if (text.length()==0) return;


        sentences.add(text);
        textBuffer = new StringBuffer(500);
      }
    }

    /**
     * Sets a flag if the start tag is the "TITLE" element start tag.
     */
    @Override
    public void handleStartTag(HTML.Tag tag, MutableAttributeSet attrSet, int pos) {
      if (tag == HTML.Tag.TITLE) {
        isTitle = true;
      } else if (tag == HTML.Tag.BODY) {
        isBody = true;
      } else if (tag == HTML.Tag.SCRIPT) {
        isScript = true;
      }

      isBreak = tag.breaksFlow();
    }

    /**
     * Sets a flag if the end tag is the "TITLE" element end tag
     */
    @Override
    public void handleEndTag(HTML.Tag tag, int pos) {
      if (tag == HTML.Tag.TITLE) {
        isTitle = false;
      } else if (tag == HTML.Tag.BODY) {
        isBody = false;
      } else if (tag == HTML.Tag.SCRIPT) {
        isScript = false;
      }
    }

    public List<String> parse(URL url) throws IOException {
      return (parse(IOUtils.slurpURL(url)));
    }

    public List<String> parse(Reader r) throws IOException {
      return parse(IOUtils.slurpReader(r));
    }

    /**
     * The parse method that actually does the work.
     * Now it first gets rid of singleton tags before running.
     * @throws IOException
     */
    public List<String> parse(String text) throws IOException {
      text = text.replaceAll("/>", ">");
      text = text.replaceAll("<\\?","<");
      StringReader r = new StringReader(text);
      textBuffer = new StringBuffer(200);
      sentences = new ArrayList<>();
      new ParserDelegator().parse(r, this, true);
      return sentences;
    }

    public String title() {
      return title;
    }

    /*
    public static void main(String[] args) throws IOException {
      MyHTMLParser parser = new MyHTMLParser();
      String input = StringUtils.slurpGBURLNoExceptions(new URL(args[0]));
      List<String> result = parser.parse(input);
      PrintWriter orig = new PrintWriter("file.orig");
      PrintWriter parsed = new PrintWriter("file.parsed");
      log.info("output to file.orig");
      orig.println(input);
      for (String s : result) {
        log.info("output to file.parsed");
        parsed.println(s);
        parsed.println("-----------------------------------------");
      }
      orig.close();
      parsed.close();
    }
    */

  }


} // end class ChineseDocumentToSentenceProcessor