CHTBTokenizer.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.trees.international.pennchinese;

import edu.stanford.nlp.io.EncodingPrintWriter;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.process.AbstractTokenizer;
import edu.stanford.nlp.process.Tokenizer;
import edu.stanford.nlp.util.logging.Redwood;

import java.io.*;

/**
 * A simple tokenizer for tokenizing Penn Chinese Treebank files.  A
 * token is any parenthesis, node label, or terminal.  All SGML
 * content of the files is ignored.
 *
 * @author Roger Levy
 * @version 01/17/2003
 */
public class CHTBTokenizer extends AbstractTokenizer<String>  {

  /** A logger for this class */
  private static final Redwood.RedwoodChannels log = Redwood.channels(CHTBTokenizer.class);

  private final CHTBLexer lexer;


  /**
   * Constructs a new tokenizer from a Reader.  Note that getting
   * the bytes going into the Reader into Java-internal Unicode is
   * not the tokenizer's job.  This can be done by converting the
   * file with {@code ConvertEncodingThread}, or by specifying
   * the files encoding explicitly in the Reader with
   * java.io.{@code InputStreamReader}.
   *
   * @param r Reader
   */
  public CHTBTokenizer(Reader r) {
    lexer = new CHTBLexer(r);
  }


  /**
   * Internally fetches the next token.
   *
   * @return The next token in the token stream, or null if none exists.
   */
  @Override
  protected String getNext() {
    try {
      int a;
      while ((a = lexer.yylex()) == CHTBLexer.IGNORE) {
        // log.info("#ignored: " + lexer.match());
      }
      if (a == CHTBLexer.YYEOF) {
        return null;
      } else {
        //log.info("#matched: " + lexer.match());
        return lexer.match();
      }
    } catch (IOException ioe) {
      // do nothing, return null
    }
    return null;
  }


  /**
   * The main() method tokenizes a file in the specified Encoding
   * and prints it to standard output in the specified Encoding.
   * Its arguments are (Infile, Encoding).
   */
  public static void main(String[] args) throws IOException {
    if (args.length < 2) {
      log.error("Usage: CHTBTokenizer inputFile encoding");
    }
    String encoding = args[1];
    Reader in = IOUtils.readerFromString(args[0], encoding);

    for (Tokenizer<String> st = new CHTBTokenizer(in); st.hasNext(); ) {
      String s = st.next();
      EncodingPrintWriter.out.println(s, encoding);
      // EncodingPrintWriter.out.println("|" + s + "| (" + s.length() + ")",
      //				encoding);
    }
  }

}