package edu.stanford.nlp.trees.international.pennchinese; import edu.stanford.nlp.io.EncodingPrintWriter; import edu.stanford.nlp.process.AbstractTokenizer; import edu.stanford.nlp.process.Tokenizer; import java.io.*; /** * A simple tokenizer for tokenizing Penn Chinese Treebank files. A * token is any parenthesis, node label, or terminal. All SGML * content of the files is ignored. * * @author Roger Levy * @version 01/17/2003 */ public class CHTBTokenizer extends AbstractTokenizer<String> { private final CHTBLexer lexer; /** * Constructs a new tokenizer from a Reader. Note that getting * the bytes going into the Reader into Java-internal Unicode is * not the tokenizer's job. This can be done by converting the * file with <code>ConvertEncodingThread</code>, or by specifying * the files encoding explicitly in the Reader with * java.io.<code>InputStreamReader</code>. * * @param r Reader */ public CHTBTokenizer(Reader r) { lexer = new CHTBLexer(r); } /** * Internally fetches the next token. * * @return the next token in the token stream, or null if none exists. */ @Override public String getNext() { try { int a; while ((a = lexer.yylex()) == CHTBLexer.IGNORE) { // System.err.println("#ignored: " + lexer.match()); } if (a == CHTBLexer.YYEOF) { return null; } else { //System.err.println("#matched: " + lexer.match()); return lexer.match(); } } catch (IOException ioe) { // do nothing, return null } return null; } /** * The main() method tokenizes a file in the specified Encoding * and prints it to standard output in the specified Encoding. * Its arguments are (Infile, Encoding). */ public static void main(String[] args) throws IOException { String encoding = args[1]; Reader in = new BufferedReader(new InputStreamReader(new FileInputStream(args[0]), encoding)); Tokenizer<String> st = new CHTBTokenizer(in); while (st.hasNext()) { String s = st.next(); EncodingPrintWriter.out.println(s, encoding); // EncodingPrintWriter.out.println("|" + s + "| (" + s.length() + ")", // encoding); } } }