package edu.stanford.nlp.trees.international.pennchinese;
import edu.stanford.nlp.io.EncodingPrintWriter;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.process.AbstractTokenizer;
import edu.stanford.nlp.process.Tokenizer;
import edu.stanford.nlp.util.logging.Redwood;
import java.io.*;
/**
* A simple tokenizer for tokenizing Penn Chinese Treebank files. A
* token is any parenthesis, node label, or terminal. All SGML
* content of the files is ignored.
*
* @author Roger Levy
* @version 01/17/2003
*/
public class CHTBTokenizer extends AbstractTokenizer<String> {
/** A logger for this class */
private static final Redwood.RedwoodChannels log = Redwood.channels(CHTBTokenizer.class);
private final CHTBLexer lexer;
/**
* Constructs a new tokenizer from a Reader. Note that getting
* the bytes going into the Reader into Java-internal Unicode is
* not the tokenizer's job. This can be done by converting the
* file with {@code ConvertEncodingThread}, or by specifying
* the files encoding explicitly in the Reader with
* java.io.{@code InputStreamReader}.
*
* @param r Reader
*/
public CHTBTokenizer(Reader r) {
lexer = new CHTBLexer(r);
}
/**
* Internally fetches the next token.
*
* @return The next token in the token stream, or null if none exists.
*/
@Override
protected String getNext() {
try {
int a;
while ((a = lexer.yylex()) == CHTBLexer.IGNORE) {
// log.info("#ignored: " + lexer.match());
}
if (a == CHTBLexer.YYEOF) {
return null;
} else {
//log.info("#matched: " + lexer.match());
return lexer.match();
}
} catch (IOException ioe) {
// do nothing, return null
}
return null;
}
/**
* The main() method tokenizes a file in the specified Encoding
* and prints it to standard output in the specified Encoding.
* Its arguments are (Infile, Encoding).
*/
public static void main(String[] args) throws IOException {
if (args.length < 2) {
log.error("Usage: CHTBTokenizer inputFile encoding");
}
String encoding = args[1];
Reader in = IOUtils.readerFromString(args[0], encoding);
for (Tokenizer<String> st = new CHTBTokenizer(in); st.hasNext(); ) {
String s = st.next();
EncodingPrintWriter.out.println(s, encoding);
// EncodingPrintWriter.out.println("|" + s + "| (" + s.length() + ")",
// encoding);
}
}
}