package edu.stanford.nlp.trees.international.arabic; import edu.stanford.nlp.util.logging.Redwood; import edu.stanford.nlp.trees.PennTreebankTokenizer; import edu.stanford.nlp.process.Tokenizer; import java.io.FileReader; import java.io.IOException; import java.io.Reader; /** * Builds a tokenizer for the Penn Arabic Treebank (ATB) using a * {@link java.io.StreamTokenizer}. * <p> * This implementation is current as of the following LDC catalog numbers: * LDC2008E61 (ATBp1v4), LDC2008E62 (ATBp2v3), and LDC2008E22 (ATBp3v3.1) * * @author Christopher Manning * @author Spence Green */ public class ArabicTreebankTokenizer extends PennTreebankTokenizer { /** A logger for this class */ private static Redwood.RedwoodChannels log = Redwood.channels(ArabicTreebankTokenizer.class); public ArabicTreebankTokenizer(Reader r) { super(r); //Required to support comments that appear in ATB3 st.eolIsSignificant(true); } /** * Internally fetches the next token. * * @return the next token in the token stream, or null if none exists. */ @Override public String getNext() { try { while (true) { st.nextToken(); int nextToken = st.ttype; switch (nextToken) { case java.io.StreamTokenizer.TT_WORD: // ";;" are comments in ATB3 // ":::" are also escaped for backward compatibility with the // old Stanford ATB pipeline if (st.sval.equals(":::") || st.sval.equals(";;")) { do { st.nextToken(); nextToken = st.ttype; } while(nextToken != java.io.StreamTokenizer.TT_EOL); continue; } else return st.sval; case java.io.StreamTokenizer.TT_NUMBER: return Double.toString(st.nval); case java.io.StreamTokenizer.TT_EOL: continue; case java.io.StreamTokenizer.TT_EOF: return null; default: char[] t = {(char) nextToken}; // (array initialization) return new String(t); } } } catch (IOException e) { System.err.printf("%s: Unknown exception in input stream\n", this.getClass().getName()); e.printStackTrace(); } return null; } public static void main(String[] args) throws IOException { Tokenizer<String> att = new ArabicTreebankTokenizer(new FileReader(args[0])); while (att.hasNext()) { System.out.print(att.next()); } } }