package edu.stanford.nlp.process; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.io.PrintWriter; import java.io.Reader; import java.util.Iterator; import java.util.Properties; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.ling.HasWord; import edu.stanford.nlp.ling.Word; import edu.stanford.nlp.util.PropertiesUtils; import edu.stanford.nlp.util.StringUtils; /** * A WhitespaceTokenizer is a tokenizer that splits on and discards only * whitespace characters. * This implementation returns Word objects. It has a parameter for whether * to make EOL a token or whether to treat EOL characters as whitespace. * If an EOL is a token, the class returns it as a Word with String value "\n". * <p/> * <i>Implementation note:</i> This was rewritten in Apr 2006 to discard the * old StreamTokenizer based implementation and to replace it with a * Unicode compliant JFlex-based version. * * @author Joseph Smarr (jsmarr@stanford.edu) * @author Teg Grenager (grenager@stanford.edu) * @author Roger Levy * @author Christopher Manning */ public class WhitespaceTokenizer<T extends HasWord> extends AbstractTokenizer<T> { private WhitespaceLexer lexer; private final boolean eolIsSignificant; /** * A factory which vends WhitespaceTokenizers. * * @author Christopher Manning */ public static class WhitespaceTokenizerFactory<T extends HasWord> implements TokenizerFactory<T> { private boolean tokenizeNLs; private LexedTokenFactory<T> factory; /** * Constructs a new TokenizerFactory that returns Word objects and * treats carriage returns as normal whitespace. * THIS METHOD IS INVOKED BY REFLECTION BY SOME OF THE JAVANLP * CODE TO LOAD A TOKENIZER FACTORY. IT SHOULD BE PRESENT IN A * TokenizerFactory. * * @return A TokenizerFactory that returns Word objects */ public static TokenizerFactory<Word> newTokenizerFactory() { return new WhitespaceTokenizerFactory<>(new WordTokenFactory(), false); } public WhitespaceTokenizerFactory(LexedTokenFactory<T> factory) { this(factory, false); } public WhitespaceTokenizerFactory(LexedTokenFactory<T> factory, String options) { this.factory = factory; Properties prop = StringUtils.stringToProperties(options); this.tokenizeNLs = PropertiesUtils.getBool(prop, "tokenizeNLs", false); } public WhitespaceTokenizerFactory(LexedTokenFactory<T> factory, boolean tokenizeNLs) { this.factory = factory; this.tokenizeNLs = tokenizeNLs; } public Iterator<T> getIterator(Reader r) { return getTokenizer(r); } public Tokenizer<T> getTokenizer(Reader r) { return new WhitespaceTokenizer<>(factory, r, tokenizeNLs); } public Tokenizer<T> getTokenizer(Reader r, String extraOptions) { Properties prop = StringUtils.stringToProperties(extraOptions); boolean tokenizeNewlines = PropertiesUtils.getBool(prop, "tokenizeNLs", this.tokenizeNLs); return new WhitespaceTokenizer<>(factory, r, tokenizeNewlines); } public void setOptions(String options) { Properties prop = StringUtils.stringToProperties(options); tokenizeNLs = PropertiesUtils.getBool(prop, "tokenizeNLs", tokenizeNLs); } } // end class WhitespaceTokenizerFactory public static WhitespaceTokenizerFactory<CoreLabel> newCoreLabelTokenizerFactory(String options) { return new WhitespaceTokenizerFactory<>(new CoreLabelTokenFactory(), options); } public static WhitespaceTokenizerFactory<CoreLabel> newCoreLabelTokenizerFactory() { return new WhitespaceTokenizerFactory<>(new CoreLabelTokenFactory()); } /** * Internally fetches the next token. * * @return the next token in the token stream, or null if none exists. */ @SuppressWarnings("unchecked") @Override protected T getNext() { T token = null; if (lexer == null) { return token; } try { token = (T) lexer.next(); while (token != null && token.word().equals(WhitespaceLexer.NEWLINE)) { if (eolIsSignificant) { return token; } else { token = (T) lexer.next(); } } } catch (IOException e) { // do nothing, return null } return token; } /** * Constructs a new WhitespaceTokenizer * @param r The Reader that is its source. * @param eolIsSignificant Whether eol tokens should be returned. */ public WhitespaceTokenizer(LexedTokenFactory factory, Reader r, boolean eolIsSignificant) { this.eolIsSignificant = eolIsSignificant; // The conditional below is perhaps currently needed in LexicalizedParser, since // it passes in a null arg while doing type-checking for sentence escaping // but StreamTokenizer barfs on that. But maybe shouldn't be here. if (r != null) { lexer = new WhitespaceLexer(r, factory); } } public static WhitespaceTokenizer<CoreLabel> newCoreLabelWhitespaceTokenizer(Reader r) { return new WhitespaceTokenizer<>(new CoreLabelTokenFactory(), r, false); } public static WhitespaceTokenizer<CoreLabel> newCoreLabelWhitespaceTokenizer(Reader r, boolean tokenizeNLs) { return new WhitespaceTokenizer<>(new CoreLabelTokenFactory(), r, tokenizeNLs); } public static WhitespaceTokenizer<Word> newWordWhitespaceTokenizer(Reader r) { return newWordWhitespaceTokenizer(r, false); } public static WhitespaceTokenizer<Word> newWordWhitespaceTokenizer(Reader r, boolean eolIsSignificant) { return new WhitespaceTokenizer<>(new WordTokenFactory(), r, eolIsSignificant); } /* ---- * Sets the source of this Tokenizer to be the Reader r. private void setSource(Reader r) { lexer = new WhitespaceLexer(r); } ---- */ public static TokenizerFactory<Word> factory() { return new WhitespaceTokenizerFactory<>(new WordTokenFactory(), false); } public static TokenizerFactory<Word> factory(boolean eolIsSignificant) { return new WhitespaceTokenizerFactory<>(new WordTokenFactory(), eolIsSignificant); } /** * Reads a file from the argument and prints its tokens one per line. * This is mainly as a testing aid, but it can also be quite useful * standalone to turn a corpus into a one token per line file of tokens. * <p/> * Usage: <code>java edu.stanford.nlp.process.WhitespaceTokenizer filename * </code> * * @param args Command line arguments * @throws IOException If can't open files, etc. */ public static void main(String[] args) throws IOException { boolean eolIsSignificant = (args.length > 0 && args[0].equals("-cr")); Reader reader = ((args.length > 0 && !args[args.length - 1].equals("-cr")) ? new InputStreamReader(new FileInputStream (args[args.length - 1]), "UTF-8") : new InputStreamReader(System.in, "UTF-8")); WhitespaceTokenizer<Word> tokenizer = new WhitespaceTokenizer<>(new WordTokenFactory(), reader, eolIsSignificant); PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out, "UTF-8"), true); while (tokenizer.hasNext()) { Word w = tokenizer.next(); if (w.value().equals(WhitespaceLexer.NEWLINE)) { pw.println("***CR***"); } else { pw.println(w); } } } } // end class WhitespaceTokenizer