/** * */ package edu.berkeley.nlp.tokenizer; import java.io.IOException; import java.io.StringReader; import java.util.ArrayList; import java.util.LinkedList; import java.util.List; import edu.berkeley.nlp.util.IOUtils; import edu.berkeley.nlp.util.Iterators; import edu.berkeley.nlp.util.StringUtils; /** * Similar to PTBLexer. However, instead of reading from a Reader this class is given a line * and returns a list of tokenized Strings. * @author petrov * */ public class PTBLineLexer extends PTBLexer implements LineTokenizer { public PTBLineLexer(){ super((java.io.Reader)null); } public List<String> tokenize(String line) { PTBTokenizer toker = new PTBTokenizer(new StringReader(line),true); List<?> elems = toker.tokenize(); List<String> toks = new ArrayList<String>(); for (Object o:elems) { toks.add(o.toString()); } return toks; } public List<String> tokenizeLine(String line) throws IOException{ LinkedList<String> tokenized = new LinkedList<String>(); int nEl = line.length(); char[] array = line.toCharArray(); yy_buffer = line.toCharArray();//new char[nEl+1]; //for(int i=0;i<nEl;i++) yy_buffer[i] = array[i]; //yy_buffer[nEl] = (char)YYEOF; yy_startRead = 0; yy_endRead = yy_buffer.length; yy_atBOL = true; yy_atEOF = false; yy_currentPos = yy_markedPos = yy_pushbackPos = 0; yyline = yychar = yycolumn = 0; yy_lexical_state = YYINITIAL; while(yy_markedPos<yy_endRead) tokenized.add(next()); return tokenized; } private boolean yy_refill() throws java.io.IOException { return true; } public static void main(String[] argv) { PTBLineLexer tokenizer = new PTBLineLexer(); try { for (String line : Iterators.able(IOUtils.lineIterator(argv[0]))) { final List<String> tokenizeLine = tokenizer.tokenizeLine(line); if (tokenizeLine.get(tokenizeLine.size() - 1) == null) tokenizeLine.remove(tokenizeLine.size() - 1); System.out.println(StringUtils.join(tokenizeLine)); } } catch (IOException e) { // TODO Auto-generated catch block throw new RuntimeException(e); } } }