/**
*
*/
package edu.berkeley.nlp.tokenizer;
/* The following code was generated by JFlex 1.3.5 on 12/2/02 9:02 PM */
import java.util.*;
import java.io.*;
/**
* doesn't handle \"can not\" to \"cannot\".
*/
/**
* This class is a scanner generated by
* <a href="http://www.jflex.de/">JFlex</a> 1.3.5
* on 12/2/02 9:02 PM from the specification file
* <tt>file:/nlp/u1/jsmarr/cs276a/src/googleling/PTB2TextLexer.flex</tt>
*/
class PTB2TextLexer {
/** This character denotes the end of file */
final public static int YYEOF = -1;
/** initial size of the lookahead buffer */
final private static int YY_BUFFERSIZE = 16384;
/** lexical states */
final public static int YYINITIAL = 0;
/**
* Translates characters to character classes
*/
final private static String yycmap_packed =
"\12\0\1\6\25\0\1\1\1\5\2\0\1\25\1\24\1\0\1\2"+
"\1\17\1\20\2\0\1\5\1\13\1\4\13\0\1\5\1\5\3\0"+
"\1\5\2\0\1\16\1\21\10\0\1\14\1\0\1\11\3\0\1\15"+
"\1\0\1\12\6\0\1\22\1\0\1\23\2\0\1\3\15\0\1\7"+
"\5\0\1\10\uff8b\0";
/**
* Translates characters to character classes
*/
final private static char [] yycmap = yy_unpack_cmap(yycmap_packed);
/**
* Translates a state to a row index in the transition table
*/
final private static int yy_rowMap [] = {
0, 22, 44, 66, 88, 110, 132, 154, 176, 198,
220, 88, 242, 264, 286, 88, 88, 88, 88, 308,
330, 88, 88, 88, 88, 352, 374, 396, 418, 440,
462, 88, 88, 484, 506, 528, 550, 572, 594
};
/**
* The packed transition table of the DFA (part 0)
*/
final private static String yy_packed0 =
"\1\2\1\3\1\2\1\4\2\2\1\5\4\2\1\6"+
"\3\2\1\7\2\2\1\10\2\2\1\11\1\2\1\0"+
"\4\2\1\0\17\2\2\0\1\12\1\0\1\13\1\14"+
"\1\0\1\15\1\0\1\16\1\0\1\17\4\0\1\20"+
"\2\0\1\21\1\22\1\0\1\2\1\23\1\2\1\24"+
"\2\2\1\0\17\2\26\0\1\2\1\0\4\2\1\0"+
"\5\2\1\25\12\2\1\26\4\2\1\0\20\2\1\27"+
"\4\2\1\0\20\2\1\30\4\2\1\0\17\2\1\14"+
"\1\0\1\31\3\14\1\0\17\14\4\0\1\32\23\0"+
"\1\33\25\0\1\34\40\0\1\35\10\0\1\2\1\31"+
"\4\2\1\0\20\2\1\0\4\2\1\0\6\2\1\36"+
"\3\2\1\37\4\2\4\0\1\14\31\0\1\40\27\0"+
"\1\41\30\0\1\42\3\0\1\43\4\0\1\2\1\0"+
"\4\2\1\0\7\2\1\44\10\2\1\0\4\2\1\0"+
"\7\2\1\45\7\2\16\0\1\46\25\0\1\47\7\0"+
"\1\2\1\0\4\2\1\0\4\2\1\7\13\2\1\0"+
"\4\2\1\0\4\2\1\10\12\2\13\0\1\20\25\0"+
"\1\21\12\0";
/**
* The transition table of the DFA
*/
final private static int yytrans [] = yy_unpack();
/* error codes */
final private static int YY_UNKNOWN_ERROR = 0;
final private static int YY_ILLEGAL_STATE = 1;
final private static int YY_NO_MATCH = 2;
final private static int YY_PUSHBACK_2BIG = 3;
/* error messages for the codes above */
final private static String YY_ERROR_MSG[] = {
"Unkown internal scanner error",
"Internal error: unknown state",
"Error: could not match input",
"Error: pushback value was too large"
};
/**
* YY_ATTRIBUTE[aState] contains the attributes of state <code>aState</code>
*/
private final static byte YY_ATTRIBUTE[] = {
0, 1, 1, 1, 9, 1, 1, 1, 1, 0, 1, 9, 0, 0, 0, 9,
9, 9, 9, 1, 1, 9, 9, 9, 9, 0, 0, 0, 0, 1, 1, 9,
9, 0, 0, 1, 1, 0, 0
};
/** the input device */
private java.io.Reader yy_reader;
/** the current state of the DFA */
private int yy_state;
/** the current lexical state */
private int yy_lexical_state = YYINITIAL;
/** this buffer contains the current text to be matched and is
the source of the yytext() string */
private char yy_buffer[] = new char[YY_BUFFERSIZE];
/** the textposition at the last accepting state */
private int yy_markedPos;
/** the textposition at the last state to be included in yytext */
private int yy_pushbackPos;
/** the current text position in the buffer */
private int yy_currentPos;
/** startRead marks the beginning of the yytext() string in the buffer */
private int yy_startRead;
/** endRead marks the last character in the buffer, that has been read
from input */
private int yy_endRead;
/** number of newlines encountered up to the start of the matched text */
private int yyline;
/** the number of characters up to the start of the matched text */
private int yychar;
/**
* the number of characters from the last newline up to the start of the
* matched text
*/
private int yycolumn;
/**
* yy_atBOL == true <=> the scanner is currently at the beginning of a line
*/
private boolean yy_atBOL = true;
/** yy_atEOF == true <=> the scanner is at the EOF */
private boolean yy_atEOF;
/* user code: */
/*
"'T WAS"
{ return("'TWAS"); }
"'T was"
{ return("'Twas"); }
"'t was"
{ return("'twas"); }
"'T IS"
{ return("'TIS"); }
"'T is"
{ return("'Tis"); }
"'t is"
{ return("'tis"); }
*/
/**
* Creates a new scanner
* There is also a java.io.InputStream version of this constructor.
*
* @param in the java.io.Reader to read input from.
*/
PTB2TextLexer(java.io.Reader in) {
this.yy_reader = in;
}
/**
* Creates a new scanner.
* There is also java.io.Reader version of this constructor.
*
* @param in the java.io.Inputstream to read input from.
*/
PTB2TextLexer(java.io.InputStream in) {
this(new java.io.InputStreamReader(in));
}
/**
* Unpacks the split, compressed DFA transition table.
*
* @return the unpacked transition table
*/
private static int [] yy_unpack() {
int [] trans = new int[616];
int offset = 0;
offset = yy_unpack(yy_packed0, offset, trans);
return trans;
}
/**
* Unpacks the compressed DFA transition table.
*
* @param packed the packed transition table
* @return the index of the last entry
*/
private static int yy_unpack(String packed, int offset, int [] trans) {
int i = 0; /* index in packed string */
int j = offset; /* index in unpacked array */
int l = packed.length();
while (i < l) {
int count = packed.charAt(i++);
int value = packed.charAt(i++);
value--;
do trans[j++] = value; while (--count > 0);
}
return j;
}
/**
* Unpacks the compressed character translation table.
*
* @param packed the packed character translation table
* @return the unpacked character translation table
*/
private static char [] yy_unpack_cmap(String packed) {
char [] map = new char[0x10000];
int i = 0; /* index in packed string */
int j = 0; /* index in unpacked array */
while (i < 86) {
int count = packed.charAt(i++);
char value = packed.charAt(i++);
do map[j++] = value; while (--count > 0);
}
return map;
}
/**
* Refills the input buffer.
*
* @return <code>false</code>, iff there was new input.
*
* @exception java.io.IOException if any I/O-Error occurs
*/
private boolean yy_refill() throws java.io.IOException {
/* first: make room (if you can) */
if (yy_startRead > 0) {
System.arraycopy(yy_buffer, yy_startRead,
yy_buffer, 0,
yy_endRead-yy_startRead);
/* translate stored positions */
yy_endRead-= yy_startRead;
yy_currentPos-= yy_startRead;
yy_markedPos-= yy_startRead;
yy_pushbackPos-= yy_startRead;
yy_startRead = 0;
}
/* is the buffer big enough? */
if (yy_currentPos >= yy_buffer.length) {
/* if not: blow it up */
char newBuffer[] = new char[yy_currentPos*2];
System.arraycopy(yy_buffer, 0, newBuffer, 0, yy_buffer.length);
yy_buffer = newBuffer;
}
/* finally: fill the buffer with new input */
int numRead = yy_reader.read(yy_buffer, yy_endRead,
yy_buffer.length-yy_endRead);
if (numRead < 0) {
return true;
}
else {
yy_endRead+= numRead;
return false;
}
}
/**
* Closes the input stream.
*/
final public void yyclose() throws java.io.IOException {
yy_atEOF = true; /* indicate end of file */
yy_endRead = yy_startRead; /* invalidate buffer */
if (yy_reader != null)
yy_reader.close();
}
/**
* Closes the current stream, and resets the
* scanner to read from a new input stream.
*
* All internal variables are reset, the old input stream
* <b>cannot</b> be reused (internal buffer is discarded and lost).
* Lexical state is set to <tt>YY_INITIAL</tt>.
*
* @param reader the new input stream
*/
final public void yyreset(java.io.Reader reader) throws java.io.IOException {
yyclose();
yy_reader = reader;
yy_atBOL = true;
yy_atEOF = false;
yy_endRead = yy_startRead = 0;
yy_currentPos = yy_markedPos = yy_pushbackPos = 0;
yyline = yychar = yycolumn = 0;
yy_lexical_state = YYINITIAL;
}
/**
* Returns the current lexical state.
*/
final public int yystate() {
return yy_lexical_state;
}
/**
* Enters a new lexical state
*
* @param newState the new lexical state
*/
final public void yybegin(int newState) {
yy_lexical_state = newState;
}
/**
* Returns the text matched by the current regular expression.
*/
final public String yytext() {
return new String( yy_buffer, yy_startRead, yy_markedPos-yy_startRead );
}
/**
* Returns the character at position <tt>pos</tt> from the
* matched text.
*
* It is equivalent to yytext().charAt(pos), but faster
*
* @param pos the position of the character to fetch.
* A value from 0 to yylength()-1.
*
* @return the character at position pos
*/
final public char yycharat(int pos) {
return yy_buffer[yy_startRead+pos];
}
/**
* Returns the length of the matched text region.
*/
final public int yylength() {
return yy_markedPos-yy_startRead;
}
/**
* Reports an error that occured while scanning.
*
* In a wellformed scanner (no or only correct usage of
* yypushback(int) and a match-all fallback rule) this method
* will only be called with things that "Can't Possibly Happen".
* If this method is called, something is seriously wrong
* (e.g. a JFlex bug producing a faulty scanner etc.).
*
* Usual syntax/scanner level error handling should be done
* in error fallback rules.
*
* @param errorCode the code of the errormessage to display
*/
private void yy_ScanError(int errorCode) {
String message;
try {
message = YY_ERROR_MSG[errorCode];
}
catch (ArrayIndexOutOfBoundsException e) {
message = YY_ERROR_MSG[YY_UNKNOWN_ERROR];
}
throw new Error(message);
}
/**
* Pushes the specified amount of characters back into the input stream.
*
* They will be read again by then next call of the scanning method
*
* @param number the number of characters to be read again.
* This number must not be greater than yylength()!
*/
private void yypushback(int number) {
if ( number > yylength() )
yy_ScanError(YY_PUSHBACK_2BIG);
yy_markedPos -= number;
}
/**
* Resumes scanning until the next regular expression is matched,
* the end of input is encountered or an I/O-Error occurs.
*
* @return the next token
* @exception java.io.IOException if any I/O-Error occurs
*/
public String next() throws java.io.IOException {
int yy_input;
int yy_action;
// cached fields:
int yy_currentPos_l;
int yy_startRead_l;
int yy_markedPos_l;
int yy_endRead_l = yy_endRead;
char [] yy_buffer_l = yy_buffer;
char [] yycmap_l = yycmap;
int [] yytrans_l = yytrans;
int [] yy_rowMap_l = yy_rowMap;
byte [] yy_attr_l = YY_ATTRIBUTE;
while (true) {
yy_markedPos_l = yy_markedPos;
yy_action = -1;
yy_startRead_l = yy_currentPos_l = yy_currentPos =
yy_startRead = yy_markedPos_l;
yy_state = yy_lexical_state;
yy_forAction: {
while (true) {
if (yy_currentPos_l < yy_endRead_l)
yy_input = yy_buffer_l[yy_currentPos_l++];
else if (yy_atEOF) {
yy_input = YYEOF;
break yy_forAction;
}
else {
// store back cached positions
yy_currentPos = yy_currentPos_l;
yy_markedPos = yy_markedPos_l;
boolean eof = yy_refill();
// get translated positions and possibly new buffer
yy_currentPos_l = yy_currentPos;
yy_markedPos_l = yy_markedPos;
yy_buffer_l = yy_buffer;
yy_endRead_l = yy_endRead;
if (eof) {
yy_input = YYEOF;
break yy_forAction;
}
else {
yy_input = yy_buffer_l[yy_currentPos_l++];
}
}
int yy_next = yytrans_l[ yy_rowMap_l[yy_state] + yycmap_l[yy_input] ];
if (yy_next == -1) break yy_forAction;
yy_state = yy_next;
int yy_attributes = yy_attr_l[yy_state];
if ( (yy_attributes & 1) == 1 ) {
yy_action = yy_state;
yy_markedPos_l = yy_currentPos_l;
if ( (yy_attributes & 8) == 8 ) break yy_forAction;
}
}
}
// store back cached position
yy_markedPos = yy_markedPos_l;
switch (yy_action) {
case 32:
{ return("N'T"); }
case 40: break;
case 31:
{ return("n't"); }
case 41: break;
case 10:
case 11:
{ return(yytext().substring(1, yytext().length())); }
case 42: break;
case 1:
case 3:
case 5:
case 6:
case 7:
case 8:
case 19:
case 20:
case 29:
case 30:
case 35:
case 36:
{ return(yytext()); }
case 43: break;
case 2:
{ return(yytext()); }
case 44: break;
case 4:
{ return(yytext()); }
case 45: break;
case 24:
{ return("\""); }
case 46: break;
case 23:
{ return("$"); }
case 47: break;
case 22:
{ return("["); }
case 48: break;
case 21:
{ return("("); }
case 49: break;
case 15:
{ return(")"); }
case 50: break;
case 16:
{ return("]"); }
case 51: break;
case 17:
{ return("%"); }
case 52: break;
case 18:
{ return("`"); }
case 53: break;
default:
if (yy_input == YYEOF && yy_startRead == yy_currentPos) {
yy_atEOF = true;
{ return(null); }
}
else {
yy_ScanError(YY_NO_MATCH);
}
}
}
}
/**
* Runs the scanner on input files.
*
* This is a standalone scanner, i.e. it will print any unmatched
* text to System.out unchanged.
*
* @param argv the command line, contains the filenames to run
* the scanner on.
*/
public static void main(String argv[]) {
if (argv.length == 0) {
System.out.println("Usage : java PTB2TextLexer <inputfile>");
}
else {
for (int i = 0; i < argv.length; i++) {
PTB2TextLexer scanner = null;
try {
scanner = new PTB2TextLexer( new java.io.FileReader(argv[i]) );
while ( !scanner.yy_atEOF ) scanner.next();
}
catch (java.io.FileNotFoundException e) {
System.out.println("File not found : \""+argv[i]+"\"");
}
catch (java.io.IOException e) {
System.out.println("IO error scanning file \""+argv[i]+"\"");
System.out.println(e);
}
catch (Exception e) {
System.out.println("Unexpected exception:");
e.printStackTrace();
}
}
}
}
}