package LBJ2.nlp; import LBJ2.parse.LineByLine; import LBJ2.parse.LinkedVector; /** * Use this parser to return <code>LinkedVector</code> objects representing * sentences given file names of POS bracket form files to parse. These * files are expected to have one sentence per line, and the format of each * line is as follows: <br><br> * * <code>(pos1 spelling1) (pos2 spelling2) ... (posN spellingN)</code> * <br><br> * * It is also expected that there will be exactly one space between a part of * speech and the corresponding spelling and between a closing parenthesis * and an opening parenthesis. * * @author Nick Rizzolo **/ public class POSBracketToVector extends LineByLine { /** * Creates the parser. * * @param file The file to parse. **/ public POSBracketToVector(String file) { super(file); } /** * Retrieves the next <code>LinkedVector</code> from the files being * parsed. **/ public Object next() { String line = readLine(); if (line == null) return null; return parsePOSBracketForm(line); } /** * Given a single line of textual input (containing all and only the words * in a single sentence) in the format shown above, this method parses and * returns a <code>LinkedVector</code>. * * @param line A single line of text. * @return A <code>LinkedVector</code> representing the input text. **/ public static LinkedVector parsePOSBracketForm(String line) { String[] tokens = line.trim().split(" "); if (tokens.length == 0 || tokens.length == 1 && (tokens[0] == null || tokens[0].length() == 0)) return new LinkedVector(); int spaceIndex = line.indexOf(' '); spaceIndex = line.indexOf(' ', spaceIndex + 1); Word w = new Word(tokens[1].substring(0, tokens[1].length() - 1), tokens[0].substring(1), 0, spaceIndex - 1); for (int i = 2; i < tokens.length; i += 2) { int start = spaceIndex + 1; spaceIndex = line.indexOf(' ', spaceIndex + 1); spaceIndex = line.indexOf(' ', spaceIndex + 1); w.next = new Word(tokens[i + 1].substring(0, tokens[i + 1].length() - 1), tokens[i].substring(1), w, start, spaceIndex - 1); w = (Word) w.next; } return new LinkedVector(w); } /** * Given textual input in the format shown below, this method parses and * returns the <code>Word</code> that the text represents. Expected * format: <br><br> * * <code>(pos spelling)</code> * * @param text Text representing a word in POS bracket form. * @param previous The word that came before this word in the sentence. * @return A <code>Word</code> represented by the input text or * <code>null</code> if the input does not represent a * <code>Word</code>. **/ public static Word parsePOSBracketForm(String text, Word previous) { if (text.charAt(0) != '(' || text.charAt(text.length() - 1) != ')') return null; String[] tokens = text.split(" "); if (tokens.length != 2) return null; return new Word(tokens[1].substring(0, tokens[1].length() - 1), tokens[0].substring(1), previous); } }