POSBracketToVector.java example

Explorer
MinorThird-master
package LBJ2.nlp;

import LBJ2.parse.LineByLine;
import LBJ2.parse.LinkedVector;


/**
  * Use this parser to return <code>LinkedVector</code> objects representing
  * sentences given file names of POS bracket form files to parse.  These
  * files are expected to have one sentence per line, and the format of each
  * line is as follows: <br><br>
  *
  * <code>(pos1 spelling1) (pos2 spelling2) ... (posN spellingN)</code>
  * <br><br>
  *
  * It is also expected that there will be exactly one space between a part of
  * speech and the corresponding spelling and between a closing parenthesis
  * and an opening parenthesis.
  *
  * @author Nick Rizzolo
 **/
public class POSBracketToVector extends LineByLine
{
  /**
    * Creates the parser.
    *
    * @param file The file to parse.
   **/
  public POSBracketToVector(String file) { super(file); }


  /**
    * Retrieves the next <code>LinkedVector</code> from the files being
    * parsed.
   **/
  public Object next() {
    String line = readLine();
    if (line == null) return null;
    return parsePOSBracketForm(line);
  }


  /**
    * Given a single line of textual input (containing all and only the words
    * in a single sentence) in the format shown above, this method parses and
    * returns a <code>LinkedVector</code>.
    *
    * @param line A single line of text.
    * @return A <code>LinkedVector</code> representing the input text.
   **/
  public static LinkedVector parsePOSBracketForm(String line) {
    String[] tokens = line.trim().split(" ");
    if (tokens.length == 0
        || tokens.length == 1
           && (tokens[0] == null || tokens[0].length() == 0))
      return new LinkedVector();

    int spaceIndex = line.indexOf(' ');
    spaceIndex = line.indexOf(' ', spaceIndex + 1);
    Word w = new Word(tokens[1].substring(0, tokens[1].length() - 1),
                      tokens[0].substring(1),
                      0,
                      spaceIndex - 1);

    for (int i = 2; i < tokens.length; i += 2) {
      int start = spaceIndex + 1;
      spaceIndex = line.indexOf(' ', spaceIndex + 1);
      spaceIndex = line.indexOf(' ', spaceIndex + 1);

      w.next =
        new Word(tokens[i + 1].substring(0, tokens[i + 1].length() - 1),
                 tokens[i].substring(1),
                 w,
                 start,
                 spaceIndex - 1);
      w = (Word) w.next;
    }

    return new LinkedVector(w);
  }


  /**
    * Given textual input in the format shown below, this method parses and
    * returns the <code>Word</code> that the text represents.  Expected
    * format: <br><br>
    *
    * <code>(pos spelling)</code>
    *
    * @param text     Text representing a word in POS bracket form.
    * @param previous The word that came before this word in the sentence.
    * @return A <code>Word</code> represented by the input text or
    *         <code>null</code> if the input does not represent a
    *         <code>Word</code>.
   **/
  public static Word parsePOSBracketForm(String text, Word previous) {
    if (text.charAt(0) != '(' || text.charAt(text.length() - 1) != ')')
      return null;
    String[] tokens = text.split(" ");
    if (tokens.length != 2) return null;
    return new Word(tokens[1].substring(0, tokens[1].length() - 1),
                    tokens[0].substring(1),
                    previous);
  }
}