package LBJ2.nlp; import LBJ2.parse.LinkedChild; /** * Implementation of a word for natural language processing. Please note * that in general, one can only count on the <code>form</code> and * <code>capitalized</code> fields described below having meaningful values. * The <code>form</code> field can be assumed to be filled in because it's * hard to imagine a situation in which a <code>Word</code> object should be * created without any knowledge of how that word appeared in text. The * <code>capitalized</code> field is computed from the <code>form</code> by * this class' constructor. * * <p> <i>All other fields must be obtained or computed externally. Space is * provided for them in this class' implementation as a convenience, since we * expect the user will make frequent use of these fields.</i> * * <p> This class extends from {@link LBJ2.parse.LinkedChild}. Of course, * this means that objects of this class contain references to both the * previous and the next word in the sentence. Constructors are available * that take the previous word as an argument, setting that reference. Thus, * a useful technique for constructing all the words in a sentence will * involve code that looks like this (where <code>form</code> is a * {@link java.lang.String}): * * <blockquote> * <code> * Word current = new Word(form);<br> * <i>a loop of some sort</i><br> * {<br> *     current.next = new Word(form, current);<br> *     current = current.next;<br> * }<br> * </blockquote> * * @author Nick Rizzolo **/ public class Word extends LinkedChild { /** The actual text from the corpus that represents the word. */ public String form; /** * Whether or not the word is capitalized is determined automatically by * the constructor. **/ public boolean capitalized; /** Names the part of speech of this word. */ public String partOfSpeech; /** The base form of the word. */ public String lemma; /** An indication of the meaning or usage of this instance of this word. */ public String wordSense; /** * When all that is known is the spelling of the word. * * @param f The actual text of the word. **/ public Word(String f) { this(f, null, null); } /** * Sets the actual text and the part of speech. * * @param f The actual text of the word. * @param pos A token representing the word's part of speech. **/ public Word(String f, String pos) { this(f, pos, null); } /** * This constructor is useful when the sentence is being parsed forwards. * * @param f The actual text of the word. * @param p The word that came before this one in the sentence. **/ public Word(String f, Word p) { this(f, null, p); } /** * This constructor is useful when the sentence is being parsed forwards. * * @param f The actual text of the word. * @param pos A token representing the word's part of speech. * @param p The word that came before this one in the sentence. **/ public Word(String f, String pos, Word p) { this(f, pos, p, -1, -1); } /** * When you have offset information. * * @param f The actual text of the word. * @param start The offset into the parent document at which the first * character of this word is found. * @param end The offset into the parent document at which the last * character of this word is found. **/ public Word(String f, int start, int end) { this(f, null, null, start, end); } /** * When you have offset information. * * @param f The actual text of the word. * @param pos A token representing the word's part of speech. * @param start The offset into the parent document at which the first * character of this word is found. * @param end The offset into the parent document at which the last * character of this word is found. **/ public Word(String f, String pos, int start, int end) { this(f, pos, null, start, end); } /** * This constructor is useful when the sentence is being parsed forwards. * * @param f The actual text of the word. * @param p The word that came before this one in the sentence. * @param start The offset into the parent document at which the first * character of this word is found. * @param end The offset into the parent document at which the last * character of this word is found. **/ public Word(String f, Word p, int start, int end) { this(f, null, p, start, end); } /** * This constructor is useful when the sentence is being parsed forwards. * * @param f The actual text of the word. * @param pos A token representing the word's part of speech. * @param p The word that came before this one in the sentence. * @param start The offset into the parent document at which the first * character of this word is found. * @param end The offset into the parent document at which the last * character of this word is found. **/ public Word(String f, String pos, Word p, int start, int end) { this(f, pos, null, null, p, start, end); } /** * This constructor is useful when the sentence is being parsed forwards. * * @param f The actual text of the word. * @param pos A token representing the word's part of speech. * @param l The base form of the word. * @param sense The sense of the word. * @param p The word that came before this one in the sentence. * @param start The offset into the parent document at which the first * character of this word is found. * @param end The offset into the parent document at which the last * character of this word is found. **/ public Word(String f, String pos, String l, String sense, Word p, int start, int end) { super(p, start, end); form = f; capitalized = f != null && f.length() > 0 && Character.isUpperCase(f.charAt(0)); partOfSpeech = pos; if (partOfSpeech != null) POS.fromToken(partOfSpeech); lemma = l; wordSense = sense; } /** * The string representation of a word is its POS bracket form, or, if the * part of speech is not available, it is just the spelling of the word. * Note that the POS bracket form of a word also entails displaying left * brackets (<code>"("</code>, <code>"["</code>, and <code>"{"</code>) as * <code>"-LRB-"</code> and right brackets (<code>")"</code>, * <code>"]"</code>, <code>"}"</code>) as <code>"-RRB-"</code>. * * @return The POS bracket form of this word, or just the spelling of the * word if the part of speech is not available. **/ public String toString() { if (partOfSpeech == null) return form; String form = this.form; if (form.length() == 1) { if ("([{".indexOf(form.charAt(0)) != -1) form = "-LRB-"; if (")]}".indexOf(form.charAt(0)) != -1) form = "-RRB-"; } return "(" + partOfSpeech + " " + form + ")"; } }