package edu.stanford.nlp.parser.lexparser;
import edu.stanford.nlp.ling.TaggedWord;
import edu.stanford.nlp.util.Index;
import edu.stanford.nlp.util.StringUtils;
import java.io.Serializable;
/** Represents a WordTag (in the sense that equality is defined
* on both components), where each half is represented by an
* int indexed by a Index. In this representation, -1 is
* used to represent the wildcard ANY value, and -2 is used
* to represent a STOP value (i.e., no more dependents).
*
* TODO: does that cause any problems regarding unseen words also being -1?
* TODO: any way to not have links to the Index in each object?
*
* @author Dan Klein
* @author Christopher Manning
*/
public class IntTaggedWord implements Serializable, Comparable<IntTaggedWord> {
public static final int ANY_WORD_INT = -1;
public static final int ANY_TAG_INT = -1;
public static final int STOP_WORD_INT = -2;
public static final int STOP_TAG_INT = -2;
public static final String ANY = ".*.";
public static final String STOP = "STOP";
public final int word;
public final short tag;
public int tag() {
return tag;
}
public int word() {
return word;
}
public String wordString(Index<String> wordIndex) {
String wordStr;
if (word >= 0) {
wordStr = wordIndex.get(word);
} else if (word == ANY_WORD_INT) {
wordStr = ANY;
} else {
wordStr = STOP;
}
return wordStr;
}
public String tagString(Index<String> tagIndex) {
String tagStr;
if (tag >= 0) {
tagStr = tagIndex.get(tag);
} else if (tag == ANY_TAG_INT) {
tagStr = ANY;
} else {
tagStr = STOP;
}
return tagStr;
}
@Override
public int hashCode() {
return word ^ (tag << 16);
}
@Override
public boolean equals(Object o) {
if (this == o) {
return true;
} else if (o instanceof IntTaggedWord) {
IntTaggedWord i = (IntTaggedWord) o;
return (word == i.word && tag == i.tag);
} else {
return false;
}
}
public int compareTo(IntTaggedWord that) {
if (tag != that.tag) {
return tag - that.tag;
} else {
return word - that.word;
}
}
private static final char[] charsToEscape = { '\"' };
public String toLexicalEntry(Index<String> wordIndex,
Index<String> tagIndex) {
String wordStr = wordString(wordIndex);
String tagStr = tagString(tagIndex);
return '\"' + StringUtils.escapeString(tagStr, charsToEscape, '\\') + "\" -> \"" + StringUtils.escapeString(wordStr, charsToEscape, '\\') + '\"';
}
@Override
public String toString() {
return word + "/" + tag;
}
public String toString(Index<String> wordIndex, Index<String> tagIndex) {
return wordString(wordIndex)+ '/' +tagString(tagIndex);
}
public String toString(String arg,
Index<String> wordIndex, Index<String> tagIndex) {
if (arg.equals("verbose")) {
return (wordString(wordIndex) + '[' + word + "]/" +
tagString(tagIndex) + '[' + tag + ']');
} else {
return toString(wordIndex, tagIndex);
}
}
public IntTaggedWord(int word, int tag) {
this.word = word;
this.tag = (short) tag;
}
public TaggedWord toTaggedWord(Index<String> wordIndex,
Index<String> tagIndex) {
String wordStr = wordString(wordIndex);
String tagStr = tagString(tagIndex);
return new TaggedWord(wordStr, tagStr);
}
/**
* Creates an IntTaggedWord given by the String representation
* of the form <word>|<tag*gt;
*/
public IntTaggedWord(String s, char splitChar,
Index<String> wordIndex, Index<String> tagIndex) {
// awkward, calls s.indexOf(splitChar) twice
this(extractWord(s, splitChar), extractTag(s, splitChar),
wordIndex, tagIndex);
// System.out.println("s: " + s);
// System.out.println("tagIndex: " + tagIndex);
// System.out.println("word: " + word);
// System.out.println("tag: " + tag);
}
private static String extractWord(String s, char splitChar) {
int n = s.lastIndexOf(splitChar);
String result = s.substring(0, n);
// System.out.println("extracted word: " + result);
return result;
}
private static String extractTag(String s, char splitChar) {
int n = s.lastIndexOf(splitChar);
String result = s.substring(n + 1);
// System.out.println("extracted tag: " + result);
return result;
}
/**
* Creates an IntTaggedWord given by the tagString and wordString
*/
public IntTaggedWord(String wordString, String tagString,
Index<String> wordIndex, Index<String> tagIndex) {
switch (wordString) {
case ANY:
word = ANY_WORD_INT;
break;
case STOP:
word = STOP_WORD_INT;
break;
default:
word = wordIndex.addToIndex(wordString);
break;
}
switch (tagString) {
case ANY:
tag = (short) ANY_TAG_INT;
break;
case STOP:
tag = (short) STOP_TAG_INT;
break;
default:
tag = (short) tagIndex.addToIndex(tagString);
break;
}
}
private static final long serialVersionUID = 1L;
} // end class IntTaggedWord