package semanticMarkup.ling.learn.utility;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.Hashtable;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang3.StringUtils;
import semanticMarkup.know.Stemmer;
import semanticMarkup.know.lib.WordNetPOSKnowledgeBase;
import semanticMarkup.ling.learn.knowledge.Constant;
public class WordFormUtility {
private WordNetPOSKnowledgeBase myWN;
private Map<String, String> numberRecords = new HashMap<String, String>(); // word->(p|s)
private Map<String, String> singularRecords = new HashMap<String, String>();// word->singular
private Map<String, String> POSRecords = new HashMap<String, String>(); // word->POSs
private Map<String, Integer> WORDS = new HashMap<String, Integer>();
private Hashtable<String, String> PLURALS = new Hashtable<String, String>();
// Porter Stemmer
private Stemmer myStemmer;
public WordFormUtility(WordNetPOSKnowledgeBase wn) {
this.myWN = wn;
this.myStemmer = new Stemmer();
}
/**
* 1) Check wordnet to gether information about a word
* 2) Save checked words in three hash tables, singularRecords (singular), numberRecords (number), and POSRecords (pos), respectively
*
* @param word
* The word to check
* @param mode
* The mode can be "signular", "number", "pos"
* @return 1) mode "singular": if a plural noun, return its singular form, otherwise, return itself.
* 2) mode "number": if a noun, return "p" [plural] or "s"[singular], else if not in WN "", otherwise "x".
* 3) mode "pos": return n [p,s], v, a, r, "" (not in WN).
*/
public String checkWN(String word, String mode) {
/**
* 0.0 If the word contains nothing but non-word characters, such as <>, return empty
* 0.1 Check singularRecordsprevious records
* 0.2 Special cases
* 1 Word not in WordNet
* 1.1
* 1.2
* 1.3
* 2. Word in WordNet
* 2.1 mode is singular or number
* 2.1.1
* 2.1.2
* 2.2 mode is pos
*/
//this.myWN.getMostLikleyPOS(word);
// If the word contains nothing but non-word characters, such as <>, return empty
word = word.replaceAll("\\W", "");
if (word.equals("")) {
return "";
}
// Check previous records
// singular case
String singular = null;
if (StringUtils.equals(mode, "singular")) {
singular = this.singularRecords.get(word);
}
if (singular != null) {
if (singular.matches("^.*\\w.*$")) {
return singular;
}
}
// number case
String number = null;
if (StringUtils.equals(mode, "number")) {
number = this.numberRecords.get(word);
}
if (number != null) {
if (number.matches("^.*\\w.*$")) {
return number;
}
}
// pos case
String pos = null;
if (StringUtils.equals(mode, "pos")) {
pos = this.POSRecords.get(word);
}
if (pos != null) {
if (pos.matches("^.*\\w.*$")) {
return pos;
}
}
// Case 0: special cases
if (word.equals("teeth")) {
this.numberRecords.put("teeth", "p");
this.singularRecords.put("teeth", "tooth");
return mode.equals("singular")?"tooth":"p";
}
if (word.equals("tooth")) {
this.numberRecords.put("tooth", "s");
this.singularRecords.put("tooth", "tooth");
return mode.equals("singular")?"tooth":"s";
}
if (word.equals("NUM")) {
return mode.equals("singular")?"NUM":"s";
}
if (word.equals("or")) {
return mode.equals("singular")?"or":"";
}
if (word.equals("and")) {
return mode.equals("singular")?"and":"";
}
// concentrically
if (word.matches("^.*[a-z]{3,}ly$")) {
if (mode.equals("singular")) {
return word;
}
if (mode.equals("number")) {
return "";
}
if (mode.equals("pos")) {
return "r";
}
}
// otherwise, call WordNet
// Case 1
if (!this.myWN.contains(word)) {// word not in WN
// boolean f = this.myWN.contains(word);
this.POSRecords.put(word, "");
String wordCopy = word;
word = word.replaceAll("ed$", "");
// Case 1.1
if (!word.equals(wordCopy)) {
if (this.myWN.contains(word)) {
// Case 1.1.1-1.1.3
if (mode.equals("singular")) {
return word;
}
if (mode.equals("number")) {
return "";
}
if (mode.equals("pos")) {
return "a";
}
}
}
word = wordCopy;
word = word.replace("^(" + Constant.PREFIX + ")+", "");
// Case 1.2
if (word.equals(wordCopy)) {
return mode.equals("singular") ? word : "";
}
// Case 1.3
else {
if (!this.myWN.contains(word)) {
return mode.equals("singular") ? wordCopy : "";
}
}
}
else { if (mode.equals("singular") || mode.equals("number")) {
// Case 2.1.1: not a noun
if (!this.myWN.isNoun(word)) {
return mode.equals("singular") ? word : "x";
}
List<String> stemList = myWN.getSingulars(word);
int maxLength = 100;
String sWord = "";
for (int i=0;i<stemList.size();i++) {
String stem = stemList.get(i);
if (stem.length()<maxLength) {
maxLength = stem.length();
sWord = stem;
}
}
this.singularRecords.put(word, sWord);
// Case 2.1.2: singular
if (sWord.equals(word)) {
this.numberRecords.put(word, "s");
return mode.equals("singular")?sWord:"s";
}
// Case 2.1.3: plural
else {
this.numberRecords.put(word, "p");
return mode.equals("singular")?sWord:"p";
}
}
// Case 2.2
else if (mode.equals("pos")) {
pos = "";
if (this.myWN.isNoun(word)){
pos = pos+"n";
}
if (this.myWN.isVerb(word)){
pos = pos+"v";
}
if (this.myWN.isAdjective(word)){
pos = pos+"a";
}
if (this.myWN.isAdverb(word)){
pos = pos+"r";
}
this.POSRecords.put(word, pos);
if ((this.myWN.isNoun(word))
&& (this.myWN.isVerb(word))
&& (word.matches("^.*(ed|ing)$"))) {
pos.replaceAll("n", "");
}
this.POSRecords.put(word, pos);
return pos;
}
}
return "";
}
/**
* Helper of method updateTable: Given a word, return [p] if it is a plural, [s] if it is singular
* @param w
* @return
*/
public String getNumber(String word) {
//remove non-word characters, such as <>
Matcher m = StringUtility.createMatcher(word, "\\W");
word = m.replaceAll("");
String number = checkWN(word, "number");
String rt = "";
// Case 1
rt = getNumberHelper1(number);
if (rt != null) {
return rt;
}
// Case 2
rt = getNumberHelper2(word);
if (rt != null) {
return rt;
}
// Case 3: return "s"
else {
return "s";
}
}
/**
* First Helper of method getNumber
*
* @param number
* @return if match return the right number, otherwise return null means not
* match
*/
public String getNumberHelper1(String number) {
if (number.matches("^.*[sp].*$")) {
return number;
}
// return "?" for "x"
else if (number.matches("^.*x.*$")) {
return "?";
}
else {
return null;
}
}
/**
* Second Helper of method getNumber
*
* @param word
* @return if match return the right number, otherwise return null means not
* match
*/
public String getNumberHelper2(String word) {
// Calyculi => 1. Calyculus, pappi => pappus
if (word.matches("^.*i$")) {
return "p";
}
else if (word.matches("^.*ss$")) {
return "s";
}
else if (word.matches("^.*ia$")) {
return "p";
}
else if (word.matches("^.*[it]um$")) {
return "s";
}
else if (word.matches("^.*ae$")) {
return "p";
}
// non-noun cases
else if (word.matches("^.*ous$")) {
return "?";
}
// non-noun cases
else if (word.matches("^[aiu]s$")) {
return "?";
}
// this case only handle three words: as, is, us
else if (word.matches("^.*us$")) {
return "s";
}
else if (word.matches("^.*es$") || word.matches("^.*s$")) {
return "p";
}
// non-noun cases
else if (word.matches("^.*ate$")) {
return "?";
}
else {
return null;
}
}
/**
* Get the singular form of a word
*
* @param word
* the word in query
* @return the singular form of the input word. If it the method fails to
* find any singular form of the word, return an empty string
*/
public String getSingular(String word) {
if (!word.matches("^.*\\w.*$")) {
return "";
}
if (word.equals("valves")) {
return "valve";
} else if (word.equals("media")) {
return "media";
} else if (word.equals("species")) {
return "species";
} else if (word.equals("axes")) {
return "axis";
} else if (word.equals("calyces")) {
return "calyx";
} else if (word.equals("frons")) {
return "frons";
} else if (word.equals("grooves")) {
return "groove";
} else if (word.equals("nerves")) {
return "nerve";
}
String singular = "";
if (getNumber(word).equals("p")) {
// Case 1
Pattern p = Pattern.compile("(^.*?[^aeiou])ies$");
Matcher m = p.matcher(word);
if (m.lookingAt()) {
singular = m.group(1) + "y";
}
else {
// Case 2
p = Pattern.compile("(^.*?)i$");
m = p.matcher(word);
if (m.lookingAt()) {
singular = m.group(1) + "us";
}
else {
// Case 3
p = Pattern.compile("(^.*?)ia$");
m = p.matcher(word);
if (m.lookingAt()) {
singular = m.group(1) + "ium";
}
else {
// Case 4
p = Pattern.compile("(^.*?(x|ch|sh|ss))es$");
m = p.matcher(word);
if (m.lookingAt()) {
singular = m.group(1);
}
else {
// Case 5
p = Pattern.compile("(^.*?)ves$");
m = p.matcher(word);
if (m.lookingAt()) {
singular = m.group(1) + "f";
}
else {
// Case 6
p = Pattern.compile("(^.*?)ices");
m = p.matcher(word);
if (m.lookingAt()) {
singular = m.group(1) + "ex";
}
else {
// Case 7.1
// pinnae ->pinna
p = Pattern.compile("(^.*?a)e$");
m = p.matcher(word);
if (m.lookingAt()) {
singular = m.group(1);
}
else {
// Case 7.2
// fruits->fruit
p = Pattern.compile("(^.*?)s$");
m = p.matcher(word);
if (m.lookingAt()) {
singular = m.group(1);
}
}
}
}
}
}
}
}
}
if (singular.matches("^.*\\w.*$")) {
return singular;
}
singular = checkWN(word, "singular");
if (singular.matches("^.*\\w.*$")) {
return singular;
}
return "";
}
/**
* Get all the plural forms of a word
*
* @param word
* the word in query
* @return a list of the plural forms of the word. If the method fails to
* find any plural forms of the method, return an empty list
*/
public List<String> getPlural(String word) {
if (word.matches("^(n|2n|x)$")) {
return new ArrayList<String>();
}
String plural = "";
if (this.PLURALS.containsKey(word)) {
plural = this.PLURALS.get(word);
if (plural.matches("^.*\\w+.*$")) {
String[] pArray = plural.split(" ");
List<String> pList = new ArrayList<String>();
Collections.addAll(pList, pArray);
return pList;
}
}
plural = getPluralSpecialCaseHelper(word);
if (!plural.equals("")) {
;
}
else {
plural = getPluralRuleHelper(word);
plural = plural+" "+word+"s";
}
plural=plural.replaceAll("^\\s+", "");
plural=plural.replaceAll("\\s+$", "");
String[] pls = plural.split(" ");
String plStr = "";
for (int i = 0; i < pls.length; i++) {
if (this.getWORDS().containsKey(pls[i])) {
if (this.getWORDS().get(pls[i]) >= 1) {
plStr = plStr + pls[i] + " ";
}
}
}
plStr = plStr.replaceAll("\\s+$", "");
this.PLURALS.put(word, plStr);
List<String> pList = new ArrayList<String>();
if (!plStr.equals("")) {
String[] pArray = plStr.split(" ");
Collections.addAll(pList, pArray);
}
return pList;
}
/**
* A helper method used by method getPlural. Help to apply a number of rules
*
* @param word
* @return if the word has plural form(s), return it(them); otherwise return ""
*/
public String getPluralRuleHelper(String word) {
String plural;
Pattern p;
Matcher m;
// Case 1
p = Pattern.compile("(^.*?)(ex|ix)$");
m = p.matcher(word);
if (m.lookingAt()) {
plural = m.group(1) + "ices";
plural = plural + " " + m.group(1) + m.group(2) + "es";
return plural;
}
// Case 2
p = Pattern.compile("^.*(x|ch|ss|sh)$");
m = p.matcher(word);
if (m.lookingAt()) {
plural = word + "es";
return plural;
}
// Case 3
p = Pattern.compile("(^.*?)([^aeiouy])y$");
m = p.matcher(word);
if (m.lookingAt()) {
plural = m.group(1) + m.group(2) + "ies";
return plural;
}
// Case 4
p = Pattern.compile("(^.*?)(?:([^f])fe|([oaelr])f)$");
m = p.matcher(word);
if (m.lookingAt()) {
String s1 = m.group(1);
String s2 = m.group(2);
String s3 = m.group(3);
if (s2 != null) {
plural = s1 + s2 + "ves";
} else {
plural = s1 + s3 + "ves";
}
return plural;
}
// Case 5
p = Pattern.compile("(^.*?)(x|s)is$");
m = p.matcher(word);
if (m.lookingAt()) {
plural = m.group(1) + m.group(2) + "es";
return plural;
}
// Case 6
p = Pattern.compile("(^.*?)([tidlv])um$");
m = p.matcher(word);
if (m.lookingAt()) {
plural = m.group(1) + m.group(2) + "a";
return plural;
}
// Case 7
p = Pattern.compile("(^.*?)(ex|ix)$");
m = p.matcher(word);
if (m.lookingAt()) {
plural = m.group(1) + "ices";
return plural;
}
// Case 8
p = Pattern.compile("(^.*?[^t][^i])on$");
m = p.matcher(word);
if (m.lookingAt()) {
plural = m.group(1) + "a";
return plural;
}
// Case 9
p = Pattern.compile("(^.*?)a$");
m = p.matcher(word);
if (m.lookingAt()) {
plural = m.group(1) + "ae";
return plural;
}
// Case 10
p = Pattern.compile("(^.*?)man$");
m = p.matcher(word);
if (m.lookingAt()) {
plural = m.group(1) + "men";
return plural;
}
// Case 11
p = Pattern.compile("(^.*?)child$");
m = p.matcher(word);
if (m.lookingAt()) {
plural = m.group(1) + "children";
return plural;
}
// Case 12
p = Pattern.compile("(^.*)status$");
m = p.matcher(word);
if (m.lookingAt()) {
plural = m.group(1) + "statuses";
return plural;
}
// Case 13
p = Pattern.compile("(^.+?)us$");
m = p.matcher(word);
if (m.lookingAt()) {
plural = m.group(1) + "i";
plural = plural + " " + m.group(1) + "uses";
return plural;
}
// Case 14
p = Pattern.compile("^.*s$");
m = p.matcher(word);
if (m.lookingAt()) {
plural = word + "es";
return plural;
}
return "";
}
/**
* A helper method used by method getPlural. Help to handle special cases.
*
* @param word
* @return if the word match any special case, return its plural form, return it; otherwise return ""
*/
public String getPluralSpecialCaseHelper(String word) {
String plural;
Pattern p;
Matcher m;
// Case 1
p = Pattern.compile("^.*series$");
m = p.matcher(word);
if (m.lookingAt()) {
plural = word;
return plural;
}
// Case 2
p = Pattern.compile("(^.*?)foot$");
m = p.matcher(word);
if (m.lookingAt()) {
plural = m.group(1) + "feet";
return plural;
}
// Case 3
p = Pattern.compile("(^.*?)tooth$");
m = p.matcher(word);
if (m.lookingAt()) {
plural = m.group(1) + "teeth";
return plural;
}
// Case 4
p = Pattern.compile("(^.*?)alga$");
m = p.matcher(word);
if (m.lookingAt()) {
plural = m.group(1) + "algae";
return plural;
}
// Case 5
p = Pattern.compile("(^.*?)genus$");
m = p.matcher(word);
if (m.lookingAt()) {
plural = m.group(1) + "genera";
return plural;
}
// Case 6
p = Pattern.compile("(^.*?)corpus$");
m = p.matcher(word);
if (m.lookingAt()) {
plural = m.group(1) + "corpora";
return plural;
}
return "";
}
public List<String> getSingularPluralPair(String word1, String word2) {
List<String> pair = new ArrayList<String>();
String singular = "";
String plural = "";
// put the shorter word at first
int len1 = word1.length();
int len2 = word2.length();
if (len1 > len2) {
String temp_word = word1;
word1 = word2;
word2 = temp_word;
int temp_len = len1;
len1 = len2;
len2 = temp_len;
}
if ((word1.matches("^.*" + Constant.SENDINGS))
&& (word2.matches("^.*" + Constant.PENDINGS))) {
// case 1.1.1
if (word2.matches("^.*" + "es$")
&& word1.matches("^.*" + "is$")
&& Math.abs(len1 - len2) == 0) {
singular = word1;
plural = word2;
}
// case 1.5
else if (word1.matches("^.*" + "us$")
&& word2.matches("^.*" + "a$")
&& Math.abs(len1 - len2) < 2) {
singular = word1;
plural = word2;
}
}
else if ((word1.matches("^.*" + Constant.PENDINGS))
&& (word2.matches("^.*" + Constant.SENDINGS))) {
// case 1.1.2
if (word1.matches("^.*" + "es$")
&& word2.matches("^.*" + "is$")
&& Math.abs(len1 - len2) == 0) {
singular = word2;
plural = word1;
}
// case 1.2
else if (word1.matches("^.*" + "a$")
&& word2.matches("^.*" + "on$")
&& Math.abs(len1 - len2) < 2) {
singular = word2;
plural = word1;
}
// case 1.3
else if (word1.matches("^.*" + "a$")
&& word2.matches("^.*" + "um$")
&& Math.abs(len1 - len2) < 2) {
singular = word2;
plural = word1;
}
// case 1.4
else if (word1.matches("^.*" + "i$")
&& word2.matches("^.*" + "us$")
&& Math.abs(len1 - len2) < 2) {
singular = word2;
plural = word1;
}
} else {
// thicker, thickness; species, specimens; tomentulose, tomentulous;
// later laterals
if (word2.matches("^.*s$")) {
if (getSingularPluralPairHelper(word1, word2)) {
singular = word1;
plural = word2;
}
}
}
if ((!singular.equals("")) && (!plural.equals(""))) {
pair.add(singular);
pair.add(plural);
}
return pair;
}
// word2 has no other letters except those appearing in word1 or ies, and
// vice versa.
public boolean getSingularPluralPairHelper(String word1, String word2) {
int len1 = word1.length();
int len2 = word2.length();
if ((!word2.matches("^\\[^" + word1 + "yies" + "\\]*&"))
&& (!word1.matches("^\\[^" + word2 + "yies" + "\\]*&"))
&& (Math.abs(len1 - len2) > 0) && (Math.abs(len1 - len2) < 3)) {
return true;
} else {
return false;
}
}
public String getRoot(String word) {
String root;
this.myStemmer.add(word.toCharArray(), word.length());
this.myStemmer.stem();
root = this.myStemmer.toString();
return root;
}
public Map<String, Integer> getWORDS() {
return WORDS;
}
public void setWORDS(Map<String, Integer> w) {
WORDS = w;
}
}