package semanticMarkup.ling.learn.knowledge; import java.util.Iterator; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.log4j.Logger; import org.apache.log4j.PropertyConfigurator; import semanticMarkup.know.lib.WordNetPOSKnowledgeBase; import semanticMarkup.ling.learn.dataholder.DataHolder; import semanticMarkup.ling.learn.dataholder.WordPOSKey; import semanticMarkup.ling.learn.dataholder.WordPOSValue; import semanticMarkup.ling.learn.utility.LearnerUtility; /** * Learn a set of seed nouns (singular and plural forms) by applying a number of * rules based on heuristics on the collection. * * @author Dongye * */ public class HeuristicNounLearnerUseSuffix implements IModule { private LearnerUtility myLearnerUtility; public HeuristicNounLearnerUseSuffix(LearnerUtility learnerUtility) { this.myLearnerUtility = learnerUtility; } @Override public void run(DataHolder dataholderHandler) { this.posBySuffix(dataholderHandler); } /** * for each unknown word in unknownwords table seperate root and suffix if * root is a word in WN or in unknownwords table make the unknowword a "b" * boundary * * suffix: -fid(adj), -form (adj), -ish(adj), -less(adj), -like (adj)), * -merous(adj), -most(adj), -shaped(adj), -ous(adj) */ public void posBySuffix(DataHolder dataholderHandler) { PropertyConfigurator.configure("conf/log4j.properties"); Logger myLogger = Logger.getLogger("learn.posBySuffix"); myLogger.trace("Enter posBySuffix"); Iterator<Map.Entry<String, String>> iterator = dataholderHandler .getUnknownWordHolder().entrySet().iterator(); while (iterator.hasNext()) { Map.Entry<String, String> unknownWordEntry = iterator.next(); String unknownWord = unknownWordEntry.getKey(); String unknownWordTag = unknownWordEntry.getValue(); if (unknownWordTag.equals("unknown")) { // boolean flag1 = posBySuffixCase1Helper(dataholderHandler, unknownWord); // boolean flag2 = posBySuffixCase2Helper(dataholderHandler, unknownWord); } } myLogger.trace("Quite posBySuffix"); } public boolean posBySuffixCase1Helper(DataHolder dataholderHandler, String unknownWord) { PropertyConfigurator.configure("conf/log4j.properties"); Logger myLogger = Logger.getLogger("learn.posBySuffix"); String pattern1 = "^[a-z_]+(" + Constant.SUFFIX + ")$"; myLogger.debug("Pattern1: " + pattern1); if (unknownWord.matches(pattern1)) { Matcher matcher = Pattern .compile("(.*?)(" + Constant.SUFFIX + ")$").matcher( unknownWord); if ((unknownWord.matches("^[a-zA-Z0-9_-]+$")) && matcher.matches()) { myLogger.debug("posBySuffix - check word: " + unknownWord); String base = matcher.group(1); String suffix = matcher.group(2); if (this.containSuffix(dataholderHandler, unknownWord, base, suffix)) { myLogger.debug("Pass\n"); dataholderHandler.updateDataHolder(unknownWord, "b", "*", "wordpos", 0); myLogger.debug("posBySuffix - set word: " + unknownWord); return true; } else { myLogger.debug("Not Pass\n"); } } } return false; } public boolean posBySuffixCase2Helper(DataHolder dataholderHandler, String unknownWord) { PropertyConfigurator.configure("conf/log4j.properties"); Logger myLogger = Logger.getLogger("learn.posBySuffix"); String pattern2 = "^[._.][a-z]+"; // , _nerved myLogger.debug("Pattern2: " + pattern2); if (unknownWord.matches(pattern2)) { dataholderHandler.getWordPOSHolder().put( new WordPOSKey(unknownWord, "b"), new WordPOSValue("*", 0, 0, null, null)); myLogger.debug("posbysuffix set " + unknownWord + " a boundary word\n"); return true; } return false; } /** * return false or true depending on if the word contains the suffix as the * suffix * * @param word * @param base * @param suffix * @return */ public boolean containSuffix(DataHolder dataholderHandler, String word, String base, String suffix) { PropertyConfigurator.configure("conf/log4j.properties"); Logger myLogger = Logger.getLogger("learn.posBySuffix.containSuffix"); myLogger.trace("Enter containSuffix"); boolean flag = false; // return value boolean wordInWN = false; // if this word is in WordNet boolean baseInWN = false; WordNetPOSKnowledgeBase myWN = this.myLearnerUtility .getWordNetPOSKnowledgeBase(); // check base if (base.length() == 0) { myLogger.trace("case 0"); return true; } base.replaceAll("_", ""); // cup_shaped if (myWN.contains(word)) { myLogger.trace("case 1.1"); wordInWN = true; // word is in WordNet } else { myLogger.trace("case 1.2"); wordInWN = false; } if (myWN.contains(base)) { myLogger.trace("case 2.1"); baseInWN = true; } else { myLogger.trace("case 2.2"); baseInWN = false; } // if WN pos is adv, return 1: e.g. ly, or if $base is in // unknownwords table if (suffix.equals("ly")) { myLogger.trace("case 3.1"); if (wordInWN) { if (myWN.isAdverb(word)) { return true; } } // if the word is in unknown word set, return true if (dataholderHandler.getUnknownWordHolder().containsKey(base)) { return true; } } // if WN recognize superlative, comparative adjs, return 1: e.g. er, est else if (suffix.equals("er") || suffix.equals("est")) { myLogger.trace("case 3.2"); if (wordInWN) { boolean case1 = !myWN.isAdjective(word); boolean case2 = myWN.isAdjective(base); if (case1 && case2) { return true; } else { return false; } } } // if $base is in WN or unknownwords table, or if $word has sole pos // adj in WN, return 1: e.g. scalelike else { myLogger.trace("case 3.3"); if (myWN.isSoleAdjective(word)) { return true; } if (baseInWN) { return true; } if (dataholderHandler.getUnknownWordHolder().containsKey(base)) { return true; } } return flag; } }