package semanticMarkup.ling.learn.knowledge; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.log4j.Logger; import org.apache.log4j.PropertyConfigurator; import semanticMarkup.ling.learn.dataholder.DataHolder; import semanticMarkup.ling.learn.dataholder.SentenceStructure; import semanticMarkup.ling.learn.utility.LearnerUtility; import semanticMarkup.ling.learn.utility.StringUtility; /** * Learns nouns based on some heuristics. * * @author Dongye * */ public class HeuristicNounLearnerUseMorphology implements IModule { private LearnerUtility myLearnerUtility; public HeuristicNounLearnerUseMorphology(LearnerUtility learnerUtility) { this.myLearnerUtility = learnerUtility; } @Override public void run(DataHolder dataholderHandler) { PropertyConfigurator.configure("conf/log4j.properties"); Logger myLogger = Logger.getLogger("learn.addHeuristicsNouns"); myLogger.trace("Enter addHeuristicsNouns"); Set<String> nouns = this.learnHeuristicsNouns(dataholderHandler); myLogger.debug("Nouns learned from heuristics:"); myLogger.debug("\t" + nouns.toString()); myLogger.debug("Total: " + nouns.size()); List<Set<String>> results = this.characterHeuristics(dataholderHandler); Set<String> rnouns = results.get(0); Set<String> descriptors = results.get(1); addDescriptors(dataholderHandler, descriptors); addNouns(dataholderHandler, rnouns); // dataholderHandler.printHolder(DataHolder.SINGULAR_PLURAL); myLogger.debug("Total: " + nouns.size()); Iterator<String> iter = nouns.iterator(); myLogger.info("Learn singular-plural pair"); while (iter.hasNext()) { String e = iter.next(); myLogger.trace("Check Word: " + e); if ((e.matches("^.*\\w.*$")) && (!StringUtility.isMatchedWords(e, "NUM|" + this.myLearnerUtility.getConstant().NUMBER + "|" + this.myLearnerUtility.getConstant().CLUSTERSTRING + "|" + this.myLearnerUtility.getConstant().CHARACTER + "|" + this.myLearnerUtility.getConstant().PROPERNOUN))) { myLogger.trace("Pass"); // same word may have two different pos tags String[] nounArray = e.split("\\|"); for (int i = 0; i < nounArray.length; i++) { String nounAndPOS = nounArray[i]; Pattern p = Pattern.compile("(\\w+)\\[([spn])\\]"); Matcher m = p.matcher(nounAndPOS); if (m.lookingAt()) { String word = m.group(1); String pos = m.group(2); dataholderHandler.updateDataHolder(word, pos, "*", "wordpos", 0); if (pos.equals("p")) { String plural = word; String singular = this.myLearnerUtility .getWordFormUtility().getSingular(plural); if (singular != null) { if (!singular.equals("")) { dataholderHandler.addSingularPluralPair( singular, plural); } } } if (pos.equals("s")) { String singular = word; List<String> pluralList = this.myLearnerUtility .getWordFormUtility().getPlural(singular); Iterator<String> pluralIter = pluralList.iterator(); while (pluralIter.hasNext()) { String plural = pluralIter.next(); if (plural != null) { if (!plural.equals("")) { dataholderHandler .addSingularPluralPair( singular, plural); } } } } } } } } myLogger.trace("Quite addHeuristicsNouns"); } /** * * @param descriptors */ public void addDescriptors(DataHolder dataholderHandler, Set<String> descriptors) { Iterator<String> iter = descriptors.iterator(); while (iter.hasNext()) { String descriptor = iter.next(); if (!StringUtility.isMatchedWords(descriptor, this.myLearnerUtility.getConstant().FORBIDDEN)) { dataholderHandler.updateDataHolder(descriptor, "b", "", "wordpos", 1); } } } /** * * @param rnouns */ public void addNouns(DataHolder dataholderHandler, Set<String> rnouns) { Iterator<String> iter = rnouns.iterator(); while (iter.hasNext()) { String noun = iter.next(); if (!StringUtility.isMatchedWords(noun, this.myLearnerUtility.getConstant().FORBIDDEN)) { dataholderHandler.updateDataHolder(noun, "n", "", "wordpos", 1); } } } /** * * @return nouns learned by heuristics */ public Set<String> learnHeuristicsNouns(DataHolder dataholderHandler) { PropertyConfigurator.configure("conf/log4j.properties"); Logger myLogger = Logger .getLogger("learn.addHeuristicsNouns.learnHeuristicsNouns"); // Set of words Set<String> words = new HashSet<String>(); // Set of nouns Set<String> nouns = new HashSet<String>(); List<String> sentences = new LinkedList<String>(); for (int i = 0; i < dataholderHandler.getSentenceHolder().size(); i++) { String originalSentence = dataholderHandler.getSentenceHolder() .get(i).getOriginalSentence(); myLogger.trace("Original Sentence: " + originalSentence); sentences.add(StringUtility.strip(originalSentence)); } // Now we have original sentences in sentences // Method addWords for (int i = 0; i < sentences.size(); i++) { String sentence = sentences.get(i); sentence = sentence.toLowerCase(); String noun = this.getPresentAbsentNouns(sentence); if (!noun.equals("")) { nouns.add(noun); } // add words List<String> tokens = this.myLearnerUtility.tokenizeText(sentence, "all"); for (String token : tokens) { if (StringUtility.isWord(token)) { words.add(token); myLogger.trace("Add a word into words: " + token); } } } // solve the problem: septa and septum are both s Iterator<String> nounsIterator = nouns.iterator(); while (nounsIterator.hasNext()) { String oldNoun = nounsIterator.next(); String newNoun = this.getHeuristicsNounsHelper(oldNoun, nouns); if (!newNoun.equals(oldNoun)) { nouns.remove(oldNoun); nouns.add(newNoun); } } // sort all words Map<String, Set<String>> wordMap = new HashMap<String, Set<String>>(); Iterator<String> wordsIterator = words.iterator(); while (wordsIterator.hasNext()) { String word = wordsIterator.next(); String root = myLearnerUtility.getWordFormUtility().getRoot(word); if (wordMap.containsKey(root)) { Set<String> wordList = wordMap.get(root); wordList.add(word); // List<String> wordList2 = wordMap.get(root); // System.out.println(wordList2); } else { Set<String> wordList = new HashSet<String>(); wordList.add(word); wordMap.put(root, wordList); } } // print out the wordMap myLogger.trace("WordMap:"); Iterator<Map.Entry<String, Set<String>>> wordMapIter = wordMap .entrySet().iterator(); while (wordMapIter.hasNext()) { Map.Entry<String, Set<String>> e = wordMapIter.next(); myLogger.trace(e.toString()); } // find nouns myLogger.info("Learn singular-plural pair"); Iterator<Map.Entry<String, Set<String>>> iter = wordMap.entrySet() .iterator(); while (iter.hasNext()) { Map.Entry<String, Set<String>> e = iter.next(); Set<String> wordSet = e.getValue(); Iterator<String> wordIterator = wordSet.iterator(); while (wordIterator.hasNext()) { String word = wordIterator.next(); // getnouns if (word.matches("^.*" + Constant.NENDINGS)) { nouns.add(word + "[s]"); if (wordSet.contains(word + "s")) { nouns.add(word + "s" + "[p]"); dataholderHandler.addSingularPluralPair(word, word + "s"); } if (wordSet.contains(word + "es")) { nouns.add(word + "es" + "[p]"); dataholderHandler.addSingularPluralPair(word, word + "es"); } } } } // Iterator<LinkedList> wordMapIterator = wordMap.i Iterator<Map.Entry<String, Set<String>>> wordMapIterator = wordMap .entrySet().iterator(); while (wordMapIterator.hasNext()) { Map.Entry<String, Set<String>> wordMapEntry = wordMapIterator .next(); Set<String> wordSet = wordMapEntry.getValue(); // check if there is a word with Vending boolean hasVending = false; // for (int i1 = 0; i1 < wordList.size(); i1++) { Iterator<String> wordIterator = wordSet.iterator(); while (wordIterator.hasNext()) { String tempWord = wordIterator.next(); if (tempWord.matches("^.*" + Constant.VENDINGS)) { hasVending = true; break; } } // at least two words without verb endings if ((!hasVending) && (wordSet.size() > 1)) { List<String> wordList = new LinkedList<String>(wordSet); for (int i = 0; i < wordList.size(); i++) { for (int j = i + 1; j < wordList.size(); j++) { String word1 = wordList.get(i); String word2 = wordList.get(j); List<String> pair = myLearnerUtility .getWordFormUtility().getSingularPluralPair( word1, word2); if (pair.size() == 2) { String singular = pair.get(0); String plural = pair.get(1); nouns.add(singular + "[s]"); nouns.add(plural + "[p]"); dataholderHandler.addSingularPluralPair(singular, plural); } } } } } // print out nouns myLogger.debug("Nouns: " + nouns); return nouns; } // ---------------addHeuristicsNouns Help Function---- // #solve the problem: septa and septum are both s // septum - Singular // septa -Plural // septa[s] => septa[p] public String getHeuristicsNounsHelper(String oldNoun, Set<String> words) { String newNoun = oldNoun; if (oldNoun.matches("^.*a\\[s\\]$")) { String noun = oldNoun.replaceAll("\\[s\\]", ""); if (words.contains(noun)) { newNoun = noun + "[p]"; } } return newNoun; } /** * any word preceeding "present"/"absent" would be a n * * @param text * the content to learn from * @return nouns learned */ public String getPresentAbsentNouns(String text) { PropertyConfigurator.configure("conf/log4j.properties"); Logger myLogger = Logger .getLogger("learn.addHeuristicsNouns.learnHeuristicsNouns.getPresentAbsentNouns"); String pachecked = "and|or|to"; // if (text.matches("(\\w+?)\\s+(present|absent)")) { // System.out.println(text); // } Matcher matcher = Pattern.compile("^.*?(\\w+?)\\s+(present|absent).*$") .matcher(text); if (matcher.lookingAt()) { String word = matcher.group(1); if ((!word.matches("\\b(" + pachecked + ")\\b")) && (!word .matches("\\b(" + this.myLearnerUtility.getConstant().STOP + ")\\b")) && (!word .matches("\\b(always|often|seldom|sometimes|[a-z]+ly)\\b"))) { myLogger.trace("present/absent " + word); if (((word.matches("^.*" + Constant.PENDINGS)) || (word.matches("^.*[^s]s$")) || (word .matches("teeth"))) && (!word.matches(Constant.SENDINGS))) { return word + "[p]"; } else { return word + "[s]"; } } } return ""; } /** * Discover nouns and descriptors according to a set of rules * * @return a linked list, whose first element is a set of nouns, and second * element is a set of descriptors */ public List<Set<String>> characterHeuristics(DataHolder dataholderHandler) { PropertyConfigurator.configure("conf/log4j.properties"); Logger myLogger = Logger .getLogger("learn.addHeuristicsNouns.characterHeuristics"); Set<String> taxonNames = new HashSet<String>(); Set<String> nouns = new HashSet<String>(); Set<String> anouns = new HashSet<String>(); Set<String> pnouns = new HashSet<String>(); Set<String> descriptors = new HashSet<String>(); Map<String, Boolean> descriptorMap = new HashMap<String, Boolean>(); int sent_num = dataholderHandler.getSentenceHolder().size(); for (int i = 0; i < sent_num; i++) { // taxon rule SentenceStructure sent = dataholderHandler.getSentenceHolder().get( i); String source = sent.getSource(); String sentence = sent.getSentence(); String originalSentence = sent.getOriginalSentence(); myLogger.trace("Source: " + source); myLogger.trace("Sentence: " + sentence); myLogger.trace("Original Sentence: " + originalSentence); originalSentence = StringUtility.trimString(originalSentence); // noun rule 0: taxon names taxonNames = this.getTaxonNameNouns(originalSentence); // $sentence =~ s#<\s*/?\s*i\s*>##g; // $originalsent =~ s#<\s*/?\s*i\s*>##g; sentence = sentence.replaceAll("<\\s*/?\\s*i\\s*>", ""); originalSentence = originalSentence.replaceAll("<\\s*/?\\s*i\\s*>", ""); // Update getSentenceHolder() dataholderHandler.getSentenceHolder().get(i).setSentence(sentence); // noun rule 0.5: Meckle#s cartilage Set<String> nouns0 = this .getNounsMecklesCartilage(originalSentence); nouns.addAll(nouns0); sentence = sentence.replaceAll("#", ""); // Update getSentenceHolder() dataholderHandler.getSentenceHolder().get(i).setSentence(sentence); // noun rule 2: end of sentence nouns // (a|an|the|some|any|this|that|those|these) noun$ Set<String> nouns2 = this.getNounsRule2(originalSentence); nouns.addAll(nouns2); // noun rule 3: proper nouns and acronyms String copy = originalSentence; Set<String> nouns_temp = this.getNounsRule3Helper(copy); Iterator<String> iter = nouns_temp.iterator(); while (iter.hasNext()) { String token = iter.next(); if (token.matches("^.*[A-Z].+$") && (!token.matches("^.*-\\w+ed$"))) { if (token.matches("^[A-Z0-9]+$")) { token = token.toLowerCase(); anouns.add(token); } else { token = token.toLowerCase(); pnouns.add(token); } nouns.add(token); } } // noun rule 1: sources with 1 _ are character statements, 2 _ are // descriptions Set<String> nouns1 = getNounsRule1(dataholderHandler, source, originalSentence, descriptorMap); nouns.addAll(nouns1); // noun rule 4: non-stop/prep followed by a number: epibranchial 4 // descriptor heuristics Set<String> nouns4 = this.getNounsRule4(originalSentence); nouns.addAll(nouns4); // remove puncts for descriptor rules originalSentence = StringUtility.removePunctuation( originalSentence, "-"); // System.out.println("oSent:"); // System.out.println(originalSentence); // Descriptor rule 1: single term descriptions are descriptors descriptors.addAll(this.getDescriptorsRule1(source, originalSentence, nouns)); // Descriptor rule 2: (is|are) red: isDescriptor descriptors.addAll(this.getDescriptorsRule2(dataholderHandler, originalSentence, descriptorMap)); } nouns = this.filterOutDescriptors(nouns, descriptors); anouns = this.filterOutDescriptors(anouns, descriptors); pnouns = this.filterOutDescriptors(pnouns, descriptors); dataholderHandler.add2HeuristicNounTable(nouns, "organ"); dataholderHandler.add2HeuristicNounTable(anouns, "acronyms"); dataholderHandler.add2HeuristicNounTable(pnouns, "propernouns"); dataholderHandler.add2HeuristicNounTable(taxonNames, "taxonnames"); nouns.addAll(anouns); nouns.addAll(pnouns); nouns.addAll(taxonNames); List<Set<String>> results = new LinkedList<Set<String>>(); results.add(nouns); results.add(descriptors); return results; } /** * filter out descriptors from nouns, and return remaining nouns * * @param rNouns * set of nouns * @param rDescriptors * set of descriptors * @return set of nouns that are not descriptors */ public Set<String> filterOutDescriptors(Set<String> rNouns, Set<String> rDescriptors) { Set<String> filtedNouns = new HashSet<String>(); Iterator<String> iter = rNouns.iterator(); while (iter.hasNext()) { String noun = iter.next(); noun = noun.toLowerCase(); Pattern p = Pattern.compile( "\\b(" + this.myLearnerUtility.getConstant().PREPOSITION + "|" + this.myLearnerUtility.getConstant().STOP + ")\\b", Pattern.CASE_INSENSITIVE); Matcher m = p.matcher(noun); if ((!m.lookingAt()) && (!rDescriptors.contains(noun))) { filtedNouns.add(noun); } } return filtedNouns; } /** * Nouns rule 0: get <i></i> enclosed taxon names * * @param oSent * @return */ public Set<String> getTaxonNameNouns(String oSent) { Set<String> taxonNames = new HashSet<String>(); String regex = "(.*?)<\\s*i\\s*>\\s*([^<]*)\\s*<\\s*\\/\\s*i\\s*>(.*)"; String copy = oSent; while (true) { Matcher matcher = Pattern.compile(regex).matcher(copy); if (matcher.lookingAt()) { String taxonName = matcher.group(2); if (taxonName.length() > 0) { taxonNames.add(taxonName); String[] taxonNameArray = taxonName.split("\\s+"); for (int i = 0; i < taxonNameArray.length; i++) { taxonNames.add(taxonNameArray[i]); } copy = matcher.group(3); } else { break; } } else { break; } } return taxonNames; } /** * Nouns rule 0.5: Meckle#s cartilage * * @param oSent * @return */ public Set<String> getNounsMecklesCartilage(String oSent) { Set<String> nouns = new HashSet<String>(); String regex = "^.*\\b(\\w+#s)\\b.*$"; Matcher m = Pattern.compile(regex).matcher(oSent); if (m.lookingAt()) { String noun = ""; noun = m.group(1); noun = noun.toLowerCase(); nouns.add(noun); noun = noun.replaceAll("#", ""); nouns.add(noun); noun = noun.replaceAll("s$", ""); nouns.add(noun); } return nouns; } /** * * @param source * @param originalSentence * @param descriptorMap * @return */ public Set<String> getNounsRule1(DataHolder dataholderHandler, String source, String originalSentence, Map<String, Boolean> descriptorMap) { Set<String> nouns = new HashSet<String>(); if ((!(source.matches("^.*\\.xml_\\S+_.*$"))) && (!(originalSentence.matches("^.*\\s.*$")))) { if (!this.isDescriptor(dataholderHandler, originalSentence, descriptorMap)) { originalSentence = originalSentence.toLowerCase(); nouns.add(originalSentence); } } return nouns; } /** * * @param oSent * @return */ public Set<String> getNounsRule2(String oSent) { String copy = oSent; String regex = "(.*?)\\b(a|an|the|some|any|this|second|third|fourth|fifth|sixth|seventh|eighth|ninth|tenth) +(\\w+)\\s*($|\\(|\\[|\\{|\\b" + this.myLearnerUtility.getConstant().PREPOSITION + "\\b)(.*)"; Set<String> nouns = new HashSet<String>(); while (true) { if (copy == null) { break; } Matcher m = Pattern.compile(regex).matcher(copy); if (m.lookingAt()) { String t = m.group(3); String prep = m.group(4); copy = m.group(5); if (prep.matches("^.*\\w.*$") && t.matches("^.*\\b(length|width|presence|\\w+tion)\\b.*$")) { continue; } t = t.toLowerCase(); nouns.add(t); } else { break; } } return nouns; } /** * * @param sentence * @return */ public Set<String> getNounsRule3Helper(String sentence) { Set<String> nouns = new HashSet<String>(); String[] segs = sentence.split("[()\\[\\]\\{\\}]"); for (int i1 = 0; i1 < segs.length; i1++) { String seg = segs[i1]; seg = StringUtility.removePunctuation(seg, "-"); String[] tokens = seg.split("\\s+"); // #ignore the first word in character statements--this is normally // capitalized for (int j = 1; j < tokens.length; j++) { String token = tokens[j]; if (token.matches("^.*[A-Z].+$") && (!token.matches("^.*-\\w+ed$"))) { nouns.add(token); } } } return nouns; } /** * noun rule 4: non-stop/prep followed by a number: epibranchial 4 * descriptor heuristics * * @param oSent * @return a set of nouns */ public Set<String> getNounsRule4(String oSent) { Set<String> nouns = new HashSet<String>(); String copy = oSent; String regex = "(.*?)\\s(\\w+)\\s+\\d+(.*)"; while (true) { if (copy == null) { break; } Matcher m = Pattern.compile(regex).matcher(copy); if (m.lookingAt()) { String t = m.group(2); copy = m.group(3); String regex2 = "\\b(" + this.myLearnerUtility.getConstant().PREPOSITION + "|" + this.myLearnerUtility.getConstant().STOP + ")\\b"; if (!t.matches(regex2)) { t = t.toLowerCase(); nouns.add(t); } } else { break; } } return nouns; } /** * * @param source * @param sentence * @param nouns * @return */ public Set<String> getDescriptorsRule1(String source, String sentence, Set<String> nouns) { Set<String> descriptors = new HashSet<String>(); // single word if (source.matches("^.*\\.xml_\\S+_.*$") && (!sentence.matches("^.*\\s.*$"))) { Iterator<String> iter = nouns.iterator(); boolean isExist = false; while (iter.hasNext()) { String noun = iter.next(); if (noun.equals(sentence)) { isExist = true; break; } } if (isExist == false) { sentence = sentence.toLowerCase(); descriptors.add(sentence); } } return descriptors; } /** * (is|are) red: isDescriptor * * @param oSent * @return */ public Set<String> getDescriptorsRule2(DataHolder dataholderHandler, String sentence, Map<String, Boolean> descriptorMap) { Set<String> descriptors = new HashSet<String>(); String[] tokens = sentence.split("\\s+"); for (int i = 0; i < tokens.length; i++) { String token = tokens[i]; token = token.toLowerCase(); if (isDescriptor(dataholderHandler, token, descriptorMap)) { token = token.toLowerCase(); descriptors.add(token); } } return descriptors; } /** * Check if the term is a descriptor * * @param term * @param descriptorMap * descriptors have already learned * @return a boolean value indicating whether the term is a descriptor. This * result will be stored in the descriptorMap for future use */ public boolean isDescriptor(DataHolder dataholderHandler, String term, Map<String, Boolean> descriptorMap) { if (descriptorMap.containsKey(term)) { if (descriptorMap.get(term).booleanValue()) { return true; } else { return false; } } else { for (int i = 0; i < dataholderHandler.getSentenceHolder().size(); i++) { String originalSentence = dataholderHandler.getSentenceHolder() .get(i).getOriginalSentence(); if (isMatched(originalSentence, term, descriptorMap)) { return true; } } term = term.toLowerCase(); descriptorMap.put(term, false); return false; } } /** * Check if the term matches the sentence * * @param sentence * @param term * @param descriptorMap * @return a boolean value indicating whether the term matches the sentence */ public boolean isMatched(String sentence, String term, Map<String, Boolean> descriptorMap) { if (sentence.matches("^.*" + " (is|are|was|were|be|being) " + term + ".*$")) { term = term.toLowerCase(); descriptorMap.put(term, true); return true; } else { return false; } } }