package semanticMarkup.ling.learn.knowledge; import java.util.HashSet; import java.util.Iterator; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.lang3.StringUtils; import org.apache.log4j.Logger; import org.apache.log4j.PropertyConfigurator; import semanticMarkup.ling.learn.Configuration; import semanticMarkup.ling.learn.auxiliary.StringAndInt; import semanticMarkup.ling.learn.dataholder.DataHolder; import semanticMarkup.ling.learn.dataholder.SentenceStructure; import semanticMarkup.ling.learn.utility.LearnerUtility; /** * This module does rule based learning first for easy cases, then does instance * based learning for the remaining unsolved cases. * * This module can run in two different modes: 1) start; 2) normal. * * @author Dongye * */ public class CoreBootstrappingLearner implements IModule { private LearnerUtility myLearnerUtility; private String status; private Configuration myConfiguration; public CoreBootstrappingLearner(LearnerUtility learnerUtility, Configuration configuration) { this.myLearnerUtility = learnerUtility; this.status = null; this.myConfiguration = configuration; } @Override public void run(DataHolder dataholderHandler) { this.discover(dataholderHandler, this.status); } public void setStatus(String status){ this.status = status; } /** * * @param status * "start" or "normal" * @return */ public int discover(DataHolder dataholderHandler, String status) { PropertyConfigurator.configure("conf/log4j.properties"); Logger myLogger = Logger.getLogger("learn.discover"); myLogger.trace("Enter Discover - Status: " + status); int newDisc = 0; // dataholderHandler.printHolder(DataHolder.SENTENCE); for (int i = 0; i < dataholderHandler.getSentenceHolder().size(); i++) { SentenceStructure sentEntry = dataholderHandler.getSentenceHolder() .get(i); // sentid String thisSentence = sentEntry.getSentence(); String thisLead = sentEntry.getLead(); String thisTag = sentEntry.getTag(); String thisStatus = sentEntry.getStatus(); // if (!(thisTag == null || !thisTag.equals("ignore") // myLogger.debug("Tag: "+thisTag); if ((!StringUtils.equals(thisTag, "ignore") || (thisTag == null)) && thisStatus.equals(status)) { myLogger.debug("Sentence #: " + i); myLogger.debug("Lead: " + thisLead); myLogger.debug("Tag: " + thisTag); myLogger.debug("Sentence: " + thisSentence); // tag is not null if (isMarked(dataholderHandler.getSentenceHolder().get(i))) { myLogger.debug("Not Pass"); continue; } // tag is null else { myLogger.debug("Pass"); } String[] startWords = thisLead.split("\\s+"); myLogger.debug("startWords: " + startWords.toString()); String pattern = buildPattern(dataholderHandler, startWords); if (pattern != null) { myLogger.debug("Build pattern [" + pattern + "] from starting words [" + thisLead + "]"); // IDs of untagged sentences that match the pattern Set<Integer> matched = matchPattern(dataholderHandler, pattern, status, false); int round = 0; int numNew = 0; do { numNew = ruleBasedLearn(dataholderHandler, matched); newDisc = newDisc + numNew; myLogger.trace("Round: " + round); round++; } while (numNew > 0); } else { myLogger.debug("Build no pattern from starting words [" + thisLead + "]"); } } } myLogger.trace("Return " + newDisc); myLogger.trace("Quite discover"); return newDisc; } /** * build a pattern based on existing checked word set, and the start words * * @param startWords * @return a pattern. If no pattern is generated, return null */ public String buildPattern(DataHolder dataholderHandler, String[] startWords) { PropertyConfigurator.configure("conf/log4j.properties"); Logger myLogger = Logger.getLogger("learn.discover.buildPattern"); myLogger.trace("Enter buildPattern"); myLogger.trace("Start Words: " + startWords); Set<String> newWords = new HashSet<String>(); String temp = ""; String prefix = "\\w+\\s"; String pattern = ""; Set<String> checkedWords = dataholderHandler.checkedWordSet; myLogger.trace("checkedWords: " + checkedWords); for (int i = 0; i < startWords.length; i++) { String word = startWords[i]; // This is not very sure, need to make sure - Dongye if ((!word.matches("[\\p{Punct}0-9]")) && (!checkedWords.contains(word))) { temp = temp + word + "|"; newWords.add(word); } } myLogger.trace("temp: " + temp); // no new words if (temp.length() == 0) { myLogger.trace("No new words"); myLogger.trace("Return null"); myLogger.trace("Quite buildPattern"); myLogger.trace("\n"); return null; } else { // remove the last char, which is a '|' temp = temp.substring(0, temp.length() - 1); } temp = "\\b(?:" + temp + ")\\b"; pattern = "^" + temp + "|"; for (int j = 0; j < this.myConfiguration.getNumLeadWords() - 1; j++) { temp = prefix + temp; pattern = pattern + "^" + temp + "|"; } myLogger.trace("Pattern: " + pattern); pattern = pattern.substring(0, pattern.length() - 1); pattern = "(?:" + pattern + ").*$"; checkedWords.addAll(newWords); dataholderHandler.checkedWordSet = checkedWords; myLogger.trace("Return Pattern: " + pattern); myLogger.trace("Quite buildPattern"); myLogger.trace("\n"); return pattern; } /** * Find the IDs of the sentences that matches the pattern * * @param pattern * @param status * @param hasTag * @return a set of sentence IDs of the sentences that matches the pattern */ public Set<Integer> matchPattern(DataHolder dataholderHandler, String pattern, String status, boolean hasTag) { PropertyConfigurator.configure("conf/log4j.properties"); Logger myLogger = Logger.getLogger("learn.discover.matchPattern"); myLogger.trace("Enter matchPattern"); myLogger.trace("Pattern: " + pattern); myLogger.trace("Status: " + status); myLogger.trace("HasTag: " + hasTag); Set<Integer> matchedIDs = new HashSet<Integer>(); for (int i = 0; i < dataholderHandler.getSentenceHolder().size(); i++) { SentenceStructure sent = dataholderHandler.getSentenceHolder().get( i); String thisSentence = sent.getSentence(); String thisStatus = sent.getStatus(); String thisTag = sent.getTag(); boolean a = hasTag; boolean b = (thisTag == null); if ((a ^ b) && (StringUtils.equals(status, thisStatus))) { Pattern p = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE); Matcher m = p.matcher(thisSentence); if (m.lookingAt()) { myLogger.debug("Push Sentence #" + i); myLogger.debug("Sentence: " + thisSentence); myLogger.debug("Status: " + thisStatus); myLogger.debug("Tag: " + thisTag); myLogger.debug("\n"); matchedIDs.add(i); } } } myLogger.trace("Return IDs: " + matchedIDs); myLogger.trace("Quite matchPattern"); myLogger.trace("\n"); return matchedIDs; } /** * return a positive number if anything new is learned from @source sentences * by applying rules and clues to grow %NOUNS and %BDRY and to confirm tags * create and maintain decision tables * * @param matched * @return */ public int ruleBasedLearn(DataHolder dataholderHandler, Set<Integer> matched) { PropertyConfigurator.configure("conf/log4j.properties"); Logger myLogger = Logger.getLogger("learn.discover.ruleBasedLearn"); myLogger.trace("Enter ruleBasedLearn"); myLogger.trace("Matched IDs: " + matched); int sign = 0; Iterator<Integer> iter = matched.iterator(); while (iter.hasNext()) { int sentID = iter.next().intValue(); SentenceStructure sentence = dataholderHandler.getSentenceHolder() .get(sentID); if (!isMarked(sentence)) { StringAndInt tagAndNew = null; String tag = null; int numNew = 0; tagAndNew = this.myLearnerUtility.learnTerms(dataholderHandler, sentID); tag = tagAndNew.getString(); numNew = tagAndNew.getInt(); this.myLearnerUtility.tagSentence(dataholderHandler, this.myConfiguration.getMaxTagLength(), sentID, tag); sign = sign + numNew; } } myLogger.trace("Return: " + sign); myLogger.trace("Quit ruleBaseLearn"); myLogger.trace("\n"); return sign; } /** * A helper of method discover(). Check if the tag of the i-th sentence is * NOT null * * @param sentence * the sentence to check * @return if the tag of the i-th sentence is NOT null, returns true; * otherwise returns false */ public boolean isMarked(SentenceStructure sentence) { String thisTag = sentence.getTag(); if (thisTag != null) { return true; } else { return false; } } }