package semanticMarkup.ling.learn.knowledge; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Set; import org.apache.commons.lang3.StringUtils; import org.apache.log4j.Logger; import org.apache.log4j.PropertyConfigurator; import semanticMarkup.ling.learn.Configuration; import semanticMarkup.ling.learn.auxiliary.SentenceLeadLengthComparator; import semanticMarkup.ling.learn.dataholder.DataHolder; import semanticMarkup.ling.learn.dataholder.SentenceStructure; import semanticMarkup.ling.learn.utility.LearnerUtility; import semanticMarkup.ling.learn.utility.StringUtility; /** * Do bootstrapping learning using clues such as shared subject different * boundary and one lead word. * * @author Dongye * */ public class AdditionalBootstrappingLearner implements IModule { private LearnerUtility myLearnerUtility; private Configuration myConfiguration; public AdditionalBootstrappingLearner(LearnerUtility learnerUtility, Configuration configuration) { this.myLearnerUtility = learnerUtility; this.myConfiguration = configuration; } @Override public void run(DataHolder dataholderHandler) { this.additionalBootstrapping(dataholderHandler); } public void additionalBootstrapping(DataHolder dataholderHandler) { PropertyConfigurator.configure("conf/log4j.properties"); Logger myLogger = Logger.getLogger("learn.additionalBootStrapping"); myLogger.trace("[additionalBootStrapping]Start"); // dataholderHandler.printHolder(DataHolder.SENTENCE); int flag = 0; do { myLogger.trace(String.format("Enter one do-while loop iteration")); flag = 0; // warmup markup int cmReturn = wrapupMarkup(dataholderHandler); myLogger.trace(String .format("wrapupMarkup() returned %d", cmReturn)); flag += cmReturn; // one lead word markup Set<String> tags = dataholderHandler.getCurrentTags(); myLogger.trace(tags.toString()); int omReturn = oneLeadWordMarkup(dataholderHandler, tags); myLogger.trace(String.format("oneLeadWordMarkup() returned %d", omReturn)); flag += omReturn; // doit markup int dmReturn = this.myLearnerUtility.doItMarkup(dataholderHandler, this.myConfiguration.getMaxTagLength()); myLogger.trace(String.format("doItMarkup() returned %d", dmReturn)); flag += dmReturn; myLogger.trace(String.format("Quite this iteration with flag = %d", flag)); } while (flag > 0); myLogger.trace("[additionalBootStrapping]End"); } /** * In the sentence collections, search for such sentence, whose lead is * among the tags passed in, and add the lead into word POS collections as a * noun * * @param tags * a set of all tags in the tagged sentences in the sentence * collection * @return the numbet of updates made */ public int oneLeadWordMarkup(DataHolder dataholderHandler, Set<String> tags) { PropertyConfigurator.configure("conf/log4j.properties"); Logger myLogger = Logger .getLogger("learn.additionalBootStrapping.oneLeadWordMarkup"); // String tags = StringUtility.joinList("|", tags); int sign = 0; myLogger.trace(String.format("Enter (%s)", tags)); Iterator<SentenceStructure> iter = dataholderHandler .getSentenceHolder().iterator(); while (iter.hasNext()) { SentenceStructure sentence = iter.next(); int ID = sentence.getID(); String tag = sentence.getTag(); String lead = sentence.getLead(); if ((tag == null) && (!(StringUtility.createMatcher(lead, ".* .*").find()))) { if (tags.contains(lead)) { this.myLearnerUtility.tagSentence(dataholderHandler, this.myConfiguration.getMaxTagLength(), ID, lead); myLogger.trace(String.format( "updateDataHolder(%s, n, -, wordpos, 1)", lead)); sign += dataholderHandler.updateDataHolder(lead, "n", "-", "wordpos", 1); } } } myLogger.trace("Return: " + sign); return 0; } /** * for the remaining of sentences that do not have a tag yet, look for lead * word co-ocurrance, use the most freq. co-occured phrases as tags e.g. * plication induplicate (n times) and plication reduplicate (m times) => * plication is the tag and a noun e.g. stigmatic scar basal (n times) and * stigmatic scar apical (m times) => stigmatic scar is the tag and scar is * a noun. what about externally like A; externally like B, functionally * staminate florets, functionally staminate xyz? * * @return */ public int wrapupMarkup(DataHolder dataholderHandler) { PropertyConfigurator.configure("conf/log4j.properties"); Logger myLogger = Logger .getLogger("learn.additionalBootStrapping.wrapupMarkup"); myLogger.trace("Enter"); int sign = 0; Set<Integer> checkedIDs = new HashSet<Integer>(); List<SentenceStructure> sentenceList = new LinkedList<SentenceStructure>(); for (int id1 = 0; id1 < dataholderHandler.getSentenceHolder().size(); id1++) { SentenceStructure sentence = dataholderHandler.getSentenceHolder() .get(id1); String tag = sentence.getTag(); String lead = sentence.getLead(); if ((tag == null) && (StringUtility.createMatcher(lead, ".* .*").find())) { sentenceList.add(sentence); } } SentenceLeadLengthComparator myComparator = new SentenceLeadLengthComparator( false); Collections.sort(sentenceList, myComparator); Iterator<SentenceStructure> iter1 = sentenceList.iterator(); while (iter1.hasNext()) { SentenceStructure sentence = iter1.next(); int ID1 = sentence.getID(); String lead = sentence.getLead(); // if this sentence has been checked, pass if (checkedIDs.contains(ID1)) { continue; } List<String> words = new ArrayList<String>(); words.addAll(Arrays.asList(lead.split("\\s+"))); List<String> sharedHead = new ArrayList<String>(); sharedHead.addAll(words.subList(0, words.size() - 1)); String match = StringUtility.joinList(" ", sharedHead); Set<SentenceStructure> sentenceSet = new HashSet<SentenceStructure>(); for (int index = 0; index < dataholderHandler.getSentenceHolder() .size(); index++) { SentenceStructure thisSentence = dataholderHandler .getSentenceHolder().get(index); String thisLead = thisSentence.getLead(); String tag = thisSentence.getTag(); String pTemp = "^" + match + " [\\S]+$"; myLogger.trace(thisLead); myLogger.trace(pTemp); // if ((tag==null) && StringUtility.isMatchedNullSafe(pTemp, // thisLead)) { if ((tag == null) && StringUtility.isMatchedNullSafe(thisLead, pTemp)) { if (!StringUtils.equals(thisLead, lead)) { sentenceSet.add(thisSentence); } } } if (sentenceSet.size() > 1) { String ptn = this.myLearnerUtility.getPOSptn(dataholderHandler, sharedHead); String wnPOS = this.myLearnerUtility.getWordFormUtility() .checkWN(sharedHead.get(sharedHead.size() - 1), "pos"); myLogger.trace("ptn: " + ptn); myLogger.trace("wnPOS: " + wnPOS); if ((StringUtility.createMatcher(ptn, "[nsp]$").find()) || ((StringUtility.createMatcher(ptn, "\\?$").find()) && (StringUtility .createMatcher(wnPOS, "n").find()))) { Iterator<SentenceStructure> iter2 = sentenceSet.iterator(); while (iter2.hasNext()) { SentenceStructure thisSentence = iter2.next(); int ID = thisSentence.getID(); String thisLead = thisSentence.getLead(); List<String> words2 = new ArrayList<String>(); words2.addAll(Arrays.asList(thisLead.split("\\s+"))); // case 1 boolean case1 = false; boolean case2 = false; case1 = words2.size() > sharedHead.size(); if (case1) { List<String> checkWord = new ArrayList<String>(); checkWord.add(words2.get(sharedHead.size())); case2 = StringUtility.createMatcher( this.myLearnerUtility.getPOSptn(dataholderHandler, checkWord), "[psn]").find(); } if (case1 && case2) { myLogger.trace("Case 1"); String nb = words2.size() >= sharedHead.size() + 2 ? words2 .get(sharedHead.size() + 1) : ""; words2 = StringUtility.stringArraySplice(words2, 0, sharedHead.size() + 1); String nmatch = StringUtility.joinList(" ", words2); this.myLearnerUtility.tagSentence(dataholderHandler, this.myConfiguration.getMaxTagLength(),ID, nmatch); myLogger.trace(String.format("tag (%d, %s)", ID, nmatch)); this.myLearnerUtility.tagSentence(dataholderHandler, this.myConfiguration.getMaxTagLength(),ID1, match); myLogger.trace(String.format("tag (%d, %s)", ID1, match)); String updatedWord = words2.get(words2.size() - 1); int update1 = dataholderHandler.updateDataHolder( updatedWord, "n", "-", "wordpos", 1); sign += update1; myLogger.trace(String.format("update (%s)", updatedWord)); if (!StringUtils.equals(nb, "")) { int update2 = dataholderHandler .updateDataHolder(nb, "b", "", "wordpos", 1); sign += update2; myLogger.trace(String.format("update (%s)", nb)); } updatedWord = words.get(words.size() - 1); int update3 = dataholderHandler.updateDataHolder( words.get(words.size() - 1), "b", "", "wordpos", 1); sign += update3; myLogger.trace(String.format("update (%s)", updatedWord)); } // case 2 else { myLogger.trace("Case 2"); String b = words2.size() >= sharedHead.size() + 1 ? words2 .get(sharedHead.size()) : ""; this.myLearnerUtility.tagSentence(dataholderHandler, this.myConfiguration.getMaxTagLength(),ID, match); this.myLearnerUtility.tagSentence(dataholderHandler, this.myConfiguration.getMaxTagLength(),ID1, match); // if (sharedHead.get(sharedHead.size() - // 1).equals("tissue")) { // System.out.println(); // } int update1 = dataholderHandler.updateDataHolder( sharedHead.get(sharedHead.size() - 1), "n", "-", "wordpos", 1); sign += update1; if (!StringUtils.equals(b, "")) { int update2 = dataholderHandler .updateDataHolder(b, "b", "", "wordpos", 1); sign += update2; } int update3 = dataholderHandler.updateDataHolder( words.get(words.size() - 1), "b", "", "wordpos", 1); sign += update3; } checkedIDs.add(ID); } } else { Iterator<SentenceStructure> iter2 = sentenceSet.iterator(); while (iter2.hasNext()) { SentenceStructure thisSentence = iter2.next(); int ID = thisSentence.getID(); checkedIDs.add(ID); } } } else { checkedIDs.add(ID1); } } myLogger.trace("Return " + sign); return sign; } }