package semanticMarkup.ling.learn.utility; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Set; import java.util.Map.Entry; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.lang3.StringUtils; import org.apache.log4j.Logger; import org.apache.log4j.PropertyConfigurator; import semanticMarkup.know.lib.WordNetPOSKnowledgeBase; import semanticMarkup.ling.Token; import semanticMarkup.ling.learn.auxiliary.GetNounsAfterPtnReturnValue; import semanticMarkup.ling.learn.auxiliary.KnownTagCollection; import semanticMarkup.ling.learn.auxiliary.POSInfo; import semanticMarkup.ling.learn.auxiliary.StringAndInt; import semanticMarkup.ling.learn.dataholder.DataHolder; import semanticMarkup.ling.learn.dataholder.ModifierTableValue; import semanticMarkup.ling.learn.dataholder.SentenceStructure; import semanticMarkup.ling.learn.dataholder.WordPOSKey; import semanticMarkup.ling.learn.dataholder.WordPOSValue; import semanticMarkup.ling.learn.knowledge.Constant; import semanticMarkup.ling.transform.ITokenizer; public class LearnerUtility { private ITokenizer mySentenceDetector; private ITokenizer mytokenizer; private WordFormUtility myWordFormUtility; private WordNetPOSKnowledgeBase myWordNetPOS; private Constant myConstant; public LearnerUtility(ITokenizer sentenceDetector, ITokenizer tokenizer, WordNetPOSKnowledgeBase wordNetPOS) { this.myConstant = new Constant(); this.mySentenceDetector = sentenceDetector; this.mytokenizer = tokenizer; this.myWordFormUtility = new WordFormUtility(wordNetPOS); this.myWordNetPOS = wordNetPOS; } public Constant getConstant(){ return this.myConstant; } public ITokenizer getTokenizer(){ return this.mytokenizer; } public ITokenizer getSentenceDetector(){ return this.mySentenceDetector; } public WordFormUtility getWordFormUtility(){ return this.myWordFormUtility; } public WordNetPOSKnowledgeBase getWordNetPOSKnowledgeBase(){ return this.myWordNetPOS; } // populate sentence utilities /** * Given a file name, return its type * * @param fileName * @return return 1 if it is a file of character file, or 2 if it is a * description file, otherwise return 0 */ public int getType(String fileName) { // remove pdf.xml fileName = fileName.replaceAll(".*\\.xml_", ""); // remove all non_ charaters fileName = fileName.replaceAll("[^_]", ""); // a character file if (fileName.length() == 0) { return 1; } // a description file if (fileName.length() == 1) { return 2; } return 0; } /** * replace '.', '?', ';', ':', '!' within brackets by some special markers, * to avoid split within brackets during sentence segmentation * * @param text * @return */ public String hideMarksInBrackets(String text) { if (text == null || text == "") { return text; } String hide = ""; int lRound = 0; int lSquare = 0; int lCurly = 0; for (int i = 0; i < text.length(); i++) { char c = text.charAt(i); switch (c) { case '(': lRound++; hide = hide + c; break; case ')': lRound--; hide = hide + c; break; case '[': lSquare++; hide = hide + c; break; case ']': lSquare--; hide = hide + c; break; case '{': lCurly++; hide = hide + c; break; case '}': lCurly--; hide = hide + c; break; default: if (lRound + lSquare + lCurly > 0) { if (c == '.') { hide = hide + "[DOT] "; } else if (c == '?') { hide = hide + "[QST] "; } else if (c == ';') { hide = hide + "[SQL] "; } else if (c == ':') { hide = hide + "[QLN] "; } else if (c == '!') { hide = hide + "[EXM] "; } else { hide = hide + c; } } else { hide = hide + c; } } } return hide; } /** * Put all words in this sentence into the words map * * @param sent * @param words * a map mapping all words already known to their counts * @return a new map of all words, including words in sent */ public Map<String, Integer> getAllWords(String sentence, Map<String, Integer> words) { List<String> tokens = this.tokenizeText(sentence, "all"); for (String token: tokens) { if (words.containsKey(token)) { int count = words.get(token); count = count + 1; words.put(token, count); } else { words.put(token, 1); } } return words; } /** * returns the first n words of the sentence * * @param sent * the sentence * @param n * number of words to be returned * @return the first n words of the sentence. If the number of words in the * sentence is less than n, return all of them. */ public List<String> getFirstNWords(String sentence, int n) { List<String> nWords = new ArrayList<String>(); if (sentence == null || sentence == "") { return nWords; } List<String> tokens = this.tokenizeText(sentence, "firstseg"); int minL = tokens.size() > n ? n : tokens.size(); for (int i = 0; i < minL; i++) { nWords.add(tokens.get(i)); } return nWords; } /** * Restore '.', '?', ';', ':', '!' within brackets * * @param text * @return the restored string */ public String restoreMarksInBrackets(String text) { if (text == null || text == "") { return text; } // restore "." from "[DOT]" text = text.replaceAll("\\[\\s*DOT\\s*\\]", "."); // restore "?" from "[QST]" text = text.replaceAll("\\[\\s*QST\\s*\\]", "?"); // restore ";" from "[SQL]" text = text.replaceAll("\\[\\s*SQL\\s*\\]", ";"); // restore ":" from "[QLN]" text = text.replaceAll("\\[\\s*QLN\\s*\\]", ":"); // restore "." from "[DOT]" text = text.replaceAll("\\[\\s*EXM\\s*\\]", "!"); return text; } /** * Add space before and after all occurence of the regex in the string str * * @param str * @param regex * @return */ public String addSpace(String str, String regex) { if (str == null || str == "" || regex == null || regex == "") { return str; } Matcher matcher = Pattern.compile("(^.*)(" + regex + ")(.*$)").matcher( str); if (matcher.lookingAt()) { str = addSpace(matcher.group(1), regex) + " " + matcher.group(2) + " " + addSpace(matcher.group(3), regex); return str; } else { return str; } } public List<String> tokenizeText(String sentence, String mode) { if (StringUtils.equals(mode, "firstseg")) { sentence = getSentenceHead(sentence); } else { ; } String[] tempWords = sentence.split("\\s+"); List<String> words = new ArrayList<String>(); words.addAll(Arrays.asList(tempWords)); return words; } /** * Get the portion in the input sentence before any of ,:;.[(, or any * preposition word, if any * * @param sentence * the input sentence * @return the portion in the head */ public String getSentenceHead(String sentence) { PropertyConfigurator.configure("conf/log4j.properties"); Logger myLogger = Logger .getLogger("learn.populateSentence.getFirstNWords.getHead"); if (sentence == null) { return sentence; } else if (sentence.equals("")) { return sentence; } else { String head = ""; int end = sentence.length(); String pattern1 = " [,:;.\\[(]"; String pattern2 = "\\b" + "(" + this.myConstant.PREPOSITION + ")" + "\\s"; myLogger.trace("Pattern1: " + pattern1); myLogger.trace("Pattern2: " + pattern2); Pattern p1 = Pattern.compile(pattern1); Pattern p2 = Pattern.compile(pattern2); Matcher m1 = p1.matcher(sentence); Matcher m2 = p2.matcher(sentence); boolean case1 = m1.find(); boolean case2 = m2.find(); if (case1 || case2) { // case 1 if (case1) { int temp1 = m1.end(); end = temp1 < end ? temp1 : end; end = end - 1; } // case 2 else { int temp2 = m2.end(); end = temp2 < end ? temp2 : end; } head = sentence.substring(0, end - 1); } else { head = sentence; } myLogger.trace("Return: " + head); return head; } } /** * Segment a text into sentences using the OpenNLP sentence detector. Note * how dot after any abbreviations is handled: to avoid segmenting at * abbreviations, the dots of abbreviations are first replaced by a special * mark before the text is segmented. Then after the segmentation, they are * restored back. * * @param text * @return List of Sentence */ public List<Token> segmentSentence(String text) { List<Token> sentences; //hide abbreviations text = this.hideAbbreviations(text); // do sentence segmentation sentences = this.mySentenceDetector.tokenize(text); // restore Abbreviations for (Token sentence: sentences){ String contentHideAbbreviations = sentence.getContent(); String contentRestoreAbbreviations = this.restoreAbbreviations(contentHideAbbreviations); sentence.setContent(contentRestoreAbbreviations); } return sentences; } /** * replace the dot (.) mark of abbreviations in the text by a special mark * ([DOT]) * * @param text * @return the text after replacement */ public String hideAbbreviations(String text) { String pattern = "(^.*)(" +Constant.PEOPLE_ABBR +"|"+Constant.ARMY_ABBR +"|"+Constant.INSTITUTES_ABBR +"|"+Constant.COMPANIES_ABBR +"|"+Constant.PLACES_ABBR +"|"+Constant.MONTHS_ABBR +"|"+Constant.MISC_ABBR +"|"+Constant.BOT1_ABBR +"|"+Constant.BOT2_ABBR +"|"+Constant.LATIN_ABBR +")(\\.)(.*$)"; //pattern = "(^.*)(jr|abc)(\\.)(.*$)"; Pattern p = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE); Matcher m; m= p.matcher(text); while (m.matches()){ String head = m.group(1); String abbr = m.group(2); String dot = m.group(3); String remaining = m.group(4); dot = "[DOT]"; text= head+abbr+dot+remaining; m=p.matcher(text); } return text; } /** * restore the dot (.) mark of abbreviations in the text from special mark * ([DOT]) * * @param text * @return the text after replacement */ public String restoreAbbreviations(String text) { String pattern = "(^.*)(" +Constant.PEOPLE_ABBR +"|"+Constant.ARMY_ABBR +"|"+Constant.INSTITUTES_ABBR +"|"+Constant.COMPANIES_ABBR +"|"+Constant.PLACES_ABBR +"|"+Constant.MONTHS_ABBR +"|"+Constant.MISC_ABBR +"|"+Constant.BOT1_ABBR +"|"+Constant.BOT2_ABBR +"|"+Constant.LATIN_ABBR +")(\\[DOT\\])(.*$)"; //pattern = "(^.*)(jr|abc)(\\.)(.*$)"; Pattern p = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE); Matcher m; m= p.matcher(text); while (m.matches()){ String head = m.group(1); String abbr = m.group(2); String dot = m.group(3); String remaining = m.group(4); dot = "."; text= head+abbr+dot+remaining; m=p.matcher(text); } return text; } /** * Convert a collection of words to a string of those words separated by "|" * * @param c * collection of words * @return string of pattern. If the collection is null or empty, return an * empty string */ public String Iterable2Pattern(Iterable<String> words) { if (words == null) { return ""; } List<String> wordList = new LinkedList<String>(); for (String word: words) { word = this.addDoubleBackslash(word); wordList.add(word); } String pattern = StringUtils.join(wordList, "|"); // pattern = this.addDoubleBackslash(pattern); // testRunner("\\\\", "abc\\abc"); // testRunner("\\(", "abc(abc"); // testRunner("\\)", "abc)abc"); // testRunner("\\[", "abc[abc"); // testRunner("\\]", "abc]abc"); // testRunner("\\{", "abc{abc"); // testRunner("\\}", "abc}abc"); // testRunner("\\.", "abc.abc"); // testRunner("\\|", "abc|abc"); // testRunner("\\+", "abc+abc"); // testRunner("\\*", "abc*abc"); // testRunner("\\?", "abc?abc"); // testRunner("\\d+", "01138"); // [-\\\\\\(\\)\\[\\]\\{\\}\\.\\|\\+\\*\\?] // // stops.addAll(Arrays.asList(new String[] { "NUM", "(", "[", "{", // ")", "]", "}", "d+" })); return pattern; } /** * Convert a pattern with words separated by "|" to a set * * @param pattern * the pattern * @return a set. If the input is null or empty string, return a empty set */ public static Set<String> Pattern2Set(String pattern) { Set<String> set = new HashSet<String>(); if (StringUtils.equals(pattern, null) || StringUtils.equals(pattern, "")) { return (set); } set.addAll(Arrays.asList(pattern.split("|"))); return set; } /** * tag words with all o n m b tags that are applicable to the words * * @param mode * "singletag" or "multitags" * @param type * "sentence" or "orginal" */ public void tagAllSentences (DataHolder dataholderHandler, String mode, String type) { List<StringAndInt> idAndSentenceList = new LinkedList<StringAndInt>(); Iterator<SentenceStructure> sentenceIter = dataholderHandler.getSentenceHolder().iterator(); if (StringUtils.equals(mode, "original")) { while (sentenceIter.hasNext()) { SentenceStructure sentence = sentenceIter.next(); int thisID = sentence.getID(); String thisOriginalSentence = sentence.getOriginalSentence(); idAndSentenceList.add(new StringAndInt(thisOriginalSentence, thisID)); } } else { while (sentenceIter.hasNext()) { SentenceStructure sentence = sentenceIter.next(); int thisID = sentence.getID(); String thisSentence = sentence.getSentence(); idAndSentenceList.add(new StringAndInt(thisSentence, thisID)); } } KnownTagCollection myKnownTags = this.getKnownTags(dataholderHandler, mode); Iterator<StringAndInt> idAndSentenceListIter = idAndSentenceList.iterator(); while (idAndSentenceListIter.hasNext()) { StringAndInt idAndSentence = idAndSentenceListIter.next(); int thisID = idAndSentence.getInt(); if (thisID == 127) { System.out.println(); } String thisSentence = idAndSentence.getString(); thisSentence = tagAllSentencesHelper(thisSentence); thisSentence = annotateSentence(thisSentence, myKnownTags, dataholderHandler.getBMSWords()); SentenceStructure targetSentence = dataholderHandler.getSentence(thisID); if (StringUtils.equals(mode, "original")) { targetSentence.setOriginalSentence(thisSentence); } else { targetSentence.setSentence(thisSentence); } } } /** * Helper of tagAllSentencesHelper method * @param text * @return text after processing */ public String tagAllSentencesHelper(String text) { text = text.replaceAll("<\\S+?>", ""); text = text.toLowerCase(); // cup_shaped, 3_nerved, 3-5 (-7)_nerved // Matcher m2 = StringUtility.createMatcher("\\s*-\\s*([a-z])", text); // while (m2.find()) { // String group1 = m2.group(1); // text = m2.replaceFirst("_"+group1); // m2 = StringUtility.createMatcher("\\s*-\\s*([a-z])", text); // } //$b =~ s#\b(_[a-z]+)\b#(?\:\\b\\d+)$1#g; #_nerved => (?:\b\d+)_nerved // $sent =~ s#\s*-\s*([a-z])#_$1#g; text = StringUtility.replaceAllBackreference(text, "\\s*-\\s*([a-z])", "_$1"); // add space around nonword char text = StringUtility.replaceAllBackreference(text, "(\\W)", " $1 "); // multiple spaces => 1 space text = text.replaceAll("\\s+", " "); // trim text = text.replaceAll("^\\s*", ""); text = text.replaceAll("\\s*$", ""); return text; } public String annotateSentence(String sentence, KnownTagCollection knownTags, Set<String> NONS) { // get known tags Set<String> boundaryMarks; Set<String> boundaryWords; Set<String> modifiers; Set<String> nouns; Set<String> organs; Set<String> properNouns; if (knownTags.boundaryMarks == null) { boundaryMarks = new HashSet<String>(); } else { boundaryMarks = knownTags.boundaryMarks; } if (knownTags.boundaryWords == null) { boundaryWords = new HashSet<String>(); } else { boundaryWords = knownTags.boundaryWords; } if (knownTags.modifiers == null) { modifiers = new HashSet<String>(); } else { modifiers = knownTags.modifiers; } if (knownTags.nouns== null) { nouns = new HashSet<String>(); } else { nouns = knownTags.nouns; } if (knownTags.organs == null) { organs = new HashSet<String>(); } else { organs = knownTags.organs; } if (knownTags.properNouns == null) { properNouns = new HashSet<String>(); } else { properNouns = knownTags.properNouns; } // preprocessing 1 List<String> bDeleteList = new LinkedList<String>(); List<String> bAddList = new LinkedList<String>(); Iterator<String> bIter = boundaryWords.iterator(); while(bIter.hasNext()) { String oldWord = bIter.next(); if (oldWord.charAt(0)=='_') { String newWord = "(?\\:\\b\\d+)"+oldWord; bDeleteList.add(oldWord); bAddList.add(newWord); } } boundaryWords.removeAll(bDeleteList); boundaryWords.addAll(bAddList); nouns = StringUtility.setSubtraction(nouns, NONS); organs = StringUtility.setSubtraction(organs, NONS); // preprocessing 2 Set<String> tagSet = new HashSet<String>(); tagSet.addAll(Arrays.asList("Z O N M B".split(" "))); properNouns = StringUtility.setSubtraction(properNouns, tagSet); organs = StringUtility.setSubtraction(organs, tagSet); nouns = StringUtility.setSubtraction(nouns, tagSet); modifiers = StringUtility.setSubtraction(modifiers, tagSet); boundaryWords = StringUtility.setSubtraction(boundaryWords, tagSet); boundaryMarks = StringUtility.setSubtraction(boundaryMarks, tagSet); // insert tags sentence = annotateSentenceHelper(sentence, properNouns, "Z", true); // System.out.println(sentence); sentence = annotateSentenceHelper(sentence, organs, "O", true); // System.out.println(sentence); // if (sentence.equals("<O>extent</O> of dermal cranial covering")) { // System.out.println(); // } sentence = annotateSentenceHelper(sentence, nouns, "N", true); // System.out.println(sentence); sentence = annotateSentenceHelper(sentence, modifiers, "M", true); sentence = annotateSentenceHelper(sentence, boundaryWords, "B", true); sentence = annotateSentenceHelper(sentence, boundaryMarks, "B", false); sentence = annotateSentenceHelper2(sentence); return sentence; } public String annotateSentenceHelper(String sentence, Set<String> words, String tag, boolean isWithBoundaryWord) { PropertyConfigurator.configure("conf/log4j.properties"); Logger myLogger = Logger.getLogger("learn.annotateSentence"); if (words.size() != 0) { if (isWithBoundaryWord) { sentence = StringUtility.replaceAllBackreference( sentence, String.format("\\b(%s)\\b", this.Iterable2Pattern(words)), String.format("<%s>$1</%s>", tag, tag)); } else { // String pattern = String.format("(%s)", LearnerUtility.Collection2Pattern(words)); // Matcher m1 = StringUtility.createMatcher("(\\]|\\}|\\(|\\)|\\{|\\[)", "word ]abc"); // boolean b1 = m1.find(); //// Matcher m2 = StringUtility.createMatcher("(]|}|(|)|{|[)", "word (abc)"); //// boolean b2 = m2.find(); String regex = String.format("(%s)", this.Iterable2Pattern(words)); String replacement = String.format("<%s>$1</%s>", tag, tag); myLogger.trace("Sentence: "+sentence); myLogger.trace("Words: "+words); myLogger.trace("Regex: "+regex); myLogger.trace("Replacement: "+replacement); sentence = StringUtility.replaceAllBackreference(sentence, regex, replacement); } } return sentence; } public String annotateSentenceHelper2(String sentence){ if (StringUtility.createMatcher(sentence, "").find()) { sentence = StringUtility.replaceAllBackreference(sentence, "<(\\w)>\\s*</$1>", ""); } Matcher m = StringUtility .createMatcher(sentence, "<(\\w)>\\s*</(\\1)>"); while (m.find()) { sentence = m.replaceFirst(""); m = StringUtility.createMatcher(sentence, "<(\\w)>\\s*</(\\1)>"); } sentence = StringUtility.replaceAllBackreference(sentence, "(?:<[^<]+>)+("+this.myConstant.FORBIDDEN+")(?:</[^<]+>)+", "$1"); return sentence; } /** * * @param mode * can be either "singletag" or "multitags" */ public KnownTagCollection getKnownTags(DataHolder dataholderHandler, String mode) { PropertyConfigurator.configure("conf/log4j.properties"); Logger myLogger = Logger.getLogger("learn.getKnownTags"); myLogger.trace("Enter (mode: "+mode+")"); KnownTagCollection knownTags = null; Set<String> nouns = new HashSet<String>(); // nouns Set<String> organs = new HashSet<String>(); // organs Set<String> modifiers = new HashSet<String>(); // modifiers Set<String> boundaryWords = new HashSet<String>(); // boundary words Set<String> boundaryMarks = new HashSet<String>(); // boundary marks Set<String> properNouns = new HashSet<String>(); // proper nouns // get nouns Set<String> nounSet = new HashSet<String>(); Set<String> psWordSet = new HashSet<String>(); // set of nouns psWordSet = this.getPSWords(dataholderHandler); nounSet.addAll(psWordSet); // if the mode is "singletag", then get additional nouns from tags if (StringUtils.equalsIgnoreCase(mode, "singletag")) { nounSet.addAll(this.getOrgans(dataholderHandler)); } else { // do nothing } nouns.addAll(nounSet); myLogger.trace("Get nouns: "+nouns.toString()); // get organs if(StringUtils.equals(mode, "multitags")){ Set<String> organSet = this.getOrgans(dataholderHandler); organs.addAll(organSet); myLogger.trace("Get organs: "+organs.toString()); } // get modifiers Set<String> modifierSet = new HashSet<String>(); modifierSet = this.getModifiers(dataholderHandler); if(StringUtils.equals(mode, "singletag")){ Iterator<String> mIter = modifierSet.iterator(); while (mIter.hasNext()) { String m = mIter.next(); if (!psWordSet.contains(m)) { modifiers.add(m); } } }else{ modifiers.addAll(modifierSet); } // get boundary words and marks List<Set<String>> result = this.getBoundaries(dataholderHandler); boundaryWords = result.get(0); boundaryMarks = result.get(1); // get proper nouns properNouns = this.getProperNouns(dataholderHandler); // put all known tags into one KnownTagCollection object knownTags = new KnownTagCollection(nouns, organs, modifiers, boundaryWords, boundaryMarks, properNouns); return knownTags; } /** * A helper of method getKnownTags(). Get a set of all nouns from the * word-POS collection. * * @return a set of nouns */ public Set<String> getPSWords(DataHolder dataholderHandler) { Set<String> psSet = new HashSet<String>(); // set of p and s // get a set of all nouns from the word-POS collection Iterator<Entry<WordPOSKey, WordPOSValue>> iterWordPOS = dataholderHandler .getWordPOSHolder().entrySet().iterator(); while (iterWordPOS.hasNext()) { Entry<WordPOSKey, WordPOSValue> entry = iterWordPOS.next(); String POS = entry.getKey().getPOS(); if ((StringUtils.equals(POS, "s")) || (StringUtils.equals(POS, "p"))) { String word = entry.getKey().getWord(); if (word != null) { if (StringUtility.createMatcher(word, "^[a-zA-Z0-9_-]+$") .find()) { psSet.add(word); } } } } return psSet; } /** * A helper of method getKnownTags(). Get a set of o from tags in sentence * collections * * @return a set of o */ public Set<String> getOrgans(DataHolder dataholderHandler) { Set<String> oSet = new HashSet<String>(); // set of organs Iterator<SentenceStructure> iterSentence = dataholderHandler .getSentenceHolder().iterator(); while (iterSentence.hasNext()) { SentenceStructure sentence = iterSentence.next(); String tag = sentence.getTag(); if (tag != null) { if ((!StringUtils.equals(tag, "ignore")) && (!StringUtility.createMatcher(tag, ".* .*").find()) && (!StringUtility.createMatcher(tag, ".*\\[.*").find())) { if (StringUtility.createMatcher(tag, "^[a-zA-Z0-9_-]+$").find()) { oSet.add(tag); } } } } return oSet; } /** * Get modifier words from modifier collection. * * @return a set fo modifer words */ public Set<String> getModifiers(DataHolder dataholderHandler) { Set<String> mSet = new HashSet<String>(); // set of o Iterator<Entry<String, ModifierTableValue>> iter = dataholderHandler .getModifierHolder().entrySet().iterator(); while (iter.hasNext()) { Entry<String, ModifierTableValue> entry = iter.next(); String word = entry.getKey(); if (word != null) { if (StringUtility.createMatcher(word, "^[a-zA-Z0-9_-]+$") .find()) { mSet.add(word); } } } return mSet; } /** * Get boundary words and marks. * * @return a list of two elements. The first element is a set of boundary * words, and second element is a set of boundary marks. */ public List<Set<String>> getBoundaries (DataHolder dataholderHandler){ Set<String> bWords = new HashSet<String>(); Set<String> bMarks = new HashSet<String>(); List<Set<String>> result = new LinkedList<Set<String>>(); Iterator<Entry<WordPOSKey, WordPOSValue>> iter = dataholderHandler .getWordPOSHolderIterator(); while (iter.hasNext()) { Entry<WordPOSKey, WordPOSValue> entry = iter.next(); String word = entry.getKey().getWord(); String POS = entry.getKey().getPOS(); if (word != null && POS != null) { if (StringUtils.equals(POS, "b")) { // String pattern = "^[-\\\\\\(\\)\\[\\]\\{\\}\\.\\|\\+\\*\\?]$"; String pattern = "^(-|\\\\|\\(|\\)|\\[|\\]|\\{|\\}|\\.|\\||\\+|\\*|\\?)$"; if (StringUtility.isMatchedNullSafe(word, pattern)) { bMarks.add(word); } else if ((!(StringUtility.isMatchedNullSafe(word, "\\w"))) && (!StringUtils.equals(word, "/"))) { if (StringUtility.createMatcher(word, "^[a-zA-Z0-9_-]+$").find()) { bMarks.add(word); } } else { if (StringUtility.isMatchedNullSafe(word, "^[a-zA-Z0-9_-]+$")) { bWords.add(word); } } } } } result.add(bWords); result.add(bMarks); return result; } /** * Get the proper nouns from the word-POS collection * * @return a set of the porper nouns */ public Set<String> getProperNouns(DataHolder dataholderHandler) { Set<String> pNouns = new HashSet<String>(); Iterator<Entry<WordPOSKey, WordPOSValue>> iter = dataholderHandler.getWordPOSHolder().entrySet().iterator(); while (iter.hasNext()) { Entry<WordPOSKey, WordPOSValue> entry = iter.next(); String word = entry.getKey().getWord(); String POS = entry.getKey().getPOS(); if (StringUtils.equals(POS, "z")) { if (StringUtility.createMatcher(word, "^[a-zA-Z0-9_-]+$").find()) { pNouns.add(word); } } } return pNouns; } // /** // * @param args // */ // public static void main(String[] args) { //// [-\\\\\\(\\)\\[\\]\\{\\}\\.\\|\\+\\*\\?] //// //// stops.addAll(Arrays.asList(new String[] { "NUM", "(", "[", "{", //// ")", "]", "}", "d+" })); // testRunner("z", "abczabc"); // testRunner("/", "abc/abc"); // testRunner("-", "abc-abc"); // testRunner("_", "abc_abc"); // testRunner(addDoubleBackslash("\\"), "abc\\abc"); // testRunner(addDoubleBackslash("("), "abc(abc"); // testRunner(addDoubleBackslash(")"), "abc)abc"); // testRunner(addDoubleBackslash("["), "abc[abc"); // testRunner(addDoubleBackslash("]"), "abc]abc"); // testRunner(addDoubleBackslash("{"), "abc{abc"); // testRunner(addDoubleBackslash("}"), "abc}abc"); // testRunner(addDoubleBackslash("."), "abc.abc"); // testRunner(addDoubleBackslash("|"), "abc|abc"); // testRunner(addDoubleBackslash("+"), "abc+abc"); // testRunner(addDoubleBackslash("*"), "abc*abc"); // testRunner(addDoubleBackslash("?"), "abc?abc"); // testRunner(addDoubleBackslash("d+"), "01138"); //// testRunner("\\(", "abc(abc"); //// testRunner("\\(", "abc(abc"); //// testRunner("\\(", "abc(abc"); // // String str = "("; // str = str.replaceAll("(\\()", "\\\\$1"); // System.out.println(str); // // str = addDoubleBackslash(str); // // // } private String addDoubleBackslash(String word) { word = word.replaceAll("^(\\\\|\\(|\\)|\\[|\\]|\\{|\\}|\\.|\\||\\+|\\*|\\?|d\\+)$", "\\\\$1"); // word = word.replaceAll("^(d\\+)$", "\\\\$1"); return word; } // private static String addDoubleBackslash(String word) { // word = word.replaceAll("^(\\\\|\\(|\\)|\\[|\\]|\\{|\\}|\\.|\\||\\+|\\*|\\?|d\\+)$", "\\\\$1"); //// word = word.replaceAll("^(d\\+)$", "\\\\$1"); // // return word; // } private static boolean testRunner(String regex, String str) { boolean isMatched = false; Pattern p = Pattern.compile(regex); Matcher m = p.matcher(str); isMatched = m.find(); System.out.println(isMatched); return isMatched; } public String getSentencePtn(DataHolder dataholderHandler, Set<String> token, int limit, List<String> words) { Set<String> typeModifierPtns = dataholderHandler.getTypeModifierPattern(); String ptn = ""; int counter = 0; String regex = String.format("\\b(%s)\\b",StringUtils.join(token, "|")); Iterator<String> wordIter = words.iterator(); while (wordIter.hasNext()) { if (counter > limit - 1) { break; } counter++; String word = wordIter.next(); if (StringUtility.isEntireMatchedNullSafe(word, regex)) { ptn = ptn + "&"; } else { if (word == null) { ptn = ptn + "q"; } else { Matcher m1 = StringUtility.createMatcher(word, "([,:;\\.])"); Matcher m2 = StringUtility.createMatcher(word, "<(\\w)>"); if (m1.find()) { String g1 = m1.group(1); ptn = ptn + g1; } else if (m2.find()){ String g1 = m2.group(1); String tag = g1; if (StringUtils.equals(tag, "M") && typeModifierPtns.contains(word)) { ptn = ptn + "t"; } else { ptn = ptn + tag.toLowerCase(); } } else if (StringUtils.equals(this.getWordFormUtility().getNumber(word), "p")) { ptn = ptn + "p"; } else { ptn = ptn + "q"; } } } } return ptn; } public String getParentSentenceTag(int sentenceID) { // TODO Auto-generated method stub return null; } // doItMarkup /** * skip and/or cases skip leads with $stop words * * @return number of updates */ public int doItMarkup(DataHolder dataholderHandler, int maxLength) { PropertyConfigurator.configure("conf/log4j.properties"); Logger myLogger = Logger .getLogger("learn.additionalBootStrapping.doItMarkup"); myLogger.trace("Enter"); int sign = 0; // for (int i=0;i<myDataHolder.getSentenceHolder().size();i++) { Iterator<SentenceStructure> iter = dataholderHandler.getSentenceHolder().iterator(); while (iter.hasNext()) { SentenceStructure sentenceObject = iter.next(); String tag = sentenceObject.getTag(); if (doItMarkupHelper(tag)) { int ID = sentenceObject.getID(); String lead = sentenceObject.getLead(); String sentence = sentenceObject.getSentence(); // case 1 if (doItMarkupCase1Helper(sentence)) { myLogger.trace(String.format("sent #%d: case 1", ID)); continue; } // case 2 if (doItMarkupCase2Helper(lead)) { myLogger.trace(String.format("sent #%d: case 2", ID)); continue; } StringAndInt tagAndSign = learnTerms(dataholderHandler, ID); String doItTag = tagAndSign.getString(); int doItSign = tagAndSign.getInt(); sign = doItSign; // case 3 if (StringUtility.createMatcher(doItTag, "\\w").find()) { myLogger.trace(String.format("sent #%d: case 3", ID)); this.tagSentence(dataholderHandler, maxLength, ID, doItTag); } } } myLogger.trace("Return: " + sign); return sign; } public boolean doItMarkupHelper(String tag) { boolean flag = false; flag = (tag == null) || (StringUtils.equals(tag, "")) || (StringUtils.equals(tag, "unknown")); return flag; } public boolean doItMarkupCase1Helper(String sentence) { boolean flag = false; flag = StringUtility.createMatcher(sentence, "^.{0,40} (nor|or|and|\\/)").find(); return flag; } public boolean doItMarkupCase2Helper(String lead) { boolean flag = false; flag = StringUtility.createMatcher(lead, "\\b(" + getConstant().STOP + ")\\b").find(); return flag; } public boolean tagSentence(DataHolder dataholderHandler, int maxLength, int sentenceID, String tag) { PropertyConfigurator.configure("conf/log4j.properties"); Logger myLogger = Logger.getLogger("learn.tagSentence"); myLogger.trace(String.format("Enter (%d, %s)", sentenceID, tag)); // case 1 if (!StringUtility.createMatcher(tag, "\\w+").find()) { myLogger.trace("\t:tag is not a word. Return"); return false; } else { // case 2 if (StringUtility.createMatcher(tag, "^(" + getConstant().STOP + ")\\b") .find()) { myLogger.trace(String .format("\t:tag %s starts with a stop word, ignore tagging requrest", tag)); return false; } else { // case 3 if (tag.length() > maxLength) { tag = tag.substring(0, maxLength); myLogger.debug(String.format("\ttag: %s longer than %d)", tag, maxLength)); } else { ; } SentenceStructure sentence = dataholderHandler.getSentence(sentenceID); sentence.setTag(tag); myLogger.debug(String.format( "\t:mark up sentence #%d with tag %s", sentenceID, tag)); return true; } } } /** * Update wordpos table (on certainty) when a sentence is tagged for the * first time. Note: 1) this update should not be done when a POS is looked * up, because we may lookup a POS for the same example multiple times. 2) * if the tag need to be adjusted (not by doit function), also need to * adjust certainty counts. * * @param sentID * the ID of the sentence * @return a pair of (tag, sign) */ public StringAndInt learnTerms(DataHolder dataholderHandler, int sentID) { PropertyConfigurator.configure("conf/log4j.properties"); Logger myLogger = Logger .getLogger("learn.discover.ruleBasedLearn.doIt"); myLogger.trace("Enter doIt"); myLogger.trace("sentence ID: " + sentID); SentenceStructure sentEntry = dataholderHandler.getSentenceHolder() .get(sentID); String thisSentence = sentEntry.getSentence(); String thisLead = sentEntry.getLead(); StringAndInt returnValue = this.doItCaseHandle(dataholderHandler, thisSentence, thisLead); myLogger.trace("Return Tag: " + returnValue.getString() + ", sign: " + returnValue.getInt()); myLogger.trace("Quit doIt"); myLogger.trace("\n"); return returnValue; } /** * * @param thisSentence * @param thisLead * @return */ public StringAndInt doItCaseHandle(DataHolder dataholderHandler, String thisSentence, String thisLead) { PropertyConfigurator.configure("conf/log4j.properties"); Logger myLogger = Logger .getLogger("learn.discover.ruleBasedLearn.doIt.doItCaseHandle"); myLogger.trace("Enter doItCaseHandle"); myLogger.trace("Sentence: " + thisSentence); myLogger.trace("Lead: " + thisLead); if (thisSentence == null || thisLead == null) { return null; } int sign = 0; String tag = ""; List<String> words = Arrays.asList(thisLead.split("\\s+")); String ptn = this.getPOSptn(dataholderHandler, words); myLogger.trace("ptn: " + ptn); Pattern p2 = Pattern.compile("ps"); Matcher m2 = p2.matcher(ptn); Pattern p3 = Pattern.compile("p(\\?)"); Matcher m3 = p3.matcher(ptn); Pattern p4 = Pattern.compile("[psn](b)"); Matcher m4 = p4.matcher(ptn); Pattern p5 = Pattern.compile("([psn][psn]+)"); Matcher m5 = p5.matcher(ptn); Pattern p6A = Pattern.compile("b[?b]([psn])$"); Matcher m6A = p6A.matcher(ptn); Pattern p6B = Pattern.compile("[?b]b([psn])$"); Matcher m6B = p6B.matcher(ptn); boolean case6A = m6A.find(); boolean case6B = m6B.find(); Pattern p7 = Pattern.compile("^s(\\?)$"); Matcher m7 = p7.matcher(ptn); Pattern p10 = Pattern.compile("^\\?(b)"); Matcher m10 = p10.matcher(ptn); // Case 1: single word case if (ptn.matches("^[pns]$")) { myLogger.trace("Case 1"); tag = words.get(0); sign = sign + dataholderHandler.updateDataHolder(tag, ptn, "-", "wordpos", 1); myLogger.debug("Directly markup with tag: " + tag + "\n"); } // Case 2: "ps" else if (m2.find()) { myLogger.trace("Case 2"); myLogger.debug("Found [ps] pattern\n"); int start = m2.start(); int end = m2.end(); String pWord = words.get(start); String sWord = words.get(end - 1); List<String> tempWords = StringUtility.stringArraySplice(words, 0, start + 1); tag = StringUtility.joinList(" ", tempWords); myLogger.debug("\tdetermine the tag: " + tag); int returnedSign = 0; returnedSign = dataholderHandler.updateDataHolder(pWord, "p", "-", "wordpos", 1); sign += returnedSign; myLogger.trace(String.format( "updateDataHolder(%s, p, -, wordpos, 1), returned: %d", pWord, returnedSign)); returnedSign = dataholderHandler.updateDataHolderNN(0, tempWords.size(), tempWords); sign += returnedSign; myLogger.trace(String.format( "updateDataHolderNN(0, %d, %s), returned: %d", tempWords.size(), tempWords.toString(), returnedSign)); returnedSign = dataholderHandler.updateDataHolder(sWord, "b", "", "wordpos", 1); sign += returnedSign; myLogger.trace(String.format( "updateDataHolder(%s, b, , wordpos, 1), returned: %d", sWord, returnedSign)); } // Case 3: "p(\\?)" else if (m3.find()) { myLogger.trace("Case 3"); myLogger.debug("Found [p?] pattern"); // int start = m3.start(1); int end = m3.end(1); String secondMatchedWord = words.get(end - 1); // case 3.1 if (StringUtils.equals(this.myWordFormUtility.getNumber(secondMatchedWord), "p")) { myLogger.trace("Case 3.1"); tag = secondMatchedWord; sign = sign + dataholderHandler.updateDataHolder(tag, "p", "-", "wordpos", 1); dataholderHandler .add2Holder( DataHolder.ISA, Arrays.asList(new String[] { tag, words.get(end - 2) })); myLogger.debug("\t:[p p] pattern: determine the tag: " + tag); } // case 3.2 else { myLogger.trace("Case 3.2"); List<String> wordsCopy = new ArrayList<String>(words); // $i is just end-1 List<String> tempWords = StringUtility.stringArraySplice(words, 0, end - 1); tag = StringUtility.joinList(" ", tempWords); myLogger.debug("\t:determine the tag: " + tag); myLogger.debug("\t:updates on POSs"); int temp = 0; temp = dataholderHandler.updateDataHolder( wordsCopy.get(end - 1), "b", "", "wordpos", 1); sign += temp; myLogger.debug("\t:updateDataHolder1 returns " + temp); temp = dataholderHandler.updateDataHolder( wordsCopy.get(end - 2), "p", "-", "wordpos", 1); sign += temp; myLogger.debug("\t:updateDataHolder2 returns " + temp); temp = dataholderHandler.updateDataHolderNN(0, tempWords.size(), tempWords); sign += temp; myLogger.debug("\t:updateDataHolder returns " + temp); } } // case 4: "[psn](b)" else if (m4.find()) { myLogger.trace("Case 4"); Pattern p41 = Pattern.compile("^sbp"); Matcher m41 = p41.matcher(ptn); if (m41.find()) { myLogger.trace("\tCase 4.1"); myLogger.debug("Found [sbp] pattern"); List<String> wordsCopy = new ArrayList<String>(words); tag = StringUtility.joinList(" ", StringUtility.stringArraySplice(wordsCopy, 0, 3)); myLogger.trace("\t:determine the tag: " + tag); } else { myLogger.trace("\tCase 4.2"); myLogger.debug("Found [[psn](b)] pattern"); int index = m4.start(1); // get tag, which is the words prior to the b word (exclusive) List<String> wordsTemp = StringUtility.stringArraySplice(words, 0, index); tag = StringUtility.joinList(" ", wordsTemp); myLogger.trace("Tag: " + tag); // update the b word sign += dataholderHandler.updateDataHolder(words.get(index), "b", "", "wordpos", 1); myLogger.trace(String.format( "updateDataHolder (%s, b, , wordpos, 1)", words.get(index))); sign += dataholderHandler.updateDataHolder( words.get(index - 1), ptn.substring(index - 1, index), "-", "wordpos", 1); myLogger.trace(String.format( "updateDataHolder (%s, %s, -, wordpos, 1)", words.get(index - 1), ptn.substring(index - 1, index))); sign += dataholderHandler.updateDataHolderNN(0, wordsTemp.size(), wordsTemp); myLogger.trace(String.format("updateDataHolderNN (0, %d, %s)", wordsTemp.size(), wordsTemp.toString())); myLogger.debug("\t:determine the tag: " + tag); myLogger.debug("\t:updates on POSs"); } } // case 5: "pp" else if (m5.find()) { myLogger.debug("Case 5: Found [[psn][psn]+] pattern"); int start = m5.start(1); int end = m5.end(1); List<String> copyWords = new ArrayList<String>(); copyWords.addAll(words); GetNounsAfterPtnReturnValue returnedValue = this.getNounsAfterPtn(dataholderHandler, thisSentence, end); List<String> moreNoun = new LinkedList<String>(); List<String> morePtn = new LinkedList<String>(); String bWord = ""; moreNoun.addAll(returnedValue.getNouns()); morePtn.addAll(returnedValue.getNounPtn()); bWord = returnedValue.getBoundaryWord(); List<POSInfo> t; if (StringUtility.createMatcher(ptn, "pp").find()) { myLogger.trace("Case 5.1"); String morePtnStr = StringUtility.joinList("", morePtn); Pattern p511 = Pattern.compile("/^p*(s)"); Matcher m511 = p511.matcher(morePtnStr); Pattern p512 = Pattern.compile("^(p+)"); Matcher m512 = p512.matcher(morePtnStr); if (m511.find()) { myLogger.trace("Case 5.1.1"); // find last p word, and reset it to "b" int sAfterPIndex = m511.start(1); int lastPIndex = sAfterPIndex - 1; String sWord = moreNoun.get(sAfterPIndex); String lastPWord = lastPIndex >= 0 ? moreNoun .get(lastPIndex) : ""; bWord = lastPWord; if (StringUtils.equals(lastPWord, "")) { tag = words.get(ptn.lastIndexOf("p")); } else { tag = lastPWord; } sign += dataholderHandler.updateDataHolder(sWord, "b", "", "wordpos", 1); } else if (m512.find()) { myLogger.trace("Case 5.1.2"); tag = moreNoun.get(m512.end(1) - 1); } else { myLogger.trace("Case 5.1.3"); int lastPIndex = ptn.lastIndexOf("p"); tag = words.get(lastPIndex); } t = dataholderHandler.checkPOSInfo(tag); } else { myLogger.trace("Case 5.2"); List<String> tempWords = new LinkedList<String>(); tempWords .addAll(StringUtility.stringArraySplice(words, 0, end)); tag = StringUtility.joinList(" ", tempWords); if (moreNoun.size() > 0) { tag = tag + " " + StringUtility.joinList(" ", moreNoun); } t = dataholderHandler.checkPOSInfo( tag.substring(tag.lastIndexOf(" ") + 1, tag.length())); } if (t.size() > 0) { String pos = t.get(0).getPOS(); // String role = t.get(0).getRole(); // int certiantyU = t.get(0).getCertaintyU(); // int certiantyL = t.get(0).getCertaintyL(); if (StringUtility.createMatcher(pos, "[psn]").find()) { // case 5.x myLogger.debug("Case 5.x: relax this condition"); List<String> tWords = new LinkedList<String>(); tWords.addAll(Arrays.asList(thisSentence.split(" "))); sign += dataholderHandler.updateDataHolder(bWord, "b", "", "wordpos", 1); ptn = ptn.substring(start, end); String tempPtn = ptn + StringUtility.joinList("", morePtn); for (int k = start; k < tempPtn.length(); k++) { if (k != tempPtn.length() - 1) { sign += dataholderHandler.updateDataHolder( tWords.get(k), tempPtn.substring(k, k + 1), "_", "wordpos", 1); } else { sign += dataholderHandler.updateDataHolder( tWords.get(k), tempPtn.substring(k, k + 1), "-", "wordpos", 1); } } if (tWords.size() > 1) { sign += dataholderHandler.updateDataHolderNN(0, tempPtn.length(), tWords); } } } myLogger.debug("\t:determine the tag: " + tag); } // case 6: "b[?b]([psn])$" or "[?b]b([psn])$" else if (case6A || case6B) { myLogger.debug("Case 6: Found [b?[psn]$] or [[?b]b([psn])$] pattern"); int end = -1; // the index of noun if (case6A) { end = m6A.end(1) - 1; } else { end = m6B.end(1) - 1; } GetNounsAfterPtnReturnValue tempReturnValue = this .getNounsAfterPtn(dataholderHandler, thisSentence, end + 1); // List<String> moreNouns = tempReturnValue.getNouns(); List<String> morePtn = tempReturnValue.getNounPtn(); String bWord = tempReturnValue.getBoundaryWord(); List<String> sentenceHeadWords = tokenizeText(thisSentence, "firstseg"); end += morePtn.size(); List<String> tempWords = StringUtility.stringArraySplice( sentenceHeadWords, 0, end + 1); tag = StringUtility.joinList(" ", tempWords); myLogger.debug("\t:updates on POSs"); if (StringUtility.createMatcher(bWord, "\\w").find()) { sign += dataholderHandler.updateDataHolder(bWord, "b", "", "wordpos", 1); } String allPtn = "" + ptn; allPtn = allPtn + StringUtility.joinList("", morePtn); // from the index of noun for (int i = 2; i < allPtn.length(); i++) { // case 6.1: last ptn if (i != allPtn.length() - 1) { myLogger.trace("Case 6.1"); sign += dataholderHandler.updateDataHolder( sentenceHeadWords.get(i), allPtn.substring(i, i + 1), "_", "wordpos", 1); } // case 6.2: not last ptn else { myLogger.trace("Case 6.2"); sign += dataholderHandler.updateDataHolder( sentenceHeadWords.get(i), allPtn.substring(i, i + 1), "-", "wordpos", 1); } } myLogger.debug("\t:determine the tag: " + tag); } // case 7: "^s(\\?)$" else if (m7.find()) { myLogger.trace("Case 7"); String singularWord = words.get(0); String questionedWord = words.get(1); String wnPOS = this.myWordFormUtility.checkWN( questionedWord, "pos"); if (StringUtility.createMatcher(wnPOS, "p").find()) { myLogger.trace("Case 7.1"); tag = singularWord + " " + questionedWord; myLogger.debug("\t:determine the tag: " + tag); myLogger.debug("\t:updates on POSs"); String questionedPOS = this.myWordFormUtility.getNumber(singularWord); sign += dataholderHandler.updateDataHolder(questionedWord, questionedPOS, "-", "wordpos", 1); } else { myLogger.trace("Case 7.2"); tag = words.get(0); myLogger.debug("\t:determine the tag: " + tag); myLogger.debug("\t:updates on POSs"); sign += dataholderHandler.updateDataHolder(questionedWord, "b", "", "wordpos", 1); sign += dataholderHandler.updateDataHolder(singularWord, "s", "-", "wordpos", 1); } } // case 8: "^bs$" else if (StringUtility.createMatcher(ptn, "^bs$").find()) { myLogger.trace("Case 8"); tag = StringUtility.joinList(" ", words); sign += dataholderHandler.updateDataHolder(words.get(0), "b", "", "wordpos", 1); sign += dataholderHandler.updateDataHolder(words.get(1), "s", "-", "wordpos", 1); } // case 9: ^bp$ else if (StringUtility.createMatcher(ptn, "^bp$").find()) { myLogger.trace("Case 9"); tag = StringUtility.joinList(" ", words); sign += dataholderHandler.updateDataHolder(words.get(0), "b", "", "wordpos", 1); sign += dataholderHandler.updateDataHolder(words.get(1), "p", "-", "wordpos", 1); } // case 10: "^\\?(b)" else if (m10.find()) { myLogger.trace("Case 10"); myLogger.trace("Found [?(b)] pattern"); int index = m10.start(1); sign += dataholderHandler.updateDataHolder(words.get(index), "b", "", "wordpos", 1); myLogger.trace(String.format( "updateDataHolder (%s, b, , wordpos, 1)", words.get(index))); List<String> wordsTemp = StringUtility.stringArraySplice(words, 0, index); tag = StringUtility.joinList(" ", wordsTemp); String word = words.get(index - 1); // the "?" word myLogger.trace("Tag: " + tag); myLogger.trace("Word: " + word); if (!isFollowedByNoun(dataholderHandler, thisSentence, thisLead)) { myLogger.trace("Case 10.1"); String wnP1 = this.myWordFormUtility.checkWN(word, "pos"); myLogger.trace("wnP1: " + wnP1); String wnP2 = ""; if (!StringUtility.createMatcher(wnP1, "\\w").find()) { wnP2 = this.myWordFormUtility.getNumber(word); } myLogger.trace("wnP2: " + wnP2); if (StringUtility.createMatcher(wnP1, "[ar]").find()) { wnP1 = ""; } if ((StringUtility.createMatcher(wnP1, "[psn]").find()) || (StringUtility.createMatcher(wnP2, "[ps]").find())) { myLogger.trace("Case 10.1.1"); myLogger.debug("\t:determine the tag: " + tag); myLogger.debug("\t:updates on POSs"); sign += dataholderHandler.updateDataHolder(word, "n", "-", "wordpos", 1); myLogger.trace(String.format( "updateDataHolder(%s, n, -, wordpos, 1)", word)); sign += dataholderHandler.updateDataHolderNN(0, wordsTemp.size() - 1, wordsTemp); myLogger.trace(String.format( "updateDataHolderNN(%d, %d, %s)", 0, wordsTemp.size() - 1, wordsTemp)); } else { myLogger.trace("Case 10.1.2"); myLogger.debug("\t:" + tag + " is adv/adj or modifier. skip."); tag = ""; } } else { myLogger.trace("Case 10.2"); myLogger.debug(String.format( "\t:%s is adv/adj or modifier. skip.", tag)); tag = ""; } } else { myLogger.trace("\tCase 0"); myLogger.trace(String.format("Pattern [%s] is not processed", ptn)); } StringAndInt returnValue = new StringAndInt(tag, sign); myLogger.trace("Return: " + returnValue.toString()); return returnValue; } public int doItCase7Helper(String regex, String ptn) { Matcher m = StringUtility.createMatcher(ptn, regex); if (m.find()) { int start = m.start(); return start + 1; } else { return -1; } } /** * The length of the ptn must be the same as the number of words in words. * If certainty is < 50%, replace POS with ?. * * @param words * @return */ public String getPOSptn(DataHolder dataholderHandler, List<String> words) { PropertyConfigurator.configure("conf/log4j.properties"); Logger myLogger = Logger .getLogger("learn.discover.ruleBasedLearn.doIt.getPOSptn"); myLogger.trace("Enter getPOSptn"); myLogger.trace("Words: " + words.toString()); String ptn = ""; String POS = ""; double certainty; for (int i = 0; i < words.size(); i++) { String word = words.get(i); myLogger.trace("\tCheck word: " + word); List<POSInfo> POSInfoList = dataholderHandler.checkPOSInfo(word); if (POSInfoList.size() >= 0) { if (POSInfoList.size() == 0) { myLogger.trace("\t\tThe word is not in WordPOS holder"); POS = "?"; } else { POSInfo p = POSInfoList.get(0); POS = p.getPOS(); if (p.getCertaintyU() == 0) { certainty = 1.0; } else { double certaintyU = (double) p.getCertaintyU(); double certaintyL = (double) p.getCertaintyL(); certainty = certaintyU / certaintyL; } myLogger.trace(String.format("\t\tCertaintyU: %d", p.getCertaintyU())); myLogger.trace(String.format("\t\tCertaintyL: %d", p.getCertaintyL())); myLogger.trace(String .format("\t\tCertainty: %f", certainty)); if ((!StringUtils.equals(POS, "?")) && (certainty <= 0.5)) { myLogger.info("\t\tThis POS has a certainty less than 0.5. It is ignored."); POS = "?"; } } ptn = ptn + POS; myLogger.trace("\t\tAdd pos: " + POS); } else { myLogger.error("Error: checkPOSInfo gave invalid return value"); } } myLogger.trace("Return ptn: " + ptn); myLogger.trace("Quite getPOSptn"); return ptn; } public GetNounsAfterPtnReturnValue getNounsAfterPtn(DataHolder dataholderHandler, String sentence, int startWordIndex) { PropertyConfigurator.configure("conf/log4j.properties"); Logger myLogger = Logger.getLogger("learn.getNounsAfterPattern"); myLogger.trace(String .format("enter (%s, %d)", sentence, startWordIndex)); String bWord = ""; List<String> nouns = new ArrayList<String>(); List<String> nounPtn = new ArrayList<String>(); List<String> tempWords = new ArrayList<String>(); tempWords.addAll(tokenizeText(sentence, "firstseg")); List<String> words = StringUtility.stringArraySplice(tempWords, startWordIndex, tempWords.size()); myLogger.trace("words: " + words); String ptn = this.getPOSptn(dataholderHandler, words); myLogger.trace("ptn: " + ptn); if (ptn != null) { Matcher m1 = StringUtility.createMatcher(ptn, "^([psn]+)"); Matcher m2 = StringUtility.createMatcher(ptn, "^(\\?+)"); boolean case1 = false; boolean case2 = false; int end = -1; if (m1.find()) { case1 = true; end = m1.end(1); } if (m2.find()) { case2 = true; end = m2.end(1); } if (case1 || case2) { myLogger.trace("end: " + end); if (end < words.size()) { bWord = words.get(end); } List<String> nWords = new ArrayList<String>(); nWords.addAll(StringUtility.stringArraySplice(words, 0, end)); for (int i = 0; i < nWords.size(); i++) { String p = ptn.substring(i, i + 1); p = StringUtils.equals(p, "?") ? this.myWordFormUtility.checkWN(nWords.get(i), "pos") : p; if (StringUtility.createMatcher(p, "^[psn]+$").find()) { nouns.add(nWords.get(i)); nounPtn.add(p); } else { bWord = nWords.get(i); break; } } } } GetNounsAfterPtnReturnValue returnValue = new GetNounsAfterPtnReturnValue( nouns, nounPtn, bWord); myLogger.trace("return " + returnValue); return (returnValue); } /** * Check if a lead is followed by a noun without any proposition in between * in the sentence * * @param thisSentence * the sentence * @param thisLead * the lead * @return true if lead is followed by a N without any proposition in * between */ public boolean isFollowedByNoun(DataHolder dataholderHandler, String sentence, String lead) { PropertyConfigurator.configure("conf/log4j.properties"); Logger myLogger = Logger .getLogger("learn.discover.ruleBasedLearn.doIt.isFollowedByNoun"); myLogger.trace(String.format("(%s, %s)", sentence, lead)); // null case if (sentence == null || lead == null) { myLogger.trace("Return false"); return false; } if (StringUtils.equals(sentence, "")) { myLogger.trace("Return false"); return false; } // remove lead from sentence sentence = sentence.replaceFirst("^" + lead, ""); myLogger.trace("Sentence after remove lead: " + sentence); // List<String> nouns = this.myDataHolder.getWordByPOS("ps"); Set<String> POSTags = new HashSet<String>(); POSTags.add("p"); POSTags.add("s"); Set<String> nouns = dataholderHandler.getWordsFromWordPOSByPOSs(POSTags); if (nouns.size() == 0) { myLogger.trace("Return false"); return false; } // String pattern1 = StringUtility.joinList("|", nouns); String pattern1 = StringUtils.join(nouns, "|"); pattern1 = "(.*?)\\b(" + pattern1 + ")" + "\\b"; myLogger.trace("Pattern: " + pattern1); Pattern p1 = Pattern.compile(pattern1); Matcher m1 = p1.matcher(sentence); String inBetweenPart = ""; if (m1.find()) { inBetweenPart = m1.group(1); String pattern2 = "\\b(" + this.myConstant.PREPOSITION + ")\\b"; Pattern p2 = Pattern.compile(pattern2); Matcher m2 = p2.matcher(inBetweenPart); if (!m2.find()) { myLogger.trace("Return true"); return true; } } myLogger.trace("Return false"); return false; } }