package semanticMarkup.ling.learn.knowledge; import java.util.ArrayList; import java.util.Arrays; import java.util.HashSet; import java.util.List; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.lang3.StringUtils; import org.apache.log4j.Logger; import org.apache.log4j.PropertyConfigurator; import semanticMarkup.ling.learn.auxiliary.POSInfo; import semanticMarkup.ling.learn.dataholder.DataHolder; import semanticMarkup.ling.learn.dataholder.SentenceStructure; import semanticMarkup.ling.learn.utility.LearnerUtility; import semanticMarkup.ling.learn.utility.StringUtility; /** * Find Modifier/Organ for the same Ox: M1 Ox, M2 Ox Example: inner phyllaries, middle phyllaries #Find Mx/Oy where Ox != Oy Example: inner florets # ==>inner/middle = type modifier # Find TM C (character) patterns => TM = adjective nouns # outer and middle => outer is adject noun # outer and mid => mid is adject noun #===> infer more boundary words/structure: outer [ligules], inner [fertile] * @author Dongye * */ public class AdjectiveSubjectBootstrappingLearner implements IModule { private LearnerUtility myLearnerUtility; private String learningMode; private int maxTagLength; public AdjectiveSubjectBootstrappingLearner(LearnerUtility learnerUtility, String learningMode, int maxTagLength) { this.myLearnerUtility = learnerUtility; this.learningMode = learningMode; this.maxTagLength = maxTagLength; } @Override public void run(DataHolder dataholderHandler) { if (StringUtils.equals(this.learningMode, "adj")) { // myLogger.info("Bootstrapping on adjective subjects"); adjectiveSubjectBootstrapping(dataholderHandler, this.maxTagLength); } else { int v = 0; do { v = 0; this.handleAndOr(dataholderHandler); } while (v > 0); } } public void adjectiveSubjectBootstrapping(DataHolder dataholderHandler, int maxTagLength) { int flag = 0; int count = 0; do { // tag all sentences this.myLearnerUtility.tagAllSentences(dataholderHandler, "singletag", "sentence"); // adjective subject markup: may discover new modifier, new boundary, and new nouns int res1 = this.adjectiveSubjects(dataholderHandler); flag += res1; // work on tag='andor' clauses, move to the main bootstrapping int res2 = discoverNewModifiers(dataholderHandler); flag += res2; int res3 = this.handleAndOr(dataholderHandler); flag += res3; dataholderHandler.untagSentences(); int res4 = this.myLearnerUtility.doItMarkup(dataholderHandler, maxTagLength); } while (flag > 0); // reset unsolvable andor to NULL for (SentenceStructure sentenceItem : dataholderHandler.getSentenceHolder()) { String tag = sentenceItem.getTag(); if (StringUtils.equals(tag, "andor")) { sentenceItem.setTag(null); } } // cases releazed from andor[m&mn] may be marked by adjectivesubjects this.myLearnerUtility.tagAllSentences(dataholderHandler, "singletag", "sentence"); this.adjectiveSubjects(dataholderHandler); } /** * works on annotated sentences that starts with a M in all non-ignored * sentences, find sentences that starts with a modifer <m> followed by a * boundary word <b>. (note, if the <B> is a punct mark, this sentence * should be tagged as ditto) Use the context to find the tag, use the * modifier as the modifie (markup process, no new discovery). for * "modifier unknown" pattern, check WNPOS of the "unknown" to decide if * "unknown" is a structure name (if it is a pl) or a boundary word (may * have new discoveries). Works on sentences, not leads * * @param dataholderHandler * @return # of updates */ public int adjectiveSubjects(DataHolder dataholderHandler) { Set<String> typeModifiers = new HashSet<String>(); // Part 1: collect evidence for the usage of "modifier boundry": typeModifiers = adjectiveSubjectsPart1(dataholderHandler, typeModifiers); for (String typeModifier : typeModifiers) { if (dataholderHandler.getModifierHolder().containsKey(typeModifier)) { dataholderHandler.getModifierHolder().get(typeModifier) .setIsTypeModifier(true); } } // Part 2: process "typemodifier unknown" patterns int flag = adjectiveSubjectsPart2(dataholderHandler, typeModifiers); return flag; } public Set<String> adjectiveSubjectsPart1(DataHolder dataholderHandler, Set<String> typeModifiers) { for (SentenceStructure sentenceItem : dataholderHandler.getSentenceHolder()) { String sentenceCopy = ""+sentenceItem.getSentence(); String tag = sentenceItem.getTag(); if (!StringUtils.equals(tag, "ignore") || tag == null) { Pattern p = Pattern.compile(".*?<M>(\\S+)</M> <B>[^,.]+</B> (.*)"); Matcher m = p.matcher(sentenceCopy); while (m.find()) { sentenceCopy = m.group(2); String temp = m.group(1); temp = temp.replaceAll("<\\S+?>", ""); if (!typeModifiers.contains(temp)) { typeModifiers.add(temp); } } } } return typeModifiers; } public int adjectiveSubjectsPart2(DataHolder dataholderHandler, Set<String> typeModifiers) { String pos = null; int flag = 0; for (SentenceStructure sentenceItem : dataholderHandler .getSentenceHolder()) { int sentenceID = sentenceItem.getID(); String sentence = sentenceItem.getSentence(); String tag = sentenceItem.getTag(); String pattern = "<M>\\S*(" + StringUtils.join(typeModifiers, "|") + ")\\S*</M> .*"; int count = 0; if (((tag == null) || StringUtils.equals(tag, "") || StringUtils .equals(tag, "unknown")) && adjectiveSubjectsPart2Helper1(sentence, typeModifiers)) { if (sentence != null) { String sentenceCopy = sentence + ""; String regex = "(.*?)((?:(\\S+)\\s*(?:and|or|nor|and / or|or / and)\\s*)*(?:<M>\\S+</M>\\s*)+) (\\S+)\\s*(.*)"; Pattern p = Pattern.compile(regex); Matcher m = p.matcher(sentenceCopy); while (m.find()) { int knownPOS = 0; String start = m.group(1); String modifier = m.group(2); String newModifier = m.group(3); String word = m.group(4); sentenceCopy = m.group(5); // case 1 if (!this.myLearnerUtility.getConstant().forbiddenWords .contains(word)) { count++; continue; } // case 2 if (StringUtility.isMatchedNullSafe( newModifier.toUpperCase(), "<N>") || StringUtility.isMatchedNullSafe( start.toUpperCase(), "<N>")) { count++; continue; } // case 3 boolean c3 = this.myLearnerUtility.getConstant().prepositionWords.contains(word); if (count == 0 && ((StringUtility.isMatchedNullSafe(word, "[;,]") || c3) || (StringUtility.isMatchedNullSafe(word, "[.;,]") && !StringUtility.isMatchedNullSafe(sentence, "\\w")))) { // case 3.1 // start with a <[BM]>, followed by a <[BM]> if ((StringUtility.isMatchedNullSafe(word, "\\b(with|without|of)\\b")) && ((StringUtility.isMatchedNullSafe(modifier, "^(<M>)?<B>(<M>)?\\w+(</M)?</B>(</M>)? (?:and|or|nor|and / or|or / and)?\\s*(<[BM]>)+\\w+(</[BM]>)+\\s*$")) || (StringUtility.isMatchedNullSafe(modifier, "^(<[BM]>)+\\w+(</[BM]>)+$")))) { dataholderHandler.tagSentenceWithMT(sentenceID, sentenceCopy, "", "ditto", "adjectivesubject[ditto]"); count++; continue; } // case 3.2 // modifier={<M>outer</M> <M><B>pistillate</B></M>} word= <B>,</B> sentence= <N>corollas</N>.... // make the last modifier b else { if (modifier != null) { Pattern p2 = Pattern .compile("^(.*) (\\S+)$"); Matcher m2 = p2.matcher(modifier); if (m2.find()) { modifier = m2.group(1); String b = m2.group(2); String bCopy = "" + b; b = b.replaceAll("<\\S+?>", ""); dataholderHandler.updateDataHolder(b,"b", "", "wordpos", 1); tag = dataholderHandler.getParentSentenceTag(sentenceID); List<String> modifierAndTag = dataholderHandler.getMTFromParentTag(tag); String modifier2 = modifierAndTag.get(0); tag = modifierAndTag.get(1); modifier = modifier.replaceAll( "<\\S+?>", ""); if (StringUtility.isMatchedNullSafe(modifier2, "\\w")) { modifier = modifier + " " + modifier2; } dataholderHandler.tagSentenceWithMT( sentenceID, sentence, modifier, tag, "adjectivesubject[M-B,]"); count++; continue; } } } } // case 4 // get new modifier from modifiers like // "mid and/or <m>distal</m>" if (!StringUtility.isMatchedNullSafe(newModifier,"<") && StringUtility.isMatchedNullSafe(newModifier, "\\w") && StringUtility.isMatchedNullSafe(start,",(?:</B>)?\\s*$")) { flag += dataholderHandler.updateDataHolder(newModifier, "m", "", "modifiers", 1); // print "find a modifier [E0]: $newm\n" if $debug; } // case 5 // pos = "N"/"B" if (word != null) { Pattern p5 = Pattern.compile("([A-Z])>(<([A-Z])>)?(.*?)<"); Matcher m5 = p5.matcher(word); if (m5.find()) { String g1 = m5.group(1); String g2 = m5.group(2); String g3 = m5.group(3); String g4 = m5.group(4); String t1 = g1; String t2 = g3; word = g4; pos = t1 + t2; // if <N><B>, decide on one tag if (pos.length() > 1) { if (StringUtility.isMatchedNullSafe(sentence, "^\\s*<B>[,;:]<\\/B>\\s*<N>") ||StringUtility.isMatchedNullSafe(sentence, "^\\s*<B>\\.<\\/B>\\s*$")){ pos = "B"; } else { pos = "N"; } } knownPOS = 1; } else { List<POSInfo> POSs = dataholderHandler.checkPOSInfo(word); pos = POSs.get(0).getPOS(); } } pos = StringUtils.equals(pos, "?") ? this.myLearnerUtility.getWordFormUtility().getNumber(word) : pos; // part 6 // markup sentid, update pos for word, new modifier if (StringUtils.equals(pos, "p") || StringUtils.equals(pos, "N")) { if (knownPOS != 0) { flag += dataholderHandler.updateDataHolder(word, "p", "-", "wordpos", 1); // /print "update [$word] pos: p\n" if (!$knownpos) && $debug; } if (count == 0 && (StringUtility.isMatchedNullSafe(start, "^\\S+\\s?(?:and |or |and \\/ or |or \\/ and )?$") ||start.length() == 0)) { modifier = start + modifier; modifier = modifier.replaceAll("<\\S+?>", ""); word = word.replaceAll("<\\S+?>", ""); dataholderHandler.tagSentenceWithMT(sentenceID, sentence, modifier, tag, "adjectivesubject[M-N]"); // new modifier start = start.replaceAll("\\s*(and |or |and \\/ or |or \\/ and )\\s*", ""); start = start.replaceAll("<\\S+?>", ""); while (StringUtility.isMatchedNullSafe(start, "^("+this.myLearnerUtility.getConstant().STOP+")\\b")) { start = start.replaceAll("^("+this.myLearnerUtility.getConstant().STOP+")\\b\\s*", ""); } if (start.length() > 0) { flag += dataholderHandler.updateDataHolder(start, "m", "", "modifiers", 1); //print "find a modifier [E]: $start\n" if $debug; } } } // not p else { if (knownPOS != 0) { // update pos for word, markup sentid (get tag // from context), new modifier flag += dataholderHandler.updateDataHolder(word, "b", "", "wordpos", 1); // print "update [$word] pos: b\n" if $debug; } if (count == 0 && (StringUtility.isMatchedNullSafe(start, "^\\S+\\s?(?:and |or |and \\/ or |or \\/ and )?$") ||start.length() == 0)) { while (StringUtility.isMatchedNullSafe(start, "^("+this.myLearnerUtility.getConstant().STOP+"|"+this.myLearnerUtility.getConstant().FORBIDDEN+"|\\w+ly)\\b")) { start = start.replaceAll("^("+this.myLearnerUtility.getConstant().STOP+"|"+this.myLearnerUtility.getConstant().FORBIDDEN+"|\\w+ly)\\b\\s*", ""); } modifier = start + modifier; modifier = modifier.replaceAll("<\\S+?>", ""); tag = dataholderHandler.getParentSentenceTag(sentenceID); List<String> modifierAndTag = dataholderHandler.getMTFromParentTag(tag); String newM = modifierAndTag.get(0); tag = modifierAndTag.get(1); if (StringUtility.isMatchedNullSafe(newM, "\\w")) { modifier = modifier + " " + newM; } dataholderHandler.tagSentenceWithMT(sentenceID, sentence, modifier, tag, "adjectivesubject[M-B]"); // new modifier start = start.replaceAll("\\s*(and |or |and \\/ or |or \\/ and )\\s*", ""); start = start.replaceAll("<\\S+?>", ""); if (start.length() > 0) { if (!StringUtility.isMatchedNullSafe(start, "ly\\s*$") && !StringUtility.isMatchedNullSafe(start, "\\b(" + this.myLearnerUtility.getConstant().STOP + "|" + this.myLearnerUtility.getConstant().FORBIDDEN + ")\\b")) { flag += dataholderHandler.updateDataHolder(word, "m", "", "modifiers", 1); // print "find a modifier [F]: $start\n" if $debug; } } } } count++; } } } } return flag; } public boolean adjectiveSubjectsPart2Helper1(String sentence, Set<String> typeModifiers) { String pattern = "<M>\\S*(" + StringUtils.join(typeModifiers, "|") + ")\\S*</M> .*"; return StringUtility.isMatchedNullSafe(sentence, pattern); } /** * Discover new modifiers using and/or pattern. * For "modifier and/or unknown boundary" pattern or * "unknown and/or modifier boundary" pattern, make "unknown" a modifier * * @param dataholderHandler * @return */ public int discoverNewModifiers(DataHolder dataholderHandler) { int sign = 0; // "modifier and/or unknown boundary" pattern for (SentenceStructure sentenceItem : dataholderHandler.getSentenceHolder()) { String sentenceTag = sentenceItem.getTag(); String sentence = sentenceItem.getSentence(); int sentenceID = sentenceItem.getID(); if ((!StringUtility.isMatchedNullSafe(sentenceTag, "ignore") || sentenceTag == null) && StringUtility.isMatchedNullSafe(sentence, "<M>[^\\s]+</M> (or|and|and / or|or / and) .*")){ String POS = ""; // if "<m>xxx</m> (and|or) yyy (<b>|\d)" pattern appears at the // beginning or is right after the 1st word of the sentence, // mark up the sentence, add yyy as a modifier if (sentence != null) { Pattern p1 = Pattern.compile("^(?:\\w+\\s)?<M>(\\S+)<\\/M> (and|or|nor|and \\/ or|or \\/ and) ((?:<[^M]>)*[^<]+(?:<\\/[^M]>)*) <B>[^,;:\\.]"); Matcher m1 = p1.matcher(sentence); if (m1.find()) { String g1 = m1.group(1); String g2 = m1.group(2); String g3 = m1.group(3); String modifier = g1 +" "+ g2+" "+ g3; String newM = g3; if (!StringUtility.isMatchedNullSafe(newM, "\\b("+this.myLearnerUtility.getConstant().STOP+")\\b")) { modifier = modifier.replaceAll("<\\S+?>", ""); if (newM != null) { Pattern p11 = Pattern.compile("(.*?>)(\\w+)<\\/"); Matcher m11 = p11.matcher(newM); if (m11.find()) { newM = m11.group(2); POS = m11.group(1); } } // update N to M: retag sentences tagged as $newm, remove [s] record from wordpos if (StringUtility.isMatchedNullSafe(POS, "<N>")) { sign += dataholderHandler.changePOS(newM, "s", "m", "", 1); } // B else { sign += dataholderHandler.updateDataHolder(newM, "m", "", "modifiers", 1); } // print "find a modifier [A]: $newm\n" if $debug; String tag = dataholderHandler.getParentSentenceTag(sentenceID); List<String> modifierAndTag = dataholderHandler.getMTFromParentTag(tag); String m = modifierAndTag.get(0); tag = modifierAndTag.get(1); if (StringUtility.isMatchedNullSafe(m, "\\w")) { modifier = modifier + " "+m; } dataholderHandler.tagSentenceWithMT(sentenceID, sentence, modifier, tag, "discovernewmodifiers"); } } // if the pattern appear in the middle of the sentence, add yyy as modifier else { Pattern p2 = Pattern.compile("<M>(\\S+)<\\/M> (and|or|nor|and \\/ or|or \\/ and) (\\w+) <B>[^,;:\\.]"); Matcher m2 = p2.matcher(sentence); if (m2.find()) { String newM = m2.group(3); sign += dataholderHandler.updateDataHolder(newM, "m", "", "modifiers", 1); // print "find a modifier[B]: $newm\n" if $debug; } } } } } // "unknown and/or modifier boundary" for (SentenceStructure sentenceItem : dataholderHandler.getSentenceHolder()) { String sentence = sentenceItem.getSentence(); String sentenceTag = sentenceItem.getTag(); if ((!StringUtility.isMatchedNullSafe(sentenceTag, "ignore") || sentenceTag == null) && StringUtility.isMatchedNullSafe(sentence, "[^\\w]+ (and|or|nor|and / or|or / and) <M>[^\\w]+</M> .*")) { int sentenceID = sentenceItem.getID(); String POS = ""; // if "xxx (and|or|nor) <m>yyy</m> (<b>|\d)" pattern appear at the beginning or is right after the 1st word of the sentence, mark up the sentence, add yyy as a modifier if (sentence != null) { Pattern p3 = Pattern.compile("^(?:\\w+\\s)?((?:<[^M]>)*[^<]+(?:<\\/[^M]>)*) (and|or|nor|and \\/ or|or \\/ and) <M>(\\S+)<\\/M> <B>[^:;,\\.]"); Matcher m3 = p3.matcher(sentence); if (m3.find()) { String g1 = m3.group(1); String g2 = m3.group(2); String g3 = m3.group(3); String modifier = g1 + " " + g2 + " " + g3; String newM = g1; modifier = modifier.replaceAll("<\\S+?>", ""); if (newM != null) { Pattern p31 = Pattern.compile("(.*?>)(\\w+)<\\/"); Matcher m31 = p31.matcher(newM); if (m31.find()) { // N or B newM = m31.group(2); POS = m31.group(1); } } if (StringUtility.isMatchedNullSafe(POS, "<N>")) { // update N to M sign += dataholderHandler.changePOS(newM, "s", "m", "", 1); // update $newm to m } else { // B sign += dataholderHandler.updateDataHolder(newM, "m", "", "modifiers", 1); } // print "find a modifier [C]: $newm\n" if $debug; String tag = dataholderHandler.getParentSentenceTag(sentenceID); List<String> modifierAndTag = dataholderHandler.getMTFromParentTag(tag); String m = modifierAndTag.get(0); tag = modifierAndTag.get(1); if (StringUtility.isMatchedNullSafe(m, "\\w")) { modifier = modifier +" "+m; } dataholderHandler.tagSentenceWithMT(sentenceID, sentence, modifier, tag, "discovernewmodifiers"); } else { Pattern p32 = Pattern.compile("(\\w+) (and|or|nor|and \\/ or|or \\/ and) <M>(\\S+)<\\/M> <B>[^,:;\\.]"); Matcher m32 = p32.matcher(sentence); // if the pattern appear in the middle of the sentence, add yyy as modifier if (m32.find()) { String newM = m32.group(1); sign += dataholderHandler.updateDataHolder(newM, "m", "", "modifiers", 1); } //print "find a modifier [D]: $newm\n" if $debug; } } } } return sign; } public int handleAndOr(DataHolder dataholderHandler) { PropertyConfigurator.configure("conf/log4j.properties"); Logger myLogger = Logger.getLogger("learn.handleAndOr"); myLogger.info("to match pattern " + Constant.ANDORPTN); List<SentenceStructure> sentenceItems = dataholderHandler .getSentencesByTagPattern("^andor$"); int sign = 0; for (SentenceStructure sentenceItem : sentenceItems) { int sentenceID = sentenceItem.getID(); String sentence = sentenceItem.getSentence(); // myLogger.trace(Constant.SEGANDORPTN); // myLogger.trace(Constant.ANDORPTN); int result = this.andOrTag(dataholderHandler, sentenceID, sentence, Constant.SEGANDORPTN, Constant.ANDORPTN); sign = sign + result; } return sign; } public int andOrTag(DataHolder dataholderHandler, int sentenceID, String sentence, String sPattern, String wPattern) { PropertyConfigurator.configure("conf/log4j.properties"); Logger myLogger = Logger.getLogger("learn.andOrTag"); myLogger.trace("Enter"); int sign = 0; List<String> mPatterns = new ArrayList<String>(); List<String> sPatterns = new ArrayList<String>(); List<String> mSegments = new ArrayList<String>(); List<String> sSegments = new ArrayList<String>(); Set<String> token = new HashSet<String>(); token.addAll(Arrays.asList("and or nor".split(" "))); token.add("\\"); token.add("and / or"); String strToken = "(" + StringUtils.join(token, " ") + ")"; int limit = 80; List<String> words = new ArrayList<String>(); words.addAll(Arrays.asList(sentence.split(" "))); String pattern = this.myLearnerUtility.getSentencePtn( dataholderHandler, token, limit, words); pattern = pattern.replaceAll("t", "m"); myLogger.info(String.format("Andor pattern %s for %s", pattern, words.toString())); if (pattern == null) { return -1; } // Matcher m1 = StringUtility.createMatcher(pattern, wPattern); Matcher m2 = StringUtility.createMatcher(pattern, "^b+&b+[,:;.]"); if (sentenceID == 163) { System.out.println(); } List<List<String>> res = this.andOrTagCase1Helper(pattern, wPattern, words, token); if (res != null) { mPatterns = res.get(0); mSegments = res.get(1); sPatterns = res.get(2); sSegments = res.get(3); List<String> tagAndModifier1 = res.get(4); List<String> tagAndModifier2 = res.get(5); List<String> update1 = res.get(6); List<String> update2 = res.get(7); if (tagAndModifier1.size() > 0) { String modifier = tagAndModifier1.get(0); String tag = tagAndModifier1.get(1); dataholderHandler.tagSentenceWithMT(sentenceID, sentence, "", tag, "andor[n&n]"); myLogger.trace("tagSentenceWithMT(" + sentenceID + ", " + sentence + ", , " + tag + ", andor[n&n]"); } else { myLogger.debug(String.format( "Andor can not determine a tag or modifier for %d: %s", sentenceID, sentence)); } if (tagAndModifier2.size() > 0) { String modifier = tagAndModifier2.get(0); String tag = tagAndModifier2.get(1); dataholderHandler.tagSentenceWithMT(sentenceID, sentence, modifier, tag, "andor[m&mn]"); myLogger.trace("tagSentenceWithMT(" + sentenceID + ", " + sentence + ", " + modifier + ", " + tag + ", andor[m&mn]"); } else { myLogger.debug(String.format( "Andor can not determine a tag or modifier for %d: %s", sentenceID, sentence)); } if (update1.size() > 0) { String newBoundaryWord = update1.get(0); sign = sign + dataholderHandler.updateDataHolder(newBoundaryWord, "b", "", "wordpos", 1); } if (update2.size() > 0) { for (String tempWord : update2) { sign = sign + dataholderHandler.updateDataHolder(tempWord, "p", "-", "wordpos", 1); } } } else if (m2.find()) { myLogger.trace("Case 2"); dataholderHandler.tagSentenceWithMT(sentenceID, sentence, "", "ditto", "andor"); } else { myLogger.trace("Case 3"); myLogger.trace("[andortag]Andor can not determine a tag or modifier for " + sentenceID + ": " + sentence); } myLogger.trace("Return " + sign + "\n"); return sign; } public List<List<String>> andOrTagCase1Helper(String pattern, String wPattern, List<String> words, Set<String> token) { PropertyConfigurator.configure("conf/log4j.properties"); Logger myLogger = Logger.getLogger("learn.andOrTag"); List<String> mPatterns = new ArrayList<String>(); List<String> sPatterns = new ArrayList<String>(); List<String> mSegments = new ArrayList<String>(); List<String> sSegments = new ArrayList<String>(); List<String> update1 = new ArrayList<String>(); List<String> update2 = new ArrayList<String>(); List<String> tagAndModifier1 = new ArrayList<String>(); List<String> tagAndModifier2 = new ArrayList<String>(); String strToken = "(" + StringUtils.join(token, " ") + ")"; Matcher m1 = StringUtility.createMatcher(pattern, wPattern); if (m1.find()) { myLogger.trace("Case 1"); if (pattern.equals("n&qqnbq")) { // System.out.println(); } int start1 = m1.start(1); int end1 = m1.end(1); int start2 = m1.start(2); int end2 = m1.end(2); int start3 = m1.start(3); int end3 = m1.end(3); int start4 = m1.start(4); int end4 = m1.end(4); int start5 = m1.start(5); int end5 = m1.end(5); // System.out.println(pattern); // System.out.println(start1); // System.out.println(); String earlyGroupsPattern = start1 == -1 ? "" : pattern.substring( 0, start1); String[] patterns = earlyGroupsPattern.split("s*<B>,<\\/B>\\s*"); String earlyGroupsWords = start1 == -1 ? "" : StringUtils.join( words.subList(0, start1), " "); String[] segments = earlyGroupsWords.split("\\s*<B>,<\\/B>s*"); String secondLastModifierPattern = m1.group(1); String secondLastModifierWords = secondLastModifierPattern == null ? "" : StringUtils.join(words.subList(start1, end1), " "); String sencondLastStructurePattern = m1.group(2); String secondLastStructureWords = sencondLastStructurePattern == null ? "" : StringUtils.join(words.subList(start2, end2), " "); String lastModifierPattern = m1.group(3); String lastModifierWords = lastModifierPattern == null ? "" : StringUtils.join(words.subList(start3, end3), " "); String lastStructurePattern = m1.group(4); String lastStructureWords = lastStructurePattern == null ? "" : StringUtils.join(words.subList(start4, end4), " "); String endSegmentPattern = m1.group(5); String endSegmentWords = endSegmentPattern == null ? "" : StringUtils.join(words.subList(start5, end5), " "); int bIndex = start5; // matching pattern with original text if (!(patterns.length == 1 && StringUtils.equals(patterns[0], ""))) { for (int i = 0; i < patterns.length; i++) { Pattern p = Pattern.compile("sPattern"); Matcher m10 = p.matcher(patterns[i]); if (m10.find()) { String g1 = m10.group(1); mPatterns.add(g1); String g2 = m10.group(2); sPatterns.add(g2); List<String> w = new ArrayList<String>( Arrays.asList(segments[i].split(" "))); String m = StringUtils.join(w.subList(0, m10.end(1)), " "); if (StringUtility.isMatchedNullSafe(m, "\\b(although|but|when|if|where)\\b")) { return null; } mSegments.add(m); sSegments.add(StringUtils.join( w.subList(m10.end(1), w.size()), " ")); } else { myLogger.info("wrong segment: " + patterns[i] + "=>" + segments[i] + "\n"); return null; } } } if (secondLastModifierPattern != null) mPatterns.add(secondLastModifierPattern); if (!StringUtils.equals(secondLastModifierWords, "")) mSegments.add(secondLastModifierWords); if (sencondLastStructurePattern != null) sPatterns.add(sencondLastStructurePattern); if (!StringUtils.equals(secondLastStructureWords, "")) sSegments.add(secondLastStructureWords); if (lastModifierPattern != null) mPatterns.add(lastModifierPattern); if (!StringUtils.equals(lastModifierWords, "")) mSegments.add(lastModifierWords); if (lastStructurePattern != null) sPatterns.add(lastStructurePattern); if (!StringUtils.equals(lastStructureWords, "")) sSegments.add(lastStructureWords); // find the modifier and the tag for sentenceID // case 1.1 if (this.countStructures(sPatterns) > 1) { // compound subject involving multiple structures: mn,mn,&mn => // use all but bounary as the tag, modifier=""; String tag = StringUtils.join(words.subList(0, bIndex), " "); String modifier = ""; tag = tag.replaceAll("<\\S+?>", ""); if (tag != null) { String regex11 = "\\b(" + StringUtils.join(token, "|") + ")\\b"; Matcher m11 = StringUtility.createMatcher(tag, regex11); if (m11.find()) { String conj = m11.group(1); tag = tag.replaceAll(",", " " + conj + " "); tag = tag.replaceAll("\\s+", " "); tag = tag.replaceAll("(" + conj + " )+", "$1"); tag = tag.replaceAll("^\\s+", ""); tag = tag.replaceAll("\\s+$", ""); // dataholderHandler.tagSentenceWithMT(sentenceID, // sentence, "", tag, "andor[n&n]"); tagAndModifier1.add(""); tagAndModifier1.add(tag); } // else { // myLogger.debug(String.format("Andor can not determine a tag or modifier for %d: %s", // sentenceID, sentence)); // } } // case 1.2 else if (this.countStructures(sPatterns) == 1) { // m&mn => connect all modifiers as the modifier, and the n // as the tag int i = 0; for (i = 0; i < sPatterns.size(); i++) { if (StringUtility.isMatchedNullSafe(sPatterns.get(i), "\\w")) { break; } } tag = sSegments.get(i); tag = tag.replaceAll("<\\S+?>", ""); modifier = StringUtils.join(mSegments, " "); modifier = modifier.replaceAll("<\\S+?>", ""); tag = StringUtility.trimString(tag); modifier = StringUtility.trimString(modifier); String myStop = this.myLearnerUtility.getConstant().STOP; myStop = myStop.replaceAll( String.format("\\b%s\\b", token), ""); myStop = myStop.replaceAll("\\s+$", ""); if (StringUtility.isMatchedNullSafe(modifier, "\\b" + strToken + "\\b") && StringUtility.isEntireMatchedNullSafe(modifier, "\\b(" + myStop + "|to)\\b")) { // case 1.2.1 List<String> wordsTemp = new ArrayList<String>(); wordsTemp.addAll(Arrays.asList(tag.split("\\s+"))); modifier = modifier + " " + StringUtils.join(wordsTemp.subList(0, wordsTemp.size() - 1), " "); tag = wordsTemp.get(wordsTemp.size() - 1); // dataholderHandler.tagSentenceWithMT(sentenceID, // sentence, modifier, tag, "andor[m&mn]"); tagAndModifier2.add(modifier); tagAndModifier2.add(tag); } // else { // myLogger.debug(String.format("Andor can not determine a tag or modifier for %d: %s", // sentenceID, sentence)); // } } // case 1.3 else { myLogger.debug("Andor can not determine a tag or modifier"); } int q = -1; if (endSegmentPattern != null) { Matcher m13 = StringUtility.createMatcher( endSegmentPattern, "q"); if (m13.find()) { q = m13.start(); } } if (q >= 0) { String newBoundaryWord = endSegmentWords.split(" ")[q]; if (StringUtility.isMatchedNullSafe(newBoundaryWord, "\\w")) { update1.add(newBoundaryWord); // sign = sign + // dataholderHandler.updateDataHolder(newBoundaryWord, // "b", "", "wordpos", 1); } } // structure patterns and segments: $nptn = // "((?:[np],?)*&?[np])"; #grouped #must present, no q allowed // mark all ps "p" for (int i = 0; i < sPatterns.size(); i++) { String sPatternI = sPatterns.get(i); sPatternI = sPatternI.replaceAll("(.)", "$1 "); sPatternI = StringUtility.trimString(sPatternI); String[] ps = sPatternI.split(" "); String[] ts = sSegments.get(i).split("\\s+"); for (int j = 0; j < ps.length; j++) { if (StringUtils.equals(ps[j], "p")) { ts[j] = StringUtility.trimString(ts[j]); update2.add(ts[j]); // sign = sign // + dataholderHandler.updateDataHolder(ts[j], // "p", "-", "wordpos", 1); } } } } List<List<String>> res = new ArrayList<List<String>>(); res.add(mPatterns); res.add(mSegments); res.add(sPatterns); res.add(sSegments); res.add(tagAndModifier1); res.add(tagAndModifier2); res.add(update1); res.add(update2); return res; } else { return null; } } public int countStructures(List<String> patterns) { int count = 0; for (String pattern : patterns) { if (StringUtility.isMatchedNullSafe(pattern, "\\w")) { count++; } } return count; } }