package semanticMarkup.ling.learn.knowledge; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.regex.Matcher; import org.apache.commons.lang3.StringUtils; import semanticMarkup.ling.learn.dataholder.DataHolder; import semanticMarkup.ling.learn.dataholder.SentenceStructure; import semanticMarkup.ling.learn.utility.LearnerUtility; import semanticMarkup.ling.learn.utility.StringUtility; /** * Identify and annotate clauses in which a comma is used to mean "and" by the * compound subject * * @author Dongye * */ public class CommaAsAndAnnotator implements IModule { private LearnerUtility myLearnerUtility; public CommaAsAndAnnotator(LearnerUtility learnerUtility) { this.myLearnerUtility = learnerUtility; } @Override public void run(DataHolder dataholderHandler) { // TODO Auto-generated method stub } /** * comma used for 'and': seen in TreatiseH, using comma for 'and' as in * "adductor , diductor scars clearly differentiated ;", which is the same * as "adductor and diductor scars clearly differentiated ;". ^m*n+,m*n+ or * m*n+,m*n+;$, or m,mn. Clauses dealt in commaand do not contain "and/or". * andortag() deals with clauses that do. * * @param dataholderHandler */ public void commaAnd(DataHolder dataholderHandler) { // cover m,mn // last + =>* // "(?:<[A-Z]*[NO]+[A-Z]*>[^<]+?<\/[A-Z]*[NO]+[A-Z]*>\\s*)+" String nPhrasePattern = "(?:<[A-Z]*[NO]+[A-Z]*>[^<]+?<\\/[A-Z]*[NO]+[A-Z]*>\\s*)+"; // add last \\s* // "(?:<[A-Z]*M[A-Z]*>[^<]+?<\/[A-Z]*M[A-Z]*>\\s*)" String mPhrasePattern = "(?:<[A-Z]*M[A-Z]*>[^<]+?<\\/[A-Z]*M[A-Z]*>\\s*)"; // "(?:<[A-Z]*B[A-Z]*>[,:\.;<]<\/[A-Z]*B[A-Z]*>)" String bPattern = "(?:<[A-Z]*B[A-Z]*>[,:.;<]<\\/[A-Z]*B[A-Z]*>)"; String commaPattern = "<B>,</B>"; String phrasePattern = mPhrasePattern + "\\s*" + nPhrasePattern; String pattern = phrasePattern + "\\s+" + commaPattern + "\\s+(?:" + phrasePattern + "| |" + commaPattern + ")+"; String pattern1 = "^(" + pattern + ")"; String pattern2 = "(.*?)(" + pattern + ")\\s*" + bPattern + "\\$"; // changed last * to + String pattern3 = "^((?:" + mPhrasePattern + "\\s+)+" + commaPattern + "\\s+(?:" + mPhrasePattern + "|\\s*|" + commaPattern + ")+" + mPhrasePattern + "+\\s*" + nPhrasePattern + ")"; for (SentenceStructure sentenceItem : dataholderHandler .getSentenceHolder()) { int sentenceID = sentenceItem.getID(); String sentence = sentenceItem.getSentence(); String sentenceCopy = "" + sentence; sentenceCopy = sentenceCopy.replaceAll("></?", ""); Matcher m1 = StringUtility.createMatcher(sentenceCopy, pattern1); Matcher m2 = StringUtility.createMatcher(sentenceCopy, pattern2); Matcher m3 = StringUtility.createMatcher(sentenceCopy, pattern3); // case 1 if (m1.find()) { String tag = m1.group(1); tag = tag.replaceAll(",", "and"); tag = tag.replaceAll("</?\\S+?>", ""); tag = StringUtility.trimString(tag); // case 1.1 if (!StringUtility.isMatchedNullSafe(tag, " and$")) { dataholderHandler.tagSentenceWithMT(sentenceID, sentence, "", tag, "commaand[CA1]"); } } // case 2 else if (m2.find()) { String g1 = m2.group(1); String tag = m2.group(2); if (!StringUtility.isMatchedNullSafe(g1, "\\b(" + this.myLearnerUtility.getConstant().PREPOSITION + ")\\b") && !StringUtility.isMatchedNullSafe(g1, "<N>")) { tag = tag.replaceAll(",", "and"); tag = tag.replaceAll("</?\\S+?>", ""); tag = StringUtility.trimString(tag); // case 2.1.1 if (!StringUtility.isMatchedNullSafe(tag, " and$")) { dataholderHandler.tagSentenceWithMT(sentenceID, sentence, "", tag, "commaand[CA2]"); } } } // case 3 else if (m3.find()) { String tag = m3.group(1); String g1 = m3.group(1); // case 3.1 if (!StringUtility.isMatchedNullSafe(g1, "\\b(" + this.myLearnerUtility.getConstant().PREPOSITION + ")\\b")) { tag = tag.replaceAll(",", "and"); tag = tag.replaceAll("</?\\S+?>", ""); tag = StringUtility.trimString(tag); // case 3.1.1 if (!StringUtility.isMatchedNullSafe(tag, " and$")) { String[] tagWords = tag.split("\\s+"); List<String> tagWordsList = new ArrayList<String>( Arrays.asList(tagWords)); tag = tagWordsList.get(tagWordsList.size() - 1); String modifier = StringUtils.join(tagWordsList .subList(0, tagWordsList.size() - 1), " "); dataholderHandler.tagSentenceWithMT(sentenceID, sentence, modifier, tag, "commaand[CA3]"); } } } } } }