package semanticMarkup.ling.learn.knowledge; import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.Map; import java.util.Set; import org.apache.commons.lang3.StringUtils; import semanticMarkup.ling.learn.dataholder.DataHolder; import semanticMarkup.ling.learn.dataholder.ModifierTableValue; import semanticMarkup.ling.learn.dataholder.SentenceStructure; import semanticMarkup.ling.learn.utility.StringUtility; /** * sentences that are tagged with a commons substructure, such as blades, * margins need to be modified with its parent structure * * @author Dongye * */ public class CommonSubstructureAnnotator implements IModule { public CommonSubstructureAnnotator() { // TODO Auto-generated constructor stub } @Override public void run(DataHolder dataholderHandler) { this.commonSubstructure(dataholderHandler); } // sentences that are tagged with a commons substructure, such as blades, // margins need to be modified with its parent structure public void commonSubstructure(DataHolder dataholderHandler) { Set<String> commonTags = this .getCommonStructures(dataholderHandler); String pattern = StringUtils.join(commonTags, "|"); pattern = "\\\\[?(" + pattern + ")\\\\]?"; for (SentenceStructure sentenceItem : dataholderHandler .getSentenceHolder()) { String tag = sentenceItem.getTag(); boolean c1 = StringUtils.equals(tag, "ignore"); boolean c2 = (tag == null); boolean c3 = (StringUtility.isMatchedNullSafe(tag, "^" + pattern + "$")); if ((c1 || c2) && c3) { int sentenceID = sentenceItem.getID(); String modifier = sentenceItem.getModifier(); String sentence = sentenceItem.getSentence(); if (!isModifierContainsStructure(dataholderHandler, modifier) && !StringUtility.isMatchedNullSafe(tag, "\\[")) { // when the common substructure is not already modified by a // structure, and // when the tag is not already inferred from parent tag: // mid/[phyllaries] String parentStructure = dataholderHandler .getParentSentenceTag(sentenceID); String pTag = "" + parentStructure; parentStructure = parentStructure.replaceAll("([\\[\\]])", ""); if (!StringUtils.equals(parentStructure, "[parenttag]") && !StringUtility.isMatchedNullSafe(modifier, parentStructure) && !StringUtility.isMatchedNullSafe(tag, parentStructure)) { // remove any overlapped words btw parentStructure and // tag pTag = pTag.replaceAll("\\b" + tag + "\\b", ""); String modifierCopy = "" + modifier; modifier = StringUtility.trimString(modifier); pTag = StringUtility.trimString(pTag); pTag = pTag.replaceAll("\\s+", " "); if (isTypeModifier(dataholderHandler, modifier)) { // cauline/base => cauline [leaf] / base modifier = modifier + " " + pTag; } else { // main marginal/spine => [leaf blade] main // marginal/spine modifier = pTag + " " + modifier; } // tagsentwmt($sentid, $sentence, $modifier, $tag, // "commonsubstructure"); dataholderHandler.tagSentenceWithMT(sentenceID, sentence, modifier, tag, "commonsubstructure"); } } } } } public boolean isTypeModifier(DataHolder dataholderHandler, String modifier) { boolean res = false; String[] words = modifier.split("\\s+"); String word = words[words.length - 1]; if (dataholderHandler.getModifierHolder().containsKey(word)) { ModifierTableValue modifierItem = dataholderHandler .getModifierHolder().get(modifier); if (modifierItem.getIsTypeModifier()) { res = true; } } return res; } public boolean isModifierContainsStructure(DataHolder dataholderHandler, String modifier) { boolean res = false; String[] words = modifier.split("\\s+"); for (String word : words) { Set<String> POSTags = new HashSet<String>(); POSTags.add("p"); POSTags.add("s"); Set<String> PSWords = dataholderHandler .getWordsFromWordPOSByPOSs(POSTags); if (PSWords.contains(word)) { res = true; break; } } return res; } /** * find tags with more than one different structure modifiers * * @param dataholderHandler * @return */ public Set<String> getCommonStructures(DataHolder dataholderHandler) { // Get structures. // Structures are just words from WordPOS holder that are P/S but not B Set<String> PSTags = new HashSet<String>( Arrays.asList("s p".split(" "))); Set<String> BTags = new HashSet<String>(); BTags.add("b"); Set<String> PSWords = dataholderHandler .getWordsFromWordPOSByPOSs(PSTags); Set<String> BWords = dataholderHandler.getWordsFromWordPOSByPOSs(BTags); Set<String> allStructures = StringUtility.setSubtraction(PSWords, BWords); Set<String> commonTags = new HashSet<String>(); // Get a map maps tags to their structures Map<String, Set<String>> tagToModifiers = new HashMap<String, Set<String>>(); for (SentenceStructure sentenceItem : dataholderHandler .getSentenceHolder()) { String tag = sentenceItem.getTag(); String modifier = sentenceItem.getModifier(); boolean c1 = StringUtils.equals(tag, "ignore"); boolean c2 = (tag == null); boolean c3 = StringUtility.isMatchedNullSafe(tag, " "); boolean c4 = StringUtility.isMatchedNullSafe(tag, "\\["); if ((!c1 || c2) && !c3 && !c4) { if (allStructures.contains(modifier)) { if (tagToModifiers.containsKey(tag)) { tagToModifiers.get(tag).add(modifier); } else { HashSet<String> modifiers = new HashSet<String>(); modifiers.add(modifier); tagToModifiers.put(tag, modifiers); } } } } // Added all tags with more than 1 structures into the common tags // collection Iterator<String> iter = tagToModifiers.keySet().iterator(); while (iter.hasNext()) { String key = iter.next(); if (tagToModifiers.get(key).size() > 1) { String commonTag = new String(key); commonTag = commonTag.replaceAll("\\|+", "\\|"); commonTag = commonTag.replaceAll("\\|+$", ""); commonTags.add(key); } } return commonTags; } }