package semanticMarkup.ling.learn.knowledge;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import org.apache.commons.lang3.StringUtils;
import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;
import semanticMarkup.ling.learn.auxiliary.KnownTagCollection;
import semanticMarkup.ling.learn.dataholder.DataHolder;
import semanticMarkup.ling.learn.dataholder.SentenceStructure;
import semanticMarkup.ling.learn.utility.LearnerUtility;
import semanticMarkup.ling.learn.utility.StringUtility;
public class POSBasedAnnotator implements IModule {
private LearnerUtility myLearnerUtility;
private Logger myLogger;
public POSBasedAnnotator(LearnerUtility learnerUtility) {
this.myLearnerUtility = learnerUtility;
PropertyConfigurator.configure("conf/log4j.properties");
myLogger = Logger.getLogger("learn.unknownWordBootstrapping");
}
@Override
public void run(DataHolder dataholderHandler) {
int sign = 0;
Set<String> token = new HashSet<String>();
token.add("################################");
do {
sign = 0;
this.tagUnknownSentences(dataholderHandler, "singletag");
for (SentenceStructure sentenceItem : dataholderHandler.getSentenceHolder()) {
if (sentenceItem.getTag() == null) {
List<String> words = new ArrayList<String>();
words.addAll(Arrays.asList(sentenceItem.getSentence().split("\\s+")));
String ptn = this.myLearnerUtility.getSentencePtn(dataholderHandler, token, words.size()+1, words);
sign += CaseHandler(dataholderHandler, sentenceItem, words, ptn);
}
}
} while (sign > 0);
}
public int CaseHandler(DataHolder dataholderHandler,
SentenceStructure sentenceItem, List<String> words, String ptn) {
int sign = 0;
Matcher m21 = StringUtility.createMatcher(ptn,
"^([mtqb]*)([np]+)((?<=p)q)");
Matcher m22 = StringUtility.createMatcher(ptn,
"^([mtqb]*)([np]+)(,|;|:|\\.|b)");
boolean case21 = m21.find();
boolean case22 = m22.find();
List<String> modifierAndTagCase3 = this.getModifierAndTagForCase3(ptn,
words);
boolean case3 = (modifierAndTagCase3 != null);
if (StringUtility.isMatchedNullSafe(ptn, "^[qmb][,;:\\.]$")) {
myLogger.trace("Case 1");
// tagsentwmt($sentid, $sentence, "", "ditto",
// "remainnulltag-[R0]");
dataholderHandler.tagSentenceWithMT(sentenceItem.getID(),
sentenceItem.getSentence(), "", "ditto",
"remainnulltag-[R0]");
} else if (case21 || case22) {
myLogger.trace("Case 2");
int start3;
int end1;
int start2;
int end2;
if (case21) {
start3 = m21.start(3);
end1 = m21.end(1);
start2 = m21.start(2);
end2 = m21.end(2);
} else {
start3 = m22.start(3);
end1 = m22.end(1);
start2 = m22.start(2);
end2 = m22.end(2);
}
String boundary = words.get(start3);
String modifier = StringUtils.join(words.subList(0, end1), " ");
// get tag and modifer for case 2
List<String> case2ModidierAndTag = this.getModifierAndTagForCase2(
modifier, start2, end2, words);
if (case2ModidierAndTag != null && case2ModidierAndTag.size() == 2) {
modifier = case2ModidierAndTag.get(0);
String tag = case2ModidierAndTag.get(1);
// update on q and p
if (StringUtility.isMatchedNullSafe(tag, "<")) {
int result = dataholderHandler.updateDataHolder(tag, "p",
"-", "wordpos", 1);
sign = sign + result;
}
// nontagged words in modifier
List<String> modifierList = getModifiersForUntag(modifier);
for (String m : modifierList) {
int result = dataholderHandler.updateDataHolder(m, "m", "",
"modifiers", 1);
sign += result;
}
// update boundary
if (StringUtility.isMatchedNullSafe(boundary, "<")) {
int result = dataholderHandler.updateDataHolder(boundary,
"b", "", "wordpos", 1);
sign += result;
}
modifier = modifier.replaceAll("<\\S+?>", "");
tag = tag.replaceAll("<\\S+?>", "");
dataholderHandler.tagSentenceWithMT(sentenceItem.getID(),
sentenceItem.getSentence(), modifier, tag,
"remainnulltag-[R1]");
}
} else if (case3) {
myLogger.trace("Case 3");
String modifier = modifierAndTagCase3.get(0);
String tag = modifierAndTagCase3.get(1);
dataholderHandler.tagSentenceWithMT(sentenceItem.getID(),
sentenceItem.getSentence(), modifier, tag,
"remainnulltag-[R2]");
}
return sign;
}
public List<String> getModifierAndTagForCase3(String ptn, List<String> words) {
Matcher m = StringUtility.createMatcher(ptn,
"^([^qpn,;:]*)([pn]+)[tmb]");
if (m.find()) {
int start1 = m.start(1);
int end1 = m.end(1);
int start2 = m.start(2);
int end2 = m.end(2);
String lStr = StringUtils.join(words.subList(0, end1), " ");
String pattern1 = String.format("\\b(%s)", this.myLearnerUtility.getConstant().FORBIDDEN);
String pattern2 = String.format("\\b(%s)\\b", this.myLearnerUtility.getConstant().STOP);
if (!StringUtility.isMatchedNullSafe(lStr, pattern1)
&& !StringUtility.isMatchedNullSafe(lStr, pattern2)) {
List<String> tagWords = words.subList(start2, end2);
String pattern3 = ".*?[,:;](.*)";
if (!StringUtility.isMatchedNullSafe(lStr, pattern3)) {
lStr = m.group(1);
}
String modifier = lStr
+ StringUtils.join(
tagWords.subList(0, tagWords.size() - 1), " ");
String tag = tagWords.get(0);
modifier = modifier.replaceAll("<\\S+?>", "");
tag = tag.replaceAll("<\\S+?>", "");
List<String> modifierAndTag = new LinkedList<String>();
modifierAndTag.add(modifier);
modifierAndTag.add(tag);
return modifierAndTag;
}
}
return null;
}
public List<String> getModifierAndTagForCase2(String modifier, int start, int end, List<String> words) {
if (!StringUtility.isMatchedNullSafe(modifier,
String.format("\\b(%s)\\b", this.myLearnerUtility.getConstant().PREPOSITION))) {
List<String> modifierAndTag = new LinkedList<String>();
// get tag and modifier
List<String> tagWords = words.subList(start, end);
if (tagWords.size() > 1) {
modifier = modifier
+ " "
+ StringUtils.join(
tagWords.subList(0, tagWords.size() - 1), " ");
modifier = modifier.replaceAll("\\s*$", "");
}
String tag = tagWords.get(tagWords.size() - 1);
modifierAndTag.add(modifier);
modifierAndTag.add(tag);
return modifierAndTag;
}
else {
return null;
}
}
public List<String> getModifiersForUntag(String modifier) {
if (modifier == null) {
return null;
}
List<String> modifiers = new LinkedList<String>();
if (modifier.equals("")) {
return modifiers;
}
String modifierCopy = modifier;
Matcher m24 = StringUtility.createMatcher(modifierCopy, "(?:^| )(\\w+) (.*)");
while (m24.find()) {
String g1 = m24.group(1);
modifiers.add(g1);
modifierCopy = m24.group(2);
m24 = StringUtility.createMatcher(modifierCopy, "(?:^| )(\\w+) (.*)");
}
return modifiers;
}
public void tagUnknownSentences(DataHolder dataholderHandler, String mode) {
KnownTagCollection knownTags = myLearnerUtility.getKnownTags(dataholderHandler, mode);
Iterator<SentenceStructure> sentenceIter = dataholderHandler.getSentenceHolderIterator();
String tag;
String lead;
String sentence;
while(sentenceIter.hasNext()) {
SentenceStructure sentenceItem = sentenceIter.next();
tag = sentenceItem.getTag();
lead = sentenceItem.getLead();
if (tag == null && !StringUtility.isMatchedNullSafe(lead, "similar to .*")) {
sentence = sentenceItem.getSentence();
sentence = sentence.replaceAll("<\\S+?>", "");
sentence = this.myLearnerUtility.annotateSentence(sentence, knownTags, dataholderHandler.getBMSWords());
sentenceItem.setSentence(sentence);
}
}
}
/**
* @param args
*/
public static void main(String[] args) {
// TODO Auto-generated method stub
}
}