package semanticMarkup.ling.learn.knowledge;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang3.StringUtils;
import semanticMarkup.ling.learn.dataholder.DataHolder;
import semanticMarkup.ling.learn.dataholder.SentenceStructure;
import semanticMarkup.ling.learn.utility.LearnerUtility;
import semanticMarkup.ling.learn.utility.StringUtility;
/**
* Annotate any clause having a pronoun or a character instead of an organ name
* as its subject by "ditto".
*
* @author Dongye
*
*/
public class PronounCharactersAnnotator implements IModule {
private LearnerUtility myLearnerUtility;
public PronounCharactersAnnotator(LearnerUtility learnerUtility) {
this.myLearnerUtility = learnerUtility;
}
@Override
public void run(DataHolder dataholderHandler) {
// TODO Auto-generated method stub
}
public void pronounCharacterSubject(DataHolder dataholderHandler) {
for (SentenceStructure sentenceItem : dataholderHandler
.getSentenceHolder()) {
int sentenceID = sentenceItem.getID();
String lead = sentenceItem.getLead();
String sentence = sentenceItem.getSentence();
String modifier = sentenceItem.getModifier();
String tag = sentenceItem.getTag();
List<String> mt = pronounCharacterSubjectHelper(lead, sentence,
modifier, tag);
if (mt != null) {
dataholderHandler.tagSentenceWithMT(sentenceID, sentence,
modifier, tag,
"pronouncharactersubject[character subject]");
}
}
// preposition cases
String prepositionPattern = String
.format("^(%s)", this.myLearnerUtility.getConstant().PREPOSITION);
for (SentenceStructure sentenceItem : dataholderHandler
.getSentenceHolder()) {
int sentenceID = sentenceItem.getID();
String lead = sentenceItem.getLead();
String modifier = sentenceItem.getModifier();
String tag = sentenceItem.getTag();
String sentence = sentenceItem.getSentence();
boolean case1 = (StringUtils.equals(tag, "ignore"));
boolean case2 = (tag == null);
boolean case3 = StringUtility.isMatchedNullSafe(tag,
prepositionPattern + " ");
if ((case1 || case2) && case3) {
dataholderHandler.tagSentenceWithMT(sentenceID, sentence, "",
"", "pronouncharactersubject[proposition subject]");
}
}
// pronoun cases
String pronounPattern = String.format("(%s)", this.myLearnerUtility.getConstant().PRONOUN);
for (SentenceStructure sentenceItem : dataholderHandler
.getSentenceHolder()) {
int sentenceID = sentenceItem.getID();
String lead = sentenceItem.getLead();
String modifier = sentenceItem.getModifier();
String tag = sentenceItem.getTag();
String sentence = sentenceItem.getSentence();
boolean case1 = StringUtility.isMatchedNullSafe(tag,
String.format("(^| )%s( |\\$)", pronounPattern));
boolean case2 = StringUtility.isMatchedNullSafe(modifier,
String.format("(^| )%s( |\\$)", pronounPattern));
if (case1 || case2) {
modifier = modifier.replaceAll("\\b(" + this.myLearnerUtility.getConstant().PRONOUN
+ ")\\b", "");
tag = tag.replaceAll("\\b(" + this.myLearnerUtility.getConstant().PRONOUN + ")\\b", "");
modifier = modifier.replaceAll("\\s+", " ");
tag = tag.replaceAll("\\s+", " ");
if (!StringUtility.isMatchedNullSafe(tag, "\\w")
|| StringUtility.isMatchedNullSafe(tag, "ditto")) {
tag = dataholderHandler.getParentSentenceTag(sentenceID);
}
modifier = modifier.replaceAll("(^\\s*|\\s*$)", "");
tag = tag.replaceAll("(^\\s*|\\s*$)", "");
List<String> mt = dataholderHandler.getMTFromParentTag(tag);
String m = mt.get(0);
tag = mt.get(1);
if (StringUtility.isMatchedNullSafe(m, "\\w")) {
modifier = modifier + m;
dataholderHandler.tagSentenceWithMT(sentenceID, sentence,
modifier, tag,
"pronouncharactersubject[pronoun subject]");
}
}
}
// correct to missed N
for (SentenceStructure sentenceItem : dataholderHandler
.getSentenceHolder()) {
int sentenceID = sentenceItem.getID();
String lead = sentenceItem.getLead();
String modifier = sentenceItem.getModifier();
String tag = sentenceItem.getTag();
String sentence = sentenceItem.getSentence();
List<String> mt = this.pronounCharacterSubjectHelper4(lead,
sentence, modifier, tag);
if (mt != null) {
modifier = mt.get(0);
tag = mt.get(1);
dataholderHandler.tagSentenceWithMT(sentenceID, sentence,
modifier, tag,
"pronouncharactersubject[correct to missed N]");
}
}
}
public List<String> pronounCharacterSubjectHelper4(String lead,
String sentence, String modifier, String tag) {
boolean case1 = (StringUtils.equals(tag, "ignore"));
boolean case2 = (tag == null);
boolean case3 = !StringUtility.isMatchedNullSafe(tag, " (and|nor|or) ");
boolean case4 = !StringUtility.isMatchedNullSafe(sentence, "\\[");
boolean case5 = false;
if (sentence != null) {
Pattern p = Pattern.compile("^[^N]*<N>" + tag);
Matcher m = p.matcher(sentence);
if (m.find()) {
case5 = true;
}
}
if ((case1 || case2) && case3 && case4 && case5) {
if (sentence != null) {
sentence = sentence.replaceAll("></?", "");
Pattern p = Pattern
.compile("^(\\S*) ?<N>([^<]+)<\\/N> <[MB]+>(\\S+)<\\/[MB]+> \\S*\\b"
+ tag + "\\b\\S*");
Matcher m2 = p.matcher(sentence);
if (m2.find()) {
modifier = m2.group(1);
tag = m2.group(2);
String g3 = m2.group(3);
if (!StringUtility.isMatchedNullSafe(g3, "\\bof\\b")) {
modifier = modifier.replaceAll("<\\S+?>", "");
tag = tag.replaceAll("<\\S+?>", "");
modifier = modifier.replaceAll("(^\\s*|\\s*$)", "");
tag = tag.replaceAll("(^\\s*|\\s*$)", "");
List<String> mt = new ArrayList<String>();
mt.add(modifier);
mt.add(tag);
return mt;
}
}
}
}
return null;
}
public List<String> pronounCharacterSubjectHelper(String lead,
String sentence, String modifier, String tag) {
String t = "(?:<\\/?[A-Z]+>)?";
boolean b1 = !StringUtils.equals(tag, "ignore");
boolean b2 = (tag == null);
boolean b3 = StringUtility.isMatchedNullSafe(lead, "(^| )("
+ this.myLearnerUtility.getConstant().CHARACTER + ")( |$)");
boolean b4 = StringUtility.isMatchedNullSafe(tag, "(^| )("
+ this.myLearnerUtility.getConstant().CHARACTER + ")( |$)");
if (((b1 || b2) && b3) || b4) {
sentence = sentence.replaceAll("></?", "");
if (sentence != null) {
String pattern1 = String
.format("^.*?%s\\b(%s)\\b%s %s(?:of)%s (.*?)(<[NO]>([^<]*?)<\\/[NO]> ?)+ ",
t, this.myLearnerUtility.getConstant().CHARACTER, t, t, t);
Matcher m1 = StringUtility.createMatcher(sentence, pattern1);
String pattern2 = String
.format("^(.*?)((?:<\\/?[BM]+>\\w+?<\\/?[BM]+>\\s*)*)%s\\b(%s)\\b%s",
t, this.myLearnerUtility.getConstant().CHARACTER, t);
Matcher m2 = StringUtility.createMatcher(sentence, pattern2);
// case 1.1
if (m1.find()) {
tag = m1.group(4);
modifier = sentence.substring(m1.start(2), m1.start(4));
String s2 = m1.group(2);
String s3 = m1.group(3);
if ((!StringUtility.isMatchedNullSafe(s2,
String.format("\\b(%s)\\b", this.myLearnerUtility.getConstant().PREPOSITION)))
&& (!StringUtility.isMatchedNullSafe(s3, String
.format("\\b(%s|\\d)\\b", this.myLearnerUtility.getConstant().STOP)))) {
modifier = modifier.replaceAll("<\\S+?>", "");
modifier = modifier.replaceAll("(^\\s*|\\s*$)", "");
tag = tag.replaceAll("<\\S+?>", "");
tag = tag.replaceAll("(^\\s*|\\s*$)", "");
} else {
modifier = "";
tag = "ditto";
}
}
// case 1.2
else if (m2.find()) {
String text = m2.group(1);
if ((!StringUtility.isMatchedNullSafe(text, "\\b("
+ this.myLearnerUtility.getConstant().STOP + "|\\d+)\\b"))
&& (StringUtility.isMatchedNullSafe(text, "\\w"))
&& (!StringUtility
.isMatchedNullSafe(text, "[,:;.]"))) {
text = text.replaceAll("<\\S+?>", "");
// $text =~ s#(^\s*|\s*$)##g;
// $text =~ s#[[:punct:]]##g;
text = text.replaceAll("(^\\s*|\\s*$)", "");
text = text.replaceAll("\\p{Punct}", "");
String[] textArray = text.split("\\s+");
// List<String> textList = new LinkedList<String>();
// textList.addAll(Arrays.asList(textArray));
if (textArray.length >= 1) {
tag = textArray[textArray.length - 1];
String pattern = "<[NO]>" + tag + "</[NO]>";
if (StringUtility.isMatchedNullSafe(sentence,
pattern)) {
// 1.2.1.1
text = text.replaceAll(tag, "");
modifier = text;
} else {
// 1.2.1.2
modifier = "";
tag = "ditto";
}
}
} else {
// 1.2.2
modifier = "";
tag = "ditto";
}
}
// case 1.3
else if (StringUtility.isMatchedNullSafe(sentence, "\\b("
+ this.myLearnerUtility.getConstant().CHARACTER + ")\\b")) {
modifier = "";
tag = "ditto";
}
}
List<String> mt = new ArrayList<String>(2);
mt.add(modifier);
mt.add(tag);
return mt;
} else {
return null;
}
}
}