package semanticMarkup.ling.learn.knowledge;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang3.StringUtils;
import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;
import semanticMarkup.ling.learn.auxiliary.KnownTagCollection;
import semanticMarkup.ling.learn.dataholder.DataHolder;
import semanticMarkup.ling.learn.dataholder.SentenceStructure;
import semanticMarkup.ling.learn.utility.LearnerUtility;
import semanticMarkup.ling.learn.utility.StringUtility;
/**
* Tag unknown words by infer any word before a plural noun as a modifier, and
* any word after a plural noun as a boundary word.
*
* @author Dongye
*
*/
public class UnknownWordBootstrappingLearner implements IModule {
private LearnerUtility myLearnerUtility;
public UnknownWordBootstrappingLearner(LearnerUtility learnerUtility) {
this.myLearnerUtility = learnerUtility;
}
@Override
public void run(DataHolder dataholderHandler) {
unknownWordBootstrapping(dataholderHandler);
}
public void unknownWordBootstrapping(DataHolder dataholderHandler) {
PropertyConfigurator.configure("conf/log4j.properties");
Logger myLogger = Logger.getLogger("learn.unknownWordBootstrapping");
myLogger.trace("[unknownWordBootstrapping]Start");
unknownWordBootstrappingPreprocessing(dataholderHandler);
unknownWordBootstrappingMain(dataholderHandler);
unknownWordBootstrappingPostprocessing(dataholderHandler);
myLogger.trace("[unknownWordBootstrapping]End");
}
public void unknownWordBootstrappingPreprocessing(DataHolder dataholderHandler) {
this.myLearnerUtility.tagAllSentences(dataholderHandler, "singletag", "sentence");
}
public void unknownWordBootstrappingMain(DataHolder dataholderHandler) {
PropertyConfigurator.configure("conf/log4j.properties");
Logger myLogger = Logger.getLogger("learn.unknownWordBootstrapping.main");
String plMiddle = "(ee)";
int newInt = 0;
do {
newInt = 0;
Set<String> organs = new HashSet<String>();
Set<String> boundaries = new HashSet<String>();
Set<String> modifiers = new HashSet<String>();
Set<String> allWords = new HashSet<String>();
String wordPattern = "(("+ Constant.PLENDINGS + "|ium)$)|"+plMiddle;
String flagPattern = "^unknown$";
Set<String> words = dataholderHandler.getWordsFromUnknownWord(wordPattern, true, flagPattern, true);
for (String word: words){
if (word.equals("teeth")) {
System.out.println();
}
if ((StringUtility.isMatchedNullSafe(word, "ium$"))
&& (!this.myLearnerUtility.getConstant().singularExceptions
.contains(word))) {
dataholderHandler.updateDataHolder(word, "s", "-", "wordpos", 1);
if (isValidWord(word)) {
organs.add(word);
myLogger.debug("find a [s] " + word);
}
}
else {
boolean c1 = dataholderHandler.isExistSentence(true, "(^| )"+word+" (<B>|" + this.myLearnerUtility.getConstant().FORBIDDEN + ")");
boolean c2 = StringUtils.equals(this.myLearnerUtility.getWordFormUtility().getNumber(word), "p");
boolean c3 = isVerbEnding(dataholderHandler, word);
if (c1 && c2 && (!c3)) {
dataholderHandler.updateDataHolder(word, "p", "-",
"wordpos", 1);
if (isValidWord(word)) {
organs.add(word);
myLogger.debug("find a [p] " + word);
}
}
}
}
// Part 2
if (organs.size() > 0) {
// find word <q> and make q a b
String organsPattern = StringUtils.join(organs, "|");
String pattern21 = "(^| )(" + organsPattern + ") [^<]";
Set<SentenceStructure> sentences21 = dataholderHandler
.getTaggedSentenceByPattern(pattern21);
for (SentenceStructure sentenceItem : sentences21) {
String sentence = sentenceItem.getSentence();
if (sentence != null) {
Pattern p21 = Pattern.compile("\\b(" + organsPattern
+ ") (\\w+)");
Matcher m21 = p21.matcher(sentence);
if (m21.find()) {
String tempWord = m21.group(2);
dataholderHandler.updateDataHolder(tempWord, "b",
"", "wordpos", 1);
if (!this.myLearnerUtility.getConstant().forbiddenWords
.contains(tempWord)) {
boundaries.add(tempWord);
if (tempWord.equals("anterolaterally")) {
System.out.println();
}
myLogger.debug("find a [b] " + tempWord);
}
}
}
}
// then find <q> $word, and make q a modifier
String pattern22 = "[^<]+ (" + organsPattern + ") ";
Set<SentenceStructure> sentences22 = dataholderHandler
.getTaggedSentenceByPattern(pattern22);
for (SentenceStructure sentenceItem : sentences22) {
String sentence = sentenceItem.getSentence();
if (sentence != null) {
Pattern p22 = Pattern.compile("(^|,<\\/b>)([\\w ]*?) ("
+ organsPattern + ")\\b");
Matcher m22 = p22.matcher(sentence);
if (m22.find()) {
String tempWords = m22.group(2);
// if (!this.myLearnerUtility.getConstant().forbiddenWords
// .contains(tempWords)) {
if (!(StringUtility.isMatchedNullSafe(tempWords,
"\\b(" + this.myLearnerUtility.getConstant().FORBIDDEN + ")\\b"))) {
String[] tempWordsArray = tempWords
.split("\\s+");
if (tempWordsArray.length <= 2) {
for (String tempWord : tempWordsArray) {
dataholderHandler.updateDataHolder(
tempWord, "m", "", "modifiers",
1);
if (this.isValidWord(tempWord)) {
modifiers.add(tempWord);
myLogger.debug("find a [m] "
+ tempWord);
}
}
}
}
}
}
}
}
// Part 3
allWords.addAll(organs);
allWords.addAll(boundaries);
allWords.addAll(modifiers);
if ( (newInt>0) && (allWords.size()>0)) {
String allWordsPattern = StringUtils.join(allWords, "|");
String pattern3 = "(^| )(" + allWordsPattern + ") ";
Set<SentenceStructure> sentences = dataholderHandler.getTaggedSentenceByPattern(pattern3);
for (SentenceStructure sentenceItem: sentences) {
if (sentenceItem.getID()==133) {
System.out.println();
}
String sentence = sentenceItem.getSentence();
KnownTagCollection myKnownTags = new KnownTagCollection(null, organs, null, boundaries, null, null);
sentence = this.myLearnerUtility.annotateSentence(sentence, myKnownTags, dataholderHandler.getBMSWords());
sentenceItem.setSentence(sentence);
}
}
} while (newInt > 0);
}
/**
* Determine if a word has verb ending
*
* @param dataholderHandler
* the dataholder handler
* @param word
* the word to check
* @return true if the word has verb ending; false otherwise
*/
public boolean isVerbEnding(DataHolder dataholderHandler, String word) {
String pWord = word;
String sWord = this.myLearnerUtility.getWordFormUtility().getSingular(
pWord);
// case 1
if (StringUtility.isMatchedNullSafe(sWord, "e$")) {
sWord = StringUtility.chop(sWord);
}
// case 2
else {
if (sWord == null) {
;
} else {
Matcher m2 = StringUtility.createMatcher(sWord, "([^aeiou])$");
if (m2.find()) {
sWord = sWord + m2.group(1) + "?";
}
}
}
sWord = "(^|_)" + sWord + "ing";
if (dataholderHandler.isWordExistInUnknownWord(sWord + "$", true, null,
false)) {
return true;
}
return false;
}
public void unknownWordBootstrappingPostprocessing(DataHolder dataholderHandler) {
// pistillate_zone
// get all nouns from wordPOS holder
Set<String> POSTags = new HashSet<String>();
POSTags.add("p");
POSTags.add("s");
Set<String> nouns = dataholderHandler.getWordsFromWordPOSByPOSs(
POSTags);
// get boudaries
Set<String> boundaries = new HashSet<String>();
Set<String> words = dataholderHandler.getWordsFromUnknownWord("^.*_.*$", true,
"^unknown$", true);
Iterator<String> wordIter = words.iterator();
String pattern = "_(" + StringUtils.join(nouns, "|") + ")$";
while (wordIter.hasNext()) {
String word = wordIter.next();
Pattern p1 = Pattern.compile("^[a-zA-Z0-9_-]+$");
Matcher m1 = p1.matcher(word);
Pattern p2 = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE);
Matcher m2 = p2.matcher(word);
if (m1.matches() && (!m2.matches())) {
if (!StringUtility.createMatcher(word,
"\\b(" + this.myLearnerUtility.getConstant().FORBIDDEN + ")\\b").find()) {
boundaries.add(word);
}
dataholderHandler.updateDataHolder(word, "b", "", "wordpos", 1);
}
}
// if the boundaries is not empty
if (boundaries.size() > 0) {
Iterator<SentenceStructure> iter = dataholderHandler
.getSentenceHolderIterator();
while (iter.hasNext()) {
SentenceStructure sentenceItem = iter.next();
String tag = sentenceItem.getTag();
String sentence = sentenceItem.getSentence();
int sentenceID = sentenceItem.getID();
if ((!(StringUtils.equals(tag, "ignore")) || (tag == null))
&& (StringUtility.createMatcher(sentence, "(^| )("
+ StringUtils.join(boundaries, "|") + ") ")
.find())) {
KnownTagCollection tags = new KnownTagCollection(null,
null, null, boundaries, null, null);
sentence = this.myLearnerUtility.annotateSentence(sentence, tags, dataholderHandler.getBMSWords());
SentenceStructure updatedSentence = dataholderHandler.getSentence(sentenceID);
updatedSentence.setSentence(sentence);
}
}
}
}
private boolean isValidWord(String word) {
if (!this.myLearnerUtility.getConstant().forbiddenWords.contains(word)) {
return true;
} else
return false;
}
}