package semanticMarkup.ling.learn.dataholder;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileWriter;
import java.io.PrintWriter;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang3.StringUtils;
import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;
import semanticMarkup.ling.learn.Configuration;
import semanticMarkup.ling.learn.auxiliary.POSInfo;
import semanticMarkup.ling.learn.auxiliary.StringPair;
import semanticMarkup.ling.learn.knowledge.Constant;
import semanticMarkup.ling.learn.utility.StringUtility;
import semanticMarkup.ling.learn.utility.WordFormUtility;
public class DataHolder {
// all unique words in the input treatments
public Map<String, Integer> allWords;
// words are singular nouns, boundary words, and modifiers
private Set<String> BMSWords;
// Data holders
// Table discounted
private Map<DiscountedKey, String> discountedTable;
public static final byte DISCOUNTED = 2;
// Table heuristicnoun
private Map<String, String> heuristicNounTable;
public static final byte HEURISTICNOUN = 1;
// Table isATable
private Map<Integer, IsAValue> isATable;
public static final byte ISA = 3;
// Table modifier
private Map<String, ModifierTableValue> modifierTable;
public static final byte MODIFIER = 4;
// Table sentence
private List<SentenceStructure> sentenceTable = new LinkedList<SentenceStructure>();
private int sentenceCount;
//private Map<Integer, Sentence> sentenceCollection;
public static final byte SENTENCE = 5;
// Table singularPlural
private Set<SingularPluralPair> singularPluralTable;
public static final byte SINGULAR_PLURAL = 6;
// Table termCategory
private Set<StringPair> termCategoryTable;
public static final byte TERM_CATEGORY = 7;
// Table unknownword
private Map<String, String> unknownWordTable;
public static final byte UNKNOWNWORD = 8;
// Table wordpos
private Map<WordPOSKey, WordPOSValue> wordPOSTable;
public static final byte WORDPOS = 9;
// Table wordrole
private Map<StringPair, String> wordRoleTable;
public static final byte WORDROLE = 10;
// Other data
// Leading three words of sentences
public Set<String> checkedWordSet;
private Configuration myConfiguration;
private Constant myConstant;
private WordFormUtility myWordFormUtility;
public DataHolder(Configuration myConfiguration, Constant myConstant, WordFormUtility myWordFormUtility) {
this.myConfiguration = myConfiguration;
this.myConstant = myConstant;
this.myWordFormUtility = myWordFormUtility;
this.allWords = new HashMap<String, Integer>();
this.BMSWords = new HashSet<String>();
this.discountedTable = new HashMap<DiscountedKey, String>();
this.heuristicNounTable = new HashMap<String, String>();
this.isATable = new HashMap<Integer, IsAValue>();
this.modifierTable = new HashMap<String, ModifierTableValue>();
this.sentenceTable = new LinkedList<SentenceStructure>();
this.sentenceCount = 0;
this.singularPluralTable = new HashSet<SingularPluralPair>();
this.termCategoryTable = new HashSet<StringPair>();
this.unknownWordTable = new HashMap<String, String>();
this.wordPOSTable = new HashMap<WordPOSKey, WordPOSValue>();
this.wordRoleTable = new HashMap<StringPair, String>();
this.checkedWordSet = new HashSet<String>();
}
@Override
public boolean equals(Object obj) {
if (obj == this) {
return true;
}
if (obj == null || obj.getClass() != this.getClass()) {
return false;
}
DataHolder myDataHolder = (DataHolder) obj;
return ((this.discountedTable.equals(myDataHolder.discountedTable))
&& (this.heuristicNounTable
.equals(myDataHolder.heuristicNounTable))
&& (this.modifierTable.equals(myDataHolder.modifierTable))
&& (this.sentenceTable.equals(myDataHolder.sentenceTable))
&& (this.singularPluralTable
.equals(myDataHolder.singularPluralTable))
&& (this.unknownWordTable.equals(myDataHolder.unknownWordTable))
&& (this.wordPOSTable.equals(myDataHolder.wordPOSTable)) && (this.allWords
.equals(myDataHolder.allWords)));
}
/** Get Holder Utility **/
public Map<DiscountedKey, String> getDiscountedHolder(){
return this.discountedTable;
}
public Map<String, ModifierTableValue> getModifierHolder(){
return this.modifierTable;
}
public Map<String, String> getHeuristicNounHolder(){
return this.heuristicNounTable;
}
public List<SentenceStructure> getSentenceHolder(){
return this.sentenceTable;
}
public Set<SingularPluralPair> getSingularPluralHolder(){
return this.singularPluralTable;
}
public Set<StringPair> getTermCategoryHolder() {
return this.termCategoryTable;
}
public Map<String, String> getUnknownWordHolder(){
return this.unknownWordTable;
}
public Map<WordPOSKey, WordPOSValue> getWordPOSHolder(){
return this.wordPOSTable;
}
public Map<StringPair, String> getWordRoleHolder(){
return this.wordRoleTable;
}
/** Add To Utilities **/
public void add2Holder(byte holderID, List<String> args){
if (holderID == DataHolder.DISCOUNTED) {
this.discountedTable = this.add2DiscountedHolder(this.discountedTable, args);
}
if (holderID == DataHolder.ISA) {
this.isATable = this.add2IsAHolder(this.isATable, args);
}
if (holderID == DataHolder.MODIFIER) {
this.add2ModifierHolder(args);
}
if (holderID == DataHolder.SENTENCE) {
this.sentenceTable = this.add2SentenceHolder(this.sentenceTable,args);
}
if (holderID == DataHolder.SINGULAR_PLURAL) {
this.singularPluralTable = this.add2SingularPluralHolder(this.singularPluralTable, args);
}
if (holderID == DataHolder.UNKNOWNWORD) {
this.unknownWordTable = this.add2UnknowWordHolder(this.unknownWordTable, args);
}
if (holderID == DataHolder.WORDPOS) {
this.add2WordPOSHolder(args);
}
}
/**
* Add the terms into the heuristicNounTable with the type specified
*
* @param terms
* set of terms
* @param type
* type of the terms
*/
public int add2HeuristicNounTable(Set<String> terms, String type) {
int count = 0;
Iterator<String> iter = terms.iterator();
while (iter.hasNext()) {
String term = iter.next();
this.getHeuristicNounHolder().put(term, type);
count++;
}
return count;
}
public Map<Integer, IsAValue> add2IsAHolder (Map<Integer, IsAValue> isAHolder, List<String> args) {
int index = 0;
String instance = args.get(index++);
String cls = args.get(index++);
isAHolder.put(isAHolder.size()+1, new IsAValue(instance, cls));
return isAHolder;
}
public void add2ModifierHolder(List<String> args) {
int index = 0;
String word = args.get(index++);
int count = new Integer(args.get(index++));
boolean isTypeModifier = false;
String isTypeModifierString = args.get(index++);
if (StringUtils.equals(isTypeModifierString, "true")) {
isTypeModifier = true;
}
this.modifierTable.put(word, new ModifierTableValue(count, isTypeModifier));
}
public void addToModifierHolder(String word, int count, boolean isTypeModifier) {
this.modifierTable.put(word, new ModifierTableValue(count, isTypeModifier));
}
public Map<String, String> add2UnknowWordHolder(Map<String, String> unknownWordHolder, List<String> args){
int index = 0;
String word = args.get(index++);
String flag = args.get(index++);
unknownWordHolder.put(word, flag);
return unknownWordHolder;
}
public void add2WordPOSHolder(List<String> args){
int index = 0;
String word = args.get(index++);
String POS = args.get(index++);
String role = args.get(index++);
int certaintyU = new Integer(args.get(index++));
int certaintyL = new Integer(args.get(index++));
String savedFlag = args.get(index++);
String savedID = args.get(index++);
this.wordPOSTable.put(
new WordPOSKey(word, POS),
new WordPOSValue(role, certaintyU, certaintyL, savedFlag, savedID));
}
public void addToWordPOSHolder(String word, String POS, String role, int certaintyU, int certaintyL, String savedFlag, String savedID) {
this.wordPOSTable.put(
new WordPOSKey(word, POS),
new WordPOSValue(role, certaintyU, certaintyL, savedFlag, savedID));
}
public Set<SingularPluralPair> add2SingularPluralHolder(Set<SingularPluralPair> singularPluralHolder, List<String> args){
int index = 0;
String singular = args.get(index++);
String plural = args.get(index++);
singularPluralHolder.add(new SingularPluralPair(singular, plural));
return singularPluralHolder;
}
public Map<DiscountedKey, String> add2DiscountedHolder(Map<DiscountedKey, String> discountedHolder, List<String> args){
int index = 0;
String word = args.get(index++);
String POS = args.get(index++);
String newPOS = args.get(index++);
discountedHolder.put(new DiscountedKey(word, POS), newPOS);
return discountedHolder;
}
public List<SentenceStructure> add2SentenceHolder(List<SentenceStructure> sentenceTable,
List<String> args) {
int index = 0;
String source=args.get(index++);
String sentence=args.get(index++);
String originalSentence=args.get(index++);
String lead=args.get(index++);
String status=args.get(index++);
String tag=args.get(index++);
String modifier=args.get(index++);
String type=args.get(index++);
this.addSentence(source, sentence, originalSentence, lead, status, tag, modifier, type);
//sentenceTable.add(new Sentence(source, sentence, originalSentence, lead, status, tag, modifier, type));
return sentenceTable;
}
/** Iterator Utility
* @return
* @return **/
public Iterator<Entry<String, ModifierTableValue>> getModifierHolderIterator() {
Iterator<Entry<String, ModifierTableValue>> iter = this.getModifierHolder().entrySet().iterator();
return iter;
}
public Iterator<SentenceStructure> getSentenceHolderIterator(){
Iterator<SentenceStructure> iter = this.getSentenceHolder().iterator();
return iter;
}
public Set<String> getSentenceTags() {
Set<String> tags = new HashSet<String>();
Iterator<SentenceStructure> iter = this.getSentenceHolderIterator();
while (iter.hasNext()) {
SentenceStructure sentenceItem = iter.next();
tags.add(sentenceItem.getTag());
}
return tags;
}
public Iterator<Entry<WordPOSKey, WordPOSValue>> getWordPOSHolderIterator(){
Iterator<Entry<WordPOSKey, WordPOSValue>> iter = this.wordPOSTable.entrySet().iterator();
return iter;
}
public boolean updateWordPOS(String word, String POS, String role,
int certaintyU, int certaintyL, String savedFlag, String savedID) {
WordPOSKey key = new WordPOSKey(word, POS);
WordPOSValue value = new WordPOSValue(role, certaintyU, certaintyL,
savedFlag, savedID);
boolean result = this.updateWordPOS(key, value);
return result;
}
public boolean updateWordPOS(WordPOSKey key, WordPOSValue value) {
PropertyConfigurator.configure( "conf/log4j.properties" );
Logger myLogger = Logger.getLogger("dataholder.updateWordPOS");
boolean result = true;
// if (key.getWord().equals("shoulder")) {
// System.out.println();
// }
if (this.wordPOSTable.containsKey(key)) {
if (this.wordPOSTable.get(key).equals(value)) {
result = false;
myLogger.trace(String.format(
"Updated [%s, %s] in WordPOS holder: No update",
key.toString(), value.toString()));
}
else {
this.wordPOSTable.put(key, value);
myLogger.trace(String.format(
"Updated [%s, %s] in WordPOS holder: Updated",
key.toString(), value.toString()));
}
}
else {
this.wordPOSTable.put(key, value);
myLogger.trace(String.format(
"Updated [%s, %s] in WordPOS holder: Added New",
key.toString(), value.toString()));
}
return result;
}
public boolean removeWordPOS(WordPOSKey key) {
PropertyConfigurator.configure( "conf/log4j.properties" );
Logger myLogger = Logger.getLogger("dataholder.updateWordPOS");
boolean result = false;
if (this.wordPOSTable.containsKey(key)) {
WordPOSValue oldValue = this.wordPOSTable.remove(key);
myLogger.trace(String.format(
"Updated [%s, %s] in WordPOS holder: Added New",
key.toString(), oldValue.toString()));
result = true;
}
else {
result = false;
}
return result;
}
public Iterator<Entry<String, String>> getUnknownWordHolderIterator(){
return this.unknownWordTable.entrySet().iterator();
}
/** Output Utility **/
public void printHolder(byte holderID) {
if (holderID == DataHolder.SENTENCE) {
printHolder(holderID, 0, this.sentenceTable.size()-1);
}
if (holderID == DataHolder.SINGULAR_PLURAL) {
printHolder(holderID, 0, this.singularPluralTable.size()-1);
}
if (holderID == DataHolder.UNKNOWNWORD) {
printHolder(holderID, 0, this.unknownWordTable.size()-1);
}
if (holderID == DataHolder.WORDPOS) {
printHolder(holderID, 0, this.wordPOSTable.size()-1);
}
}
public void printHolder(byte holderID, int startIndex, int endIndex){
PropertyConfigurator.configure( "conf/log4j.properties" );
Logger myLogger = Logger.getLogger("dataholder.printHolder");
if (holderID == DataHolder.SENTENCE) {
for (int i = startIndex; i<=endIndex; i++) {
SentenceStructure sentence = this.sentenceTable.get(i);
myLogger.info("Index: "+i);
myLogger.info(sentence.toString());
// myLogger.info("Sentence ID: "+sentence.getID());
// myLogger.info("Source: "+sentence.getSource());
// myLogger.info("Sentence: "+sentence.getSentence());
// myLogger.info("Original Sentence: "+sentence.getSentence());
// myLogger.info("Lead: "+sentence.getLead());
// myLogger.info("Status: "+sentence.getStatus());
// myLogger.info("Tag: "+sentence.getTag());
// myLogger.info("Modifier: "+sentence.getModifier());
// myLogger.info("Type: "+sentence.getType());
// myLogger.info("\n");
}
}
if (holderID == DataHolder.SINGULAR_PLURAL) {
myLogger.info("==SingularPlural Table==");
// Iterator<SingularPluralPair> iter = this.singularPluralTable.iterator();
List<SingularPluralPair> singularPluralPairList = new LinkedList<SingularPluralPair>();
singularPluralPairList.addAll(singularPluralTable);
Collections.sort(singularPluralPairList);
for (int i = 0; i<singularPluralPairList.size();i++) {
if ((i >= startIndex) && (i <=endIndex)) {
SingularPluralPair entry = singularPluralPairList.get(i);
myLogger.info("Index: " + i);
myLogger.info("Singular: " + entry.getSingular());
myLogger.info("Plural: " + entry.getPlural());
myLogger.info("\n");
}
}
// int index = 0;
// while (iter.hasNext()) {
// if ((index >= startIndex) && (index <=endIndex)) {
// SingularPluralPair entry = iter.next();
//
// myLogger.info("Index: " + index);
// myLogger.info("Singular: " + entry.getSingular());
// myLogger.info("Plural: " + entry.getPlural());
// myLogger.info("\n");
// }
// index++;
// }
}
if (holderID == DataHolder.UNKNOWNWORD) {
int index = 0;
Iterator<Entry<String, String>> iter = this.unknownWordTable.entrySet().iterator();
while (iter.hasNext()) {
if ((index >= startIndex) && (index <= endIndex)) {
Entry<String, String> entry = iter.next();
myLogger.info("Index: " + index);
myLogger.info("Key: " + entry.getKey());
myLogger.info("Value: " + entry.getValue());
myLogger.info("\n");
}
index++;
}
}
if (holderID == DataHolder.WORDPOS) {
int index = 0;
Iterator<Entry<WordPOSKey, WordPOSValue>> iter = this.getWordPOSHolderIterator();
while (iter.hasNext()) {
if ((index >= startIndex) && (index <= endIndex)) {
Entry<WordPOSKey, WordPOSValue> entry = iter.next();
myLogger.info(entry.toString());
myLogger.info("\n");
}
index++;
}
}
myLogger.info("Total: "+(endIndex-startIndex+1)+"\n");
}
/** Class Methods**/
public SentenceStructure getSentence(int ID) {
Iterator<SentenceStructure> iter = this.sentenceTable.iterator();
while(iter.hasNext()) {
SentenceStructure sentence = iter.next();
if (sentence.getID()==ID) {
return sentence;
}
}
return null;
}
/********************************************/
/********************************************/
/********************************************/
/** Unsupervised Learning Methods**/
/**
*
* @param word
* @param flag
*/
public void updateUnknownWord(String word, String flag){
PropertyConfigurator.configure( "conf/log4j.properties" );
Logger myLogger = Logger.getLogger("dataholder.updateUnkownWord");
this.unknownWordTable.put(word, flag);
myLogger.trace(String.format("Added (%s, %s) into UnknownWord holder", word, flag));
}
public void addSentence(String source, String sentence,
String originalSentence, String lead, String status, String tag,
String modifier, String type) {
PropertyConfigurator.configure( "conf/log4j.properties" );
Logger myLogger = Logger.getLogger("dataholder.addSentence");
SentenceStructure newSent = new SentenceStructure(this.sentenceCount, source, sentence, originalSentence, lead,
status, tag, modifier, type);
this.sentenceCount++;
this.sentenceTable.add(newSent);
myLogger.trace("Added Sentence: ");
myLogger.trace("\tSource: " + source);
myLogger.trace("\tSentence: " + sentence);
myLogger.trace("\tOriginal Sentence: " + originalSentence);
myLogger.trace("\tLead: " + lead);
myLogger.trace("\tStatus: " + status);
myLogger.trace("\tTag: " + tag);
myLogger.trace("\tModifier: " + modifier);
myLogger.trace("\tType: " + type);
myLogger.trace("Quite\n");
}
/**
*
* @param word
* @return
*/
public List<Entry<WordPOSKey,WordPOSValue>> getWordPOSEntries(String word) {
PropertyConfigurator.configure( "conf/log4j.properties" );
Logger myLogger = Logger.getLogger("dataholder.getWordPOSEntries");
Iterator<Map.Entry<WordPOSKey, WordPOSValue>> iter = this.getWordPOSHolderIterator();
List<Entry<WordPOSKey, WordPOSValue>> result = new ArrayList<Entry<WordPOSKey, WordPOSValue>>();
while (iter.hasNext()) {
Map.Entry<WordPOSKey, WordPOSValue> wordPOSEntry = iter.next();
if (StringUtils.equals(wordPOSEntry.getKey().getWord(), word)) {
result.add(wordPOSEntry);
}
}
myLogger.trace("Get WordPOS Entries of word: " + word);
myLogger.trace(StringUtils.join(result, ",\n"));
return result;
}
public List<Entry<WordPOSKey,WordPOSValue>> getWordPOSEntriesByWordPOS(String word, Set<String> POSs) {
Iterator<Map.Entry<WordPOSKey, WordPOSValue>> iter = this.getWordPOSHolderIterator();
List<Entry<WordPOSKey, WordPOSValue>> result = new ArrayList<Entry<WordPOSKey, WordPOSValue>>();
while (iter.hasNext()) {
Map.Entry<WordPOSKey, WordPOSValue> wordPOSEntry = iter.next();
if (StringUtils.equals(wordPOSEntry.getKey().getWord(), word)
&& POSs.contains(wordPOSEntry.getValue())) {
result.add(wordPOSEntry);
}
}
return result;
}
/**
* check if the word is in the singularPluralTable.
*
* @param word
* the word to check
* @return true if the word is in the SingularPluralTable; false otherwise.
*/
public boolean isInSingularPluralPair(String word) {
Iterator<SingularPluralPair> iter = this.singularPluralTable.iterator();
while (iter.hasNext()) {
SingularPluralPair spp = iter.next();
if ((spp.getSingular().equals(word))
|| (spp.getPlural().equals(word))) {
return true;
}
}
return false;
}
/**
* Get words with specified POS tags from word-POS holder
*
* @param POSTags
* the POS tags of the words searching for
* @return set of words
*/
public Set<String> getWordsFromWordPOSByPOSs(Set<String> POSTags) {
Set<String> words = new HashSet<String>();
if (POSTags == null) {
return words;
}
Iterator<Entry<WordPOSKey, WordPOSValue>> iter = this
.getWordPOSHolderIterator();
while (iter.hasNext()) {
Entry<WordPOSKey, WordPOSValue> wordPOSEntry = iter.next();
String POS = wordPOSEntry.getKey().getPOS();
if (POSTags.contains(POS)) {
String word = wordPOSEntry.getKey().getWord();
words.add(word);
}
}
return words;
}
/**
* Get words from UnknowWord holder
*
* @param wordPattern
* pattern the word must match
* @param isWordPatternChecked
* if the word pattern is used
* @param flagPattern
* pattern the flag must match
* @param isFlagPatternChecked
* if the flag pattern is used
* @return set of words
*/
public Set<String> getWordsFromUnknownWord(String wordPattern, boolean isWordPatternChecked,
String flagPattern, boolean isFlagPatternChecked) {
Set<String> words = new HashSet<String>();
if ((!isWordPatternChecked) && (!isFlagPatternChecked)) {
return words;
}
Iterator<Entry<String, String>> iter = this
.getUnknownWordHolderIterator();
while (iter.hasNext()) {
Entry<String, String> item = iter.next();
String word = item.getKey();
String flag = item.getValue();
boolean case1 = getWordsFromUnknownWordByPatternsHelper(
wordPattern, isWordPatternChecked, word);
boolean case2 = getWordsFromUnknownWordByPatternsHelper(
flagPattern, isFlagPatternChecked, flag);
if (case1 && case2) {
words.add(word);
}
}
return words;
}
private boolean getWordsFromUnknownWordByPatternsHelper(String pattern,
boolean isPatternChecked, String text) {
boolean result = false;
if (!isPatternChecked) {
result = true;
}
else {
if (pattern != null) {
// if (StringUtility.isMatchedNullSafe(pattern, text)) {
if (StringUtility.isMatchedNullSafe(text, pattern)) {
result = true;
}
}
}
return result;
}
public boolean isWordExistInUnknownWord(String wordPattern,
boolean isWordPatternChecked, String flagPattern,
boolean isFlagPatternChecked) {
boolean isWordExist = false;
if ((!isWordPatternChecked) && (!isFlagPatternChecked)) {
isWordExist = false;
return isWordExist;
}
Iterator<Entry<String, String>> iter = this
.getUnknownWordHolderIterator();
while (iter.hasNext()) {
Entry<String, String> item = iter.next();
String word = item.getKey();
String flag = item.getValue();
boolean case1 = getWordsFromUnknownWordByPatternsHelper(
wordPattern, isWordPatternChecked, word);
boolean case2 = getWordsFromUnknownWordByPatternsHelper(
flagPattern, isFlagPatternChecked, flag);
if (case1 && case2) {
isWordExist = true;
return isWordExist;
}
}
return isWordExist;
}
/**
* Check if any sentence matches given pattern exists in the data holder
*
* @param isTagged
* if the sentence has to be tagged or not
* @param pattern
* pattern to match against
* @return true if any sentence matches the given pattern exists; false
* otherwise
*/
public boolean isExistSentence(boolean isTagged, String pattern) {
boolean isExist = false;
Iterator<SentenceStructure> iter = getSentenceHolderIterator();
while (iter.hasNext()) {
SentenceStructure sentenceItem = iter.next();
String tag = sentenceItem.getTag();
boolean isTagGood = false;
if (isTagged) {
if ((!StringUtils.equals(tag, "ignore")) || (tag == null)) {
isTagGood = true;
}
} else {
isTagGood = true;
}
if (isTagGood) {
String sentence = sentenceItem.getSentence();
if (StringUtility.isMatchedNullSafe(sentence, pattern)) {
isExist = true;
return isExist;
}
}
}
return isExist;
}
/**
* Get all sentences match a given pattern from the data holder
*
* @param dataholderHandler
* handler of dataholder
* @param pattern
* pattern to match against
* @return sentences matche the given pattern exists; false
* otherwise
*/
public Set<SentenceStructure> getTaggedSentenceByPattern(String pattern) {
Set<SentenceStructure> sentences = new HashSet<SentenceStructure>();
Iterator<SentenceStructure> iter = getSentenceHolderIterator();
while (iter.hasNext()) {
SentenceStructure sentenceItem = iter.next();
String tag = sentenceItem.getTag();
if ((!StringUtils.equals(tag, "ignore"))||(tag == null)) {
String sentence = sentenceItem.getSentence();
if (StringUtility.isMatchedNullSafe(sentence, pattern)) {
sentences.add(sentenceItem);
}
}
}
return sentences;
}
/**
* Delete any wordPOS entries in WordPOS collection that meets the
* requirements
*
* @param isWordChecked
* if the word is checked
* @param word
* the word to check
* @param isPOSChecked
* if the POS tag is checked
* @param POS
* the POS to check
* @return true if any deletion has been made, false otherwise
*/
public boolean deleteWordPOS(boolean isWordChecked, String word,
boolean isPOSChecked, String POS) {
boolean isDeleted = false;
int numDeleted = 0;
if ((!isWordChecked) && (!isPOSChecked)) {
isDeleted = true;
} else {
Iterator<Entry<WordPOSKey, WordPOSValue>> iter = this
.getWordPOSHolderIterator();
while (iter.hasNext()) {
Entry<WordPOSKey, WordPOSValue> wordPOS = iter.next();
boolean isWordPass = false;
boolean isPOSPass = false;
if (isWordChecked) {
if (StringUtils.equals(word, wordPOS.getKey().getWord())) {
isWordPass = true;
}
} else {
isWordPass = true;
}
if (isPOSPass) {
if (StringUtils.equals(POS, wordPOS.getKey().getPOS())) {
isPOSPass = true;
}
} else {
isPOSPass = true;
}
if (isWordPass && isPOSPass) {
numDeleted++;
}
}
if (numDeleted > 0) {
isDeleted = true;
}
}
return isDeleted;
}
public boolean updateSentenceTag(String tagPattern, String newTag){
boolean isTagged = false;
Iterator<SentenceStructure> iter = this.getSentenceHolderIterator();
while (iter.hasNext()) {
SentenceStructure sentenceItem = iter.next();
String tag = sentenceItem.getTag();
if (updateSentenceTagHelper(tag, tagPattern)) {
sentenceItem.setTag(newTag);
isTagged = true;
}
}
return isTagged;
}
private boolean updateSentenceTagHelper (String tag, String tagPattern) {
if (tag == null && tagPattern == null) {
return true;
}
return StringUtility.isMatchedNullSafe(tag, tagPattern);
}
/**
* get all sentences which match the pattern passed in
*
* @param tagPattern
* pattern of tag of the sentences searching for
* @return list of sentences
*/
public List<SentenceStructure> getSentencesByTagPattern(String tagPattern) {
List<SentenceStructure> sentences = new LinkedList<SentenceStructure>();
Iterator<SentenceStructure> iter = this.getSentenceHolderIterator();
while (iter.hasNext()) {
SentenceStructure sentenceItem = iter.next();
String tag = sentenceItem.getTag();
if (StringUtility.isMatchedNullSafe(tag, tagPattern)) {
sentences.add(sentenceItem);
}
}
return sentences;
}
public int getSentenceCount(boolean isModifierUsed, String mPattern,
boolean isTagUsed, String tPattern) {
int count = 0;
for (SentenceStructure sentenceItem : this.sentenceTable) {
boolean c1 = true;
if (isModifierUsed) {
c1 = StringUtility.isMatchedNullSafe(
sentenceItem.getModifier(), mPattern);
}
boolean c2 = true;
if (isTagUsed) {
c2 = StringUtility.isMatchedNullSafe(sentenceItem.getTag(),
tPattern);
}
if (c1 && c2) {
count++;
}
}
return count;
}
/**
* add the singular form and the plural form of a word into the
* singularPluarlTable
*
* @param sgl
* singular form
* @param pl
* plural form
* @return if add a pair, return true; otherwise return false
*/
public boolean addSingularPluralPair(String sgl, String pl) {
PropertyConfigurator.configure( "conf/log4j.properties" );
Logger myLogger = Logger.getLogger("dataholder.addsingularpluralpair");
SingularPluralPair pair = new SingularPluralPair(sgl, pl);
boolean result = this.singularPluralTable.add(pair);
myLogger.debug(String.format("Added singular-plural pair (%s, %s)", sgl, pl));
return result;
}
/** Unknown Word Table Utility***********************************/
/**
*
* @param word
* @param tag
*/
public void addUnknown(String word, String tag) {
this.unknownWordTable.put(word, tag);
}
/** Modifier Table Utility***************************************/
/**
* Take a new word, insert it into the modifier holder, or update its count in
* modifier holder if it already exists
*
* @param newWord
* @param increment
* @return if anything changed in modifier holder, return true; otherwise
* return false
*/
public int addModifier(String newWord, int increment) {
int isUpdate = 0;
if ((newWord.matches("(" + myConstant.STOP + "|^.*\\w+ly$)"))
|| (!(newWord.matches("^.*\\w.*$")))) {
return isUpdate;
}
if (this.modifierTable.containsKey(newWord)) {
int count = this.modifierTable.get(newWord).getCount();
count = count + increment;
this.modifierTable.get(newWord).setCount(count);
// isUpdate = 1;
} else {
this.modifierTable.put(newWord, new ModifierTableValue(1, false));
isUpdate = 1;
}
return isUpdate;
}
/**
* Pick one from bPOS and otherPOS and return it
*
* @param newWord
* @param bPOS
* @param otherPOS
* @return if the newWord appears after a plural noun in any untagged
* sentence, return the bPOS; otherwise, return the otherPOS
*/
public String resolveConflict(String newWord, String bPOS, String otherPOS) {
PropertyConfigurator.configure( "conf/log4j.properties" );
Logger myLogger = Logger.getLogger("dataholder.updateDataHolder.resolveConflict");
myLogger.trace("Enter resolveConflict");
int count = 0;
List<SentenceStructure> mySentenceHolder = this.getSentenceHolder();
for (int i = 0; i < mySentenceHolder.size(); i++) {
SentenceStructure sentence = mySentenceHolder.get(i);
boolean flag = false;
flag = sentence.getTag() == null ?
true : (!sentence.getTag().equals("ignore"));
if (flag) {
String regex = "^.*?([a-z]+(" + myConstant.PLENDINGS + ")) ("
+ newWord + ").*$";
Pattern p = Pattern.compile(regex, Pattern.CASE_INSENSITIVE);
String originalSentence = sentence.getOriginalSentence();
Matcher m = p.matcher(originalSentence);
if (m.find()) {
String plural = m.group(1).toLowerCase();
if (this.myWordFormUtility.getNumber(plural)
.equals("p")) {
count++;
}
if (count >= 1) {
myLogger.trace("Quite resolveConflict, return: " + bPOS);
return bPOS;
}
}
}
}
myLogger.trace("Quite resolveConflict, return: "+otherPOS);
return otherPOS;
}
/**
* Discount existing pos, but do not establish suggested pos
*
* @param newWord
* @param oldPOS
* @param newPOS
* @param mode
* "byone" - reduce certainty 1 by 1. "all" - remove this POS
*/
public void discountPOS(String newWord, String oldPOS, String newPOS,
String mode) {
/**
* 1. Find the flag of newWord in unknownWords table
* 1. Select all words from unknownWords table who has the same flag (including newWord)
* 1. From wordPOS table, select certaintyU of the (word, oldPOS) where word is in the words list
* For each of them
* 1.1 Case 1: certaintyu less than 1, AND mode is "all"
* 1.1.1 Delete the entry from wordpos table
* 1.1.1 Update unknownwords
* 1.1.1.1 Case 1: the pos is "s" or "p"
* Delete all entries contains word from singularplural table as well
* 1.1.1 Insert (word, oldpos, newpos) into discounted table
*/
PropertyConfigurator.configure( "conf/log4j.properties" );
Logger myLogger = Logger.getLogger("dataholder.updateDataHolder.discountPOS");
myLogger.trace("Enter discountPOS");
// get the flag of the newWord
String flag = this.unknownWordTable.get(newWord);
// get the word list
List<String> wordList = new ArrayList<String>();
Iterator<Map.Entry<String, String>> unknownWordIter = this.unknownWordTable.entrySet().iterator();
while (unknownWordIter.hasNext()) {
Map.Entry<String, String> e = unknownWordIter.next();
if (e.getValue().equals(flag)) {
wordList.add(e.getKey());
}
}
myLogger.debug(wordList.toString());
//wordList.add(newWord);
for (int i=0;i<wordList.size();i++) {
String word = wordList.get(i);
WordPOSKey key = new WordPOSKey(word, oldPOS);
if (this.wordPOSTable.containsKey(key)) {
WordPOSValue value = this.wordPOSTable.get(key);
int cU = value.getCertaintyU();
if (cU <= 1 && mode.equals("all")) {
this.removeWordPOS(key);
this.updateUnknownWord(word, "unknown");
// delete from SingularPluralHolder
if (oldPOS.matches("^.*[sp].*$")) {
// list of entries to be deleted
ArrayList<SingularPluralPair> delList = new ArrayList<SingularPluralPair>();
// find entries to be deleted, put them into delList
Iterator<SingularPluralPair> iterSPTable = this.singularPluralTable.iterator();
while (iterSPTable.hasNext()) {
SingularPluralPair spp = iterSPTable.next();
if (spp.getSingular().equals(word)
|| spp.getPlural().equals(word)) {
delList.add(spp);
}
}
// delete all entries in delList from singularPluralTable
Iterator<SingularPluralPair> delListIter = delList.iterator();
while (delListIter.hasNext()) {
SingularPluralPair del = delListIter.next();
this.singularPluralTable.remove(del);
}
}
DiscountedKey dKey = new DiscountedKey(word, oldPOS);
this.discountedTable.put(dKey, newPOS);
}
else {
WordPOSValue temp = this.wordPOSTable.get(key);
int certaintyU = temp.getCertaintyU();
temp.setCertiantyU(certaintyU-1);
this.updateWordPOS(key, temp);
}
}
}
myLogger.trace("Quite discountPOS");
}
/**
* Given a new role, and the old role, of a word, decide the right role to
* return
*
* @param oldRole
* @param newRole
* @return oldRole or newRole, whichever wins
*/
public String mergeRole(String oldRole, String newRole) {
// if old role is "*", return the new role
if (oldRole.equals("*")) {
return newRole;
}
// if the new role is "*", return the old rule
else if (newRole.equals("*")) {
return oldRole;
}
// if the old role is empty, return the new role
if (oldRole.equals("")) {
return newRole;
}
// if the new role is empty, return the old role
else if (newRole.equals("")) {
return oldRole;
}
// if the old role is not same as the new role, return "+"
else if (!oldRole.equals(newRole)) {
return "+";
}
// if none of above apply, return the old role by default
else {
return oldRole;
}
}
/**
* Find the tag of the sentence of which this sentid (clause) is a part of
*
* @param sentID
* @return a tag
*/
public String getParentSentenceTag(int sentID) {
/**
* 1. Get the originalsent of sentence with sentID
* 1. Case 1: the originalsent of $sentence sentID starts with a [a-z\d]
* 1.1 select modifier and tag from Sentence where tag is not "ignore" OR tag is null
* AND originalsent COLLATE utf8_bin regexp '^[A-Z].*' OR originalsent rlike ': *\$' AND id < sentID
* 1.1 take the tag of the first sentence (with smallest id), get its modifier and tag
* 1.1 if modifier matches \w, tag = modifier + space + tag
* 1.1 remove [ and ] from tag
* 1. if tag matches \w return [+tag+], else return [parenttag]
*/
String originalSentence = this.sentenceTable.get(sentID)
.getOriginalSentence();
String tag = "";
String oSentence = "";
if (originalSentence.matches("^\\s*[^A-Z].*$")) {
//if (originalSent.matches("^\\s*([a-z]|\\d).*$")) {
for (int i = 0; i < sentID; i++) {
SentenceStructure sentence = this.sentenceTable.get(i);
tag = sentence.getTag();
oSentence = sentence.getOriginalSentence();
boolean flag = (tag == null)? true : (!tag.matches("ignore"));
if (flag && ((oSentence.matches("^[A-Z].*$")) || (oSentence
.matches("^.*:\\s*$")))) {
String modifier = sentence.getModifier();
if (modifier.matches("^.*\\w.*$")) {
if (tag == null) {
tag = "";
}
tag = modifier + " " + tag;
tag = tag.replaceAll("[\\[\\]]", "");
}
break;
}
}
}
return tag.matches("^.*\\w.*$") ? "[" + tag + "]" : "[parenttag]" ;
}
/**
* Get modifier and tag from the parent tag
*
* @param tag
* @return a list with two elements. The first element is modifier. The
* second element is tag
*/
public List<String> getMTFromParentTag(String tag) {
String modifier = "";
String newTag = "";
Pattern p = Pattern.compile("^\\[(\\w+)\\s+(\\w+)\\]$");
Matcher m = p.matcher(tag);
if (m.lookingAt()) {
modifier = m.group(1);
newTag = m.group(2);
} else {
p = Pattern.compile("^(\\w+)\\s+(\\w+)$");
m = p.matcher(tag);
if (m.lookingAt()) {
modifier = m.group(1);
newTag = m.group(2);
}
}
List<String> pair = new ArrayList<String>();
pair.add(modifier);
pair.add(newTag);
return pair;
}
/**
* Remove ly ending word which is a "b" in the WordPOS, from the modifier
*
* @param modifier
* @return the new modifer
*/
public String tagSentWithMTRemoveLyEndingBoundary(String modifier) {
if (modifier == null) {
return null;
}
Pattern p = Pattern.compile("^(\\w+ly)\\s*(.*)$");
Matcher m = p.matcher(modifier);
while (m.lookingAt()) {
String wordly = m.group(1);
String rest = m.group(2);
WordPOSKey wp = new WordPOSKey(wordly, "b");
if (this.wordPOSTable.containsKey(wp)) {
modifier = rest;
m = p.matcher(modifier);
} else {
break;
}
}
return modifier;
}
/**
*
* @param word
* @param pos
* @param role
* @param table
* @param increment
* @return
*/
public int updateDataHolder(String word, String pos, String role, String table,
int increment) {
PropertyConfigurator.configure( "conf/log4j.properties" );
Logger myLogger = Logger.getLogger("dataholder.updateDataHolder");
myLogger.trace(String.format("Enter (%s, %s, %s, %s, %d)", word, pos, role, table, increment));
int result = 0;
word = StringUtility.processWord(word);
// empty word
if (word.length() < 1) {
return 0;
}
// forbidden word
if (word.matches("\\b(?:" + myConstant.FORBIDDEN + ")\\b")) {
return 0;
}
// if it is a n word, check if it is singular or plural, and update the
// pos
if (pos.equals("n")) {
pos = this.myWordFormUtility.getNumber(word);
}
result = result + markKnown(word, pos, role, table, increment);
myLogger.trace("result1: " + result);
// 1) if the word is a singular form n word, find its plural form, then add
// the plural form, and add the singular - pluarl pair into
// singularPluarlTable;
// 2) if the word is a plural form n word, find its singular form, then add
// the singular form, and add the singular - pluarl pair into
// singularPluarlTable;
if (!this.isInSingularPluralPair(word)) {
myLogger.trace("Search for singular-plural pair of word: " + word);
if (pos.equals("p")) {
myLogger.trace("Case 1");
String pl = word;
word = this.myWordFormUtility.getSingular(word);
myLogger.trace(String.format("Get singular form of %s: %s", pl,
word));
// add "*" and 0: pos for those words are inferred based on
// other clues, not seen directly from the text
result = result + this.markKnown(word, "s", "*", table, 0);
myLogger.trace("result2: " + result);
this.addSingularPluralPair(word, pl);
myLogger.trace(String.format("Added (%s, %s)", word, pl));
}
else if (pos.equals("s")) {
myLogger.trace("Case 2");
List<String> words = this.myWordFormUtility.getPlural(word);
String sg = word;
// if (sg.equals("centrum")) {
// System.out.println("Return Size: "+words.size());
// }
for (int i = 0; i < words.size(); i++) {
if (words.get(i).matches("^.*\\w.*$")) {
result = result
+ this.markKnown(words.get(i), "p", "*", table,
0);
myLogger.trace("result3: " + result);
}
this.addSingularPluralPair(sg, words.get(i));
myLogger.trace(String.format("Added (%s, %s)", sg, words.get(i)));
}
}
else {
myLogger.trace("Nothing added");
}
}
myLogger.trace("Return: "+result+"\n");
return result;
}
/**
* mark a word with its pos and role in wordpos holder, or ???
*
* @param word
* the word to mark
* @param pos
* the pos of the word
* @param role
* the role of the word
* @param table
* which table to mark
* @param increment
* @return
*/
public int markKnown(String word, String pos, String role, String table,
int increment) {
PropertyConfigurator.configure( "conf/log4j.properties" );
Logger myLogger = Logger.getLogger("dataholder.updateDataHolder.markKnown");
myLogger.trace("Enter markKnown");
String pattern = "";
int sign = 0;
String otherPrefix = "";
String spWords = "";
// forbidden word
if (word.matches("\\b(?:" + myConstant.FORBIDDEN + ")\\b")) {
return 0;
}
// stop words
if (word.matches("^(" + myConstant.STOP + ")$")) {
sign = sign
+ processNewWord(word, pos, role, table, word, increment);
return sign;
}
// process this new word
sign = sign + processNewWord(word, pos, role, table, word, increment);
// Case 1: we try to learn those new words based on this one
Pattern p = Pattern.compile("^(" + myConstant.PREFIX + ")(\\S+).*$");
Matcher m = p.matcher(word);
if (m.lookingAt()) {
myLogger.trace("Case 1");
String g1 = m.group(1); // the prefix
String g2 = m.group(2); // the remaining
otherPrefix = StringUtility.removeFromWordList(g1, myConstant.PREFIX);
spWords = "("
+ StringUtility.escape(this.singularPluralVariations(g2,
this.getSingularPluralHolder())) + ")";
pattern = "^(" + otherPrefix + ")?" + spWords + "$";
Iterator<Map.Entry<String, String>> iter1 = this.getUnknownWordHolder()
.entrySet().iterator();
while (iter1.hasNext()) {
Map.Entry<String, String> entry = iter1.next();
String newWord = entry.getKey();
String flag = entry.getValue();
if ((newWord.matches(pattern)) && (flag.equals("unknown"))) {
sign = sign
+ processNewWord(newWord, pos, "*", table, word, 0);
myLogger.trace("Case 1.1");
myLogger.trace("by removing prefix of " + word + ", know "
+ newWord + " is a [" + pos + "]");
}
}
}
// Case 2: word starts with a lower case letter
if (word.matches("^[a-z].*$")) {
myLogger.trace("Case 2");
spWords = "("
+ StringUtility.escape(this.singularPluralVariations(word,
this.getSingularPluralHolder())) + ")";
// word=shrubs, pattern = (pre|sub)shrubs
pattern = "^(" + myConstant.PREFIX + ")" + spWords + "$";
Iterator<Map.Entry<String, String>> iter2 = this.getUnknownWordHolder()
.entrySet().iterator();
while (iter2.hasNext()) {
Map.Entry<String, String> entry = iter2.next();
String newWord = entry.getKey();
String flag = entry.getValue();
// case 2.1
if ((newWord.matches(pattern)) && (flag.equals("unknown"))) {
sign = sign
+ processNewWord(newWord, pos, "*", table, word, 0);
myLogger.debug("Case 2.1");
myLogger.debug("by adding a prefix to " + word
+ ", know " + newWord + " is a [" + pos + "]");
}
}
spWords = "("
+ StringUtility.escape(this.singularPluralVariations(word,
this.getSingularPluralHolder())) + ")";
pattern = "^.*_" + spWords + "$";
Iterator<Map.Entry<String, String>> iter3 = this.getUnknownWordHolder()
.entrySet().iterator();
while (iter3.hasNext()) {
Map.Entry<String, String> entry = iter3.next();
String newWord = entry.getKey();
String flag = entry.getValue();
// case 2.2: word_$spwords
if ((newWord.matches(pattern)) && (flag.equals("unknown"))) {
sign = sign
+ processNewWord(newWord, pos, "*", table, word, 0);
myLogger.debug("Case 2.2");
myLogger.debug("by adding a prefix to " + word
+ ", know " + newWord + " is a [" + pos + "]");
}
}
}
return sign;
}
/**
* This method handles a new word when the updateDataHolder method is called
*
* @param newWord
* @param pos
* @param role
* @param table which table to update. "wordpos" or "modifiers"
* @param flag
* @param increment
* @return if a new word was added, returns 1; otherwise returns 0
*/
public int processNewWord(String newWord, String pos, String role,
String table, String flag, int increment) {
int sign = 0;
// remove the new word from unknownword holder
this.updateUnknownWord(newWord, flag);
// insert the new word to the specified data holder
if (table.equals("wordpos")) {
sign = sign + updatePOS(newWord, pos, role, increment);
} else if (table.equals("modifiers")) {
sign = sign + this.addModifier(newWord, increment);
}
return sign;
}
/**
* update the pos of a word
*
* @param newWord
* @param newPOS
* @param newRole
* @param increment
* @return
*/
public int updatePOS(String newWord, String newPOS, String newRole, int increment) {
PropertyConfigurator.configure( "conf/log4j.properties" );
Logger myLogger = Logger.getLogger("dataholder.updateDataHolder.updatePOS");
myLogger.trace("Enter updatePOS");
myLogger.trace("Word: "+newWord+", POS: "+newPOS);
int n = 0;
String regex = "^.*(\\b|_)(NUM|" + myConstant.NUMBER + "|"
+ myConstant.CLUSTERSTRING + "|" + myConstant.CHARACTER + ")\\b.*$";
//regex = "(NUM|" + "rows" + ")";
boolean case1 = newWord.matches(regex);
boolean case2 = newPOS.matches("[nsp]");
if (case1 && case2) {
myLogger.trace("Case 0");
myLogger.trace("Quite updatePOS");
return 0;
}
// Iterator<Map.Entry<WordPOSKey, WordPOSValue>> iter = this.getWordPOSHolder()
// .entrySet().iterator();
// // boolean isExist = false;
// Map.Entry<WordPOSKey, WordPOSValue> targetWordPOS = null;
// while (iter.hasNext()) {
// Map.Entry<WordPOSKey, WordPOSValue> wordPOS = iter.next();
// if (wordPOS.getKey().getWord().equals(newWord)) {
// targetWordPOS = wordPOS;
// break;
// }
// }
List<Entry<WordPOSKey, WordPOSValue>> entryList = getWordPOSEntries(newWord);
int certaintyU = 0;
// case 1: the word does not exist, add it
if (entryList.size()==0) {
// if (targetWordPOS == null) {
myLogger.trace("Case 1");
certaintyU += increment;
this.updateWordPOS(new WordPOSKey(newWord, newPOS), new WordPOSValue(newRole, certaintyU, 0, null, null));
n = 1;
myLogger.trace(String.format("\t: new [%s] pos=%s, role =%s, certaintyU=%d", newWord, newPOS, newRole, certaintyU));
// case 2: the word already exists, update it
} else {
myLogger.trace("Case 2");
Entry<WordPOSKey, WordPOSValue> targetWordPOS = entryList.get(0);
String oldPOS = targetWordPOS.getKey().getPOS();
String oldRole = targetWordPOS.getValue().getRole();
certaintyU = targetWordPOS.getValue().getCertaintyU();
// case 2.1
// the old POS is NOT same as the new POS,
// AND the old POS is b or the new POS is b
if ((!oldPOS.equals(newPOS))
&& ((oldPOS.equals("b")) || (newPOS.equals("b")))) {
myLogger.trace("Case 2.1");
String otherPOS = newPOS.equals("b") ? oldPOS : newPOS;
newPOS = this.resolveConflict(newWord, "b", otherPOS);
boolean flag = false;
if (newPOS != null) {
if (!newPOS.equals(oldPOS)) {
flag = true;
}
}
// new pos win
if (flag) {
newRole = newRole.equals("*") ? "" : newRole;
n = n + changePOS(newWord, oldPOS, newPOS, newRole, increment);
// old pos win
} else {
newRole = oldRole.equals("*") ? newRole : oldRole;
certaintyU = certaintyU + increment;
// WordPOSKey key = new WordPOSKey("newWord", "pos");
// WordPOSValue value = new WordPOSValue(newRole, certaintyU, 0,
// null, null);
// this.getWordPOSHolder().put(key, value);
this.updateWordPOS(newWord, newPOS, newRole, certaintyU, 0, null, null);
myLogger.debug(String.format("\t: update [%s (%s):a] role: %s=>%s, certaintyU=%d\n",
newWord, newPOS, oldRole, newRole, certaintyU));
}
// case 2.2: the old POS and the new POS are all [n], update role and certaintyU
} else {
myLogger.trace("Case 2.2");
newRole = this.mergeRole(oldRole, newRole);
certaintyU += increment;
// WordPOSKey key = new WordPOSKey(newWord, newPOS);
// WordPOSValue value = new WordPOSValue(newRole, certaintyU, 0,
// null, null);
// this.getWordPOSHolder().put(key, value);
this.updateWordPOS(newWord, newPOS, newRole, certaintyU, 0, null, null);
myLogger.debug(String.format("\t: update [%s (%s):b] role: %s => %s, certaintyU=%d\n",
newWord, newPOS, oldRole, newRole, certaintyU));
}
}
Iterator<Map.Entry<WordPOSKey, WordPOSValue>> iter2 = this.getWordPOSHolderIterator();
int certaintyL = 0;
while (iter2.hasNext()) {
Map.Entry<WordPOSKey, WordPOSValue> e = iter2.next();
if (e.getKey().getWord().equals(newWord)) {
certaintyL += e.getValue().getCertaintyU();
}
}
Iterator<Map.Entry<WordPOSKey, WordPOSValue>> iter3 = this.getWordPOSHolderIterator();
while (iter3.hasNext()) {
Map.Entry<WordPOSKey, WordPOSValue> e = iter3.next();
if (e.getKey().getWord().equals(newWord)) {
e.getValue().setCertiantyU(certaintyL);
}
}
myLogger.debug(String.format("\t: total occurance of [%s] = %d\n", newWord, certaintyL));
myLogger.trace("Return: " + n);
return n;
}
/**
* This method corrects the pos of the word from N to M (establish newPOS)
*
* @param newWord
* @param oldPOS
* @param newPOS
* @param newRole
* @param increment
* @return
*/
public int changePOS(String newWord, String oldPOS, String newPOS,
String newRole, int increment) {
PropertyConfigurator.configure( "conf/log4j.properties" );
Logger myLogger = Logger.getLogger("dataholder.updateDataHolder.changePOS");
myLogger.trace("Enter changePOS");
myLogger.trace("newWord: "+newWord);
myLogger.trace("oldPOS: "+oldPOS);
myLogger.trace("newPOS: "+newPOS);
myLogger.trace("newRole: "+newRole);
oldPOS = oldPOS.toLowerCase();
newPOS = newPOS.toLowerCase();
String modifier = "";
String tag = "";
String sentence = null;
int sign = 0;
// case 1: oldPOS is "s" AND newPOS is "m"
//if (oldPOS.matches("^.*s.*$") && newPOS.matches("^.*m.*$")) {
if (oldPOS.equals("s") && newPOS.equals("m")) {
myLogger.trace("Case 1");
this.discountPOS(newWord, oldPOS, newPOS, "all");
sign += markKnown(newWord, "m", "", "modifiers", increment);
// For all the sentences tagged with $word (m), re tag by finding their parent tag.
for (int i = 0; i < this.getSentenceHolder().size(); i++) {
SentenceStructure sent = this.getSentenceHolder().get(i);
if (sent.getTag().equals(newWord)) {
int sentID = i;
modifier = sent.getModifier();
tag = sent.getTag();
sentence = sent.getSentence();
tag = this.getParentSentenceTag(sentID);
modifier = modifier + " " + newWord;
modifier.replaceAll("^\\s*", "");
List<String> pair = this.getMTFromParentTag(tag);
String m = pair.get(1);
tag = pair.get(2);
if (m.matches("^.*\\w.*$")) {
modifier = modifier + " " + m;
}
this.tagSentenceWithMT(sentID, sentence, modifier, tag, "changePOS[n->m:parenttag]");
}
}
}
// case 2: oldPOS is "s" AND newPOS is "b"
else if ((oldPOS.matches("s")) && (newPOS.matches("b"))) {
myLogger.trace("Case 2");
int certaintyU = 0;
// find (newWord, oldPOS)
WordPOSKey newOldKey = new WordPOSKey(newWord, oldPOS);
if (this.getWordPOSHolder().containsKey(newOldKey)) {
WordPOSValue v = this.getWordPOSHolder().get(newOldKey);
certaintyU = v.getCertaintyU();
certaintyU += increment;
this.discountPOS(newWord, oldPOS, newPOS, "all");
}
// find (newWord, newPOS)
WordPOSKey newNewKey = new WordPOSKey(newWord, newPOS);
if (!this.getWordPOSHolder().containsKey(newNewKey)) {
// this.getWordPOSHolder().put(newNewKey, new WordPOSValue(newRole,
// certaintyU, 0, "", ""));
this.add2Holder(DataHolder.WORDPOS,
Arrays.asList(new String [] {newWord, newPOS, newRole, Integer.toString(certaintyU), "0", "", ""}));
}
myLogger.debug("\t: change ["+newWord+"("+oldPOS+" => "+newPOS+")] role=>"+newRole+"\n");
sign++;
// for all sentences tagged with (newWord, "b"), re tag them
for (int i = 0; i < this.getSentenceHolder().size(); i++) {
String thisTag = this.getSentenceHolder().get(i).getTag();
int thisSentID = i;
String thisSent = this.getSentenceHolder().get(i).getSentence();
if (StringUtils.equals(thisTag, newWord)) {
this.tagSentenceWithMT(thisSentID, thisSent, "", "NULL", "changePOS[s->b: reset to NULL]");
}
}
}
// case 3: oldPOS is "b" AND newPOS is "s"
else if (oldPOS.matches("b") && newPOS.matches("s")) {
myLogger.trace("Case 3");
int certaintyU = 0;
// find (newWord, oldPOS)
WordPOSKey newOldKey = new WordPOSKey(newWord, oldPOS);
if (this.getWordPOSHolder().containsKey(newOldKey)) {
WordPOSValue v = this.getWordPOSHolder().get(newOldKey);
certaintyU = v.getCertaintyU();
certaintyU += increment;
this.discountPOS(newWord, oldPOS, newPOS, "all");
}
// find (newWord, newPOS)
WordPOSKey newNewKey = new WordPOSKey(newWord, newPOS);
if (!this.getWordPOSHolder().containsKey(newOldKey)) {
// this.getWordPOSHolder().put(newNewKey, );
this.updateWordPOS(newNewKey, new WordPOSValue(newRole,
certaintyU, 0, "", ""));
}
myLogger.debug("\t: change ["+newWord+"("+oldPOS+" => "+newPOS+")] role=>"+newRole+"\n");
sign++;
}
int sum_certaintyU = this.getSumCertaintyU(newWord);
if (sum_certaintyU > 0) {
Iterator<Map.Entry<WordPOSKey, WordPOSValue>> iter2 = this.getWordPOSHolderIterator();
while (iter2.hasNext()) {
Map.Entry<WordPOSKey, WordPOSValue> e = iter2.next();
if (e.getKey().getWord().equals(newWord)) {
e.getValue().setCertiantyL(sum_certaintyU);
}
}
}
myLogger.trace("Return: "+sign);
myLogger.trace("Quite changePOS\n");
return sign;
}
/**
*
* @param sentID
* @param sentence
* @param modifier
* @param tag tag could be "null"
* @param label
*/
public void tagSentenceWithMT(int sentID, String sentence, String modifier,
String tag, String label) {
/**
* 1. Do some preprocessing of modifier and tag
* 1. Remove -ly words
* 1. Update modifier and tag of sentence sentID in Sentence
*/
PropertyConfigurator.configure( "conf/log4j.properties" );
Logger myLogger = Logger.getLogger("dataholder.updateDataHolder.tagSentenceWithWT");
myLogger.trace(String.format("Enter (%d, %s, %s, %s, %s)", sentID,
sentence, modifier, tag, label));
if (this.getSentence(sentID) == null) {
return;
}
if (modifier != null) {
// modifier preprocessing
modifier = this.tagSentWithMTPreProcessing(modifier);
// Remove any -ly ending word which is a "b" in the WordPOS, from
// the modifier
modifier = this.tagSentWithMTRemoveLyEndingBoundary(modifier);
modifier = StringUtility.removeAll(modifier, "(^\\s*|\\s*$)");
}
if (tag != null) {
tag = this.tagSentWithMTPreProcessing(tag);
tag = StringUtility.removeAll(tag, "(^\\s*|\\s*$)");
}
if (tag == null) {
this.getSentenceHolder().get(sentID).setTag(null);
this.getSentenceHolder().get(sentID).setModifier(modifier);
}
else {
if (tag.length() > this.myConfiguration.getMaxTagLength()) {
tag = tag.substring(0, this.myConfiguration.getMaxTagLength());
}
this.sentenceTable.get(sentID).setTag(tag);
this.sentenceTable.get(sentID).setModifier(modifier);
}
for (int i = 0; i < this.sentenceTable.size(); i++) {
this.sentenceTable.get(sentID).setTag(tag);
this.sentenceTable.get(sentID).setModifier(modifier);
}
myLogger.trace(label);
myLogger.trace("Quite tagSentenceWithMT");
}
public String tagSentWithMTPreProcessing(String text) {
if (text == null) {
return null;
}
text = text.replaceAll("<\\S+?>", "");
text = StringUtility.removeAllRecursive(text, "^(" + myConstant.STOP
+ "|" + myConstant.FORBIDDEN+")\\b\\s*");
// remove stop and forbidden words from ending
text = StringUtility.removeAllRecursive(text, "\\s*\\b(" + myConstant.STOP
+ "|" + myConstant.FORBIDDEN + "|\\w+ly)$");
// remove all pronoun words
text = StringUtility.removeAllRecursive(text, "\\b(" + myConstant.PRONOUN
+ ")\\b");
return text;
}
public int getSumCertaintyU(String word) {
int sumCertaintyU = 0;
Iterator<Map.Entry<WordPOSKey, WordPOSValue>> iter = this.getWordPOSHolderIterator();
while (iter.hasNext()) {
Map.Entry<WordPOSKey, WordPOSValue> e = iter.next();
if (e.getKey().getWord().equals(word)) {
sumCertaintyU += e.getValue().getCertaintyU();
}
}
return sumCertaintyU;
}
/**
* return singular and plural variations of the word
*
* @param word
* @return all variations of the word
*/
public String singularPluralVariations(String word, Set<SingularPluralPair> singularPluralHolder) {
String variations = word + "|";
Iterator<SingularPluralPair> iter = singularPluralHolder.iterator();
while (iter.hasNext()) {
SingularPluralPair pair = iter.next();
String sg = pair.getSingular();
String pl = pair.getPlural();
if (sg.equals(word) && (!pl.equals(""))) {
variations = variations + pl + "|";
}
if (pl.equals(word) && (!sg.equals(""))) {
variations = variations + sg + "|";
}
}
variations = StringUtility.removeAll(variations, "\\|+$");
return variations;
}
/**
* mark the words between the start index and the end index as modifiers if
* they are valid words.
*
* @param start
* the start index
* @param end
* the end index
* @param words
* a list of words
* @return number of updates made
*/
public int updateDataHolderNN(int start, int end, List<String> words) {
PropertyConfigurator.configure( "conf/log4j.properties" );
Logger myLogger = Logger.getLogger("dataholder.updateDataHolderNN");
myLogger.trace(String.format("Enter (%d, %d, %s)", start, end,
words.toString()));
int update=0;
List<String> splicedWords = StringUtility.stringArraySplice(words, start, end);
for (int i=0;i<splicedWords.size()-1;i++) {
String word = splicedWords.get(i);
myLogger.trace("Check N: " + word);
if (this.updateDataHolderNNConditionHelper(word)) {
myLogger.trace("Update N: " + word);
int temp = this.updateDataHolder(word, "m", "", "modifiers", 1);
update = update + temp;
myLogger.trace("Return: " + temp);
}
}
myLogger.trace("Return: " + update + "\n");
return update;
}
/**
* A helper of method updateDataHolderNN. Check if the condition is meet.
*
* @param word
* the word to check
* @return a boolean variable
*/
public boolean updateDataHolderNNConditionHelper(String word) {
boolean flag = false;
flag = ( (!word.matches("^.*\\b("+myConstant.STOP+")\\b.*$"))
&& (!word.matches("^.*ly\\s*$"))
&& (!word.matches("^.*\\b("+myConstant.FORBIDDEN+")\\b.*$"))
);
return flag;
}
/**
* Return (POS, role, certaintyU, certaintyL) of a word
*
* @param word
* the word to check
* @return entries of (POS, role, certaintyU, certaintyL) of the word in a
* list
*/
public List<POSInfo> checkPOSInfo(String word) {
PropertyConfigurator.configure( "conf/log4j.properties" );
Logger myLogger = Logger.getLogger("dataholder.checkPOSInfo");
myLogger.trace("Enter ("+word+")");
List<POSInfo> POSInfoList = new ArrayList<POSInfo>();
word = StringUtility.removeAll(word, "^\\s*");
word = StringUtility.removeAll(word, "\\s+$");
if (word.matches("^\\d+.*$")) {
POSInfo p = new POSInfo(word, "b", "", 1, 1);
POSInfoList.add(p);
myLogger.trace("Reture: "+POSInfoList);
return POSInfoList;
}
Iterator<Map.Entry<WordPOSKey, WordPOSValue>> iter = this.getWordPOSHolderIterator();
while (iter.hasNext()) {
Map.Entry<WordPOSKey, WordPOSValue> e = iter.next();
String w = e.getKey().getWord();
if (w.equals(word)) {
String POS = e.getKey().getPOS();
String role = e.getValue().getRole();
int certaintyU = e.getValue().getCertaintyU();
int certaintyL = e.getValue().getCertaintyL();
POSInfo p = new POSInfo(word, POS, role, certaintyU, certaintyL);
POSInfoList.add(p);
}
}
// nothing found
if (POSInfoList.size() != 0) {
// sort the list in ascending order of certaintyU/certaintyL
Collections.sort(POSInfoList);
// reverse it into descending order
Collections.reverse(POSInfoList);
}
myLogger.trace("Reture: "+POSInfoList);
return POSInfoList;
}
public Set<String> getBMSWords() {
return this.BMSWords;
}
public Set<String> getTypeModifierPattern() {
Set<String> words = new HashSet<String>();
Iterator<Entry<String, ModifierTableValue>> modifierIter = this.getModifierHolderIterator();
while (modifierIter.hasNext()) {
Entry<String, ModifierTableValue> modifierItem = modifierIter.next();
if (modifierItem.getValue().getIsTypeModifier()) {
String word = modifierItem.getKey();
words.add(word);
}
}
return words;
}
/**
* Get a list of all tags which is not "ignore".
*
* @return a set of tags
*/
public Set<String> getCurrentTags() {
Set<String> tags = new HashSet<String>();
for (int i=0;i<this.sentenceTable.size();i++) {
SentenceStructure sentence = this.sentenceTable.get(i);
String tag = sentence.getTag();
if ((!StringUtils.equals(tag, "ignore"))){
tags.add(tag);
}
}
return tags;
}
public void untagSentences(){
for (SentenceStructure sentenceItem : this.sentenceTable) {
String sentence = sentenceItem.getSentence();
String tag = sentenceItem.getTag();
boolean c1 = StringUtils.equals(tag, "ignore");
boolean c2 = (tag == null);
boolean c3 = StringUtility.isMatchedNullSafe(sentence, "<");
if ((c1||c2)&&c3) {
sentence = sentence.replaceAll("<\\S+?>", "");
sentence = sentence.replaceAll("'", "\\'");
sentenceItem.setSentence(sentence);
}
}
}
// add2Holder
public void addWords2WordPOSHolder(Set<String> words, String POS) {
Iterator<String> iter = words.iterator();
String word = iter.next();
this.add2WordPOSHolder(word, POS, "", 0, 0, null, null);
}
public boolean add2WordPOSHolder(String word, String POS, String role, int certaintyU, int certaintyL, String savedFlag, String savedID) {
boolean isUpdated = false;
WordPOSKey key = new WordPOSKey(word, POS);
WordPOSValue value = new WordPOSValue(role, certaintyU, certaintyL, savedFlag, savedID);
if (this.wordPOSTable.containsKey(key)) {
if (this.wordPOSTable.get(key).equals(value)) {
isUpdated = false;
}
else {
// this.wordPOSTable.put(key, value);
this.updateWordPOS(key, value);
isUpdated = true;
}
}
else {
// this.wordPOSTable.put(key, value);
this.updateWordPOS(key, value);
isUpdated = true;
}
return isUpdated;
}
public void writeToFile(String dir, String fileNamePrefix) {
if (fileNamePrefix == null) {
fileNamePrefix = "";
}
if (!StringUtils.equals(fileNamePrefix, "")) {
fileNamePrefix = fileNamePrefix + "_";
}
// writer
PrintWriter writer = null;
// Discounted Holder
try {
String fullPath = dir + "/" + fileNamePrefix + "Discounted.csv";
File file = new File(fullPath);
file.getParentFile().mkdirs();
writer = new PrintWriter(fullPath, "UTF-8");
// write header
writer.println("word, discounted POS, possible new POS");
// write content
Iterator<Entry<DiscountedKey, String>> iter = this.discountedTable.entrySet().iterator();
while (iter.hasNext()) {
Entry<DiscountedKey, String> discountedEntry = iter.next();
writer.println(String.format("%s, %s, %s",
discountedEntry.getKey().getWord(),
discountedEntry.getKey().getPOS(),
discountedEntry.getValue()
));
}
writer.close();
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (UnsupportedEncodingException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
// HeuristicNouns Holder
try {
String fullPath = dir + "/" + fileNamePrefix + "HeuristicNouns.csv";
File file = new File(fullPath);
file.getParentFile().mkdirs();
writer = new PrintWriter(fullPath, "UTF-8");
// write header
writer.println("word, type");
// write content
Iterator<Entry<String, String>> iter = this.heuristicNounTable.entrySet().iterator();
while (iter.hasNext()) {
Entry<String, String> heuristicNounEntry = iter.next();
writer.println(String.format("%s, %s",
heuristicNounEntry.getKey(),
heuristicNounEntry.getValue()
));
}
writer.close();
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (UnsupportedEncodingException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
// IsA Holder
try {
String fullPath = dir + "/" + fileNamePrefix + "IsA.csv";
File file = new File(fullPath);
file.getParentFile().mkdirs();
writer = new PrintWriter(fullPath, "UTF-8");
// write header
writer.println("ID, instance, class");
// write content
Iterator<Entry<Integer, IsAValue>> iter = this.isATable.entrySet().iterator();
while (iter.hasNext()) {
Entry<Integer, IsAValue> isAEntry = iter.next();
writer.println(String.format("%d, %s, %s",
isAEntry.getKey(),
isAEntry.getValue().getInstance(),
isAEntry.getValue().getCls()
));
}
writer.close();
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (UnsupportedEncodingException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
// Modifiers Holder
try {
String fullPath = dir + "/" + fileNamePrefix + "Modifiers.csv";
File file = new File(fullPath);
file.getParentFile().mkdirs();
writer = new PrintWriter(fullPath, "UTF-8");
// write header
writer.println("word, count, is type modifier");
// write content
Iterator<Entry<String, ModifierTableValue>> iter = this.modifierTable.entrySet().iterator();
while (iter.hasNext()) {
Entry<String, ModifierTableValue> modifierEntry = iter.next();
writer.println(String.format("%s, %d, %b",
modifierEntry.getKey(),
modifierEntry.getValue().getCount(),
modifierEntry.getValue().getIsTypeModifier()
));
}
writer.close();
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (UnsupportedEncodingException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
// Sentence Holder
try {
String fullPath = dir + "/" + fileNamePrefix + "Sentence.csv";
File file = new File(fullPath);
file.getParentFile().mkdirs();
writer = new PrintWriter(fullPath, "UTF-8");
// write header
writer.println("sentence ID, source, sentence, original sentence, lead, status, tag, modifier, type");
// write content
for (SentenceStructure sentenceItem : this.sentenceTable) {
writer.println(String.format("%d, %s, %s, %s, %s, %s, %s, %s, %s",
sentenceItem.getID(),
sentenceItem.getSource(),
sentenceItem.getSentence(),
sentenceItem.getOriginalSentence(),
sentenceItem.getLead(),
sentenceItem.getStatus(),
sentenceItem.getTag(),
sentenceItem.getModifier(),
sentenceItem.getType()
));
}
writer.close();
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (UnsupportedEncodingException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
// SingularPlural Holder
try {
String fullPath = dir + "/" + fileNamePrefix + "SingularPlural.csv";
File file = new File(fullPath);
file.getParentFile().mkdirs();
writer = new PrintWriter(fullPath, "UTF-8");
// write header
writer.println("singular, plural");
// write content
for (SingularPluralPair pair : this.singularPluralTable) {
writer.println(String.format("%s, %s",
pair.getSingular(),
pair.getPlural()
));
}
writer.close();
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (UnsupportedEncodingException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
// TermCategory Holder
try {
String fullPath = dir + "/" + fileNamePrefix + "TermCategory.csv";
File file = new File(fullPath);
file.getParentFile().mkdirs();
writer = new PrintWriter(fullPath, "UTF-8");
// write header
writer.println("term, category");
// write content
for (StringPair pair : this.termCategoryTable) {
writer.println(String.format("%s, %s",
pair.getHead(),
pair.getTail()
));
}
writer.close();
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (UnsupportedEncodingException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
// UnknownWord Holder
try {
String fullPath = dir + "/" + fileNamePrefix + "UnknownWords.csv";
File file = new File(fullPath);
file.getParentFile().mkdirs();
writer = new PrintWriter(fullPath, "UTF-8");
// write header
writer.println("word, flag");
// write content
Iterator<Entry<String, String>> iter = this.unknownWordTable.entrySet().iterator();
while (iter.hasNext()) {
Entry<String, String> unknownWordEntry = iter.next();
writer.println(String.format("%s, %s",
unknownWordEntry.getKey(),
unknownWordEntry.getValue()
));
}
writer.close();
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (UnsupportedEncodingException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
// WordPOS holder
try {
String fullPath = dir + "/" + fileNamePrefix + "WordPOS.csv";
File file = new File(fullPath);
file.getParentFile().mkdirs();
writer = new PrintWriter(fullPath, "UTF-8");
// write header
writer.println("word, POS, role, certaintyU, certiantyL, saved_flag, saveedID");
// write content
Iterator<Entry<WordPOSKey, WordPOSValue>> iter = this
.getWordPOSHolderIterator();
while (iter.hasNext()) {
Entry<WordPOSKey, WordPOSValue> wordPOSItem = iter.next();
writer.println(String.format("%s, %s, %s, %d, %d, %s, %s",
wordPOSItem.getKey().getWord(),
wordPOSItem.getKey().getPOS(),
wordPOSItem.getValue().getRole(),
wordPOSItem.getValue().getCertaintyU(),
wordPOSItem.getValue().getCertaintyL(),
wordPOSItem.getValue().getSavedFlag(),
wordPOSItem.getValue().getSavedID()
));
}
writer.close();
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (UnsupportedEncodingException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
// WordRole holder
try {
String fullPath = dir + "/" + fileNamePrefix + "WordRole.csv";
File file = new File(fullPath);
file.getParentFile().mkdirs();
writer = new PrintWriter(fullPath, "UTF-8");
// write header
writer.println("word, semantic role, saved ID");
// write content
Iterator<Entry<StringPair, String>> iter = this.wordRoleTable.entrySet().iterator();
while (iter.hasNext()) {
Entry<StringPair, String> wordRoleItem = iter.next();
writer.println(String.format("%s, %s, %s",
wordRoleItem.getKey().getHead(),
wordRoleItem.getKey().getTail(),
wordRoleItem.getValue()
));
}
writer.close();
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (UnsupportedEncodingException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public Set<String> getCheckedWordSet() {
return this.checkedWordSet;
}
public void setCheckedWordSet(Set<String> wordSet) {
this.checkedWordSet = wordSet;
}
}