package semanticMarkup.ling.learn;
import static org.junit.Assert.assertEquals;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang3.StringUtils;
import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;
import semanticMarkup.core.Treatment;
import semanticMarkup.know.IGlossary;
import semanticMarkup.know.lib.WordNetPOSKnowledgeBase;
import semanticMarkup.knowledge.KnowledgeBase;
import semanticMarkup.ling.learn.auxiliary.GetNounsAfterPtnReturnValue;
import semanticMarkup.ling.learn.auxiliary.KnownTagCollection;
import semanticMarkup.ling.learn.auxiliary.POSInfo;
import semanticMarkup.ling.learn.auxiliary.SentenceLeadLengthComparator;
import semanticMarkup.ling.learn.auxiliary.StringAndInt;
import semanticMarkup.ling.learn.dataholder.DataHolder;
import semanticMarkup.ling.learn.dataholder.ModifierTableValue;
import semanticMarkup.ling.learn.dataholder.SentenceStructure;
import semanticMarkup.ling.learn.dataholder.WordPOSKey;
import semanticMarkup.ling.learn.dataholder.WordPOSValue;
import semanticMarkup.ling.learn.knowledge.AdditionalBootstrappingLearner;
import semanticMarkup.ling.learn.knowledge.AdjectiveSubjectBootstrappingLearner;
import semanticMarkup.ling.learn.knowledge.AdjectiveVerifier;
import semanticMarkup.ling.learn.knowledge.AndOrTagSetter;
import semanticMarkup.ling.learn.knowledge.AnnotationNormalizer;
import semanticMarkup.ling.learn.knowledge.CommaAsAndAnnotator;
import semanticMarkup.ling.learn.knowledge.CommonSubstructureAnnotator;
import semanticMarkup.ling.learn.knowledge.Constant;
import semanticMarkup.ling.learn.knowledge.CoreBootstrappingLearner;
import semanticMarkup.ling.learn.knowledge.DittoAnnotator;
import semanticMarkup.ling.learn.knowledge.FiniteSetsLoader;
import semanticMarkup.ling.learn.knowledge.HeuristicNounLearnerUseMorphology;
import semanticMarkup.ling.learn.knowledge.IgnorePatternAnnotator;
import semanticMarkup.ling.learn.knowledge.IgnoredFinalizer;
import semanticMarkup.ling.learn.knowledge.Initializer;
import semanticMarkup.ling.learn.knowledge.ModifierTagSeparator;
import semanticMarkup.ling.learn.knowledge.NMBResolver;
import semanticMarkup.ling.learn.knowledge.NullSentenceTagger;
import semanticMarkup.ling.learn.knowledge.POSBasedAnnotator;
import semanticMarkup.ling.learn.knowledge.PatternBasedAnnotator;
import semanticMarkup.ling.learn.knowledge.PhraseClauseAnnotator;
import semanticMarkup.ling.learn.knowledge.PronounCharactersAnnotator;
import semanticMarkup.ling.learn.knowledge.HeuristicNounLearnerUseSuffix;
import semanticMarkup.ling.learn.knowledge.UnknownWordBootstrappingLearner;
import semanticMarkup.ling.learn.utility.LearnerUtility;
import semanticMarkup.ling.learn.utility.StringUtility;
import semanticMarkup.ling.transform.ITokenizer;
public class Learner {
private static final Set<String> NONS = null; // ??
private Configuration myConfiguration;
private ITokenizer myTokenizer;
// Data holder
private DataHolder myDataHolder;
// Learner utility
private LearnerUtility myLearnerUtility;
// Class variables
// Leading three words of sentences
Map<String, Boolean> checkedModifiers;
// Modules
KnowledgeBase knowledgeBase;
Initializer initializer;
HeuristicNounLearnerUseMorphology heuristicNounLearnerUseMorphology;
FiniteSetsLoader finiteSetsLoader;
HeuristicNounLearnerUseSuffix heuristicNounLearnerUseSuffix;
PatternBasedAnnotator patternBasedAnnotator;
IgnorePatternAnnotator ignorePatternAnnotator;
CoreBootstrappingLearner coreBootstrappingLearner;
AdditionalBootstrappingLearner additionalBootstrappingLearner;
UnknownWordBootstrappingLearner unknownWordBootstrappingLearner;
AdjectiveVerifier adjectiveVerifier;
ModifierTagSeparator modifierTagSeparator;
NMBResolver nMBResolver;
AndOrTagSetter andOrTagSetter;
AdjectiveSubjectBootstrappingLearner adjectiveSubjectBootstrappingLearner;
POSBasedAnnotator posBasedAnnotator;
PhraseClauseAnnotator phraseClauseAnnotator;
DittoAnnotator dittoAnnotator;
PronounCharactersAnnotator pronounCharactersAnnotator;
IgnoredFinalizer ignoredFinalizer;
CommonSubstructureAnnotator commonSubstructureAnnotator;
CommaAsAndAnnotator commaAsAndAnnotator;
NullSentenceTagger nullSentenceTagger;
AnnotationNormalizer annotationNormalizer;
public Learner(Configuration configuration, ITokenizer tokenizer,
LearnerUtility learnerUtility) {
PropertyConfigurator.configure("conf/log4j.properties");
Logger myLogger = Logger.getLogger("Learner");
this.myConfiguration = configuration;
this.myTokenizer = tokenizer;
// Utilities
this.myLearnerUtility = learnerUtility;
// Data holder
this.myDataHolder = new DataHolder(myConfiguration,
myLearnerUtility.getConstant(), myLearnerUtility.getWordFormUtility());
// Class variables
this.checkedModifiers = new HashMap<String, Boolean>();
myLogger.info("Created Learner");
myLogger.info("\tLearning Mode: " + myConfiguration.getLearningMode());
myLogger.info("\tMax Tag Lengthr: " + myConfiguration.getMaxTagLength());
myLogger.info("\n");
this.knowledgeBase = new KnowledgeBase();
this.initializer = new Initializer(this.myLearnerUtility,
this.myConfiguration.getNumLeadWords());
this.heuristicNounLearnerUseMorphology = new HeuristicNounLearnerUseMorphology(this.myLearnerUtility);
this.finiteSetsLoader = new FiniteSetsLoader(this.myLearnerUtility);
this.heuristicNounLearnerUseSuffix = new HeuristicNounLearnerUseSuffix(this.myLearnerUtility);
this.patternBasedAnnotator = new PatternBasedAnnotator();
this.ignorePatternAnnotator = new IgnorePatternAnnotator();
this.coreBootstrappingLearner = new CoreBootstrappingLearner(this.myLearnerUtility, this.myConfiguration);
this.additionalBootstrappingLearner = new AdditionalBootstrappingLearner(this.myLearnerUtility, this.myConfiguration);
this.unknownWordBootstrappingLearner = new UnknownWordBootstrappingLearner(
this.myLearnerUtility);
this.adjectiveVerifier = new AdjectiveVerifier(this.myLearnerUtility);
this.modifierTagSeparator = new ModifierTagSeparator(this.myLearnerUtility);
this.nMBResolver = new NMBResolver();
this.andOrTagSetter = new AndOrTagSetter(this.myLearnerUtility);
this.adjectiveSubjectBootstrappingLearner = new AdjectiveSubjectBootstrappingLearner(this.myLearnerUtility, this.myConfiguration.getLearningMode(), this.myConfiguration.getMaxTagLength());
this.posBasedAnnotator = new POSBasedAnnotator(this.myLearnerUtility);
this.phraseClauseAnnotator = new PhraseClauseAnnotator(this.myLearnerUtility);
this.dittoAnnotator = new DittoAnnotator(this.myLearnerUtility);
this.pronounCharactersAnnotator = new PronounCharactersAnnotator(this.myLearnerUtility);
this.ignoredFinalizer = new IgnoredFinalizer();
this.nullSentenceTagger = new NullSentenceTagger(this.myLearnerUtility, this.myConfiguration.getDefaultGeneralTag());
this.commonSubstructureAnnotator = new CommonSubstructureAnnotator();
this.commaAsAndAnnotator = new CommaAsAndAnnotator(this.myLearnerUtility);
this.annotationNormalizer
= new AnnotationNormalizer(this.getConfiguration().getLearningMode(),
this.checkedModifiers, this.getLearnerUtility());
}
public DataHolder learn(List<Treatment> treatments, IGlossary glossary,
String markupMode) {
PropertyConfigurator.configure("conf/log4j.properties");
Logger myLogger = Logger.getLogger("Learn");
myLogger.trace("Enter Learn");
myLogger.trace(String.format("Learning Mode: %s",
this.myConfiguration.getLearningMode()));
this.knowledgeBase.importKnowledgeBase(this.myDataHolder, "kb", this.myLearnerUtility.getConstant());
this.initializer.loadTreatments(treatments);
this.initializer.run(myDataHolder);
this.heuristicNounLearnerUseMorphology.run(this.myDataHolder);
this.finiteSetsLoader.run(this.myDataHolder);
this.heuristicNounLearnerUseSuffix.run(myDataHolder);
// Set the certaintyU and certaintyL value of every entry in WordPOS collection to be 0
this.resetCounts(myDataHolder);
this.patternBasedAnnotator.run(myDataHolder);
this.ignorePatternAnnotator.run(myDataHolder);
this.coreBootstrappingLearner.setStatus("start");
this.coreBootstrappingLearner.run(myDataHolder);
this.coreBootstrappingLearner.setStatus("normal");
this.coreBootstrappingLearner.run(myDataHolder);
this.additionalBootstrappingLearner.run(myDataHolder);
myLogger.info("Unknownword bootstrappings:");
this.unknownWordBootstrappingLearner.run(myDataHolder);
myLogger.info("Adjectives Verification:");
this.adjectiveVerifier.run(myDataHolder);
// For those sentences whose tag has a space between words, separate modifier and update the tag
this.modifierTagSeparator.run(myDataHolder);
// deal with words that plays N, and B roles
this.nMBResolver.run(myDataHolder);
// set and/or tags
this.andOrTagSetter.run(myDataHolder);
this.adjectiveSubjectBootstrappingLearner.run(myDataHolder);
// set tags of sentences with "andor" tag to null
this.resetAndOrTags(myDataHolder);
this.getLearnerUtility().tagAllSentences(myDataHolder, "singletag",
"sentence");
this.posBasedAnnotator.run(myDataHolder);
this.phraseClauseAnnotator.run(myDataHolder);
this.dittoAnnotator.run(myDataHolder);
this.pronounCharactersAnnotator.run(myDataHolder);
this.ignoredFinalizer.run(myDataHolder);
this.posBasedAnnotator.run(myDataHolder);
// tag remaining sentences with null tags
this.nullSentenceTagger.run(myDataHolder);
if (StringUtils.equals(this.myConfiguration.getLearningMode(), "adj")) {
// Modify the sentences which are tagged with commons substructure
this.commonSubstructureAnnotator.run(myDataHolder);
}
this.commaAsAndAnnotator.run(myDataHolder);
this.annotationNormalizer.run(myDataHolder);
this.prepareTables4Parser(myDataHolder);
myDataHolder.writeToFile("dataholder", "");
myLogger.info("Learning done!");
return myDataHolder;
}
private void adjectiveSubjectBootstrappingLearner(DataHolder dataholderHandler,
String learningMode) {
if (StringUtils.equals(learningMode, "adj")) {
// myLogger.info("Bootstrapping on adjective subjects");
adjectiveSubjectBootstrapping(myDataHolder); // !!!
} else {
int v = 0;
do {
v = 0;
this.handleAndOr(myDataHolder); // !!!
} while (v > 0);
}
}
public void addGlossary(IGlossary glossary) {
if (glossary != null) {
String category = "struture";
Set<String> pWords = glossary.getWords(category);
Set<String> categories = new HashSet<String>();
categories.add(category);
Set<String> bWords = glossary.getWordsNotInCategories(categories);
this.getDataHolder().addWords2WordPOSHolder(pWords, "p");
this.getDataHolder().addWords2WordPOSHolder(bWords, "b");
}
}
// private void addPredefinedWords() {
// this.addStopWords();
// this.addCharacters();
// this.addNumbers();
// this.addClusterStrings();
// this.addProperNouns();
// }
/**
*
* @return
*/
public DataHolder getDataHolder() {
return this.myDataHolder;
}
/**
*
*/
public void addHeuristicsNouns() {
PropertyConfigurator.configure("conf/log4j.properties");
Logger myLogger = Logger.getLogger("learn.addHeuristicsNouns");
myLogger.trace("Enter addHeuristicsNouns");
Set<String> nouns = this.learnHeuristicsNouns();
myLogger.debug("Nouns learned from heuristics:");
myLogger.debug("\t" + nouns.toString());
myLogger.debug("Total: " + nouns.size());
List<Set<String>> results = this.characterHeuristics();
Set<String> rnouns = results.get(0);
Set<String> descriptors = results.get(1);
addDescriptors(descriptors);
addNouns(rnouns);
// this.myDataHolder.printHolder(DataHolder.SINGULAR_PLURAL);
myLogger.debug("Total: " + nouns.size());
Iterator<String> iter = nouns.iterator();
myLogger.info("Learn singular-plural pair");
while (iter.hasNext()) {
String e = iter.next();
myLogger.trace("Check Word: " + e);
if ((e.matches("^.*\\w.*$"))
&& (!StringUtility.isMatchedWords(e, "NUM|"
+ this.myLearnerUtility.getConstant().NUMBER + "|" + this.myLearnerUtility.getConstant().CLUSTERSTRING
+ "|" + this.myLearnerUtility.getConstant().CHARACTER + "|"
+ this.myLearnerUtility.getConstant().PROPERNOUN))) {
myLogger.trace("Pass");
// same word may have two different pos tags
String[] nounArray = e.split("\\|");
for (int i = 0; i < nounArray.length; i++) {
String nounAndPOS = nounArray[i];
Pattern p = Pattern.compile("(\\w+)\\[([spn])\\]");
Matcher m = p.matcher(nounAndPOS);
if (m.lookingAt()) {
String word = m.group(1);
String pos = m.group(2);
this.myDataHolder.updateDataHolder(word, pos, "*",
"wordpos", 0);
if (pos.equals("p")) {
String plural = word;
String singular = this.myLearnerUtility
.getWordFormUtility().getSingular(plural);
if (singular != null) {
if (!singular.equals("")) {
this.myDataHolder.addSingularPluralPair(
singular, plural);
}
}
}
if (pos.equals("s")) {
String singular = word;
List<String> pluralList = this.myLearnerUtility
.getWordFormUtility().getPlural(singular);
Iterator<String> pluralIter = pluralList.iterator();
while (pluralIter.hasNext()) {
String plural = pluralIter.next();
if (plural != null) {
if (!plural.equals("")) {
this.myDataHolder
.addSingularPluralPair(
singular, plural);
}
}
}
}
}
}
}
}
myLogger.trace("Quite addHeuristicsNouns");
}
/**
*
* @param descriptors
*/
public void addDescriptors(Set<String> descriptors) {
Iterator<String> iter = descriptors.iterator();
while (iter.hasNext()) {
String descriptor = iter.next();
if (!StringUtility.isMatchedWords(descriptor, this.myLearnerUtility.getConstant().FORBIDDEN)) {
this.myDataHolder.updateDataHolder(descriptor, "b", "",
"wordpos", 1);
}
}
}
/**
*
* @param rnouns
*/
public void addNouns(Set<String> rnouns) {
Iterator<String> iter = rnouns.iterator();
while (iter.hasNext()) {
String noun = iter.next();
if (!StringUtility.isMatchedWords(noun, this.myLearnerUtility.getConstant().FORBIDDEN)) {
this.myDataHolder.updateDataHolder(noun, "n", "", "wordpos", 1);
}
}
}
/**
*
* @return nouns learned by heuristics
*/
public Set<String> learnHeuristicsNouns() {
PropertyConfigurator.configure("conf/log4j.properties");
Logger myLogger = Logger
.getLogger("learn.addHeuristicsNouns.learnHeuristicsNouns");
// Set of words
Set<String> words = new HashSet<String>();
// Set of nouns
Set<String> nouns = new HashSet<String>();
List<String> sentences = new LinkedList<String>();
for (int i = 0; i < this.myDataHolder.getSentenceHolder().size(); i++) {
String originalSentence = this.myDataHolder.getSentenceHolder()
.get(i).getOriginalSentence();
myLogger.trace("Original Sentence: " + originalSentence);
sentences.add(StringUtility.strip(originalSentence));
}
// Now we have original sentences in sentences
// Method addWords
for (int i = 0; i < sentences.size(); i++) {
String sentence = sentences.get(i);
sentence = sentence.toLowerCase();
String noun = this.getPresentAbsentNouns(sentence);
if (!noun.equals("")) {
nouns.add(noun);
}
// add words
List<String> tokens = this.myLearnerUtility.tokenizeText(sentence,
"all");
for (String token : tokens) {
if (StringUtility.isWord(token)) {
words.add(token);
myLogger.trace("Add a word into words: " + token);
}
}
}
// solve the problem: septa and septum are both s
Iterator<String> nounsIterator = nouns.iterator();
while (nounsIterator.hasNext()) {
String oldNoun = nounsIterator.next();
String newNoun = this.getHeuristicsNounsHelper(oldNoun, nouns);
if (!newNoun.equals(oldNoun)) {
nouns.remove(oldNoun);
nouns.add(newNoun);
}
}
// sort all words
Map<String, Set<String>> wordMap = new HashMap<String, Set<String>>();
Iterator<String> wordsIterator = words.iterator();
while (wordsIterator.hasNext()) {
String word = wordsIterator.next();
String root = myLearnerUtility.getWordFormUtility().getRoot(word);
if (wordMap.containsKey(root)) {
Set<String> wordList = wordMap.get(root);
wordList.add(word);
// List<String> wordList2 = wordMap.get(root);
// System.out.println(wordList2);
} else {
Set<String> wordList = new HashSet<String>();
wordList.add(word);
wordMap.put(root, wordList);
}
}
// print out the wordMap
myLogger.trace("WordMap:");
Iterator<Map.Entry<String, Set<String>>> wordMapIter = wordMap
.entrySet().iterator();
while (wordMapIter.hasNext()) {
Map.Entry<String, Set<String>> e = wordMapIter.next();
myLogger.trace(e.toString());
}
// find nouns
myLogger.info("Learn singular-plural pair");
Iterator<Map.Entry<String, Set<String>>> iter = wordMap.entrySet()
.iterator();
while (iter.hasNext()) {
Map.Entry<String, Set<String>> e = iter.next();
Set<String> wordSet = e.getValue();
Iterator<String> wordIterator = wordSet.iterator();
while (wordIterator.hasNext()) {
String word = wordIterator.next();
// getnouns
if (word.matches("^.*" + Constant.NENDINGS)) {
nouns.add(word + "[s]");
if (wordSet.contains(word + "s")) {
nouns.add(word + "s" + "[p]");
this.myDataHolder.addSingularPluralPair(word, word
+ "s");
}
if (wordSet.contains(word + "es")) {
nouns.add(word + "es" + "[p]");
this.myDataHolder.addSingularPluralPair(word, word
+ "es");
}
}
}
}
// Iterator<LinkedList> wordMapIterator = wordMap.i
Iterator<Map.Entry<String, Set<String>>> wordMapIterator = wordMap
.entrySet().iterator();
while (wordMapIterator.hasNext()) {
Map.Entry<String, Set<String>> wordMapEntry = wordMapIterator
.next();
Set<String> wordSet = wordMapEntry.getValue();
// check if there is a word with Vending
boolean hasVending = false;
// for (int i1 = 0; i1 < wordList.size(); i1++) {
Iterator<String> wordIterator = wordSet.iterator();
while (wordIterator.hasNext()) {
String tempWord = wordIterator.next();
if (tempWord.matches("^.*" + Constant.VENDINGS)) {
hasVending = true;
break;
}
}
// at least two words without verb endings
if ((!hasVending) && (wordSet.size() > 1)) {
List<String> wordList = new LinkedList<String>(wordSet);
for (int i = 0; i < wordList.size(); i++) {
for (int j = i + 1; j < wordList.size(); j++) {
String word1 = wordList.get(i);
String word2 = wordList.get(j);
List<String> pair = myLearnerUtility
.getWordFormUtility().getSingularPluralPair(
word1, word2);
if (pair.size() == 2) {
String singular = pair.get(0);
String plural = pair.get(1);
nouns.add(singular + "[s]");
nouns.add(plural + "[p]");
this.myDataHolder.addSingularPluralPair(singular,
plural);
}
}
}
}
}
// print out nouns
myLogger.debug("Nouns: " + nouns);
return nouns;
}
// ---------------addHeuristicsNouns Help Function----
// #solve the problem: septa and septum are both s
// septum - Singular
// septa -Plural
// septa[s] => septa[p]
public String getHeuristicsNounsHelper(String oldNoun, Set<String> words) {
String newNoun = oldNoun;
if (oldNoun.matches("^.*a\\[s\\]$")) {
String noun = oldNoun.replaceAll("\\[s\\]", "");
if (words.contains(noun)) {
newNoun = noun + "[p]";
}
}
return newNoun;
}
/**
* any word preceeding "present"/"absent" would be a n
*
* @param text
* the content to learn from
* @return nouns learned
*/
public String getPresentAbsentNouns(String text) {
PropertyConfigurator.configure("conf/log4j.properties");
Logger myLogger = Logger
.getLogger("learn.addHeuristicsNouns.learnHeuristicsNouns.getPresentAbsentNouns");
String pachecked = "and|or|to";
// if (text.matches("(\\w+?)\\s+(present|absent)")) {
// System.out.println(text);
// }
Matcher matcher = Pattern.compile("^.*?(\\w+?)\\s+(present|absent).*$")
.matcher(text);
if (matcher.lookingAt()) {
String word = matcher.group(1);
if ((!word.matches("\\b(" + pachecked + ")\\b"))
&& (!word.matches("\\b(" + this.myLearnerUtility.getConstant().STOP + ")\\b"))
&& (!word
.matches("\\b(always|often|seldom|sometimes|[a-z]+ly)\\b"))) {
myLogger.trace("present/absent " + word);
if (((word.matches("^.*" + Constant.PENDINGS))
|| (word.matches("^.*[^s]s$")) || (word
.matches("teeth")))
&& (!word.matches(Constant.SENDINGS))) {
return word + "[p]";
} else {
return word + "[s]";
}
}
}
return "";
}
/**
* Discover nouns and descriptors according to a set of rules
*
* @return a linked list, whose first element is a set of nouns, and second
* element is a set of descriptors
*/
public List<Set<String>> characterHeuristics() {
PropertyConfigurator.configure("conf/log4j.properties");
Logger myLogger = Logger
.getLogger("learn.addHeuristicsNouns.characterHeuristics");
Set<String> taxonNames = new HashSet<String>();
Set<String> nouns = new HashSet<String>();
Set<String> anouns = new HashSet<String>();
Set<String> pnouns = new HashSet<String>();
Set<String> descriptors = new HashSet<String>();
Map<String, Boolean> descriptorMap = new HashMap<String, Boolean>();
int sent_num = this.myDataHolder.getSentenceHolder().size();
for (int i = 0; i < sent_num; i++) {
// taxon rule
SentenceStructure sent = this.myDataHolder.getSentenceHolder().get(
i);
String source = sent.getSource();
String sentence = sent.getSentence();
String originalSentence = sent.getOriginalSentence();
myLogger.trace("Source: " + source);
myLogger.trace("Sentence: " + sentence);
myLogger.trace("Original Sentence: " + originalSentence);
originalSentence = StringUtility.trimString(originalSentence);
// noun rule 0: taxon names
taxonNames = this.getTaxonNameNouns(originalSentence);
// $sentence =~ s#<\s*/?\s*i\s*>##g;
// $originalsent =~ s#<\s*/?\s*i\s*>##g;
sentence = sentence.replaceAll("<\\s*/?\\s*i\\s*>", "");
originalSentence = originalSentence.replaceAll("<\\s*/?\\s*i\\s*>",
"");
// Update getSentenceHolder()
this.myDataHolder.getSentenceHolder().get(i).setSentence(sentence);
// noun rule 0.5: Meckle#s cartilage
Set<String> nouns0 = this
.getNounsMecklesCartilage(originalSentence);
nouns.addAll(nouns0);
sentence = sentence.replaceAll("#", "");
// Update getSentenceHolder()
this.myDataHolder.getSentenceHolder().get(i).setSentence(sentence);
// noun rule 2: end of sentence nouns
// (a|an|the|some|any|this|that|those|these) noun$
Set<String> nouns2 = this.getNounsRule2(originalSentence);
nouns.addAll(nouns2);
// noun rule 3: proper nouns and acronyms
String copy = originalSentence;
Set<String> nouns_temp = this.getNounsRule3Helper(copy);
Iterator<String> iter = nouns_temp.iterator();
while (iter.hasNext()) {
String token = iter.next();
if (token.matches("^.*[A-Z].+$")
&& (!token.matches("^.*-\\w+ed$"))) {
if (token.matches("^[A-Z0-9]+$")) {
token = token.toLowerCase();
anouns.add(token);
} else {
token = token.toLowerCase();
pnouns.add(token);
}
nouns.add(token);
}
}
// noun rule 1: sources with 1 _ are character statements, 2 _ are
// descriptions
Set<String> nouns1 = getNounsRule1(source, originalSentence,
descriptorMap);
nouns.addAll(nouns1);
// noun rule 4: non-stop/prep followed by a number: epibranchial 4
// descriptor heuristics
Set<String> nouns4 = this.getNounsRule4(originalSentence);
nouns.addAll(nouns4);
// remove puncts for descriptor rules
originalSentence = StringUtility.removePunctuation(
originalSentence, "-");
// System.out.println("oSent:");
// System.out.println(originalSentence);
// Descriptor rule 1: single term descriptions are descriptors
descriptors.addAll(this.getDescriptorsRule1(source,
originalSentence, nouns));
// Descriptor rule 2: (is|are) red: isDescriptor
descriptors.addAll(this.getDescriptorsRule2(originalSentence,
descriptorMap));
}
nouns = this.filterOutDescriptors(nouns, descriptors);
anouns = this.filterOutDescriptors(anouns, descriptors);
pnouns = this.filterOutDescriptors(pnouns, descriptors);
this.getDataHolder().add2HeuristicNounTable(nouns, "organ");
this.getDataHolder().add2HeuristicNounTable(anouns, "acronyms");
this.getDataHolder().add2HeuristicNounTable(pnouns, "propernouns");
this.getDataHolder().add2HeuristicNounTable(taxonNames, "taxonnames");
nouns.addAll(anouns);
nouns.addAll(pnouns);
nouns.addAll(taxonNames);
List<Set<String>> results = new LinkedList<Set<String>>();
results.add(nouns);
results.add(descriptors);
return results;
}
/**
* filter out descriptors from nouns, and return remaining nouns
*
* @param rNouns
* set of nouns
* @param rDescriptors
* set of descriptors
* @return set of nouns that are not descriptors
*/
public Set<String> filterOutDescriptors(Set<String> rNouns,
Set<String> rDescriptors) {
Set<String> filtedNouns = new HashSet<String>();
Iterator<String> iter = rNouns.iterator();
while (iter.hasNext()) {
String noun = iter.next();
noun = noun.toLowerCase();
Pattern p = Pattern.compile("\\b(" + this.myLearnerUtility.getConstant().PREPOSITION + "|"
+ this.myLearnerUtility.getConstant().STOP + ")\\b", Pattern.CASE_INSENSITIVE);
Matcher m = p.matcher(noun);
if ((!m.lookingAt()) && (!rDescriptors.contains(noun))) {
filtedNouns.add(noun);
}
}
return filtedNouns;
}
/**
* Nouns rule 0: get <i></i> enclosed taxon names
*
* @param oSent
* @return
*/
public Set<String> getTaxonNameNouns(String oSent) {
Set<String> taxonNames = new HashSet<String>();
String regex = "(.*?)<\\s*i\\s*>\\s*([^<]*)\\s*<\\s*\\/\\s*i\\s*>(.*)";
String copy = oSent;
while (true) {
Matcher matcher = Pattern.compile(regex).matcher(copy);
if (matcher.lookingAt()) {
String taxonName = matcher.group(2);
if (taxonName.length() > 0) {
taxonNames.add(taxonName);
String[] taxonNameArray = taxonName.split("\\s+");
for (int i = 0; i < taxonNameArray.length; i++) {
taxonNames.add(taxonNameArray[i]);
}
copy = matcher.group(3);
} else {
break;
}
} else {
break;
}
}
return taxonNames;
}
/**
* Nouns rule 0.5: Meckle#s cartilage
*
* @param oSent
* @return
*/
public Set<String> getNounsMecklesCartilage(String oSent) {
Set<String> nouns = new HashSet<String>();
String regex = "^.*\\b(\\w+#s)\\b.*$";
Matcher m = Pattern.compile(regex).matcher(oSent);
if (m.lookingAt()) {
String noun = "";
noun = m.group(1);
noun = noun.toLowerCase();
nouns.add(noun);
noun = noun.replaceAll("#", "");
nouns.add(noun);
noun = noun.replaceAll("s$", "");
nouns.add(noun);
}
return nouns;
}
/**
*
* @param source
* @param originalSentence
* @param descriptorMap
* @return
*/
public Set<String> getNounsRule1(String source, String originalSentence,
Map<String, Boolean> descriptorMap) {
Set<String> nouns = new HashSet<String>();
if ((!(source.matches("^.*\\.xml_\\S+_.*$")))
&& (!(originalSentence.matches("^.*\\s.*$")))) {
if (!this.isDescriptor(originalSentence, descriptorMap)) {
originalSentence = originalSentence.toLowerCase();
nouns.add(originalSentence);
}
}
return nouns;
}
/**
*
* @param oSent
* @return
*/
public Set<String> getNounsRule2(String oSent) {
String copy = oSent;
String regex = "(.*?)\\b(a|an|the|some|any|this|second|third|fourth|fifth|sixth|seventh|eighth|ninth|tenth) +(\\w+)\\s*($|\\(|\\[|\\{|\\b"
+ this.myLearnerUtility.getConstant().PREPOSITION + "\\b)(.*)";
Set<String> nouns = new HashSet<String>();
while (true) {
if (copy == null) {
break;
}
Matcher m = Pattern.compile(regex).matcher(copy);
if (m.lookingAt()) {
String t = m.group(3);
String prep = m.group(4);
copy = m.group(5);
if (prep.matches("^.*\\w.*$")
&& t.matches("^.*\\b(length|width|presence|\\w+tion)\\b.*$")) {
continue;
}
t = t.toLowerCase();
nouns.add(t);
} else {
break;
}
}
return nouns;
}
/**
*
* @param sentence
* @return
*/
public Set<String> getNounsRule3Helper(String sentence) {
Set<String> nouns = new HashSet<String>();
String[] segs = sentence.split("[()\\[\\]\\{\\}]");
for (int i1 = 0; i1 < segs.length; i1++) {
String seg = segs[i1];
seg = StringUtility.removePunctuation(seg, "-");
String[] tokens = seg.split("\\s+");
// #ignore the first word in character statements--this is normally
// capitalized
for (int j = 1; j < tokens.length; j++) {
String token = tokens[j];
if (token.matches("^.*[A-Z].+$")
&& (!token.matches("^.*-\\w+ed$"))) {
nouns.add(token);
}
}
}
return nouns;
}
/**
* noun rule 4: non-stop/prep followed by a number: epibranchial 4
* descriptor heuristics
*
* @param oSent
* @return a set of nouns
*/
public Set<String> getNounsRule4(String oSent) {
Set<String> nouns = new HashSet<String>();
String copy = oSent;
String regex = "(.*?)\\s(\\w+)\\s+\\d+(.*)";
while (true) {
if (copy == null) {
break;
}
Matcher m = Pattern.compile(regex).matcher(copy);
if (m.lookingAt()) {
String t = m.group(2);
copy = m.group(3);
String regex2 = "\\b(" + this.myLearnerUtility.getConstant().PREPOSITION + "|"
+ this.myLearnerUtility.getConstant().STOP + ")\\b";
if (!t.matches(regex2)) {
t = t.toLowerCase();
nouns.add(t);
}
} else {
break;
}
}
return nouns;
}
/**
*
* @param source
* @param sentence
* @param nouns
* @return
*/
public Set<String> getDescriptorsRule1(String source, String sentence,
Set<String> nouns) {
Set<String> descriptors = new HashSet<String>();
// single word
if (source.matches("^.*\\.xml_\\S+_.*$")
&& (!sentence.matches("^.*\\s.*$"))) {
Iterator<String> iter = nouns.iterator();
boolean isExist = false;
while (iter.hasNext()) {
String noun = iter.next();
if (noun.equals(sentence)) {
isExist = true;
break;
}
}
if (isExist == false) {
sentence = sentence.toLowerCase();
descriptors.add(sentence);
}
}
return descriptors;
}
/**
* (is|are) red: isDescriptor
*
* @param oSent
* @return
*/
public Set<String> getDescriptorsRule2(String sentence,
Map<String, Boolean> descriptorMap) {
Set<String> descriptors = new HashSet<String>();
String[] tokens = sentence.split("\\s+");
for (int i = 0; i < tokens.length; i++) {
String token = tokens[i];
token = token.toLowerCase();
if (isDescriptor(token, descriptorMap)) {
token = token.toLowerCase();
descriptors.add(token);
}
}
return descriptors;
}
/**
* Check if the term is a descriptor
*
* @param term
* @param descriptorMap
* descriptors have already learned
* @return a boolean value indicating whether the term is a descriptor. This
* result will be stored in the descriptorMap for future use
*/
public boolean isDescriptor(String term, Map<String, Boolean> descriptorMap) {
if (descriptorMap.containsKey(term)) {
if (descriptorMap.get(term).booleanValue()) {
return true;
} else {
return false;
}
} else {
for (int i = 0; i < this.myDataHolder.getSentenceHolder().size(); i++) {
String originalSentence = this.myDataHolder.getSentenceHolder()
.get(i).getOriginalSentence();
if (isMatched(originalSentence, term, descriptorMap)) {
return true;
}
}
term = term.toLowerCase();
descriptorMap.put(term, false);
return false;
}
}
/**
* Check if the term matches the sentence
*
* @param sentence
* @param term
* @param descriptorMap
* @return a boolean value indicating whether the term matches the sentence
*/
public boolean isMatched(String sentence, String term,
Map<String, Boolean> descriptorMap) {
if (sentence.matches("^.*" + " (is|are|was|were|be|being) " + term
+ ".*$")) {
term = term.toLowerCase();
descriptorMap.put(term, true);
return true;
} else {
return false;
}
}
/**
public void addStopWords() {
PropertyConfigurator.configure("conf/log4j.properties");
Logger myLogger = Logger.getLogger("learn.addStopWords");
myLogger.trace("Add stop words");
List<String> stops = new ArrayList<String>();
stops.addAll(Arrays.asList(this.myLearnerUtility.getConstant().STOP.split("\\|")));
stops.addAll(Arrays.asList(new String[] { "NUM", "(", "[", "{", ")",
"]", "}", "d+" }));
myLogger.trace("Stop Words: " + stops);
for (int i = 0; i < stops.size(); i++) {
String word = stops.get(i);
if (word.matches("\\b(" + this.myLearnerUtility.getConstant().FORBIDDEN + ")\\b")) {
continue;
}
this.myDataHolder.updateDataHolder(word, "b", "*", "wordpos", 0);
myLogger.trace(String.format(
"(\"%s\", \"b\", \"*\", \"wordpos\", 0) added\n", word));
// this.getWordPOSHolder().put(new WordPOSKey(word, "b"), new
// WordPOSValue("*", 0, 0, null, null));
// System.out.println("Add Stop Word: " + word+"\n");
}
myLogger.trace("Quite\n");
}
public void addCharacters() {
PropertyConfigurator.configure("conf/log4j.properties");
Logger myLogger = Logger.getLogger("learn.addCharacters");
myLogger.trace("Add characters");
List<String> chars = new ArrayList<String>();
chars.addAll(Arrays.asList(this.myLearnerUtility.getConstant().CHARACTER.split("\\|")));
//
// System.out.println(chars);
// System.out.println(this.myLearnerUtility.getConstant().CHARACTER);
for (int i = 0; i < chars.size(); i++) {
String word = chars.get(i);
// String reg="\\b("+this.myLearnerUtility.getConstant().FORBIDDEN+")\\b";
// boolean f = word.matches(reg);
if (word.matches("\\b(" + this.myLearnerUtility.getConstant().FORBIDDEN + ")\\b")) {
continue;
}
this.myDataHolder.updateDataHolder(word, "b", "*", "wordpos", 0);
// this.getWordPOSHolder().put(new WordPOSKey(word, "b"), new
// WordPOSValue("", 0, 0, null, null));
// System.out.println("addCharacter word: " + word);
}
}
public void addNumbers() {
PropertyConfigurator.configure("conf/log4j.properties");
Logger myLogger = Logger.getLogger("learn.addNumbers");
myLogger.trace("Add numbers");
List<String> nums = new ArrayList<String>();
nums.addAll(Arrays.asList(this.myLearnerUtility.getConstant().NUMBER.split("\\|")));
// System.out.println(nums);
// System.out.println(this.myLearnerUtility.getConstant().NUMBER);
for (int i = 0; i < nums.size(); i++) {
String word = nums.get(i);
// String reg="\\b("+this.myLearnerUtility.getConstant().FORBIDDEN+")\\b";
// boolean f = word.matches(reg);
if (word.matches("\\b(" + this.myLearnerUtility.getConstant().FORBIDDEN + ")\\b")) {
continue;
}
this.myDataHolder.updateDataHolder(word, "b", "*", "wordpos", 0);
// this.getWordPOSHolder().put(new WordPOSKey(word, "b"), new
// WordPOSValue("*", 0, 0, null, null));
// System.out.println("add Number: " + word);
}
this.myDataHolder.updateDataHolder("NUM", "b", "*", "wordpos", 0);
// this.getWordPOSHolder().put(new WordPOSKey("NUM", "b"), new
// WordPOSValue("*",0, 0, null, null));
}
public void addClusterStrings() {
PropertyConfigurator.configure("conf/log4j.properties");
Logger myLogger = Logger.getLogger("learn.addClusterstrings");
myLogger.trace("Add clusterstrings");
List<String> cltstrs = new ArrayList<String>();
cltstrs.addAll(Arrays.asList(this.myLearnerUtility.getConstant().CLUSTERSTRING.split("\\|")));
// System.out.println(cltstrs);
// System.out.println(this.myLearnerUtility.getConstant().CLUSTERSTRING);
for (int i = 0; i < cltstrs.size(); i++) {
String word = cltstrs.get(i);
if (word.matches("\\b(" + this.myLearnerUtility.getConstant().FORBIDDEN + ")\\b")) {
continue;
}
this.myDataHolder.updateDataHolder(word, "b", "*", "wordpos", 0);
// this.getWordPOSHolder().put(new WordPOSKey(word, "b"), new
// WordPOSValue("*", 1, 1, null, null));
// System.out.println("addClusterString: " + word);
}
}
public void addProperNouns() {
PropertyConfigurator.configure("conf/log4j.properties");
Logger myLogger = Logger.getLogger("learn.addProperNouns");
myLogger.trace("Add proper nouns");
List<String> ppnouns = new ArrayList<String>();
ppnouns.addAll(Arrays.asList(Constant.PROPERNOUN.split("\\|")));
for (int i = 0; i < ppnouns.size(); i++) {
String word = ppnouns.get(i);
if (word.matches("\\b(" + this.myLearnerUtility.getConstant().FORBIDDEN + ")\\b")) {
continue;
}
this.myDataHolder.updateDataHolder(word, "b", "*", "wordpos", 0);
// this.getWordPOSHolder().put(new WordPOSKey(word, "z"), new
// WordPOSValue("*", 0, 0, null, null));
// System.out.println("Add ProperNoun: " + word);
}
}
**/
// ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// suffix: -fid(adj), -form (adj), -ish(adj), -less(adj), -like (adj)),
// -merous(adj), -most(adj), -shaped(adj), -ous(adj)
// -ly (adv), -er (advj), -est (advj),
// foreach unknownword in unknownwords table
// seperate root and suffix
// if root is a word in WN or in unknownwords table
// make the unknowword a "b" boundary
/**
* for each unknown word in unknownwords table seperate root and suffix if
* root is a word in WN or in unknownwords table make the unknowword a "b"
* boundary
*
* suffix: -fid(adj), -form (adj), -ish(adj), -less(adj), -like (adj)),
* -merous(adj), -most(adj), -shaped(adj), -ous(adj)
*/
public void posBySuffix() {
PropertyConfigurator.configure("conf/log4j.properties");
Logger myLogger = Logger.getLogger("learn.posBySuffix");
myLogger.trace("Enter posBySuffix");
Iterator<Map.Entry<String, String>> iterator = this.myDataHolder
.getUnknownWordHolder().entrySet().iterator();
while (iterator.hasNext()) {
Map.Entry<String, String> unknownWordEntry = iterator.next();
String unknownWord = unknownWordEntry.getKey();
String unknownWordTag = unknownWordEntry.getValue();
if (unknownWordTag.equals("unknown")) {
// boolean flag1 =
posBySuffixCase1Helper(unknownWord);
// boolean flag2 =
posBySuffixCase2Helper(unknownWord);
}
}
myLogger.trace("Quite posBySuffix");
}
/**
* Set the certaintyU and certaintyL value of every entry in WordPOS
* collection to be 0
*
* @param dh
* DataHolder handler to update the dataholder and return the
* updated dataholder
* @return Number of records that have been changed
*/
public int resetCounts(DataHolder dh) {
int count = 0;
Iterator<Entry<WordPOSKey, WordPOSValue>> iter = dh
.getWordPOSHolderIterator();
while (iter.hasNext()) {
Entry<WordPOSKey, WordPOSValue> wordPOSObject = iter.next();
wordPOSObject.getValue().setCertiantyU(0);
wordPOSObject.getValue().setCertiantyL(0);
count++;
}
return count;
}
public boolean posBySuffixCase1Helper(String unknownWord) {
PropertyConfigurator.configure("conf/log4j.properties");
Logger myLogger = Logger.getLogger("learn.posBySuffix");
String pattern1 = "^[a-z_]+(" + Constant.SUFFIX + ")$";
myLogger.debug("Pattern1: " + pattern1);
if (unknownWord.matches(pattern1)) {
Matcher matcher = Pattern
.compile("(.*?)(" + Constant.SUFFIX + ")$").matcher(
unknownWord);
if ((unknownWord.matches("^[a-zA-Z0-9_-]+$")) && matcher.matches()) {
myLogger.debug("posBySuffix - check word: " + unknownWord);
String base = matcher.group(1);
String suffix = matcher.group(2);
if (this.containSuffix(unknownWord, base, suffix)) {
myLogger.debug("Pass\n");
this.myDataHolder.updateDataHolder(unknownWord, "b", "*",
"wordpos", 0);
myLogger.debug("posBySuffix - set word: " + unknownWord);
return true;
} else {
myLogger.debug("Not Pass\n");
}
}
}
return false;
}
public boolean posBySuffixCase2Helper(String unknownWord) {
PropertyConfigurator.configure("conf/log4j.properties");
Logger myLogger = Logger.getLogger("learn.posBySuffix");
String pattern2 = "^[._.][a-z]+"; // , _nerved
myLogger.debug("Pattern2: " + pattern2);
if (unknownWord.matches(pattern2)) {
this.myDataHolder.getWordPOSHolder().put(
new WordPOSKey(unknownWord, "b"),
new WordPOSValue("*", 0, 0, null, null));
myLogger.debug("posbysuffix set " + unknownWord
+ " a boundary word\n");
return true;
}
return false;
}
/**
* return false or true depending on if the word contains the suffix as the
* suffix
*
* @param word
* @param base
* @param suffix
* @return
*/
public boolean containSuffix(String word, String base, String suffix) {
PropertyConfigurator.configure("conf/log4j.properties");
Logger myLogger = Logger.getLogger("learn.posBySuffix.containSuffix");
myLogger.trace("Enter containSuffix");
boolean flag = false; // return value
boolean wordInWN = false; // if this word is in WordNet
boolean baseInWN = false;
WordNetPOSKnowledgeBase myWN = this.myLearnerUtility
.getWordNetPOSKnowledgeBase();
// check base
if (base.length() == 0) {
myLogger.trace("case 0");
return true;
}
base.replaceAll("_", ""); // cup_shaped
if (myWN.contains(word)) {
myLogger.trace("case 1.1");
wordInWN = true; // word is in WordNet
} else {
myLogger.trace("case 1.2");
wordInWN = false;
}
if (myWN.contains(base)) {
myLogger.trace("case 2.1");
baseInWN = true;
} else {
myLogger.trace("case 2.2");
baseInWN = false;
}
// if WN pos is adv, return 1: e.g. ly, or if $base is in
// unknownwords table
if (suffix.equals("ly")) {
myLogger.trace("case 3.1");
if (wordInWN) {
if (myWN.isAdverb(word)) {
return true;
}
}
// if the word is in unknown word set, return true
if (this.myDataHolder.getUnknownWordHolder().containsKey(base)) {
return true;
}
}
// if WN recognize superlative, comparative adjs, return 1: e.g. er, est
else if (suffix.equals("er") || suffix.equals("est")) {
myLogger.trace("case 3.2");
if (wordInWN) {
boolean case1 = !myWN.isAdjective(word);
boolean case2 = myWN.isAdjective(base);
if (case1 && case2) {
return true;
} else {
return false;
}
}
}
// if $base is in WN or unknownwords table, or if $word has sole pos
// adj in WN, return 1: e.g. scalelike
else {
myLogger.trace("case 3.3");
if (myWN.isSoleAdjective(word)) {
return true;
}
if (baseInWN) {
return true;
}
if (this.myDataHolder.getUnknownWordHolder().containsKey(base)) {
return true;
}
}
return flag;
}
public void markupByPattern() {
PropertyConfigurator.configure("conf/log4j.properties");
Logger myLogger = Logger.getLogger("learn.markupByPattern");
myLogger.trace("Enter markupByPattern");
int size = this.myDataHolder.getSentenceHolder().size();
for (int i = 0; i < size; i++) {
boolean flag = markupByPatternHelper(this.myDataHolder
.getSentenceHolder().get(i));
if (flag) {
myLogger.debug("Updated Sentence #" + i);
}
}
myLogger.trace("Quite markupByPattern");
}
public boolean markupByPatternHelper(SentenceStructure sentence) {
PropertyConfigurator.configure("conf/log4j.properties");
Logger myLogger = Logger.getLogger("markupByPattern");
// case 1
if (sentence.getOriginalSentence().matches("^x=.*")) {
myLogger.trace("Case 1");
sentence.setTag("chromosome");
sentence.setModifier("");
return true;
}
// case 2
else if (sentence.getOriginalSentence().matches("^2n=.*")) {
myLogger.trace("Case 2");
sentence.setTag("chromosome");
sentence.setModifier("");
return true;
}
// case 3
else if (sentence.getOriginalSentence().matches("^x .*")) {
myLogger.trace("Case 3");
sentence.setTag("chromosome");
sentence.setModifier("");
return true;
}
// case 4
else if (sentence.getOriginalSentence().matches("^2n .*")) {
myLogger.trace("Case 4");
sentence.setTag("chromosome");
sentence.setModifier("");
return true;
}
// case 5
else if (sentence.getOriginalSentence().matches("^2 n.*")) {
myLogger.trace("Case 5");
sentence.setTag("chromosome");
sentence.setModifier("");
return true;
}
// case 6
else if (sentence.getOriginalSentence().matches("^fl.*")) {
myLogger.trace("Case 6");
sentence.setTag("flowerTime");
sentence.setModifier("");
return true;
}
// case 7
else if (sentence.getOriginalSentence().matches("^fr.*")) {
myLogger.trace("Case 7");
sentence.setTag("fruitTime");
sentence.setModifier("");
return true;
}
return false;
}
// private String IGNOREPTN ="(IGNOREPTN)"; //disabled
public void markupIgnore() {
PropertyConfigurator.configure("conf/log4j.properties");
Logger myLogger = Logger.getLogger("learn.markupIgnore");
myLogger.trace("Enter markupIgnore");
for (int i = 0; i < this.myDataHolder.getSentenceHolder().size(); i++) {
boolean flag = markupIgnoreHelper(this.myDataHolder
.getSentenceHolder().get(i));
if (flag) {
myLogger.debug("Updated Sentence #" + i);
}
}
myLogger.trace("Quite markupIgnore");
}
public boolean markupIgnoreHelper(SentenceStructure sentence) {
PropertyConfigurator.configure("conf/log4j.properties");
Logger myLogger = Logger.getLogger("markupIgnore");
String thisOriginalSentence = sentence.getOriginalSentence();
String pattern = "(^|^ )" + Constant.IGNORE_PATTERN + ".*$";
if (thisOriginalSentence.matches(pattern)) {
sentence.setTag("ignore");
sentence.setModifier("");
myLogger.trace("Set Tag to \"ignore\", Modifier to \"\"");
return true;
}
return false;
}
/**
* A helper of method discover(). Check if the tag of the i-th sentence is
* NOT null
*
* @param sentence
* the sentence to check
* @return if the tag of the i-th sentence is NOT null, returns true;
* otherwise returns false
*/
public boolean isMarked(SentenceStructure sentence) {
String thisTag = sentence.getTag();
if (thisTag != null) {
return true;
} else {
return false;
}
}
/**
* Find the IDs of the sentences that matches the pattern
*
* @param pattern
* @param status
* @param hasTag
* @return a set of sentence IDs of the sentences that matches the pattern
*/
public Set<Integer> matchPattern(String pattern, String status,
boolean hasTag) {
PropertyConfigurator.configure("conf/log4j.properties");
Logger myLogger = Logger.getLogger("learn.discover.matchPattern");
myLogger.trace("Enter matchPattern");
myLogger.trace("Pattern: " + pattern);
myLogger.trace("Status: " + status);
myLogger.trace("HasTag: " + hasTag);
Set<Integer> matchedIDs = new HashSet<Integer>();
for (int i = 0; i < this.myDataHolder.getSentenceHolder().size(); i++) {
SentenceStructure sent = this.myDataHolder.getSentenceHolder().get(
i);
String thisSentence = sent.getSentence();
String thisStatus = sent.getStatus();
String thisTag = sent.getTag();
boolean a = hasTag;
boolean b = (thisTag == null);
if ((a ^ b) && (StringUtils.equals(status, thisStatus))) {
Pattern p = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE);
Matcher m = p.matcher(thisSentence);
if (m.lookingAt()) {
myLogger.debug("Push Sentence #" + i);
myLogger.debug("Sentence: " + thisSentence);
myLogger.debug("Status: " + thisStatus);
myLogger.debug("Tag: " + thisTag);
myLogger.debug("\n");
matchedIDs.add(i);
}
}
}
myLogger.trace("Return IDs: " + matchedIDs);
myLogger.trace("Quite matchPattern");
myLogger.trace("\n");
return matchedIDs;
}
/**
* return a positive number if anything new is learnt from @source sentences
* by applying rules and clues to grow %NOUNS and %BDRY and to confirm tags
* create and maintain decision tables
*
* @param matched
* @return
*/
public int ruleBasedLearn(Set<Integer> matched) {
PropertyConfigurator.configure("conf/log4j.properties");
Logger myLogger = Logger.getLogger("learn.discover.ruleBasedLearn");
myLogger.trace("Enter ruleBasedLearn");
myLogger.trace("Matched IDs: " + matched);
int sign = 0;
Iterator<Integer> iter = matched.iterator();
while (iter.hasNext()) {
int sentID = iter.next().intValue();
SentenceStructure sentence = this.myDataHolder.getSentenceHolder()
.get(sentID);
if (!isMarked(sentence)) {
StringAndInt tagAndNew = null;
String tag = null;
int numNew = 0;
tagAndNew = this.myLearnerUtility.learnTerms(this.myDataHolder, sentID);
tag = tagAndNew.getString();
numNew = tagAndNew.getInt();
this.myLearnerUtility.tagSentence(this.myDataHolder, this.myConfiguration.getMaxTagLength(), sentID, tag);
sign = sign + numNew;
}
}
myLogger.trace("Return: " + sign);
myLogger.trace("Quit ruleBaseLearn");
myLogger.trace("\n");
return sign;
}
/**
*
*/
public void additionalBootstrapping() {
PropertyConfigurator.configure("conf/log4j.properties");
Logger myLogger = Logger.getLogger("learn.additionalBootStrapping");
myLogger.trace("[additionalBootStrapping]Start");
// this.myDataHolder.printHolder(DataHolder.SENTENCE);
int flag = 0;
do {
myLogger.trace(String.format("Enter one do-while loop iteration"));
flag = 0;
// warmup markup
int cmReturn = wrapupMarkup();
myLogger.trace(String
.format("wrapupMarkup() returned %d", cmReturn));
flag += cmReturn;
// one lead word markup
Set<String> tags = myDataHolder.getCurrentTags();
myLogger.trace(tags.toString());
int omReturn = oneLeadWordMarkup(tags);
myLogger.trace(String.format("oneLeadWordMarkup() returned %d",
omReturn));
flag += omReturn;
// doit markup
int dmReturn = this.myLearnerUtility.doItMarkup(this.myDataHolder, this.myConfiguration.getMaxTagLength());
myLogger.trace(String.format("doItMarkup() returned %d", dmReturn));
flag += dmReturn;
myLogger.trace(String.format("Quite this iteration with flag = %d",
flag));
} while (flag > 0);
myLogger.trace("[additionalBootStrapping]End");
}
/**
* In the sentence collections, search for such sentence, whose lead is
* among the tags passed in, and add the lead into word POS collections as a
* noun
*
* @param tags
* a set of all tags in the tagged sentences in the sentence
* collection
* @return the numbet of updates made
*/
public int oneLeadWordMarkup(Set<String> tags) {
PropertyConfigurator.configure("conf/log4j.properties");
Logger myLogger = Logger
.getLogger("learn.additionalBootStrapping.oneLeadWordMarkup");
// String tags = StringUtility.joinList("|", tags);
int sign = 0;
myLogger.trace(String.format("Enter (%s)", tags));
Iterator<SentenceStructure> iter = this.myDataHolder
.getSentenceHolder().iterator();
while (iter.hasNext()) {
SentenceStructure sentence = iter.next();
int ID = sentence.getID();
String tag = sentence.getTag();
String lead = sentence.getLead();
if ((tag == null)
&& (!(StringUtility.createMatcher(lead, ".* .*").find()))) {
if (tags.contains(lead)) {
this.myLearnerUtility.tagSentence(this.myDataHolder, this.myConfiguration.getMaxTagLength(), ID, lead);
myLogger.trace(String.format(
"updateDataHolder(%s, n, -, wordpos, 1)", lead));
sign += myDataHolder.updateDataHolder(lead, "n", "-",
"wordpos", 1);
}
}
}
myLogger.trace("Return: " + sign);
return 0;
}
/**
* for the remaining of sentences that do not have a tag yet, look for lead
* word co-ocurrance, use the most freq. co-occured phrases as tags e.g.
* plication induplicate (n times) and plication reduplicate (m times) =>
* plication is the tag and a noun e.g. stigmatic scar basal (n times) and
* stigmatic scar apical (m times) => stigmatic scar is the tag and scar is
* a noun. what about externally like A; externally like B, functionally
* staminate florets, functionally staminate xyz?
*
* @return
*/
public int wrapupMarkup() {
PropertyConfigurator.configure("conf/log4j.properties");
Logger myLogger = Logger
.getLogger("learn.additionalBootStrapping.wrapupMarkup");
myLogger.trace("Enter");
int sign = 0;
Set<Integer> checkedIDs = new HashSet<Integer>();
List<SentenceStructure> sentenceList = new LinkedList<SentenceStructure>();
for (int id1 = 0; id1 < this.myDataHolder.getSentenceHolder().size(); id1++) {
SentenceStructure sentence = this.myDataHolder.getSentenceHolder()
.get(id1);
String tag = sentence.getTag();
String lead = sentence.getLead();
if ((tag == null)
&& (StringUtility.createMatcher(lead, ".* .*").find())) {
sentenceList.add(sentence);
}
}
SentenceLeadLengthComparator myComparator = new SentenceLeadLengthComparator(
false);
Collections.sort(sentenceList, myComparator);
Iterator<SentenceStructure> iter1 = sentenceList.iterator();
while (iter1.hasNext()) {
SentenceStructure sentence = iter1.next();
int ID1 = sentence.getID();
String lead = sentence.getLead();
// if this sentence has been checked, pass
if (checkedIDs.contains(ID1)) {
continue;
}
List<String> words = new ArrayList<String>();
words.addAll(Arrays.asList(lead.split("\\s+")));
List<String> sharedHead = new ArrayList<String>();
sharedHead.addAll(words.subList(0, words.size() - 1));
String match = StringUtility.joinList(" ", sharedHead);
Set<SentenceStructure> sentenceSet = new HashSet<SentenceStructure>();
for (int index = 0; index < this.myDataHolder.getSentenceHolder()
.size(); index++) {
SentenceStructure thisSentence = this.myDataHolder
.getSentenceHolder().get(index);
String thisLead = thisSentence.getLead();
String tag = thisSentence.getTag();
String pTemp = "^" + match + " [\\S]+$";
myLogger.trace(thisLead);
myLogger.trace(pTemp);
// if ((tag==null) && StringUtility.isMatchedNullSafe(pTemp,
// thisLead)) {
if ((tag == null)
&& StringUtility.isMatchedNullSafe(thisLead, pTemp)) {
if (!StringUtils.equals(thisLead, lead)) {
sentenceSet.add(thisSentence);
}
}
}
if (sentenceSet.size() > 1) {
String ptn = this.myLearnerUtility.getPOSptn(this.myDataHolder, sharedHead);
String wnPOS = this.myLearnerUtility.getWordFormUtility()
.checkWN(sharedHead.get(sharedHead.size() - 1), "pos");
myLogger.trace("ptn: " + ptn);
myLogger.trace("wnPOS: " + wnPOS);
if ((StringUtility.createMatcher(ptn, "[nsp]$").find())
|| ((StringUtility.createMatcher(ptn, "\\?$").find()) && (StringUtility
.createMatcher(wnPOS, "n").find()))) {
Iterator<SentenceStructure> iter2 = sentenceSet.iterator();
while (iter2.hasNext()) {
SentenceStructure thisSentence = iter2.next();
int ID = thisSentence.getID();
String thisLead = thisSentence.getLead();
List<String> words2 = new ArrayList<String>();
words2.addAll(Arrays.asList(thisLead.split("\\s+")));
// case 1
boolean case1 = false;
boolean case2 = false;
case1 = words2.size() > sharedHead.size();
if (case1) {
List<String> checkWord = new ArrayList<String>();
checkWord.add(words2.get(sharedHead.size()));
case2 = StringUtility.createMatcher(
this.myLearnerUtility.getPOSptn(this.myDataHolder, checkWord), "[psn]").find();
}
if (case1 && case2) {
myLogger.trace("Case 1");
String nb = words2.size() >= sharedHead.size() + 2 ? words2
.get(sharedHead.size() + 1) : "";
words2 = StringUtility.stringArraySplice(words2, 0,
sharedHead.size() + 1);
String nmatch = StringUtility.joinList(" ", words2);
this.myLearnerUtility.tagSentence(this.myDataHolder, this.myConfiguration.getMaxTagLength(),ID, nmatch);
myLogger.trace(String.format("tag (%d, %s)", ID,
nmatch));
this.myLearnerUtility.tagSentence(this.myDataHolder, this.myConfiguration.getMaxTagLength(),ID1, match);
myLogger.trace(String.format("tag (%d, %s)", ID1,
match));
String updatedWord = words2.get(words2.size() - 1);
int update1 = this.myDataHolder.updateDataHolder(
updatedWord, "n", "-", "wordpos", 1);
sign += update1;
myLogger.trace(String.format("update (%s)",
updatedWord));
if (!StringUtils.equals(nb, "")) {
int update2 = this.myDataHolder
.updateDataHolder(nb, "b", "",
"wordpos", 1);
sign += update2;
myLogger.trace(String.format("update (%s)", nb));
}
updatedWord = words.get(words.size() - 1);
int update3 = this.myDataHolder.updateDataHolder(
words.get(words.size() - 1), "b", "",
"wordpos", 1);
sign += update3;
myLogger.trace(String.format("update (%s)",
updatedWord));
}
// case 2
else {
myLogger.trace("Case 2");
String b = words2.size() >= sharedHead.size() + 1 ? words2
.get(sharedHead.size()) : "";
this.myLearnerUtility.tagSentence(this.myDataHolder, this.myConfiguration.getMaxTagLength(),ID, match);
this.myLearnerUtility.tagSentence(this.myDataHolder, this.myConfiguration.getMaxTagLength(),ID1, match);
// if (sharedHead.get(sharedHead.size() -
// 1).equals("tissue")) {
// System.out.println();
// }
int update1 = this.myDataHolder.updateDataHolder(
sharedHead.get(sharedHead.size() - 1), "n",
"-", "wordpos", 1);
sign += update1;
if (!StringUtils.equals(b, "")) {
int update2 = this.myDataHolder
.updateDataHolder(b, "b", "",
"wordpos", 1);
sign += update2;
}
int update3 = this.myDataHolder.updateDataHolder(
words.get(words.size() - 1), "b", "",
"wordpos", 1);
sign += update3;
}
checkedIDs.add(ID);
}
} else {
Iterator<SentenceStructure> iter2 = sentenceSet.iterator();
while (iter2.hasNext()) {
SentenceStructure thisSentence = iter2.next();
int ID = thisSentence.getID();
checkedIDs.add(ID);
}
}
} else {
checkedIDs.add(ID1);
}
}
myLogger.trace("Return " + sign);
return sign;
}
/**
* check if the lead has the head in the beginning of it
*
* @param head
* @param lead
* @return true if it has, false if it does not have
*/
public boolean hasHead(List<String> head, List<String> lead) {
// null case
if (head == null || lead == null) {
return false;
}
int headSize = head.size();
int leadSize = lead.size();
if (headSize > leadSize) {
return false;
}
for (int i = 0; i < headSize; i++) {
if (!StringUtils.equals(head.get(i), lead.get(i))) {
return false;
}
}
return true;
}
public void unknownWordBootstrapping() {
PropertyConfigurator.configure("conf/log4j.properties");
Logger myLogger = Logger.getLogger("learn.unknownWordBootstrapping");
myLogger.trace("[unknownWordBootstrapping]Start");
unknownWordBootstrappingPreprocessing();
unknownWordBootstrappingMain();
unknownWordBootstrappingPostprocessing();
myLogger.trace("[unknownWordBootstrapping]End");
}
public void unknownWordBootstrappingPreprocessing() {
this.myLearnerUtility.tagAllSentences(this.myDataHolder, "singletag",
"sentence");
}
public void unknownWordBootstrappingMain() {
String plMiddle = "(ee)";
int newInt = 0;
do {
// this.unknownWordBootstrappingGetUnknownWord(plMiddle);
} while (newInt > 0);
}
public void unknownWordBootstrappingPostprocessing() {
// pistillate_zone
// get all nouns from wordPOS holder
Set<String> POSTags = new HashSet<String>();
POSTags.add("p");
POSTags.add("s");
Set<String> nouns = this.getDataHolder().getWordsFromWordPOSByPOSs(
POSTags);
// get boudaries
Set<String> boundaries = new HashSet<String>();
Set<String> words = this.getDataHolder().getWordsFromUnknownWord(
"^.*_.*$", true, "^unknown$", true);
Iterator<String> wordIter = words.iterator();
String pattern = "_(" + StringUtils.join(nouns, "|") + ")$";
while (wordIter.hasNext()) {
String word = wordIter.next();
Pattern p1 = Pattern.compile("^[a-zA-Z0-9_-]+$");
Matcher m1 = p1.matcher(word);
Pattern p2 = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE);
Matcher m2 = p2.matcher(word);
if (m1.matches() && (!m2.matches())) {
if (!StringUtility.createMatcher(word,
"\\b(" + this.myLearnerUtility.getConstant().FORBIDDEN + ")\\b").find()) {
boundaries.add(word);
}
this.getDataHolder().updateDataHolder(word, "b", "", "wordpos",
1);
}
}
// if the boundaries is not empty
if (boundaries.size() > 0) {
Iterator<SentenceStructure> iter = this.getDataHolder()
.getSentenceHolderIterator();
while (iter.hasNext()) {
SentenceStructure sentenceItem = iter.next();
String tag = sentenceItem.getTag();
String sentence = sentenceItem.getSentence();
int sentenceID = sentenceItem.getID();
if ((!(StringUtils.equals(tag, "ignore")) || (tag == null))
&& (StringUtility.createMatcher(sentence, "(^| )("
+ StringUtils.join(boundaries, "|") + ") ")
.find())) {
KnownTagCollection tags = new KnownTagCollection(null,
null, null, boundaries, null, null);
sentence = this.myLearnerUtility.annotateSentence(sentence,
tags, this.myDataHolder.getBMSWords());
SentenceStructure updatedSentence = this.getDataHolder()
.getSentence(sentenceID);
updatedSentence.setSentence(sentence);
}
}
}
}
public void adjectiveSubjectBootstrapping(DataHolder dataholderHandler) {
int flag = 0;
int count = 0;
do {
// tag all sentences
this.myLearnerUtility.tagAllSentences(dataholderHandler, "singletag", "sentence");
// adjective subject markup: may discover new modifier, new boundary, and new nouns
int res1 = this.adjectiveSubjects(dataholderHandler);
flag += res1;
// work on tag='andor' clauses, move to the main bootstrapping
int res2 = discoverNewModifiers(dataholderHandler);
flag += res2;
int res3 = this.handleAndOr(dataholderHandler);
flag += res3;
dataholderHandler.untagSentences();
int res4 = this.myLearnerUtility.doItMarkup(this.myDataHolder, this.myConfiguration.getMaxTagLength());
} while (flag > 0);
// reset unsolvable andor to NULL
for (SentenceStructure sentenceItem : dataholderHandler.getSentenceHolder()) {
String tag = sentenceItem.getTag();
if (StringUtils.equals(tag, "andor")) {
sentenceItem.setTag(null);
}
}
// cases releazed from andor[m&mn] may be marked by adjectivesubjects
this.myLearnerUtility.tagAllSentences(dataholderHandler, "singletag", "sentence");
this.adjectiveSubjects(dataholderHandler);
}
/**
* works on annotated sentences that starts with a M in all non-ignored
* sentences, find sentences that starts with a modifer <m> followed by a
* boundary word <b>. (note, if the <B> is a punct mark, this sentence
* should be tagged as ditto) Use the context to find the tag, use the
* modifier as the modifie (markup process, no new discovery). for
* "modifier unknown" pattern, check WNPOS of the "unknown" to decide if
* "unknown" is a structure name (if it is a pl) or a boundary word (may
* have new discoveries). Works on sentences, not leads
*
* @param dataholderHandler
* @return # of updates
*/
public int adjectiveSubjects(DataHolder dataholderHandler) {
Set<String> typeModifiers = new HashSet<String>();
// Part 1: collect evidence for the usage of "modifier boundry":
typeModifiers = adjectiveSubjectsPart1(dataholderHandler, typeModifiers);
for (String typeModifier : typeModifiers) {
if (dataholderHandler.getModifierHolder().containsKey(typeModifier)) {
dataholderHandler.getModifierHolder().get(typeModifier)
.setIsTypeModifier(true);
}
}
// Part 2: process "typemodifier unknown" patterns
int flag = adjectiveSubjectsPart2(dataholderHandler, typeModifiers);
return flag;
}
public Set<String> adjectiveSubjectsPart1(DataHolder dataholderHandler, Set<String> typeModifiers) {
for (SentenceStructure sentenceItem : dataholderHandler.getSentenceHolder()) {
String sentenceCopy = ""+sentenceItem.getSentence();
String tag = sentenceItem.getTag();
if (!StringUtils.equals(tag, "ignore") || tag == null) {
Pattern p = Pattern.compile(".*?<M>(\\S+)</M> <B>[^,.]+</B> (.*)");
Matcher m = p.matcher(sentenceCopy);
while (m.find()) {
sentenceCopy = m.group(2);
String temp = m.group(1);
temp = temp.replaceAll("<\\S+?>", "");
if (!typeModifiers.contains(temp)) {
typeModifiers.add(temp);
}
}
}
}
return typeModifiers;
}
public int adjectiveSubjectsPart2(DataHolder dataholderHandler,
Set<String> typeModifiers) {
String pos = null;
int flag = 0;
for (SentenceStructure sentenceItem : dataholderHandler
.getSentenceHolder()) {
int sentenceID = sentenceItem.getID();
String sentence = sentenceItem.getSentence();
String tag = sentenceItem.getTag();
String pattern = "<M>\\S*(" + StringUtils.join(typeModifiers, "|")
+ ")\\S*</M> .*";
int count = 0;
if (((tag == null) || StringUtils.equals(tag, "") || StringUtils
.equals(tag, "unknown"))
&& adjectiveSubjectsPart2Helper1(sentence, typeModifiers)) {
if (sentence != null) {
String sentenceCopy = sentence + "";
String regex = "(.*?)((?:(\\S+)\\s*(?:and|or|nor|and / or|or / and)\\s*)*(?:<M>\\S+</M>\\s*)+) (\\S+)\\s*(.*)";
Pattern p = Pattern.compile(regex);
Matcher m = p.matcher(sentenceCopy);
while (m.find()) {
int knownPOS = 0;
String start = m.group(1);
String modifier = m.group(2);
String newModifier = m.group(3);
String word = m.group(4);
sentenceCopy = m.group(5);
// case 1
if (!this.myLearnerUtility.getConstant().forbiddenWords
.contains(word)) {
count++;
continue;
}
// case 2
if (StringUtility.isMatchedNullSafe(
newModifier.toUpperCase(), "<N>")
|| StringUtility.isMatchedNullSafe(
start.toUpperCase(), "<N>")) {
count++;
continue;
}
// case 3
boolean c3 = this.myLearnerUtility.getConstant().prepositionWords.contains(word);
if (count == 0
&& ((StringUtility.isMatchedNullSafe(word, "[;,]") || c3)
|| (StringUtility.isMatchedNullSafe(word, "[.;,]")
&& !StringUtility.isMatchedNullSafe(sentence, "\\w")))) {
// case 3.1
// start with a <[BM]>, followed by a <[BM]>
if ((StringUtility.isMatchedNullSafe(word,
"\\b(with|without|of)\\b"))
&& ((StringUtility.isMatchedNullSafe(modifier,
"^(<M>)?<B>(<M>)?\\w+(</M)?</B>(</M>)? (?:and|or|nor|and / or|or / and)?\\s*(<[BM]>)+\\w+(</[BM]>)+\\s*$"))
|| (StringUtility.isMatchedNullSafe(modifier, "^(<[BM]>)+\\w+(</[BM]>)+$")))) {
dataholderHandler.tagSentenceWithMT(sentenceID,
sentenceCopy, "", "ditto",
"adjectivesubject[ditto]");
count++;
continue;
}
// case 3.2
// modifier={<M>outer</M> <M><B>pistillate</B></M>} word= <B>,</B> sentence= <N>corollas</N>....
// make the last modifier b
else {
if (modifier != null) {
Pattern p2 = Pattern
.compile("^(.*) (\\S+)$");
Matcher m2 = p2.matcher(modifier);
if (m2.find()) {
modifier = m2.group(1);
String b = m2.group(2);
String bCopy = "" + b;
b = b.replaceAll("<\\S+?>", "");
dataholderHandler.updateDataHolder(b,"b", "", "wordpos", 1);
tag = dataholderHandler.getParentSentenceTag(sentenceID);
List<String> modifierAndTag =
dataholderHandler.getMTFromParentTag(tag);
String modifier2 = modifierAndTag.get(0);
tag = modifierAndTag.get(1);
modifier = modifier.replaceAll(
"<\\S+?>", "");
if (StringUtility.isMatchedNullSafe(modifier2, "\\w")) {
modifier = modifier + " " + modifier2;
}
dataholderHandler.tagSentenceWithMT(
sentenceID, sentence, modifier,
tag, "adjectivesubject[M-B,]");
count++;
continue;
}
}
}
}
// case 4
// get new modifier from modifiers like
// "mid and/or <m>distal</m>"
if (!StringUtility.isMatchedNullSafe(newModifier,"<")
&& StringUtility.isMatchedNullSafe(newModifier, "\\w")
&& StringUtility.isMatchedNullSafe(start,",(?:</B>)?\\s*$")) {
flag += dataholderHandler.updateDataHolder(newModifier, "m", "", "modifiers", 1);
// print "find a modifier [E0]: $newm\n" if $debug;
}
// case 5
// pos = "N"/"B"
if (word != null) {
Pattern p5 = Pattern.compile("([A-Z])>(<([A-Z])>)?(.*?)<");
Matcher m5 = p5.matcher(word);
if (m5.find()) {
String g1 = m5.group(1);
String g2 = m5.group(2);
String g3 = m5.group(3);
String g4 = m5.group(4);
String t1 = g1;
String t2 = g3;
word = g4;
pos = t1 + t2;
// if <N><B>, decide on one tag
if (pos.length() > 1) {
if (StringUtility.isMatchedNullSafe(sentence, "^\\s*<B>[,;:]<\\/B>\\s*<N>")
||StringUtility.isMatchedNullSafe(sentence, "^\\s*<B>\\.<\\/B>\\s*$")){
pos = "B";
}
else {
pos = "N";
}
}
knownPOS = 1;
}
else {
List<POSInfo> POSs = dataholderHandler.checkPOSInfo(word);
pos = POSs.get(0).getPOS();
}
}
pos = StringUtils.equals(pos, "?") ? this.myLearnerUtility.getWordFormUtility().getNumber(word) : pos;
// part 6
// markup sentid, update pos for word, new modifier
if (StringUtils.equals(pos, "p") || StringUtils.equals(pos, "N")) {
if (knownPOS != 0) {
flag += dataholderHandler.updateDataHolder(word, "p", "-", "wordpos", 1);
// /print "update [$word] pos: p\n" if (!$knownpos) && $debug;
}
if (count == 0
&& (StringUtility.isMatchedNullSafe(start, "^\\S+\\s?(?:and |or |and \\/ or |or \\/ and )?$")
||start.length() == 0)) {
modifier = start + modifier;
modifier = modifier.replaceAll("<\\S+?>", "");
word = word.replaceAll("<\\S+?>", "");
dataholderHandler.tagSentenceWithMT(sentenceID, sentence, modifier, tag, "adjectivesubject[M-N]");
// new modifier
start = start.replaceAll("\\s*(and |or |and \\/ or |or \\/ and )\\s*", "");
start = start.replaceAll("<\\S+?>", "");
while (StringUtility.isMatchedNullSafe(start, "^("+this.myLearnerUtility.getConstant().STOP+")\\b")) {
start = start.replaceAll("^("+this.myLearnerUtility.getConstant().STOP+")\\b\\s*", "");
}
if (start.length() > 0) {
flag += dataholderHandler.updateDataHolder(start, "m", "", "modifiers", 1);
//print "find a modifier [E]: $start\n" if $debug;
}
}
}
// not p
else {
if (knownPOS != 0) {
// update pos for word, markup sentid (get tag
// from context), new modifier
flag += dataholderHandler.updateDataHolder(word, "b", "", "wordpos", 1);
// print "update [$word] pos: b\n" if $debug;
}
if (count == 0
&& (StringUtility.isMatchedNullSafe(start, "^\\S+\\s?(?:and |or |and \\/ or |or \\/ and )?$")
||start.length() == 0)) {
while (StringUtility.isMatchedNullSafe(start, "^("+this.myLearnerUtility.getConstant().STOP+"|"+this.myLearnerUtility.getConstant().FORBIDDEN+"|\\w+ly)\\b")) {
start = start.replaceAll("^("+this.myLearnerUtility.getConstant().STOP+"|"+this.myLearnerUtility.getConstant().FORBIDDEN+"|\\w+ly)\\b\\s*", "");
}
modifier = start + modifier;
modifier = modifier.replaceAll("<\\S+?>", "");
tag = dataholderHandler.getParentSentenceTag(sentenceID);
List<String> modifierAndTag = dataholderHandler.getMTFromParentTag(tag);
String newM = modifierAndTag.get(0);
tag = modifierAndTag.get(1);
if (StringUtility.isMatchedNullSafe(newM, "\\w")) {
modifier = modifier + " " + newM;
}
dataholderHandler.tagSentenceWithMT(sentenceID, sentence, modifier, tag, "adjectivesubject[M-B]");
// new modifier
start = start.replaceAll("\\s*(and |or |and \\/ or |or \\/ and )\\s*", "");
start = start.replaceAll("<\\S+?>", "");
if (start.length() > 0) {
if (!StringUtility.isMatchedNullSafe(start, "ly\\s*$")
&& !StringUtility.isMatchedNullSafe(start, "\\b(" + this.myLearnerUtility.getConstant().STOP + "|" + this.myLearnerUtility.getConstant().FORBIDDEN + ")\\b")) {
flag += dataholderHandler.updateDataHolder(word, "m", "", "modifiers", 1);
// print "find a modifier [F]: $start\n" if $debug;
}
}
}
}
count++;
}
}
}
}
return flag;
}
public boolean adjectiveSubjectsPart2Helper1(String sentence,
Set<String> typeModifiers) {
String pattern = "<M>\\S*(" + StringUtils.join(typeModifiers, "|")
+ ")\\S*</M> .*";
return StringUtility.isMatchedNullSafe(sentence, pattern);
}
/**
* Discover new modifiers using and/or pattern.
* For "modifier and/or unknown boundary" pattern or
* "unknown and/or modifier boundary" pattern, make "unknown" a modifier
*
* @param dataholderHandler
* @return
*/
public int discoverNewModifiers(DataHolder dataholderHandler) {
int sign = 0;
// "modifier and/or unknown boundary" pattern
for (SentenceStructure sentenceItem : dataholderHandler.getSentenceHolder()) {
String sentenceTag = sentenceItem.getTag();
String sentence = sentenceItem.getSentence();
int sentenceID = sentenceItem.getID();
if ((!StringUtility.isMatchedNullSafe(sentenceTag, "ignore") || sentenceTag == null)
&& StringUtility.isMatchedNullSafe(sentence, "<M>[^\\s]+</M> (or|and|and / or|or / and) .*")){
String POS = "";
// if "<m>xxx</m> (and|or) yyy (<b>|\d)" pattern appears at the
// beginning or is right after the 1st word of the sentence,
// mark up the sentence, add yyy as a modifier
if (sentence != null) {
Pattern p1 = Pattern.compile("^(?:\\w+\\s)?<M>(\\S+)<\\/M> (and|or|nor|and \\/ or|or \\/ and) ((?:<[^M]>)*[^<]+(?:<\\/[^M]>)*) <B>[^,;:\\.]");
Matcher m1 = p1.matcher(sentence);
if (m1.find()) {
String g1 = m1.group(1);
String g2 = m1.group(2);
String g3 = m1.group(3);
String modifier = g1 +" "+ g2+" "+ g3;
String newM = g3;
if (!StringUtility.isMatchedNullSafe(newM, "\\b("+this.myLearnerUtility.getConstant().STOP+")\\b")) {
modifier = modifier.replaceAll("<\\S+?>", "");
if (newM != null) {
Pattern p11 = Pattern.compile("(.*?>)(\\w+)<\\/");
Matcher m11 = p11.matcher(newM);
if (m11.find()) {
newM = m11.group(2);
POS = m11.group(1);
}
}
// update N to M: retag sentences tagged as $newm, remove [s] record from wordpos
if (StringUtility.isMatchedNullSafe(POS, "<N>")) {
sign += dataholderHandler.changePOS(newM, "s", "m", "", 1);
}
// B
else {
sign += dataholderHandler.updateDataHolder(newM, "m", "", "modifiers", 1);
}
// print "find a modifier [A]: $newm\n" if $debug;
String tag = dataholderHandler.getParentSentenceTag(sentenceID);
List<String> modifierAndTag = dataholderHandler.getMTFromParentTag(tag);
String m = modifierAndTag.get(0);
tag = modifierAndTag.get(1);
if (StringUtility.isMatchedNullSafe(m, "\\w")) {
modifier = modifier + " "+m;
}
dataholderHandler.tagSentenceWithMT(sentenceID, sentence, modifier, tag, "discovernewmodifiers");
}
}
// if the pattern appear in the middle of the sentence, add yyy as modifier
else {
Pattern p2 = Pattern.compile("<M>(\\S+)<\\/M> (and|or|nor|and \\/ or|or \\/ and) (\\w+) <B>[^,;:\\.]");
Matcher m2 = p2.matcher(sentence);
if (m2.find()) {
String newM = m2.group(3);
sign += dataholderHandler.updateDataHolder(newM, "m", "", "modifiers", 1);
// print "find a modifier[B]: $newm\n" if $debug;
}
}
}
}
}
// "unknown and/or modifier boundary"
for (SentenceStructure sentenceItem : dataholderHandler.getSentenceHolder()) {
String sentence = sentenceItem.getSentence();
String sentenceTag = sentenceItem.getTag();
if ((!StringUtility.isMatchedNullSafe(sentenceTag, "ignore") || sentenceTag == null)
&& StringUtility.isMatchedNullSafe(sentence, "[^\\w]+ (and|or|nor|and / or|or / and) <M>[^\\w]+</M> .*")) {
int sentenceID = sentenceItem.getID();
String POS = "";
// if "xxx (and|or|nor) <m>yyy</m> (<b>|\d)" pattern appear at the beginning or is right after the 1st word of the sentence, mark up the sentence, add yyy as a modifier
if (sentence != null) {
Pattern p3 = Pattern.compile("^(?:\\w+\\s)?((?:<[^M]>)*[^<]+(?:<\\/[^M]>)*) (and|or|nor|and \\/ or|or \\/ and) <M>(\\S+)<\\/M> <B>[^:;,\\.]");
Matcher m3 = p3.matcher(sentence);
if (m3.find()) {
String g1 = m3.group(1);
String g2 = m3.group(2);
String g3 = m3.group(3);
String modifier = g1 + " " + g2 + " " + g3;
String newM = g1;
modifier = modifier.replaceAll("<\\S+?>", "");
if (newM != null) {
Pattern p31 = Pattern.compile("(.*?>)(\\w+)<\\/");
Matcher m31 = p31.matcher(newM);
if (m31.find()) { // N or B
newM = m31.group(2);
POS = m31.group(1);
}
}
if (StringUtility.isMatchedNullSafe(POS, "<N>")) { // update N to M
sign += dataholderHandler.changePOS(newM, "s", "m", "", 1); // update $newm to m
}
else { // B
sign += dataholderHandler.updateDataHolder(newM, "m", "", "modifiers", 1);
}
// print "find a modifier [C]: $newm\n" if $debug;
String tag = dataholderHandler.getParentSentenceTag(sentenceID);
List<String> modifierAndTag = dataholderHandler.getMTFromParentTag(tag);
String m = modifierAndTag.get(0);
tag = modifierAndTag.get(1);
if (StringUtility.isMatchedNullSafe(m, "\\w")) {
modifier = modifier +" "+m;
}
dataholderHandler.tagSentenceWithMT(sentenceID, sentence, modifier, tag, "discovernewmodifiers");
}
else {
Pattern p32 = Pattern.compile("(\\w+) (and|or|nor|and \\/ or|or \\/ and) <M>(\\S+)<\\/M> <B>[^,:;\\.]");
Matcher m32 = p32.matcher(sentence);
// if the pattern appear in the middle of the sentence, add yyy as modifier
if (m32.find()) {
String newM = m32.group(1);
sign += dataholderHandler.updateDataHolder(newM, "m", "", "modifiers", 1);
}
//print "find a modifier [D]: $newm\n" if $debug;
}
}
}
}
return sign;
}
public int handleAndOr(DataHolder dataholderHandler) {
PropertyConfigurator.configure("conf/log4j.properties");
Logger myLogger = Logger.getLogger("learn.handleAndOr");
myLogger.info("to match pattern " + Constant.ANDORPTN);
List<SentenceStructure> sentenceItems = dataholderHandler
.getSentencesByTagPattern("^andor$");
int sign = 0;
for (SentenceStructure sentenceItem : sentenceItems) {
int sentenceID = sentenceItem.getID();
String sentence = sentenceItem.getSentence();
// myLogger.trace(Constant.SEGANDORPTN);
// myLogger.trace(Constant.ANDORPTN);
int result = this.andOrTag(dataholderHandler, sentenceID, sentence,
Constant.SEGANDORPTN, Constant.ANDORPTN);
sign = sign + result;
}
return sign;
}
public int andOrTag(DataHolder dataholderHandler, int sentenceID,
String sentence, String sPattern, String wPattern) {
PropertyConfigurator.configure("conf/log4j.properties");
Logger myLogger = Logger.getLogger("learn.andOrTag");
myLogger.trace("Enter");
int sign = 0;
List<String> mPatterns = new ArrayList<String>();
List<String> sPatterns = new ArrayList<String>();
List<String> mSegments = new ArrayList<String>();
List<String> sSegments = new ArrayList<String>();
Set<String> token = new HashSet<String>();
token.addAll(Arrays.asList("and or nor".split(" ")));
token.add("\\");
token.add("and / or");
String strToken = "(" + StringUtils.join(token, " ") + ")";
int limit = 80;
List<String> words = new ArrayList<String>();
words.addAll(Arrays.asList(sentence.split(" ")));
String pattern = this.getLearnerUtility().getSentencePtn(
dataholderHandler, token, limit, words);
pattern = pattern.replaceAll("t", "m");
myLogger.info(String.format("Andor pattern %s for %s", pattern,
words.toString()));
if (pattern == null) {
return -1;
}
// Matcher m1 = StringUtility.createMatcher(pattern, wPattern);
Matcher m2 = StringUtility.createMatcher(pattern, "^b+&b+[,:;.]");
if (sentenceID == 163) {
System.out.println();
}
List<List<String>> res = this.andOrTagCase1Helper(pattern, wPattern, words, token);
if (res != null) {
mPatterns = res.get(0);
mSegments = res.get(1);
sPatterns = res.get(2);
sSegments = res.get(3);
List<String> tagAndModifier1 = res.get(4);
List<String> tagAndModifier2 = res.get(5);
List<String> update1 = res.get(6);
List<String> update2 = res.get(7);
if (tagAndModifier1.size() > 0) {
String modifier = tagAndModifier1.get(0);
String tag = tagAndModifier1.get(1);
dataholderHandler.tagSentenceWithMT(sentenceID, sentence, "",
tag, "andor[n&n]");
myLogger.trace("tagSentenceWithMT(" + sentenceID + ", "
+ sentence + ", , " + tag + ", andor[n&n]");
} else {
myLogger.debug(String.format(
"Andor can not determine a tag or modifier for %d: %s",
sentenceID, sentence));
}
if (tagAndModifier2.size() > 0) {
String modifier = tagAndModifier2.get(0);
String tag = tagAndModifier2.get(1);
dataholderHandler.tagSentenceWithMT(sentenceID, sentence,
modifier, tag, "andor[m&mn]");
myLogger.trace("tagSentenceWithMT(" + sentenceID + ", "
+ sentence + ", " + modifier + ", " + tag
+ ", andor[m&mn]");
} else {
myLogger.debug(String.format(
"Andor can not determine a tag or modifier for %d: %s",
sentenceID, sentence));
}
if (update1.size() > 0) {
String newBoundaryWord = update1.get(0);
sign = sign
+ dataholderHandler.updateDataHolder(newBoundaryWord,
"b", "", "wordpos", 1);
}
if (update2.size() > 0) {
for (String tempWord : update2) {
sign = sign
+ dataholderHandler.updateDataHolder(tempWord, "p",
"-", "wordpos", 1);
}
}
}
else if (m2.find()) {
myLogger.trace("Case 2");
dataholderHandler.tagSentenceWithMT(sentenceID, sentence, "",
"ditto", "andor");
} else {
myLogger.trace("Case 3");
myLogger.trace("[andortag]Andor can not determine a tag or modifier for "
+ sentenceID + ": " + sentence);
}
myLogger.trace("Return " + sign + "\n");
return sign;
}
public List<List<String>> andOrTagCase1Helper(String pattern,
String wPattern, List<String> words, Set<String> token) {
PropertyConfigurator.configure("conf/log4j.properties");
Logger myLogger = Logger.getLogger("learn.andOrTag");
List<String> mPatterns = new ArrayList<String>();
List<String> sPatterns = new ArrayList<String>();
List<String> mSegments = new ArrayList<String>();
List<String> sSegments = new ArrayList<String>();
List<String> update1 = new ArrayList<String>();
List<String> update2 = new ArrayList<String>();
List<String> tagAndModifier1 = new ArrayList<String>();
List<String> tagAndModifier2 = new ArrayList<String>();
String strToken = "(" + StringUtils.join(token, " ") + ")";
Matcher m1 = StringUtility.createMatcher(pattern, wPattern);
if (m1.find()) {
myLogger.trace("Case 1");
if (pattern.equals("n&qqnbq")) {
// System.out.println();
}
int start1 = m1.start(1);
int end1 = m1.end(1);
int start2 = m1.start(2);
int end2 = m1.end(2);
int start3 = m1.start(3);
int end3 = m1.end(3);
int start4 = m1.start(4);
int end4 = m1.end(4);
int start5 = m1.start(5);
int end5 = m1.end(5);
// System.out.println(pattern);
// System.out.println(start1);
// System.out.println();
String earlyGroupsPattern = start1 == -1 ? "" : pattern.substring(
0, start1);
String[] patterns = earlyGroupsPattern.split("s*<B>,<\\/B>\\s*");
String earlyGroupsWords = start1 == -1 ? "" : StringUtils.join(
words.subList(0, start1), " ");
String[] segments = earlyGroupsWords.split("\\s*<B>,<\\/B>s*");
String secondLastModifierPattern = m1.group(1);
String secondLastModifierWords = secondLastModifierPattern == null ? ""
: StringUtils.join(words.subList(start1, end1), " ");
String sencondLastStructurePattern = m1.group(2);
String secondLastStructureWords = sencondLastStructurePattern == null ? ""
: StringUtils.join(words.subList(start2, end2), " ");
String lastModifierPattern = m1.group(3);
String lastModifierWords = lastModifierPattern == null ? ""
: StringUtils.join(words.subList(start3, end3), " ");
String lastStructurePattern = m1.group(4);
String lastStructureWords = lastStructurePattern == null ? ""
: StringUtils.join(words.subList(start4, end4), " ");
String endSegmentPattern = m1.group(5);
String endSegmentWords = endSegmentPattern == null ? ""
: StringUtils.join(words.subList(start5, end5), " ");
int bIndex = start5;
// matching pattern with original text
if (!(patterns.length == 1 && StringUtils.equals(patterns[0], ""))) {
for (int i = 0; i < patterns.length; i++) {
Pattern p = Pattern.compile("sPattern");
Matcher m10 = p.matcher(patterns[i]);
if (m10.find()) {
String g1 = m10.group(1);
mPatterns.add(g1);
String g2 = m10.group(2);
sPatterns.add(g2);
List<String> w = new ArrayList<String>(
Arrays.asList(segments[i].split(" ")));
String m = StringUtils.join(w.subList(0, m10.end(1)),
" ");
if (StringUtility.isMatchedNullSafe(m,
"\\b(although|but|when|if|where)\\b")) {
return null;
}
mSegments.add(m);
sSegments.add(StringUtils.join(
w.subList(m10.end(1), w.size()), " "));
} else {
myLogger.info("wrong segment: " + patterns[i] + "=>"
+ segments[i] + "\n");
return null;
}
}
}
if (secondLastModifierPattern != null)
mPatterns.add(secondLastModifierPattern);
if (!StringUtils.equals(secondLastModifierWords, ""))
mSegments.add(secondLastModifierWords);
if (sencondLastStructurePattern != null)
sPatterns.add(sencondLastStructurePattern);
if (!StringUtils.equals(secondLastStructureWords, ""))
sSegments.add(secondLastStructureWords);
if (lastModifierPattern != null)
mPatterns.add(lastModifierPattern);
if (!StringUtils.equals(lastModifierWords, ""))
mSegments.add(lastModifierWords);
if (lastStructurePattern != null)
sPatterns.add(lastStructurePattern);
if (!StringUtils.equals(lastStructureWords, ""))
sSegments.add(lastStructureWords);
// find the modifier and the tag for sentenceID
// case 1.1
if (this.countStructures(sPatterns) > 1) {
// compound subject involving multiple structures: mn,mn,&mn =>
// use all but bounary as the tag, modifier="";
String tag = StringUtils.join(words.subList(0, bIndex), " ");
String modifier = "";
tag = tag.replaceAll("<\\S+?>", "");
if (tag != null) {
String regex11 = "\\b(" + StringUtils.join(token, "|")
+ ")\\b";
Matcher m11 = StringUtility.createMatcher(tag, regex11);
if (m11.find()) {
String conj = m11.group(1);
tag = tag.replaceAll(",", " " + conj + " ");
tag = tag.replaceAll("\\s+", " ");
tag = tag.replaceAll("(" + conj + " )+", "$1");
tag = tag.replaceAll("^\\s+", "");
tag = tag.replaceAll("\\s+$", "");
// dataholderHandler.tagSentenceWithMT(sentenceID,
// sentence, "", tag, "andor[n&n]");
tagAndModifier1.add("");
tagAndModifier1.add(tag);
}
// else {
// myLogger.debug(String.format("Andor can not determine a tag or modifier for %d: %s",
// sentenceID, sentence));
// }
}
// case 1.2
else if (this.countStructures(sPatterns) == 1) {
// m&mn => connect all modifiers as the modifier, and the n
// as the tag
int i = 0;
for (i = 0; i < sPatterns.size(); i++) {
if (StringUtility.isMatchedNullSafe(sPatterns.get(i),
"\\w")) {
break;
}
}
tag = sSegments.get(i);
tag = tag.replaceAll("<\\S+?>", "");
modifier = StringUtils.join(mSegments, " ");
modifier = modifier.replaceAll("<\\S+?>", "");
tag = StringUtility.trimString(tag);
modifier = StringUtility.trimString(modifier);
String myStop = this.myLearnerUtility.getConstant().STOP;
myStop = myStop.replaceAll(
String.format("\\b%s\\b", token), "");
myStop = myStop.replaceAll("\\s+$", "");
if (StringUtility.isMatchedNullSafe(modifier, "\\b"
+ strToken + "\\b")
&& StringUtility.isEntireMatchedNullSafe(modifier,
"\\b(" + myStop + "|to)\\b")) {
// case 1.2.1
List<String> wordsTemp = new ArrayList<String>();
wordsTemp.addAll(Arrays.asList(tag.split("\\s+")));
modifier = modifier
+ " "
+ StringUtils.join(wordsTemp.subList(0,
wordsTemp.size() - 1), " ");
tag = wordsTemp.get(wordsTemp.size() - 1);
// dataholderHandler.tagSentenceWithMT(sentenceID,
// sentence, modifier, tag, "andor[m&mn]");
tagAndModifier2.add(modifier);
tagAndModifier2.add(tag);
}
// else {
// myLogger.debug(String.format("Andor can not determine a tag or modifier for %d: %s",
// sentenceID, sentence));
// }
}
// case 1.3
else {
myLogger.debug("Andor can not determine a tag or modifier");
}
int q = -1;
if (endSegmentPattern != null) {
Matcher m13 = StringUtility.createMatcher(
endSegmentPattern, "q");
if (m13.find()) {
q = m13.start();
}
}
if (q >= 0) {
String newBoundaryWord = endSegmentWords.split(" ")[q];
if (StringUtility.isMatchedNullSafe(newBoundaryWord, "\\w")) {
update1.add(newBoundaryWord);
// sign = sign +
// dataholderHandler.updateDataHolder(newBoundaryWord,
// "b", "", "wordpos", 1);
}
}
// structure patterns and segments: $nptn =
// "((?:[np],?)*&?[np])"; #grouped #must present, no q allowed
// mark all ps "p"
for (int i = 0; i < sPatterns.size(); i++) {
String sPatternI = sPatterns.get(i);
sPatternI = sPatternI.replaceAll("(.)", "$1 ");
sPatternI = StringUtility.trimString(sPatternI);
String[] ps = sPatternI.split(" ");
String[] ts = sSegments.get(i).split("\\s+");
for (int j = 0; j < ps.length; j++) {
if (StringUtils.equals(ps[j], "p")) {
ts[j] = StringUtility.trimString(ts[j]);
update2.add(ts[j]);
// sign = sign
// + dataholderHandler.updateDataHolder(ts[j],
// "p", "-", "wordpos", 1);
}
}
}
}
List<List<String>> res = new ArrayList<List<String>>();
res.add(mPatterns);
res.add(mSegments);
res.add(sPatterns);
res.add(sSegments);
res.add(tagAndModifier1);
res.add(tagAndModifier2);
res.add(update1);
res.add(update2);
return res;
} else {
return null;
}
}
public int countStructures(List<String> patterns) {
int count = 0;
for (String pattern : patterns) {
if (StringUtility.isMatchedNullSafe(pattern, "\\w")) {
count++;
}
}
return count;
}
public void resetAndOrTags(DataHolder dataholderHandler) {
dataholderHandler.updateSentenceTag("^andor$", null);
}
public void ditto(DataHolder dataholderHandler) {
String nPhrasePattern = "(?:<[A-Z]*[NO]+[A-Z]*>[^<]+?<\\/[A-Z]*[NO]+[A-Z]*>\\s*)+";
String mPhrasePattern = "(?:<[A-Z]*M[A-Z]*>[^<]+?<\\/[A-Z]*M[A-Z]*>\\s*)+";
for (SentenceStructure sentenceItem : dataholderHandler
.getSentenceHolder()) {
if (sentenceItem.getTag() == null) {
int sentenceID = sentenceItem.getID();
String sentence = sentenceItem.getSentence();
this.dittoHelper(dataholderHandler, sentenceID, sentence,
nPhrasePattern, mPhrasePattern);
}
}
}
public int dittoHelper(DataHolder dataholderHandler, int sentenceID,
String sentence, String nPhrasePattern, String mPhrasePattern) {
int res = 0;
String sentenceCopy = "" + sentence;
sentenceCopy = sentenceCopy.replaceAll("></?", "");
String modifier = "";
Matcher m2 = StringUtility.createMatcher(sentenceCopy, "(.*?)"
+ nPhrasePattern);
if (!StringUtility.isMatchedNullSafe(sentence, "<[NO]>")) {
String tag = "ditto";
dataholderHandler.tagSentenceWithMT(sentenceID, sentence, "", tag,
"ditto-no-N");
res = 1;
} else if (m2.find()) {
String head = m2.group(1);
String pattern21 = String
.format("\\b(%s)\\b", this.myLearnerUtility.getConstant().PREPOSITION);
if (StringUtility.isMatchedNullSafe(head, pattern21)) {
String tag = "ditto";
dataholderHandler.tagSentenceWithMT(sentenceID, sentence,
modifier, tag, "ditto-proposition");
res = 21;
} else if (StringUtility.isMatchedNullSafe(head, ",<\\/B>\\s*$")) {
String tag = "ditto";
dataholderHandler.tagSentenceWithMT(sentenceID, sentence,
modifier, tag, "ditto-,-N");
res = 22;
}
}
return res;
}
public void phraseClause(DataHolder dataholderHandler) {
for (SentenceStructure sentenceItem : dataholderHandler
.getSentenceHolder()) {
if (sentenceItem.getTag() == null) {
int sentenceID = sentenceItem.getID();
String sentence = sentenceItem.getSentence();
List<String> res = this.phraseClauseHelper(sentence);
if (res != null && res.size() == 2) {
String modifier = res.get(0);
String tag = res.get(1);
dataholderHandler.tagSentenceWithMT(sentenceID, sentence,
modifier, tag, "phraseclause");
}
}
}
}
public List<String> phraseClauseHelper(String sentence) {
if (sentence == null) {
return null;
}
List<String> res = new ArrayList<String>(2);
String pattern = "^(.*?)((?:<[A-Z]*M[A-Z]*>[^<]*?<\\/[A-Z]*M[A-Z]*>\\s*)*)((?:<[A-Z]*[NO]+[A-Z]*>[^<]*?<\\/[A-Z]*[NO]+[A-Z]*>\\s*)+)<B>[,:\\.;]<\\/B>\\s*$";
String sentenceCopy = "" + sentence;
sentenceCopy = sentenceCopy.replaceAll("></?", "");
Matcher m = StringUtility.createMatcher(sentenceCopy, pattern);
if (m.find()) {
String head = m.group(1);
String modifier = m.group(2);
String tag = m.group(3);
String prepositionPattern = String.format("\\b(%s)\\b",
this.myLearnerUtility.getConstant().PREPOSITION);
if (!StringUtility.isMatchedNullSafe(head, prepositionPattern)
&& !StringUtility.isMatchedNullSafe(head, "<\\/N>")
&& !StringUtility.isMatchedNullSafe(modifier,
prepositionPattern)) {
if (tag != null) {
Matcher m2 = StringUtility.createMatcher(tag,
"(.*?)<N>([^<]+)<\\/N>\\s*$");
if (m2.find()) {
modifier = modifier + m2.group(1);
tag = m2.group(2);
}
tag = tag.replaceAll("<\\S+?>", "");
modifier = modifier.replaceAll("<\\S+?>", "");
tag = tag.replaceAll("(^\\s*|\\s*$)", "");
modifier = modifier.replaceAll("(^\\s*|\\s*$)", "");
res.add(modifier);
res.add(tag);
return res;
}
}
}
return res;
}
public void pronounCharacterSubject(DataHolder dataholderHandler) {
for (SentenceStructure sentenceItem : dataholderHandler
.getSentenceHolder()) {
int sentenceID = sentenceItem.getID();
String lead = sentenceItem.getLead();
String sentence = sentenceItem.getSentence();
String modifier = sentenceItem.getModifier();
String tag = sentenceItem.getTag();
List<String> mt = pronounCharacterSubjectHelper(lead, sentence,
modifier, tag);
if (mt != null) {
dataholderHandler.tagSentenceWithMT(sentenceID, sentence,
modifier, tag,
"pronouncharactersubject[character subject]");
}
}
// preposition cases
String prepositionPattern = String
.format("^(%s)", this.myLearnerUtility.getConstant().PREPOSITION);
for (SentenceStructure sentenceItem : dataholderHandler
.getSentenceHolder()) {
int sentenceID = sentenceItem.getID();
String lead = sentenceItem.getLead();
String modifier = sentenceItem.getModifier();
String tag = sentenceItem.getTag();
String sentence = sentenceItem.getSentence();
boolean case1 = (StringUtils.equals(tag, "ignore"));
boolean case2 = (tag == null);
boolean case3 = StringUtility.isMatchedNullSafe(tag,
prepositionPattern + " ");
if ((case1 || case2) && case3) {
dataholderHandler.tagSentenceWithMT(sentenceID, sentence, "",
"", "pronouncharactersubject[proposition subject]");
}
}
// pronoun cases
String pronounPattern = String.format("(%s)", this.myLearnerUtility.getConstant().PRONOUN);
for (SentenceStructure sentenceItem : dataholderHandler
.getSentenceHolder()) {
int sentenceID = sentenceItem.getID();
String lead = sentenceItem.getLead();
String modifier = sentenceItem.getModifier();
String tag = sentenceItem.getTag();
String sentence = sentenceItem.getSentence();
boolean case1 = StringUtility.isMatchedNullSafe(tag,
String.format("(^| )%s( |\\$)", pronounPattern));
boolean case2 = StringUtility.isMatchedNullSafe(modifier,
String.format("(^| )%s( |\\$)", pronounPattern));
if (case1 || case2) {
modifier = modifier.replaceAll("\\b(" + this.myLearnerUtility.getConstant().PRONOUN
+ ")\\b", "");
tag = tag.replaceAll("\\b(" + this.myLearnerUtility.getConstant().PRONOUN + ")\\b", "");
modifier = modifier.replaceAll("\\s+", " ");
tag = tag.replaceAll("\\s+", " ");
if (!StringUtility.isMatchedNullSafe(tag, "\\w")
|| StringUtility.isMatchedNullSafe(tag, "ditto")) {
tag = dataholderHandler.getParentSentenceTag(sentenceID);
}
modifier = modifier.replaceAll("(^\\s*|\\s*$)", "");
tag = tag.replaceAll("(^\\s*|\\s*$)", "");
List<String> mt = dataholderHandler.getMTFromParentTag(tag);
String m = mt.get(0);
tag = mt.get(1);
if (StringUtility.isMatchedNullSafe(m, "\\w")) {
modifier = modifier + m;
dataholderHandler.tagSentenceWithMT(sentenceID, sentence,
modifier, tag,
"pronouncharactersubject[pronoun subject]");
}
}
}
// correct to missed N
for (SentenceStructure sentenceItem : dataholderHandler
.getSentenceHolder()) {
int sentenceID = sentenceItem.getID();
String lead = sentenceItem.getLead();
String modifier = sentenceItem.getModifier();
String tag = sentenceItem.getTag();
String sentence = sentenceItem.getSentence();
List<String> mt = this.pronounCharacterSubjectHelper4(lead,
sentence, modifier, tag);
if (mt != null) {
modifier = mt.get(0);
tag = mt.get(1);
dataholderHandler.tagSentenceWithMT(sentenceID, sentence,
modifier, tag,
"pronouncharactersubject[correct to missed N]");
}
}
}
public List<String> pronounCharacterSubjectHelper4(String lead,
String sentence, String modifier, String tag) {
boolean case1 = (StringUtils.equals(tag, "ignore"));
boolean case2 = (tag == null);
boolean case3 = !StringUtility.isMatchedNullSafe(tag, " (and|nor|or) ");
boolean case4 = !StringUtility.isMatchedNullSafe(sentence, "\\[");
boolean case5 = false;
if (sentence != null) {
Pattern p = Pattern.compile("^[^N]*<N>" + tag);
Matcher m = p.matcher(sentence);
if (m.find()) {
case5 = true;
}
}
if ((case1 || case2) && case3 && case4 && case5) {
if (sentence != null) {
sentence = sentence.replaceAll("></?", "");
Pattern p = Pattern
.compile("^(\\S*) ?<N>([^<]+)<\\/N> <[MB]+>(\\S+)<\\/[MB]+> \\S*\\b"
+ tag + "\\b\\S*");
Matcher m2 = p.matcher(sentence);
if (m2.find()) {
modifier = m2.group(1);
tag = m2.group(2);
String g3 = m2.group(3);
if (!StringUtility.isMatchedNullSafe(g3, "\\bof\\b")) {
modifier = modifier.replaceAll("<\\S+?>", "");
tag = tag.replaceAll("<\\S+?>", "");
modifier = modifier.replaceAll("(^\\s*|\\s*$)", "");
tag = tag.replaceAll("(^\\s*|\\s*$)", "");
List<String> mt = new ArrayList<String>();
mt.add(modifier);
mt.add(tag);
return mt;
}
}
}
}
return null;
}
public List<String> pronounCharacterSubjectHelper(String lead,
String sentence, String modifier, String tag) {
String t = "(?:<\\/?[A-Z]+>)?";
boolean b1 = !StringUtils.equals(tag, "ignore");
boolean b2 = (tag == null);
boolean b3 = StringUtility.isMatchedNullSafe(lead, "(^| )("
+ this.myLearnerUtility.getConstant().CHARACTER + ")( |$)");
boolean b4 = StringUtility.isMatchedNullSafe(tag, "(^| )("
+ this.myLearnerUtility.getConstant().CHARACTER + ")( |$)");
if (((b1 || b2) && b3) || b4) {
sentence = sentence.replaceAll("></?", "");
if (sentence != null) {
String pattern1 = String
.format("^.*?%s\\b(%s)\\b%s %s(?:of)%s (.*?)(<[NO]>([^<]*?)<\\/[NO]> ?)+ ",
t, this.myLearnerUtility.getConstant().CHARACTER, t, t, t);
Matcher m1 = StringUtility.createMatcher(sentence, pattern1);
String pattern2 = String
.format("^(.*?)((?:<\\/?[BM]+>\\w+?<\\/?[BM]+>\\s*)*)%s\\b(%s)\\b%s",
t, this.myLearnerUtility.getConstant().CHARACTER, t);
Matcher m2 = StringUtility.createMatcher(sentence, pattern2);
// case 1.1
if (m1.find()) {
tag = m1.group(4);
modifier = sentence.substring(m1.start(2), m1.start(4));
String s2 = m1.group(2);
String s3 = m1.group(3);
if ((!StringUtility.isMatchedNullSafe(s2,
String.format("\\b(%s)\\b", this.myLearnerUtility.getConstant().PREPOSITION)))
&& (!StringUtility.isMatchedNullSafe(s3, String
.format("\\b(%s|\\d)\\b", this.myLearnerUtility.getConstant().STOP)))) {
modifier = modifier.replaceAll("<\\S+?>", "");
modifier = modifier.replaceAll("(^\\s*|\\s*$)", "");
tag = tag.replaceAll("<\\S+?>", "");
tag = tag.replaceAll("(^\\s*|\\s*$)", "");
} else {
modifier = "";
tag = "ditto";
}
}
// case 1.2
else if (m2.find()) {
String text = m2.group(1);
if ((!StringUtility.isMatchedNullSafe(text, "\\b("
+ this.myLearnerUtility.getConstant().STOP + "|\\d+)\\b"))
&& (StringUtility.isMatchedNullSafe(text, "\\w"))
&& (!StringUtility
.isMatchedNullSafe(text, "[,:;.]"))) {
text = text.replaceAll("<\\S+?>", "");
// $text =~ s#(^\s*|\s*$)##g;
// $text =~ s#[[:punct:]]##g;
text = text.replaceAll("(^\\s*|\\s*$)", "");
text = text.replaceAll("\\p{Punct}", "");
String[] textArray = text.split("\\s+");
// List<String> textList = new LinkedList<String>();
// textList.addAll(Arrays.asList(textArray));
if (textArray.length >= 1) {
tag = textArray[textArray.length - 1];
String pattern = "<[NO]>" + tag + "</[NO]>";
if (StringUtility.isMatchedNullSafe(sentence,
pattern)) {
// 1.2.1.1
text = text.replaceAll(tag, "");
modifier = text;
} else {
// 1.2.1.2
modifier = "";
tag = "ditto";
}
}
} else {
// 1.2.2
modifier = "";
tag = "ditto";
}
}
// case 1.3
else if (StringUtility.isMatchedNullSafe(sentence, "\\b("
+ this.myLearnerUtility.getConstant().CHARACTER + ")\\b")) {
modifier = "";
tag = "ditto";
}
}
List<String> mt = new ArrayList<String>(2);
mt.add(modifier);
mt.add(tag);
return mt;
} else {
return null;
}
}
/**
* comma used for 'and': seen in TreatiseH, using comma for 'and' as in
* "adductor , diductor scars clearly differentiated ;", which is the same
* as "adductor and diductor scars clearly differentiated ;". ^m*n+,m*n+ or
* m*n+,m*n+;$, or m,mn. Clauses dealt in commaand do not contain "and/or".
* andortag() deals with clauses that do.
*
* @param dataholderHandler
*/
public void commaAnd(DataHolder dataholderHandler) {
// cover m,mn
// last + =>*
// "(?:<[A-Z]*[NO]+[A-Z]*>[^<]+?<\/[A-Z]*[NO]+[A-Z]*>\\s*)+"
String nPhrasePattern = "(?:<[A-Z]*[NO]+[A-Z]*>[^<]+?<\\/[A-Z]*[NO]+[A-Z]*>\\s*)+";
// add last \\s*
// "(?:<[A-Z]*M[A-Z]*>[^<]+?<\/[A-Z]*M[A-Z]*>\\s*)"
String mPhrasePattern = "(?:<[A-Z]*M[A-Z]*>[^<]+?<\\/[A-Z]*M[A-Z]*>\\s*)";
// "(?:<[A-Z]*B[A-Z]*>[,:\.;<]<\/[A-Z]*B[A-Z]*>)"
String bPattern = "(?:<[A-Z]*B[A-Z]*>[,:.;<]<\\/[A-Z]*B[A-Z]*>)";
String commaPattern = "<B>,</B>";
String phrasePattern = mPhrasePattern + "\\s*" + nPhrasePattern;
String pattern = phrasePattern + "\\s+" + commaPattern + "\\s+(?:"
+ phrasePattern + "| |" + commaPattern + ")+";
String pattern1 = "^(" + pattern + ")";
String pattern2 = "(.*?)(" + pattern + ")\\s*" + bPattern + "\\$";
// changed last * to +
String pattern3 = "^((?:" + mPhrasePattern + "\\s+)+" + commaPattern
+ "\\s+(?:" + mPhrasePattern + "|\\s*|" + commaPattern + ")+"
+ mPhrasePattern + "+\\s*" + nPhrasePattern + ")";
for (SentenceStructure sentenceItem : dataholderHandler
.getSentenceHolder()) {
int sentenceID = sentenceItem.getID();
String sentence = sentenceItem.getSentence();
String sentenceCopy = "" + sentence;
sentenceCopy = sentenceCopy.replaceAll("></?", "");
Matcher m1 = StringUtility.createMatcher(sentenceCopy, pattern1);
Matcher m2 = StringUtility.createMatcher(sentenceCopy, pattern2);
Matcher m3 = StringUtility.createMatcher(sentenceCopy, pattern3);
// case 1
if (m1.find()) {
String tag = m1.group(1);
tag = tag.replaceAll(",", "and");
tag = tag.replaceAll("</?\\S+?>", "");
tag = StringUtility.trimString(tag);
// case 1.1
if (!StringUtility.isMatchedNullSafe(tag, " and$")) {
dataholderHandler.tagSentenceWithMT(sentenceID, sentence,
"", tag, "commaand[CA1]");
}
}
// case 2
else if (m2.find()) {
String g1 = m2.group(1);
String tag = m2.group(2);
if (!StringUtility.isMatchedNullSafe(g1, "\\b("
+ this.myLearnerUtility.getConstant().PREPOSITION + ")\\b")
&& !StringUtility.isMatchedNullSafe(g1, "<N>")) {
tag = tag.replaceAll(",", "and");
tag = tag.replaceAll("</?\\S+?>", "");
tag = StringUtility.trimString(tag);
// case 2.1.1
if (!StringUtility.isMatchedNullSafe(tag, " and$")) {
dataholderHandler.tagSentenceWithMT(sentenceID,
sentence, "", tag, "commaand[CA2]");
}
}
}
// case 3
else if (m3.find()) {
String tag = m3.group(1);
String g1 = m3.group(1);
// case 3.1
if (!StringUtility.isMatchedNullSafe(g1, "\\b("
+ this.myLearnerUtility.getConstant().PREPOSITION + ")\\b")) {
tag = tag.replaceAll(",", "and");
tag = tag.replaceAll("</?\\S+?>", "");
tag = StringUtility.trimString(tag);
// case 3.1.1
if (!StringUtility.isMatchedNullSafe(tag, " and$")) {
String[] tagWords = tag.split("\\s+");
List<String> tagWordsList = new ArrayList<String>(
Arrays.asList(tagWords));
tag = tagWordsList.get(tagWordsList.size() - 1);
String modifier = StringUtils.join(tagWordsList
.subList(0, tagWordsList.size() - 1), " ");
dataholderHandler.tagSentenceWithMT(sentenceID,
sentence, modifier, tag, "commaand[CA3]");
}
}
}
}
}
public void normalizeModifiers(DataHolder dataholderHandler) {
Comparator<SentenceStructure> stringLengthComparator = new Comparator<SentenceStructure>() {
@Override
public int compare(SentenceStructure s1, SentenceStructure s2) {
String m1 = s1.getModifier();
String m2 = s2.getModifier();
if (m1.length() == m2.length()) {
return 0;
} else {
return m1.length() < m2.length() ? -1 : 1;
}
}
};
// Part 1
// non- and/or/to/plus cases
List<SentenceStructure> sentenceList = new ArrayList<SentenceStructure>();
for (SentenceStructure sentenceItem : dataholderHandler
.getSentenceHolder()) {
String modifier = sentenceItem.getModifier();
boolean c1 = !StringUtils.equals(modifier, "");
boolean c2 = !StringUtility.isMatchedNullSafe(modifier,
" (and|or|nor|plus|to) ");
if (c1 && c2) {
sentenceList.add(sentenceItem);
}
}
Collections.sort(sentenceList, stringLengthComparator);
Collections.reverse(sentenceList);
for (SentenceStructure sentenceItem : sentenceList) {
int sentenceID = sentenceItem.getID();
String sentence = sentenceItem.getSentence();
String tag = sentenceItem.getTag();
String modifier = sentenceItem.getModifier();
String mCopy = "" + modifier;
modifier = finalizeModifier(dataholderHandler, modifier, tag, sentence);
modifier = modifier.replaceAll("\\s*\\[.*?\\]\\s*", " ");
modifier = StringUtility.trimString(modifier);
if (!StringUtils.equals(mCopy, modifier)) {
dataholderHandler.tagSentenceWithMT(sentenceID, sentence,
modifier, tag, "normalizemodifiers");
}
}
// Part 2
// deal with to: characterA to characterB organ (small to median shells)
List<SentenceStructure> sentenceList2 = new ArrayList<SentenceStructure>();
for (SentenceStructure sentenceItem : dataholderHandler
.getSentenceHolder()) {
String modifier = sentenceItem.getModifier();
boolean c1 = StringUtility.isMatchedNullSafe(modifier, " to ");
if (c1) {
sentenceList2.add(sentenceItem);
}
}
Collections.sort(sentenceList2, stringLengthComparator);
for (SentenceStructure sentenceItem : sentenceList2) {
int sentenceID = sentenceItem.getID();
String sentence = sentenceItem.getSentence();
String tag = sentenceItem.getTag();
String modifier = sentenceItem.getModifier();
String mCopy = "" + modifier;
modifier = modifier.replaceAll(".*? to ", "");
List<String> mWords = new ArrayList<String>(Arrays.asList(modifier
.split("\\s+")));
Collections.reverse(mWords);
String m = "";
int count = dataholderHandler.getSentenceCount(true, m, true, tag);
String modi = "" + m;
for (String word : mWords) {
m = word + " " + m;
m = m.replaceAll("\\s+$", "");
int c = dataholderHandler.getSentenceCount(true, m, true, tag);
if (c > count) {
count = c;
modi = "" + m;
}
}
// tagsentwmt($sentid, $sentence, $modi, $tag,
// "normalizemodifiers");
dataholderHandler.tagSentenceWithMT(sentenceID, sentence, modi,
tag, "normalizemodifiers");
}
// Part 3
// modifier with and/or/plus
List<SentenceStructure> sentenceList3 = new ArrayList<SentenceStructure>();
for (SentenceStructure sentenceItem : dataholderHandler
.getSentenceHolder()) {
String modifier = sentenceItem.getModifier();
boolean con = !StringUtility.isMatchedNullSafe(modifier,
" (and|or|nor|plus|to) ");
if (con) {
sentenceList3.add(sentenceItem);
}
}
Collections.sort(sentenceList3, stringLengthComparator);
Collections.reverse(sentenceList3);
for (SentenceStructure sentenceItem : sentenceList3) {
int sentenceID = sentenceItem.getID();
String sentence = sentenceItem.getSentence();
String tag = sentenceItem.getTag();
String modifier = sentenceItem.getModifier();
String mCopy = "" + modifier;
modifier = this.finalizeCompoundModifier(dataholderHandler,
modifier, tag, sentence);
modifier = modifier.replaceAll("\\s*\\[.*?\\]\\s*", " ");
modifier = StringUtility.trimString(modifier);
if (!StringUtils.equals(mCopy, modifier)) {
// tagsentwmt($sentid, $sentence, $modifier, $tag,
// "normalizemodifiers");
dataholderHandler.tagSentenceWithMT(sentenceID, sentence,
modifier, tag, "normalizemodifiers");
}
}
// Part 4
// modifier with and/or/plus
List<SentenceStructure> sentenceList4 = new ArrayList<SentenceStructure>();
for (SentenceStructure sentenceItem : dataholderHandler
.getSentenceHolder()) {
String modifier = sentenceItem.getModifier();
// ???
boolean con = !StringUtility.isMatchedNullSafe(modifier,
"[_ ](and|or|nor|plus|to)[ _]");
if (con) {
sentenceList4.add(sentenceItem);
}
}
Collections.sort(sentenceList4, stringLengthComparator);
Collections.reverse(sentenceList4);
for (SentenceStructure sentenceItem : sentenceList4) {
int sentenceID = sentenceItem.getID();
String sentence = sentenceItem.getSentence();
String tag = sentenceItem.getTag();
String modifier = sentenceItem.getModifier();
String mTag = "" + tag;
tag = this.finalizeCompoundTag(tag, sentence);
tag = tag.replaceAll("\\s*\\[.*?\\]\\s*", " ");
tag = StringUtility.trimString(tag);
if (!StringUtils.equals(mTag, tag)) {
// tagsentwmt($sentid, $sentence, $modifier, $tag,
// "normalizemodifiers");
dataholderHandler.tagSentenceWithMT(sentenceID, sentence,
modifier, tag, "normalizemodifiers");
}
}
}
public String finalizeCompoundModifier(DataHolder dataholderHandler, String modifier, String tag,
String sentence) {
// case 1
if (StringUtility.isMatchedNullSafe(modifier, "\\[")) {
return modifier;
}
modifier = modifier.replaceAll("\\(.*?\\)", " ");
modifier = modifier.replaceAll("\\(.*", "");
modifier = modifier.replaceAll("\\W","");
modifier = modifier.replaceAll("\\s+", " ");
String mCopy = ""+modifier;
String result = "";
String m = "";
String n = "";
List<String> lastPart = new ArrayList(Arrays.asList(modifier.split("\\s+")));
Collections.reverse(lastPart);
int cut = 0;
for (String l : lastPart) {
if (cut == 0 && StringUtility.isMatchedNullSafe(sentence, "<N>"+l)) {
n = l + " " + n;
n = StringUtility.trimString(n);
}
else {
cut = 1;
String tm = StringUtility.isMatchedNullSafe(n, "\\w") ? l + " "
+ n : l;
for (SentenceStructure sentenceItem : dataholderHandler
.getSentenceHolder()) {
if (StringUtils.equals(sentenceItem.getModifier(), tm)
&& StringUtils.equals(sentenceItem.getTag(), tag)) {
m = l + " " + m;
}
}
break;
}
}
m = StringUtility.trimString(m);
n = StringUtility.trimString(n);
modifier = modifier.replaceAll("\\s*"+n, "");
// components
List<String> parts = new ArrayList<String>();
List<String> conj = new ArrayList<String>();
conj.add("");
if (modifier != null) {
Matcher m1 = StringUtility.createMatcher(modifier, "(^.*?) (and|or|nor|plus) (.*)");
while (m1.find()) {
String g1 = m1.group(1);
String g2 = m1.group(2);
String g3 = m1.group(3);
parts.add(g1);
parts.add(g2);
modifier = g3;
m1 = StringUtility.createMatcher(modifier, "(^.*?) (and|or|nor|plus) (.*)");
}
}
parts.add(modifier);
// at least one m in a part
// for (String part : parts) {
for (int i = 0; i < parts.size(); i++) {
String part = parts.get(i);
String[] words = part.split("\\s+");
boolean isFound = false;
String r = "";
for (String word : words) {
if ((this.checkedModifiers.containsKey(word) && this.checkedModifiers.get(word)) || StringUtility.isMatchedNullSafe(sentence, "<N>"+word)) {
isFound = true;
r = r + " " + word;
}
}
r = StringUtility.trimString(r);
result = result + " " + conj.get(i)+ " "+r;
String regex2 = "\\b(" + this.myLearnerUtility.getConstant().CHARACTER + "|" + this.myLearnerUtility.getConstant().STOP
+ "|" + this.myLearnerUtility.getConstant().NUMBER + "|" + this.myLearnerUtility.getConstant().CLUSTERSTRING
+ ")\\b";
if (!StringUtility.isMatchedNullSafe(r, "\\w")
|| StringUtility.isMatchedNullSafe(r, regex2)) {
result = "";
break;
}
}
result = StringUtility.isMatchedNullSafe(result, "\\w") ? result
+ " " + n : m + " " + n;
result = StringUtility.trimString(result);
return result;
}
// [bm]+n+&[bm]+n+
public String finalizeCompoundTag(String tag, String sentence) {
// avoid unmatched ( in regexp
tag = tag.replaceAll("\\(.*?\\)", " ");
tag = tag.replaceAll("\\(.*", "");
tag = tag.replaceAll("\\s+", " ");
String tCopy = "" + tag;
String result = "";
// components
List<String> parts = new ArrayList<String>();
List<String> conj = new ArrayList<String>();
conj.add("");
Matcher m1 = StringUtility.createMatcher(tag, "(^.*?)[_ ](and|or|nor|plus)[_ ](.*)");
while (m1.find()) {
String g1 = m1.group(1);
String g2 = m1.group(2);
String g3 = m1.group(3);
parts.add(g1);
conj.add(g2);
tag = g3;
m1 = StringUtility.createMatcher(tag, "(^.*?)[_ ](and|or|nor|plus)[_ ](.*)");
}
parts.add(tag);
// at least one m in a part
// for (String part : parts) {
for (int i = 0; i < parts.size(); i++) {
String part = parts.get(i);
String[] words = part.split("\\s+");
boolean isFoundM = false;
String r = "";
for (String word : words) {
String escapedW = StringUtility.escapePerlRegex(word);
if ((this.checkedModifiers.containsKey(word) && this.checkedModifiers
.get(word))
|| StringUtility.isMatchedNullSafe(sentence, "<N>"
+ escapedW)) {
isFoundM = true;
r = r + " " + word;
}
}
String regex = "\\b(" + this.myLearnerUtility.getConstant().CHARACTER + "|" + this.myLearnerUtility.getConstant().STOP
+ "|" + this.myLearnerUtility.getConstant().NUMBER + "|" + this.myLearnerUtility.getConstant().CLUSTERSTRING
+ ")\\b";
r = r.replaceAll(regex, "");
r = StringUtility.trimString(r);
if (StringUtility.isMatchedNullSafe(r, "\\w")) {
result = result + " " + conj.get(i) +" "+r;
}
}
result = result.replaceAll("\\s+", " ");
result = StringUtility.trimString(result);
return result;
}
public String finalizeModifier(DataHolder dataholderHandler, String modifier, String tag, String sentence) {
String fModifier = "";
modifier = modifier.replaceAll("\\[.*?\\]", "");
modifier = StringUtility.trimString(modifier);
if (StringUtility.isMatchedNullSafe(modifier, "\\w")) {
List<String> mWords = new ArrayList<String>(Arrays.asList(modifier.split("\\s+")));
Collections.reverse(mWords);
for (String mWord : mWords) {
boolean isModifier = this.isModifier(dataholderHandler, mWord, modifier, tag);
if (isModifier) {
fModifier = mWord + " " + fModifier;
}
else {
break;
}
}
fModifier = fModifier.replaceAll("\\s+", "");
}
return fModifier;
}
public boolean isModifier(DataHolder dataholderHandler, String word, String modifier, String tag) {
if (this.checkedModifiers.containsKey(word)) {
if (this.checkedModifiers.get(word)) {
return true;
} else {
return false;
}
}
// if word is a "s", return 1
Set<String> nouns = new HashSet<String>(Arrays.asList("s p n"
.split(" ")));
List<Entry<WordPOSKey, WordPOSValue>> entries = dataholderHandler
.getWordPOSEntriesByWordPOS(word, nouns);
if (entries.size() > 0) {
this.checkedModifiers.put(word, true);
return true;
}
// if word is a "b", and not a "m", return 0
Set<String> bPOS = new HashSet<String>();
bPOS.add("b");
List<Entry<WordPOSKey, WordPOSValue>> boundaries = dataholderHandler
.getWordPOSEntriesByWordPOS(word, bPOS);
boolean c1 = (boundaries.size() > 0);
boolean c2 = dataholderHandler.getModifierHolder().containsKey(word);
if (c1 && !c2) {
// the word is a boundary word, but not a modifier
this.checkedModifiers.put(word, false);
return false;
}
if (!c1 && c2) {
this.checkedModifiers.put(word, true);
return true;
}
// when word has been used as "b" and "m" or neither "b" nor "m" and is not a "s"
int mCount = this.getMCount(dataholderHandler, word);
String wCopy = ""+word;
if (StringUtility.isMatchedNullSafe(word, "_")) {
wCopy = wCopy.replaceAll("_", " - ");
}
int tCount = 0;
String pattern = "(^| )"+wCopy+" ";
for (SentenceStructure sentenceItem : dataholderHandler.getSentenceHolder()) {
String oSentence = sentenceItem.getOriginalSentence();
if (StringUtility.isMatchedNullSafe(oSentence, pattern)) {
tCount++;
}
}
if (tCount == 0 || tCount > 0.25 * mCount) {
this.checkedModifiers.put(word, false);
return false;
}
else {
this.checkedModifiers.put(word, true);
return true;
}
}
public int getMCount(DataHolder dataholderHandler, String word) {
int count = 0;
String pattern = "(>| )"+word+"(</B></M>)? <N";
for (SentenceStructure sentenceItem : dataholderHandler.getSentenceHolder()) {
String sentence = sentenceItem.getSentence();
if (StringUtility.isMatchedNullSafe(sentence, pattern)) {
count++;
}
}
return count;
}
public void normalizeTags(DataHolder dataholderHandler) {
for (SentenceStructure sentenceItem : dataholderHandler.getSentenceHolder()) {
int sentenceID = sentenceItem.getID();
String modifier = sentenceItem.getModifier();
String tag = sentenceItem.getTag();
if (tag != null && StringUtils.equals(tag, "ignore")) {
tag = this.normalizeItem(tag);
modifier = this.normalizeItem(modifier);
}
String sentence = sentenceItem.getSentence();
sentence = sentence.replaceAll("</?[NBM]>", "");
dataholderHandler.getSentence(sentenceID).setSentence(sentence);
if (StringUtility.isMatchedNullSafe(tag, "\\w")) {
dataholderHandler.tagSentenceWithMT(sentenceID, sentence, modifier, tag, "normalizetags");
}
else {
dataholderHandler.tagSentenceWithMT(sentenceID, sentence, modifier, null, "normalizetags");
}
}
}
public String normalizeItem(String tag) {
tag = tag.replaceAll("\\s*NUM\\s*", " ");
tag = StringUtility.trimString(tag);
if (StringUtility.isMatchedNullSafe(tag, "\\w")) {
tag = tag.replaceAll("\\[", "[*");
tag = tag.replaceAll("\\]", "*]");
String[] twSegs = tag.split("[\\]\\[]");
StringBuilder tagSB = new StringBuilder();
for (int j = 0; j < twSegs.length; j++) {
StringBuilder outSB = new StringBuilder();
// case 1
if (StringUtility.isMatchedNullSafe(twSegs[j], "\\*")) {
twSegs[j] = twSegs[j].replaceAll("\\*", "");
String[] tagWords = twSegs[j].split("\\s+");
outSB.append('[');
for (int i = 0; i < tagWords.length; i++) {
tagWords[i] = this.myLearnerUtility
.getWordFormUtility().getSingular(tagWords[i]);
outSB.append(tagWords[i]);
outSB.append(" ");
}
outSB.deleteCharAt(outSB.length() - 1);
outSB.append(']');
}
// case 2
else if (StringUtility.isMatchedNullSafe(twSegs[j], "\\w")) {
String[] tagWords = twSegs[j].split("\\s+");
for (int i = 0; i < tagWords.length; i++) {
tagWords[i] = this.myLearnerUtility
.getWordFormUtility().getSingular(tagWords[i]);
outSB.append(tagWords[i]);
outSB.append(" ");
}
outSB.deleteCharAt(outSB.length() - 1);
}
String out = outSB.toString();
if (StringUtility.isMatchedNullSafe(out, "\\w")) {
tagSB.append(out.toString());
tagSB.append(' ');
}
}
tagSB.deleteCharAt(tagSB.length() - 1);
tag = tagSB.toString();
tag = tag.replaceAll("\\s+", " ");
}
return tag;
}
/**
* Set saved_flag to red for the following terms in preparation to run the Parser
* 1. words that are not in allwords table
* 2. special words added
*/
public void prepareTables4Parser(DataHolder dataholderHandler) {
Set<String> toRemove = new HashSet<String>();
toRemove.addAll(this.myLearnerUtility.getConstant().pronounWords);
toRemove.addAll(this.myLearnerUtility.getConstant().characterWords);
toRemove.addAll(this.myLearnerUtility.getConstant().numberWords);
toRemove.addAll(this.myLearnerUtility.getConstant().clusterStringWords);
toRemove.addAll(this.myLearnerUtility.getConstant().pronounWords);
toRemove.addAll(this.myLearnerUtility.getConstant().stopWords);
Set<String> unknownWords =dataholderHandler.getUnknownWordHolder().keySet();
// set saved_flag to red in WordPOS collection
Iterator<Entry<WordPOSKey, WordPOSValue>> iter = dataholderHandler.getWordPOSHolderIterator();
while (iter.hasNext()) {
Entry<WordPOSKey, WordPOSValue> entry = iter.next();
WordPOSKey key = entry.getKey();
WordPOSValue value = entry.getValue();
String word = key.getWord();
// boolean c1 = toRemove.contains(word);
// boolean c2 = StringUtility.isMatchedNullSafe(word, "[a-z]");
// boolean c3 = unknownWords.contains(word);
if (toRemove.contains(word)
|| !StringUtility.isMatchedNullSafe(word, "[a-z]")
|| !unknownWords.contains(word)) {
value.setSavedFlag("red");
}
}
// handle -ly words
// If a word in WordPOS collection, has ending of -ly, and after
// removing the -ly ending, it appears in the UnknownWords collections,
// then set the savedFlag to "red"
Iterator<Entry<WordPOSKey, WordPOSValue>> iter2 = dataholderHandler.getWordPOSHolderIterator();
while (iter2.hasNext()) {
Entry<WordPOSKey, WordPOSValue> entry = iter2.next();
WordPOSKey key = entry.getKey();
WordPOSValue value = entry.getValue();
String lyWord = key.getWord();
if (StringUtility.isMatchedNullSafe(lyWord, "ly$")) {
String nWord = lyWord.replaceAll("ly$", "");
if (unknownWords.contains(nWord)) {
value.setSavedFlag("red");
}
}
}
}
// some unused variables in perl
// directory of /descriptions folder
private String desDir = "";
// directory of /characters folder
private String chrDir = "";
// prefix for all tables generated by this program
private String prefix = "";
// default general tag
// knowledge base
private String knlgBase = "phenoscape";
private int DECISIONID = 0;
private Map<String, String> numberRecords = new HashMap<String, String>(); // word->(p|s)
private Map<String, String> singularRecords = new HashMap<String, String>();// word->singular
private Map<String, String> POSRecords = new HashMap<String, String>(); // word->POSs
// private Map<String, String> POSRecordsRECORDS = new HashMap<String,
// String>();
private String NEWDESCRIPTION = ""; // record the index of sentences that
// ends a description
private Hashtable<String, String> PLURALS = new Hashtable<String, String>();
private String TAGS = "";
// grouped #may contain q but not the last m, unless it is followed by a p
private String mptn = "((?:[mbq][,&]*)*(?:m|b|q(?=[pon])))";
// grouped #must present, no q allowed
private String nptn = "((?:[nop][,&]*)*[nop])";
// grouped #when following a p, a b could be a q
private String bptn = "([,;:\\\\.]*\\$|,*[bm]|(?<=[pon]),*q)";
private String SEGANDORPTN = "(?:" + mptn + nptn + ")";
private String ANDORPTN = "^(?:" + SEGANDORPTN + "[,&]+)*" + SEGANDORPTN
+ bptn;
// utility method
public LearnerUtility getLearnerUtility() {
return this.myLearnerUtility;
}
public ITokenizer getTokenizer() {
return this.myTokenizer;
}
public Configuration getConfiguration() {
return this.myConfiguration;
}
public static void main(String[] args) {
assertEquals("tagAllSentenceHelper", 1, 12);
}
}