HeuristicNounLearnerUseMorphology.java example

Explorer

charaparser-unsupervised-master
- src
  - main
    - java
      - semanticMarkup
        core
        Treatment.java
        io
        input
        lib
        db
        ParentTagProvider.java
        know
        IGlossary.java
        IPOSKnowledgeBase.java
        Stemmer.java
        lib
        InMemoryGlossary.java
        WordNetPOSKnowledgeBase.java
        knowledge
        KnowledgeBase.java
        ling
        Token.java
        learn
        Configuration.java
        ITerminologyLearner.java
        Learner.java
        UnsupervisedClauseMarkup.java
        auxiliary
        AjectiveReplacementForNoun.java
        FileLoader.java
        GetNounsAfterPtnReturnValue.java
        KnownTagCollection.java
        POS.java
        POSInfo.java
        SentenceLeadLengthComparator.java
        StringAndInt.java
        StringPair.java
        dataholder
        DataHolder.java
        DiscountedKey.java
        IsAValue.java
        ModifierTableValue.java
        SentenceStructure.java
        SingularPluralPair.java
        WordPOSKey.java
        WordPOSValue.java
        knowledge
        AdditionalBootstrapping.java
        AdditionalBootstrappingLearner.java
        AdjectiveSubjectBootstrappingLearner.java
        AdjectiveVerifier.java
        AndOrTagSetter.java
        AnnotationNormalizer.java
        CommaAsAndAnnotator.java
        CommonSubstructureAnnotator.java
        Constant.java
        CoreBootstrappingLearner.java
        DittoAnnotator.java
        FiniteSetsLoader.java
        HeuristicNounLearnerUseMorphology.java
        HeuristicNounLearnerUseSuffix.java
        IModule.java
        IgnorePatternAnnotator.java
        IgnoredFinalizer.java
        Initializer.java
        MarkupByPatternLearner.java
        ModifierTagSeparator.java
        NMBResolver.java
        NullSentenceTagger.java
        POSBasedAnnotator.java
        PatternBasedAnnotator.java
        PhraseClauseAnnotator.java
        PronounCharactersAnnotator.java
        UnknownWordBootstrappingLearner.java
        utility
        LearnerUtility.java
        StringUtility.java
        WordFormUtility.java
        pos
        POS.java
        transform
        ITokenizer.java
        lib
        OpenNLPSentencesTokenizer.java
        OpenNLPTokenizer.java
  - test
    - java
      - semanticMarkup
        ling
        learn
        DataHolderTest.java
        LearnerTest.java
        LearnerUtilityTest.java
        SentenceLeadLengthComparatorTest.java
        StringUtilityTest.java
        UnsupervisedClauseMarkupTest.java
        WordFormUtilityTest.java
        knowledge
        AndOrTagSetterTest.java
        CommonSubstructureAnnotatorTest.java
        CoreBootstrappingLearnerTest.java
        InitializerTest.java
        POSBasedAnnotatorTest.java
        UnknownWordBootstrappingTest.java

package semanticMarkup.ling.learn.knowledge;

import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;

import semanticMarkup.ling.learn.dataholder.DataHolder;
import semanticMarkup.ling.learn.dataholder.SentenceStructure;
import semanticMarkup.ling.learn.utility.LearnerUtility;
import semanticMarkup.ling.learn.utility.StringUtility;

/**
 * Learns nouns based on some heuristics.
 * 
 * @author Dongye
 * 
 */
public class HeuristicNounLearnerUseMorphology implements IModule {
	private LearnerUtility myLearnerUtility;

	public HeuristicNounLearnerUseMorphology(LearnerUtility learnerUtility) {
		this.myLearnerUtility = learnerUtility;
	}

	@Override
	public void run(DataHolder dataholderHandler) {
		PropertyConfigurator.configure("conf/log4j.properties");
		Logger myLogger = Logger.getLogger("learn.addHeuristicsNouns");

		myLogger.trace("Enter addHeuristicsNouns");

		Set<String> nouns = this.learnHeuristicsNouns(dataholderHandler);
		myLogger.debug("Nouns learned from heuristics:");
		myLogger.debug("\t" + nouns.toString());
		myLogger.debug("Total: " + nouns.size());

		List<Set<String>> results = this.characterHeuristics(dataholderHandler);
		Set<String> rnouns = results.get(0);
		Set<String> descriptors = results.get(1);
		addDescriptors(dataholderHandler, descriptors);
		addNouns(dataholderHandler, rnouns);

		// dataholderHandler.printHolder(DataHolder.SINGULAR_PLURAL);

		myLogger.debug("Total: " + nouns.size());
		Iterator<String> iter = nouns.iterator();
		myLogger.info("Learn singular-plural pair");
		while (iter.hasNext()) {
			String e = iter.next();
			myLogger.trace("Check Word: " + e);

			if ((e.matches("^.*\\w.*$"))
					&& (!StringUtility.isMatchedWords(e, "NUM|"
							+ this.myLearnerUtility.getConstant().NUMBER + "|"
							+ this.myLearnerUtility.getConstant().CLUSTERSTRING
							+ "|"
							+ this.myLearnerUtility.getConstant().CHARACTER
							+ "|"
							+ this.myLearnerUtility.getConstant().PROPERNOUN))) {
				myLogger.trace("Pass");

				// same word may have two different pos tags
				String[] nounArray = e.split("\\|");
				for (int i = 0; i < nounArray.length; i++) {
					String nounAndPOS = nounArray[i];
					Pattern p = Pattern.compile("(\\w+)\\[([spn])\\]");
					Matcher m = p.matcher(nounAndPOS);
					if (m.lookingAt()) {
						String word = m.group(1);
						String pos = m.group(2);
						dataholderHandler.updateDataHolder(word, pos, "*",
								"wordpos", 0);

						if (pos.equals("p")) {
							String plural = word;
							String singular = this.myLearnerUtility
									.getWordFormUtility().getSingular(plural);
							if (singular != null) {
								if (!singular.equals("")) {
									dataholderHandler.addSingularPluralPair(
											singular, plural);
								}
							}
						}

						if (pos.equals("s")) {
							String singular = word;
							List<String> pluralList = this.myLearnerUtility
									.getWordFormUtility().getPlural(singular);
							Iterator<String> pluralIter = pluralList.iterator();
							while (pluralIter.hasNext()) {
								String plural = pluralIter.next();
								if (plural != null) {
									if (!plural.equals("")) {
										dataholderHandler
												.addSingularPluralPair(
														singular, plural);
									}
								}
							}
						}
					}
				}
			}
		}

		myLogger.trace("Quite addHeuristicsNouns");
	}

	/**
	 * 
	 * @param descriptors
	 */
	public void addDescriptors(DataHolder dataholderHandler,
			Set<String> descriptors) {
		Iterator<String> iter = descriptors.iterator();
		while (iter.hasNext()) {
			String descriptor = iter.next();

			if (!StringUtility.isMatchedWords(descriptor,
					this.myLearnerUtility.getConstant().FORBIDDEN)) {
				dataholderHandler.updateDataHolder(descriptor, "b", "",
						"wordpos", 1);
			}
		}

	}

	/**
	 * 
	 * @param rnouns
	 */
	public void addNouns(DataHolder dataholderHandler, Set<String> rnouns) {
		Iterator<String> iter = rnouns.iterator();
		while (iter.hasNext()) {
			String noun = iter.next();
			if (!StringUtility.isMatchedWords(noun,
					this.myLearnerUtility.getConstant().FORBIDDEN)) {
				dataholderHandler.updateDataHolder(noun, "n", "", "wordpos", 1);
			}
		}
	}

	/**
	 * 
	 * @return nouns learned by heuristics
	 */
	public Set<String> learnHeuristicsNouns(DataHolder dataholderHandler) {
		PropertyConfigurator.configure("conf/log4j.properties");
		Logger myLogger = Logger
				.getLogger("learn.addHeuristicsNouns.learnHeuristicsNouns");

		// Set of words
		Set<String> words = new HashSet<String>();

		// Set of nouns
		Set<String> nouns = new HashSet<String>();

		List<String> sentences = new LinkedList<String>();
		for (int i = 0; i < dataholderHandler.getSentenceHolder().size(); i++) {
			String originalSentence = dataholderHandler.getSentenceHolder()
					.get(i).getOriginalSentence();
			myLogger.trace("Original Sentence: " + originalSentence);
			sentences.add(StringUtility.strip(originalSentence));
		}

		// Now we have original sentences in sentences
		// Method addWords
		for (int i = 0; i < sentences.size(); i++) {
			String sentence = sentences.get(i);
			sentence = sentence.toLowerCase();
			String noun = this.getPresentAbsentNouns(sentence);
			if (!noun.equals("")) {
				nouns.add(noun);
			}

			// add words
			List<String> tokens = this.myLearnerUtility.tokenizeText(sentence,
					"all");
			for (String token : tokens) {
				if (StringUtility.isWord(token)) {
					words.add(token);
					myLogger.trace("Add a word into words: " + token);
				}
			}
		}

		// solve the problem: septa and septum are both s
		Iterator<String> nounsIterator = nouns.iterator();
		while (nounsIterator.hasNext()) {
			String oldNoun = nounsIterator.next();
			String newNoun = this.getHeuristicsNounsHelper(oldNoun, nouns);
			if (!newNoun.equals(oldNoun)) {
				nouns.remove(oldNoun);
				nouns.add(newNoun);
			}
		}

		// sort all words
		Map<String, Set<String>> wordMap = new HashMap<String, Set<String>>();
		Iterator<String> wordsIterator = words.iterator();
		while (wordsIterator.hasNext()) {
			String word = wordsIterator.next();
			String root = myLearnerUtility.getWordFormUtility().getRoot(word);
			if (wordMap.containsKey(root)) {
				Set<String> wordList = wordMap.get(root);
				wordList.add(word);
				// List<String> wordList2 = wordMap.get(root);
				// System.out.println(wordList2);
			} else {
				Set<String> wordList = new HashSet<String>();
				wordList.add(word);
				wordMap.put(root, wordList);
			}
		}

		// print out the wordMap
		myLogger.trace("WordMap:");
		Iterator<Map.Entry<String, Set<String>>> wordMapIter = wordMap
				.entrySet().iterator();
		while (wordMapIter.hasNext()) {
			Map.Entry<String, Set<String>> e = wordMapIter.next();
			myLogger.trace(e.toString());
		}

		// find nouns
		myLogger.info("Learn singular-plural pair");
		Iterator<Map.Entry<String, Set<String>>> iter = wordMap.entrySet()
				.iterator();
		while (iter.hasNext()) {
			Map.Entry<String, Set<String>> e = iter.next();
			Set<String> wordSet = e.getValue();
			Iterator<String> wordIterator = wordSet.iterator();
			while (wordIterator.hasNext()) {
				String word = wordIterator.next();

				// getnouns
				if (word.matches("^.*" + Constant.NENDINGS)) {
					nouns.add(word + "[s]");
					if (wordSet.contains(word + "s")) {
						nouns.add(word + "s" + "[p]");
						dataholderHandler.addSingularPluralPair(word, word
								+ "s");
					}
					if (wordSet.contains(word + "es")) {
						nouns.add(word + "es" + "[p]");
						dataholderHandler.addSingularPluralPair(word, word
								+ "es");
					}
				}
			}
		}

		// Iterator<LinkedList> wordMapIterator = wordMap.i
		Iterator<Map.Entry<String, Set<String>>> wordMapIterator = wordMap
				.entrySet().iterator();
		while (wordMapIterator.hasNext()) {
			Map.Entry<String, Set<String>> wordMapEntry = wordMapIterator
					.next();
			Set<String> wordSet = wordMapEntry.getValue();

			// check if there is a word with Vending
			boolean hasVending = false;
			// for (int i1 = 0; i1 < wordList.size(); i1++) {
			Iterator<String> wordIterator = wordSet.iterator();
			while (wordIterator.hasNext()) {
				String tempWord = wordIterator.next();
				if (tempWord.matches("^.*" + Constant.VENDINGS)) {
					hasVending = true;
					break;
				}
			}

			// at least two words without verb endings
			if ((!hasVending) && (wordSet.size() > 1)) {
				List<String> wordList = new LinkedList<String>(wordSet);
				for (int i = 0; i < wordList.size(); i++) {
					for (int j = i + 1; j < wordList.size(); j++) {
						String word1 = wordList.get(i);
						String word2 = wordList.get(j);
						List<String> pair = myLearnerUtility
								.getWordFormUtility().getSingularPluralPair(
										word1, word2);
						if (pair.size() == 2) {
							String singular = pair.get(0);
							String plural = pair.get(1);
							nouns.add(singular + "[s]");
							nouns.add(plural + "[p]");
							dataholderHandler.addSingularPluralPair(singular,
									plural);
						}
					}
				}
			}
		}

		// print out nouns
		myLogger.debug("Nouns: " + nouns);

		return nouns;
	}

	// ---------------addHeuristicsNouns Help Function----
	// #solve the problem: septa and septum are both s
	// septum - Singular
	// septa -Plural
	// septa[s] => septa[p]
	public String getHeuristicsNounsHelper(String oldNoun, Set<String> words) {
		String newNoun = oldNoun;

		if (oldNoun.matches("^.*a\\[s\\]$")) {
			String noun = oldNoun.replaceAll("\\[s\\]", "");
			if (words.contains(noun)) {
				newNoun = noun + "[p]";
			}
		}

		return newNoun;
	}

	/**
	 * any word preceeding "present"/"absent" would be a n
	 * 
	 * @param text
	 *            the content to learn from
	 * @return nouns learned
	 */
	public String getPresentAbsentNouns(String text) {
		PropertyConfigurator.configure("conf/log4j.properties");
		Logger myLogger = Logger
				.getLogger("learn.addHeuristicsNouns.learnHeuristicsNouns.getPresentAbsentNouns");

		String pachecked = "and|or|to";

		// if (text.matches("(\\w+?)\\s+(present|absent)")) {
		// System.out.println(text);
		// }

		Matcher matcher = Pattern.compile("^.*?(\\w+?)\\s+(present|absent).*$")
				.matcher(text);
		if (matcher.lookingAt()) {
			String word = matcher.group(1);
			if ((!word.matches("\\b(" + pachecked + ")\\b"))
					&& (!word
							.matches("\\b("
									+ this.myLearnerUtility.getConstant().STOP
									+ ")\\b"))
					&& (!word
							.matches("\\b(always|often|seldom|sometimes|[a-z]+ly)\\b"))) {

				myLogger.trace("present/absent " + word);

				if (((word.matches("^.*" + Constant.PENDINGS))
						|| (word.matches("^.*[^s]s$")) || (word
							.matches("teeth")))
						&& (!word.matches(Constant.SENDINGS))) {
					return word + "[p]";
				} else {
					return word + "[s]";
				}
			}
		}

		return "";
	}

	/**
	 * Discover nouns and descriptors according to a set of rules
	 * 
	 * @return a linked list, whose first element is a set of nouns, and second
	 *         element is a set of descriptors
	 */
	public List<Set<String>> characterHeuristics(DataHolder dataholderHandler) {
		PropertyConfigurator.configure("conf/log4j.properties");
		Logger myLogger = Logger
				.getLogger("learn.addHeuristicsNouns.characterHeuristics");

		Set<String> taxonNames = new HashSet<String>();
		Set<String> nouns = new HashSet<String>();
		Set<String> anouns = new HashSet<String>();
		Set<String> pnouns = new HashSet<String>();
		Set<String> descriptors = new HashSet<String>();
		Map<String, Boolean> descriptorMap = new HashMap<String, Boolean>();

		int sent_num = dataholderHandler.getSentenceHolder().size();
		for (int i = 0; i < sent_num; i++) {

			// taxon rule
			SentenceStructure sent = dataholderHandler.getSentenceHolder().get(
					i);
			String source = sent.getSource();
			String sentence = sent.getSentence();
			String originalSentence = sent.getOriginalSentence();

			myLogger.trace("Source: " + source);
			myLogger.trace("Sentence: " + sentence);
			myLogger.trace("Original Sentence: " + originalSentence);

			originalSentence = StringUtility.trimString(originalSentence);

			// noun rule 0: taxon names
			taxonNames = this.getTaxonNameNouns(originalSentence);

			// $sentence =~ s#<\s*/?\s*i\s*>##g;
			// $originalsent =~ s#<\s*/?\s*i\s*>##g;

			sentence = sentence.replaceAll("<\\s*/?\\s*i\\s*>", "");
			originalSentence = originalSentence.replaceAll("<\\s*/?\\s*i\\s*>",
					"");
			// Update getSentenceHolder()
			dataholderHandler.getSentenceHolder().get(i).setSentence(sentence);

			// noun rule 0.5: Meckle#s cartilage

			Set<String> nouns0 = this
					.getNounsMecklesCartilage(originalSentence);
			nouns.addAll(nouns0);
			sentence = sentence.replaceAll("#", "");
			// Update getSentenceHolder()
			dataholderHandler.getSentenceHolder().get(i).setSentence(sentence);

			// noun rule 2: end of sentence nouns
			// (a|an|the|some|any|this|that|those|these) noun$
			Set<String> nouns2 = this.getNounsRule2(originalSentence);
			nouns.addAll(nouns2);

			// noun rule 3: proper nouns and acronyms
			String copy = originalSentence;
			Set<String> nouns_temp = this.getNounsRule3Helper(copy);
			Iterator<String> iter = nouns_temp.iterator();
			while (iter.hasNext()) {
				String token = iter.next();
				if (token.matches("^.*[A-Z].+$")
						&& (!token.matches("^.*-\\w+ed$"))) {
					if (token.matches("^[A-Z0-9]+$")) {
						token = token.toLowerCase();
						anouns.add(token);
					} else {
						token = token.toLowerCase();
						pnouns.add(token);
					}
					nouns.add(token);
				}
			}

			// noun rule 1: sources with 1 _ are character statements, 2 _ are
			// descriptions
			Set<String> nouns1 = getNounsRule1(dataholderHandler, source,
					originalSentence, descriptorMap);
			nouns.addAll(nouns1);

			// noun rule 4: non-stop/prep followed by a number: epibranchial 4
			// descriptor heuristics
			Set<String> nouns4 = this.getNounsRule4(originalSentence);
			nouns.addAll(nouns4);

			// remove puncts for descriptor rules
			originalSentence = StringUtility.removePunctuation(
					originalSentence, "-");
			// System.out.println("oSent:");
			// System.out.println(originalSentence);

			// Descriptor rule 1: single term descriptions are descriptors
			descriptors.addAll(this.getDescriptorsRule1(source,
					originalSentence, nouns));

			// Descriptor rule 2: (is|are) red: isDescriptor
			descriptors.addAll(this.getDescriptorsRule2(dataholderHandler,
					originalSentence, descriptorMap));
		}

		nouns = this.filterOutDescriptors(nouns, descriptors);
		anouns = this.filterOutDescriptors(anouns, descriptors);
		pnouns = this.filterOutDescriptors(pnouns, descriptors);

		dataholderHandler.add2HeuristicNounTable(nouns, "organ");
		dataholderHandler.add2HeuristicNounTable(anouns, "acronyms");
		dataholderHandler.add2HeuristicNounTable(pnouns, "propernouns");
		dataholderHandler.add2HeuristicNounTable(taxonNames, "taxonnames");

		nouns.addAll(anouns);
		nouns.addAll(pnouns);
		nouns.addAll(taxonNames);

		List<Set<String>> results = new LinkedList<Set<String>>();
		results.add(nouns);
		results.add(descriptors);

		return results;
	}

	/**
	 * filter out descriptors from nouns, and return remaining nouns
	 * 
	 * @param rNouns
	 *            set of nouns
	 * @param rDescriptors
	 *            set of descriptors
	 * @return set of nouns that are not descriptors
	 */
	public Set<String> filterOutDescriptors(Set<String> rNouns,
			Set<String> rDescriptors) {
		Set<String> filtedNouns = new HashSet<String>();

		Iterator<String> iter = rNouns.iterator();
		while (iter.hasNext()) {
			String noun = iter.next();
			noun = noun.toLowerCase();

			Pattern p = Pattern.compile(
					"\\b(" + this.myLearnerUtility.getConstant().PREPOSITION
							+ "|" + this.myLearnerUtility.getConstant().STOP
							+ ")\\b", Pattern.CASE_INSENSITIVE);
			Matcher m = p.matcher(noun);

			if ((!m.lookingAt()) && (!rDescriptors.contains(noun))) {
				filtedNouns.add(noun);
			}
		}
		return filtedNouns;
	}

	/**
	 * Nouns rule 0: get <i></i> enclosed taxon names
	 * 
	 * @param oSent
	 * @return
	 */
	public Set<String> getTaxonNameNouns(String oSent) {
		Set<String> taxonNames = new HashSet<String>();
		String regex = "(.*?)<\\s*i\\s*>\\s*([^<]*)\\s*<\\s*\\/\\s*i\\s*>(.*)";
		String copy = oSent;

		while (true) {
			Matcher matcher = Pattern.compile(regex).matcher(copy);
			if (matcher.lookingAt()) {
				String taxonName = matcher.group(2);
				if (taxonName.length() > 0) {
					taxonNames.add(taxonName);
					String[] taxonNameArray = taxonName.split("\\s+");
					for (int i = 0; i < taxonNameArray.length; i++) {
						taxonNames.add(taxonNameArray[i]);
					}
					copy = matcher.group(3);
				} else {
					break;
				}
			} else {
				break;
			}
		}

		return taxonNames;
	}

	/**
	 * Nouns rule 0.5: Meckle#s cartilage
	 * 
	 * @param oSent
	 * @return
	 */
	public Set<String> getNounsMecklesCartilage(String oSent) {
		Set<String> nouns = new HashSet<String>();
		String regex = "^.*\\b(\\w+#s)\\b.*$";
		Matcher m = Pattern.compile(regex).matcher(oSent);
		if (m.lookingAt()) {
			String noun = "";
			noun = m.group(1);

			noun = noun.toLowerCase();
			nouns.add(noun);

			noun = noun.replaceAll("#", "");
			nouns.add(noun);

			noun = noun.replaceAll("s$", "");
			nouns.add(noun);
		}

		return nouns;
	}

	/**
	 * 
	 * @param source
	 * @param originalSentence
	 * @param descriptorMap
	 * @return
	 */
	public Set<String> getNounsRule1(DataHolder dataholderHandler,
			String source, String originalSentence,
			Map<String, Boolean> descriptorMap) {
		Set<String> nouns = new HashSet<String>();

		if ((!(source.matches("^.*\\.xml_\\S+_.*$")))
				&& (!(originalSentence.matches("^.*\\s.*$")))) {
			if (!this.isDescriptor(dataholderHandler, originalSentence,
					descriptorMap)) {
				originalSentence = originalSentence.toLowerCase();
				nouns.add(originalSentence);
			}
		}

		return nouns;
	}

	/**
	 * 
	 * @param oSent
	 * @return
	 */
	public Set<String> getNounsRule2(String oSent) {
		String copy = oSent;
		String regex = "(.*?)\\b(a|an|the|some|any|this|second|third|fourth|fifth|sixth|seventh|eighth|ninth|tenth) +(\\w+)\\s*($|\\(|\\[|\\{|\\b"
				+ this.myLearnerUtility.getConstant().PREPOSITION + "\\b)(.*)";
		Set<String> nouns = new HashSet<String>();

		while (true) {
			if (copy == null) {
				break;
			}
			Matcher m = Pattern.compile(regex).matcher(copy);
			if (m.lookingAt()) {
				String t = m.group(3);
				String prep = m.group(4);
				copy = m.group(5);

				if (prep.matches("^.*\\w.*$")
						&& t.matches("^.*\\b(length|width|presence|\\w+tion)\\b.*$")) {
					continue;
				}
				t = t.toLowerCase();
				nouns.add(t);
			} else {
				break;
			}
		}

		return nouns;
	}

	/**
	 * 
	 * @param sentence
	 * @return
	 */
	public Set<String> getNounsRule3Helper(String sentence) {
		Set<String> nouns = new HashSet<String>();

		String[] segs = sentence.split("[()\\[\\]\\{\\}]");
		for (int i1 = 0; i1 < segs.length; i1++) {
			String seg = segs[i1];
			seg = StringUtility.removePunctuation(seg, "-");
			String[] tokens = seg.split("\\s+");

			// #ignore the first word in character statements--this is normally
			// capitalized
			for (int j = 1; j < tokens.length; j++) {
				String token = tokens[j];
				if (token.matches("^.*[A-Z].+$")
						&& (!token.matches("^.*-\\w+ed$"))) {
					nouns.add(token);
				}
			}
		}

		return nouns;
	}

	/**
	 * noun rule 4: non-stop/prep followed by a number: epibranchial 4
	 * descriptor heuristics
	 * 
	 * @param oSent
	 * @return a set of nouns
	 */
	public Set<String> getNounsRule4(String oSent) {
		Set<String> nouns = new HashSet<String>();

		String copy = oSent;
		String regex = "(.*?)\\s(\\w+)\\s+\\d+(.*)";

		while (true) {
			if (copy == null) {
				break;
			}
			Matcher m = Pattern.compile(regex).matcher(copy);
			if (m.lookingAt()) {
				String t = m.group(2);
				copy = m.group(3);
				String regex2 = "\\b("
						+ this.myLearnerUtility.getConstant().PREPOSITION + "|"
						+ this.myLearnerUtility.getConstant().STOP + ")\\b";
				if (!t.matches(regex2)) {
					t = t.toLowerCase();
					nouns.add(t);
				}
			} else {
				break;
			}
		}

		return nouns;
	}

	/**
	 * 
	 * @param source
	 * @param sentence
	 * @param nouns
	 * @return
	 */
	public Set<String> getDescriptorsRule1(String source, String sentence,
			Set<String> nouns) {
		Set<String> descriptors = new HashSet<String>();
		// single word
		if (source.matches("^.*\\.xml_\\S+_.*$")
				&& (!sentence.matches("^.*\\s.*$"))) {
			Iterator<String> iter = nouns.iterator();
			boolean isExist = false;
			while (iter.hasNext()) {
				String noun = iter.next();
				if (noun.equals(sentence)) {
					isExist = true;
					break;
				}
			}
			if (isExist == false) {
				sentence = sentence.toLowerCase();
				descriptors.add(sentence);
			}
		}

		return descriptors;
	}

	/**
	 * (is|are) red: isDescriptor
	 * 
	 * @param oSent
	 * @return
	 */
	public Set<String> getDescriptorsRule2(DataHolder dataholderHandler,
			String sentence, Map<String, Boolean> descriptorMap) {
		Set<String> descriptors = new HashSet<String>();

		String[] tokens = sentence.split("\\s+");

		for (int i = 0; i < tokens.length; i++) {
			String token = tokens[i];
			token = token.toLowerCase();
			if (isDescriptor(dataholderHandler, token, descriptorMap)) {
				token = token.toLowerCase();
				descriptors.add(token);
			}
		}

		return descriptors;
	}

	/**
	 * Check if the term is a descriptor
	 * 
	 * @param term
	 * @param descriptorMap
	 *            descriptors have already learned
	 * @return a boolean value indicating whether the term is a descriptor. This
	 *         result will be stored in the descriptorMap for future use
	 */
	public boolean isDescriptor(DataHolder dataholderHandler, String term,
			Map<String, Boolean> descriptorMap) {
		if (descriptorMap.containsKey(term)) {
			if (descriptorMap.get(term).booleanValue()) {
				return true;
			} else {
				return false;
			}
		} else {
			for (int i = 0; i < dataholderHandler.getSentenceHolder().size(); i++) {
				String originalSentence = dataholderHandler.getSentenceHolder()
						.get(i).getOriginalSentence();
				if (isMatched(originalSentence, term, descriptorMap)) {
					return true;
				}
			}
			term = term.toLowerCase();
			descriptorMap.put(term, false);
			return false;
		}

	}

	/**
	 * Check if the term matches the sentence
	 * 
	 * @param sentence
	 * @param term
	 * @param descriptorMap
	 * @return a boolean value indicating whether the term matches the sentence
	 */
	public boolean isMatched(String sentence, String term,
			Map<String, Boolean> descriptorMap) {
		if (sentence.matches("^.*" + " (is|are|was|were|be|being) " + term
				+ ".*$")) {
			term = term.toLowerCase();
			descriptorMap.put(term, true);
			return true;
		} else {
			return false;
		}
	}

}