LearnerUtility.java example

Explorer
charaparser-unsupervised-master
- src
  - main
    - java
      - semanticMarkup
        core
        Treatment.java
        io
        input
        lib
        db
        ParentTagProvider.java
        know
        IGlossary.java
        IPOSKnowledgeBase.java
        Stemmer.java
        lib
        InMemoryGlossary.java
        WordNetPOSKnowledgeBase.java
        knowledge
        KnowledgeBase.java
        ling
        Token.java
        learn
        Configuration.java
        ITerminologyLearner.java
        Learner.java
        UnsupervisedClauseMarkup.java
        auxiliary
        AjectiveReplacementForNoun.java
        FileLoader.java
        GetNounsAfterPtnReturnValue.java
        KnownTagCollection.java
        POS.java
        POSInfo.java
        SentenceLeadLengthComparator.java
        StringAndInt.java
        StringPair.java
        dataholder
        DataHolder.java
        DiscountedKey.java
        IsAValue.java
        ModifierTableValue.java
        SentenceStructure.java
        SingularPluralPair.java
        WordPOSKey.java
        WordPOSValue.java
        knowledge
        AdditionalBootstrapping.java
        AdditionalBootstrappingLearner.java
        AdjectiveSubjectBootstrappingLearner.java
        AdjectiveVerifier.java
        AndOrTagSetter.java
        AnnotationNormalizer.java
        CommaAsAndAnnotator.java
        CommonSubstructureAnnotator.java
        Constant.java
        CoreBootstrappingLearner.java
        DittoAnnotator.java
        FiniteSetsLoader.java
        HeuristicNounLearnerUseMorphology.java
        HeuristicNounLearnerUseSuffix.java
        IModule.java
        IgnorePatternAnnotator.java
        IgnoredFinalizer.java
        Initializer.java
        MarkupByPatternLearner.java
        ModifierTagSeparator.java
        NMBResolver.java
        NullSentenceTagger.java
        POSBasedAnnotator.java
        PatternBasedAnnotator.java
        PhraseClauseAnnotator.java
        PronounCharactersAnnotator.java
        UnknownWordBootstrappingLearner.java
        utility
        LearnerUtility.java
        StringUtility.java
        WordFormUtility.java
        pos
        POS.java
        transform
        ITokenizer.java
        lib
        OpenNLPSentencesTokenizer.java
        OpenNLPTokenizer.java
  - test
    - java
      - semanticMarkup
        ling
        learn
        DataHolderTest.java
        LearnerTest.java
        LearnerUtilityTest.java
        SentenceLeadLengthComparatorTest.java
        StringUtilityTest.java
        UnsupervisedClauseMarkupTest.java
        WordFormUtilityTest.java
        knowledge
        AndOrTagSetterTest.java
        CommonSubstructureAnnotatorTest.java
        CoreBootstrappingLearnerTest.java
        InitializerTest.java
        POSBasedAnnotatorTest.java
        UnknownWordBootstrappingTest.java
package semanticMarkup.ling.learn.utility;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.Map.Entry;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.lang3.StringUtils;
import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;

import semanticMarkup.know.lib.WordNetPOSKnowledgeBase;
import semanticMarkup.ling.Token;
import semanticMarkup.ling.learn.auxiliary.GetNounsAfterPtnReturnValue;
import semanticMarkup.ling.learn.auxiliary.KnownTagCollection;
import semanticMarkup.ling.learn.auxiliary.POSInfo;
import semanticMarkup.ling.learn.auxiliary.StringAndInt;
import semanticMarkup.ling.learn.dataholder.DataHolder;
import semanticMarkup.ling.learn.dataholder.ModifierTableValue;
import semanticMarkup.ling.learn.dataholder.SentenceStructure;
import semanticMarkup.ling.learn.dataholder.WordPOSKey;
import semanticMarkup.ling.learn.dataholder.WordPOSValue;
import semanticMarkup.ling.learn.knowledge.Constant;
import semanticMarkup.ling.transform.ITokenizer;

public class LearnerUtility {

	private ITokenizer mySentenceDetector;
	private ITokenizer mytokenizer;
	private WordFormUtility myWordFormUtility;
	private WordNetPOSKnowledgeBase myWordNetPOS;
	private Constant myConstant;
	
	public LearnerUtility(ITokenizer sentenceDetector, ITokenizer tokenizer, WordNetPOSKnowledgeBase wordNetPOS) {
		this.myConstant = new Constant();
		this.mySentenceDetector = sentenceDetector;
		this.mytokenizer = tokenizer;
		this.myWordFormUtility = new WordFormUtility(wordNetPOS);
		this.myWordNetPOS = wordNetPOS;
	}
	
	public Constant getConstant(){
		return this.myConstant;
	}
	
	public ITokenizer getTokenizer(){
		return this.mytokenizer;
	}
	
	public ITokenizer getSentenceDetector(){
		return this.mySentenceDetector;
	}
	
	public WordFormUtility getWordFormUtility(){
		return this.myWordFormUtility;
	}
	
	public WordNetPOSKnowledgeBase getWordNetPOSKnowledgeBase(){
		return this.myWordNetPOS;
	}
	
	// populate sentence utilities
	/**
	 * Given a file name, return its type
	 * 
	 * @param fileName
	 * @return return 1 if it is a file of character file, or 2 if it is a
	 *         description file, otherwise return 0
	 */
	public int getType(String fileName) {
		// remove pdf.xml
		fileName = fileName.replaceAll(".*\\.xml_", "");
		// remove all non_ charaters
		fileName = fileName.replaceAll("[^_]", "");

		// a character file
		if (fileName.length() == 0) {
			return 1;
		}

		// a description file
		if (fileName.length() == 1) {
			return 2;
		}

		return 0;
	}
	
	


	/**
	 * replace '.', '?', ';', ':', '!' within brackets by some special markers,
	 * to avoid split within brackets during sentence segmentation
	 * 
	 * @param text
	 * @return
	 */
	public String hideMarksInBrackets(String text) {

		if (text == null || text == "") {
			return text;
		}

		String hide = "";
		int lRound = 0;
		int lSquare = 0;
		int lCurly = 0;

		for (int i = 0; i < text.length(); i++) {
			char c = text.charAt(i);
			switch (c) {
			case '(':
				lRound++;
				hide = hide + c;
				break;
			case ')':
				lRound--;
				hide = hide + c;
				break;
			case '[':
				lSquare++;
				hide = hide + c;
				break;
			case ']':
				lSquare--;
				hide = hide + c;
				break;
			case '{':
				lCurly++;
				hide = hide + c;
				break;
			case '}':
				lCurly--;
				hide = hide + c;
				break;
			default:
				if (lRound + lSquare + lCurly > 0) {
					if (c == '.') {
						hide = hide + "[DOT] ";
					} else if (c == '?') {
						hide = hide + "[QST] ";
					} else if (c == ';') {
						hide = hide + "[SQL] ";
					} else if (c == ':') {
						hide = hide + "[QLN] ";
					} else if (c == '!') {
						hide = hide + "[EXM] ";
					} else {
						hide = hide + c;
					}
				} else {
					hide = hide + c;
				}
			}
		}
		return hide;

	}
	
	
	/**
	 * Put all words in this sentence into the words map
	 * 
	 * @param sent
	 * @param words
	 *            a map mapping all words already known to their counts
	 * @return a new map of all words, including words in sent
	 */
	public Map<String, Integer> getAllWords(String sentence,
			Map<String, Integer> words) {
		List<String> tokens = this.tokenizeText(sentence, "all");

		for (String token: tokens) {
			if (words.containsKey(token)) {
				int count = words.get(token);
				count = count + 1;
				words.put(token, count);
			} else {
				words.put(token, 1);
			}
		}

		return words;
	}
	
	/**
	 * returns the first n words of the sentence
	 * 
	 * @param sent
	 *            the sentence
	 * @param n
	 *            number of words to be returned
	 * @return the first n words of the sentence. If the number of words in the
	 *         sentence is less than n, return all of them.
	 */
	public List<String> getFirstNWords(String sentence, int n) {
		List<String> nWords = new ArrayList<String>();

		if (sentence == null || sentence == "") {
			return nWords;
		}
		
		List<String> tokens = this.tokenizeText(sentence, "firstseg");
		
		
		int minL = tokens.size() > n ? n : tokens.size();
		for (int i = 0; i < minL; i++) {
			nWords.add(tokens.get(i));
		}

		return nWords;
	}
	

	/**
	 * Restore '.', '?', ';', ':', '!' within brackets
	 * 
	 * @param text
	 * @return the restored string
	 */
	public String restoreMarksInBrackets(String text) {

		if (text == null || text == "") {
			return text;
		}

		// restore "." from "[DOT]"
		text = text.replaceAll("\\[\\s*DOT\\s*\\]", ".");
		// restore "?" from "[QST]"
		text = text.replaceAll("\\[\\s*QST\\s*\\]", "?");
		// restore ";" from "[SQL]"
		text = text.replaceAll("\\[\\s*SQL\\s*\\]", ";");
		// restore ":" from "[QLN]"
		text = text.replaceAll("\\[\\s*QLN\\s*\\]", ":");
		// restore "." from "[DOT]"
		text = text.replaceAll("\\[\\s*EXM\\s*\\]", "!");

		return text;
	}

	/**
	 * Add space before and after all occurence of the regex in the string str
	 * 
	 * @param str
	 * @param regex
	 * @return
	 */
	public String addSpace(String str, String regex) {

		if (str == null || str == "" || regex == null || regex == "") {
			return str;
		}

		Matcher matcher = Pattern.compile("(^.*)(" + regex + ")(.*$)").matcher(
				str);
		if (matcher.lookingAt()) {
			str = addSpace(matcher.group(1), regex) + " " + matcher.group(2)
					+ " " + addSpace(matcher.group(3), regex);
			return str;
		} else {
			return str;
		}
	}
	
	public List<String> tokenizeText(String sentence, String mode) {
		if (StringUtils.equals(mode, "firstseg")) {
			sentence = getSentenceHead(sentence);
		}
		else {
			;
		}
		
		String[] tempWords = sentence.split("\\s+");
		List<String> words = new ArrayList<String>();
		words.addAll(Arrays.asList(tempWords));
		
		return words;
	}
	
	/**
	 * Get the portion in the input sentence before any of ,:;.[(, or any
	 * preposition word, if any
	 * 
	 * @param sentence
	 *            the input sentence
	 * @return the portion in the head
	 */
	public String getSentenceHead(String sentence) {
		PropertyConfigurator.configure("conf/log4j.properties");
		Logger myLogger = Logger
				.getLogger("learn.populateSentence.getFirstNWords.getHead");

		if (sentence == null) {
			return sentence;
		}
		else if (sentence.equals("")) {
			return sentence;
		} 
		else {
			String head = "";
			int end = sentence.length();

			String pattern1 = " [,:;.\\[(]";
			String pattern2 = "\\b" + "(" + this.myConstant.PREPOSITION + ")" + "\\s";

			myLogger.trace("Pattern1: " + pattern1);
			myLogger.trace("Pattern2: " + pattern2);

			Pattern p1 = Pattern.compile(pattern1);
			Pattern p2 = Pattern.compile(pattern2);

			Matcher m1 = p1.matcher(sentence);
			Matcher m2 = p2.matcher(sentence);

			boolean case1 = m1.find();
			boolean case2 = m2.find();
			
			if (case1 || case2) {
				// case 1
				if (case1) {
					int temp1 = m1.end();
					end = temp1 < end ? temp1 : end;
					end = end - 1;
				}
				// case 2
				else {
					int temp2 = m2.end();
					end = temp2 < end ? temp2 : end;
				}

				head = sentence.substring(0, end - 1);
			}
			else {
				head = sentence;
			}

			myLogger.trace("Return: " + head);
			return head;
		}
	}
	
	/**
	 * Segment a text into sentences using the OpenNLP sentence detector. Note
	 * how dot after any abbreviations is handled: to avoid segmenting at
	 * abbreviations, the dots of abbreviations are first replaced by a special
	 * mark before the text is segmented. Then after the segmentation, they are
	 * restored back.
	 * 
	 * @param text
	 * @return List of Sentence
	 */
	public List<Token> segmentSentence(String text) {
		List<Token> sentences;
		
		//hide abbreviations
		text = this.hideAbbreviations(text);
		
		// do sentence segmentation
		
		sentences = this.mySentenceDetector.tokenize(text);
		
		// restore Abbreviations
		
		for (Token sentence: sentences){
			String contentHideAbbreviations = sentence.getContent();
			String contentRestoreAbbreviations = this.restoreAbbreviations(contentHideAbbreviations);
			sentence.setContent(contentRestoreAbbreviations); 
		}
		
		return sentences;
	}
	
	/**
	 * replace the dot (.) mark of abbreviations in the text by a special mark
	 * ([DOT])
	 * 
	 * @param text
	 * @return the text after replacement
	 */
	public String hideAbbreviations(String text) {
		String pattern = "(^.*)("
				+Constant.PEOPLE_ABBR
				+"|"+Constant.ARMY_ABBR
				+"|"+Constant.INSTITUTES_ABBR
				+"|"+Constant.COMPANIES_ABBR
				+"|"+Constant.PLACES_ABBR
				+"|"+Constant.MONTHS_ABBR
				+"|"+Constant.MISC_ABBR
				+"|"+Constant.BOT1_ABBR
				+"|"+Constant.BOT2_ABBR
				+"|"+Constant.LATIN_ABBR
				+")(\\.)(.*$)";
		//pattern = "(^.*)(jr|abc)(\\.)(.*$)";
		
		Pattern p = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE);
		Matcher m;
		m= p.matcher(text);
		while (m.matches()){
			String head = m.group(1);
			String abbr = m.group(2);
			String dot = m.group(3);
			String remaining = m.group(4);
			dot = "[DOT]";
			text= head+abbr+dot+remaining;
			m=p.matcher(text);
		}
		
		return text;
	}
	
	/**
	 * restore the dot (.) mark of abbreviations in the text from special mark
	 * ([DOT])
	 * 
	 * @param text
	 * @return the text after replacement
	 */
	public String restoreAbbreviations(String text) {
		String pattern = "(^.*)("
				+Constant.PEOPLE_ABBR
				+"|"+Constant.ARMY_ABBR
				+"|"+Constant.INSTITUTES_ABBR
				+"|"+Constant.COMPANIES_ABBR
				+"|"+Constant.PLACES_ABBR
				+"|"+Constant.MONTHS_ABBR
				+"|"+Constant.MISC_ABBR
				+"|"+Constant.BOT1_ABBR
				+"|"+Constant.BOT2_ABBR
				+"|"+Constant.LATIN_ABBR
				+")(\\[DOT\\])(.*$)";
		//pattern = "(^.*)(jr|abc)(\\.)(.*$)";
		
		Pattern p = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE);
		Matcher m;
		m= p.matcher(text);
		while (m.matches()){
			String head = m.group(1);
			String abbr = m.group(2);
			String dot = m.group(3);
			String remaining = m.group(4);
			dot = ".";
			text= head+abbr+dot+remaining;
			m=p.matcher(text);
		}
		
		return text;
	}

	/**
	 * Convert a collection of words to a string of those words separated by "|"
	 * 
	 * @param c
	 *            collection of words
	 * @return string of pattern. If the collection is null or empty, return an
	 *         empty string
	 */
	public String Iterable2Pattern(Iterable<String> words) {
		if (words == null) {
			return "";
		}

		List<String> wordList = new LinkedList<String>();
		for (String word: words) {
			word = this.addDoubleBackslash(word);
			wordList.add(word);
		}
		String pattern = StringUtils.join(wordList, "|");
//		pattern = this.addDoubleBackslash(pattern);
		
		
		
		
//		testRunner("\\\\", "abc\\abc");
//		testRunner("\\(", "abc(abc");
//		testRunner("\\)", "abc)abc");
//		testRunner("\\[", "abc[abc");
//		testRunner("\\]", "abc]abc");
//		testRunner("\\{", "abc{abc");
//		testRunner("\\}", "abc}abc");
//		testRunner("\\.", "abc.abc");
//		testRunner("\\|", "abc|abc");
//		testRunner("\\+", "abc+abc");
//		testRunner("\\*", "abc*abc");
//		testRunner("\\?", "abc?abc");
//		testRunner("\\d+", "01138");
		
//		[-\\\\\\(\\)\\[\\]\\{\\}\\.\\|\\+\\*\\?]
//				
//				stops.addAll(Arrays.asList(new String[] { "NUM", "(", "[", "{",
//						")", "]", "}", "d+" }));

		return pattern;
	}

	/**
	 * Convert a pattern with words separated by "|" to a set
	 * 
	 * @param pattern
	 *            the pattern
	 * @return a set. If the input is null or empty string, return a empty set
	 */
	public static Set<String> Pattern2Set(String pattern) {
		Set<String> set = new HashSet<String>();

		if (StringUtils.equals(pattern, null)
				|| StringUtils.equals(pattern, "")) {
			return (set);
		}

		set.addAll(Arrays.asList(pattern.split("|")));

		return set;
	}
	
	/**
	 * tag words with all o n m b tags that are applicable to the words
	 * 
	 * @param mode
	 *            "singletag" or "multitags"
	 * @param type
	 *            "sentence" or "orginal"
	 */
	public void tagAllSentences (DataHolder dataholderHandler, String mode, String type) {
		List<StringAndInt> idAndSentenceList = new LinkedList<StringAndInt>();
		
		Iterator<SentenceStructure> sentenceIter = 
				dataholderHandler.getSentenceHolder().iterator();
		
		if (StringUtils.equals(mode, "original")) {
			while (sentenceIter.hasNext()) {
				SentenceStructure sentence = sentenceIter.next();
				int thisID = sentence.getID();
				String thisOriginalSentence = sentence.getOriginalSentence();
				idAndSentenceList.add(new StringAndInt(thisOriginalSentence, thisID));
			}
		}
		else {
			while (sentenceIter.hasNext()) {
				SentenceStructure sentence = sentenceIter.next();
				int thisID = sentence.getID();
				String thisSentence = sentence.getSentence();
				idAndSentenceList.add(new StringAndInt(thisSentence, thisID));
			}
		}
		
		KnownTagCollection myKnownTags = this.getKnownTags(dataholderHandler, mode);
	
		Iterator<StringAndInt> idAndSentenceListIter = idAndSentenceList.iterator();
		while (idAndSentenceListIter.hasNext()) {
			StringAndInt idAndSentence = idAndSentenceListIter.next();
			int thisID = idAndSentence.getInt();
			if (thisID == 127) {
				System.out.println();
			}
			String thisSentence = idAndSentence.getString();
			
			thisSentence = tagAllSentencesHelper(thisSentence);
			thisSentence = annotateSentence(thisSentence, myKnownTags, dataholderHandler.getBMSWords());
			
			SentenceStructure targetSentence = dataholderHandler.getSentence(thisID);
			
			if (StringUtils.equals(mode, "original")) {
				targetSentence.setOriginalSentence(thisSentence);
			}
			else {
			targetSentence.setSentence(thisSentence);
			}
		}
		
	}
    
	/**
	 * Helper of tagAllSentencesHelper method
	 * @param text
	 * @return text after processing
	 */
	public String tagAllSentencesHelper(String text) {
		text = text.replaceAll("<\\S+?>", "");
		text = text.toLowerCase();
		
		// cup_shaped, 3_nerved, 3-5 (-7)_nerved
//		Matcher m2 = StringUtility.createMatcher("\\s*-\\s*([a-z])", text);
//		while (m2.find()) {
//			String group1 = m2.group(1);
//			text = m2.replaceFirst("_"+group1);
//			m2 = StringUtility.createMatcher("\\s*-\\s*([a-z])", text);
//		}
		
		//$b =~ s#\b(_[a-z]+)\b#(?\:\\b\\d+)$1#g; #_nerved => (?:\b\d+)_nerved
//		$sent =~ s#\s*-\s*([a-z])#_$1#g; 
		text = StringUtility.replaceAllBackreference(text, "\\s*-\\s*([a-z])", "_$1");
		
		// add space around nonword char
		text = StringUtility.replaceAllBackreference(text, "(\\W)", " $1 ");
		
		// multiple spaces => 1 space
		text = text.replaceAll("\\s+", " ");	
		// trim
		text = text.replaceAll("^\\s*", "");	
		text = text.replaceAll("\\s*$", "");	
		
		return text;
	}
	
	
	
	public String annotateSentence(String sentence,
			KnownTagCollection knownTags, Set<String> NONS) {
		// get known tags
		Set<String> boundaryMarks;
		Set<String> boundaryWords;
		Set<String> modifiers;
		Set<String> nouns;
		Set<String> organs;
		Set<String> properNouns;
		
		if (knownTags.boundaryMarks == null) {
			boundaryMarks = new HashSet<String>();
		} else {
			boundaryMarks = knownTags.boundaryMarks;
		}
		
		if (knownTags.boundaryWords == null) {
			boundaryWords = new HashSet<String>();
		} else {
			boundaryWords = knownTags.boundaryWords;
		}
		
		if (knownTags.modifiers == null) {
			modifiers = new HashSet<String>();
		} else {
			modifiers = knownTags.modifiers;
		}
		
		if (knownTags.nouns== null) {
			nouns = new HashSet<String>();
		} else {
			nouns = knownTags.nouns;
		}
		
		if (knownTags.organs == null) {
			organs = new HashSet<String>();
		} else {
			organs = knownTags.organs;
		}
		
		if (knownTags.properNouns == null) {
			properNouns = new HashSet<String>();
		} else {
			properNouns = knownTags.properNouns;
		}
		
		// preprocessing 1
		List<String> bDeleteList = new LinkedList<String>();
		List<String> bAddList = new LinkedList<String>();
		Iterator<String> bIter = boundaryWords.iterator();
		while(bIter.hasNext()) {
			String oldWord = bIter.next();
			
			if (oldWord.charAt(0)=='_') {
				String newWord = "(?\\:\\b\\d+)"+oldWord;
				bDeleteList.add(oldWord);
				bAddList.add(newWord);
			}
		}
		boundaryWords.removeAll(bDeleteList);
		boundaryWords.addAll(bAddList);
		
		nouns = StringUtility.setSubtraction(nouns, NONS);
		organs = StringUtility.setSubtraction(organs, NONS);
		
		// preprocessing 2
		Set<String> tagSet = new HashSet<String>();
		tagSet.addAll(Arrays.asList("Z O N M B".split(" ")));
		properNouns = StringUtility.setSubtraction(properNouns, tagSet);
		organs = StringUtility.setSubtraction(organs, tagSet);
		nouns = StringUtility.setSubtraction(nouns, tagSet);
		modifiers = StringUtility.setSubtraction(modifiers, tagSet);
		boundaryWords = StringUtility.setSubtraction(boundaryWords, tagSet);
		boundaryMarks = StringUtility.setSubtraction(boundaryMarks, tagSet);
		
		// insert tags
		sentence = annotateSentenceHelper(sentence, properNouns, "Z", true);
//		System.out.println(sentence);
		sentence = annotateSentenceHelper(sentence, organs, "O", true);
//		System.out.println(sentence);
//		if (sentence.equals("<O>extent</O> of dermal cranial covering")) {
//			System.out.println();
//		}
		sentence = annotateSentenceHelper(sentence, nouns, "N", true);
//		System.out.println(sentence);
		sentence = annotateSentenceHelper(sentence, modifiers, "M", true);
		sentence = annotateSentenceHelper(sentence, boundaryWords, "B", true);
		sentence = annotateSentenceHelper(sentence, boundaryMarks, "B", false);
		
		sentence = annotateSentenceHelper2(sentence);
		
		return sentence;
	}
	
	
	public String annotateSentenceHelper(String sentence, Set<String> words,
			String tag, boolean isWithBoundaryWord) {
		PropertyConfigurator.configure("conf/log4j.properties");
		Logger myLogger = Logger.getLogger("learn.annotateSentence");
		
		if (words.size() != 0) {
			if (isWithBoundaryWord) {
				sentence = StringUtility.replaceAllBackreference(
						sentence,
						String.format("\\b(%s)\\b",
								this.Iterable2Pattern(words)),
						String.format("<%s>$1</%s>", tag, tag));
			} else {
//				String pattern = String.format("(%s)", LearnerUtility.Collection2Pattern(words));
//				Matcher m1 = StringUtility.createMatcher("(\\]|\\}|\\(|\\)|\\{|\\[)", "word ]abc");
//				boolean b1 = m1.find();
////				Matcher m2 = StringUtility.createMatcher("(]|}|(|)|{|[)", "word (abc)");
////				boolean b2 = m2.find();
				
				String regex = String.format("(%s)",
						this.Iterable2Pattern(words));
				String replacement = String.format("<%s>$1</%s>", tag, tag);
				
				myLogger.trace("Sentence: "+sentence);
				myLogger.trace("Words: "+words);
				myLogger.trace("Regex: "+regex);
				myLogger.trace("Replacement: "+replacement);

				sentence = StringUtility.replaceAllBackreference(sentence,
						regex, replacement);
			}
		}

		return sentence;
	}
	
	public String annotateSentenceHelper2(String sentence){
		if (StringUtility.createMatcher(sentence, "").find()) {
			sentence = StringUtility.replaceAllBackreference(sentence, "<(\\w)>\\s*</$1>", "");
		}
		
		Matcher m = StringUtility
				.createMatcher(sentence, "<(\\w)>\\s*</(\\1)>");
		while (m.find()) {
			sentence = m.replaceFirst("");
			m = StringUtility.createMatcher(sentence, "<(\\w)>\\s*</(\\1)>");
		}
		
		sentence = StringUtility.replaceAllBackreference(sentence, 
				"(?:<[^<]+>)+("+this.myConstant.FORBIDDEN+")(?:</[^<]+>)+", "$1");
		
		return sentence;
	}
	
	/**
	 * 
	 * @param mode
	 *            can be either "singletag" or "multitags"
	 */
    public KnownTagCollection getKnownTags(DataHolder dataholderHandler, String mode) {
    	PropertyConfigurator.configure("conf/log4j.properties");
		Logger myLogger = Logger.getLogger("learn.getKnownTags");
		myLogger.trace("Enter (mode: "+mode+")");
		
		KnownTagCollection knownTags = null;
		Set<String> nouns = new HashSet<String>(); // nouns
		Set<String> organs = new HashSet<String>(); // organs
		Set<String> modifiers = new HashSet<String>(); // modifiers
		Set<String> boundaryWords = new HashSet<String>(); // boundary words
		Set<String> boundaryMarks = new HashSet<String>(); // boundary marks
		Set<String> properNouns = new HashSet<String>(); // proper nouns
		
		// get nouns
		Set<String> nounSet = new HashSet<String>();
		Set<String> psWordSet = new HashSet<String>(); // set of nouns
		psWordSet = this.getPSWords(dataholderHandler);
		nounSet.addAll(psWordSet);
		// if the mode is "singletag", then get additional nouns from tags
		if (StringUtils.equalsIgnoreCase(mode, "singletag")) {
			nounSet.addAll(this.getOrgans(dataholderHandler));
		} else {
			// do nothing
		}
		nouns.addAll(nounSet);
		myLogger.trace("Get nouns: "+nouns.toString());
		
		// get organs
		if(StringUtils.equals(mode, "multitags")){
			Set<String> organSet = this.getOrgans(dataholderHandler);
			organs.addAll(organSet);
			myLogger.trace("Get organs: "+organs.toString());
		}
		
		// get modifiers
		Set<String> modifierSet = new HashSet<String>();
		modifierSet = this.getModifiers(dataholderHandler);
		if(StringUtils.equals(mode, "singletag")){
			Iterator<String> mIter = modifierSet.iterator();
			while (mIter.hasNext()) {
				String m = mIter.next();
				if (!psWordSet.contains(m)) {
					modifiers.add(m);
				}
			}
		}else{
			modifiers.addAll(modifierSet);
		}
		
		// get boundary words and marks
		List<Set<String>> result = this.getBoundaries(dataholderHandler);
		boundaryWords = result.get(0);
		boundaryMarks = result.get(1);
		
		// get proper nouns
		properNouns = this.getProperNouns(dataholderHandler);
		
		// put all known tags into one KnownTagCollection object
		knownTags = new KnownTagCollection(nouns, organs, modifiers, boundaryWords, boundaryMarks, properNouns);
		
		return knownTags;
	}
    
	/**
	 * A helper of method getKnownTags(). Get a set of all nouns from the
	 * word-POS collection.
	 * 
	 * @return a set of nouns
	 */
	public Set<String> getPSWords(DataHolder dataholderHandler) {
		Set<String> psSet = new HashSet<String>(); // set of p and s
		// get a set of all nouns from the word-POS collection
		Iterator<Entry<WordPOSKey, WordPOSValue>> iterWordPOS = dataholderHandler
				.getWordPOSHolder().entrySet().iterator();
		while (iterWordPOS.hasNext()) {
			Entry<WordPOSKey, WordPOSValue> entry = iterWordPOS.next();
			String POS = entry.getKey().getPOS();
			if ((StringUtils.equals(POS, "s"))
					|| (StringUtils.equals(POS, "p"))) {
				String word = entry.getKey().getWord();
				if (word != null) {
					if (StringUtility.createMatcher(word, "^[a-zA-Z0-9_-]+$")
							.find()) {
						psSet.add(word);
					}
				}
			}
		}

		return psSet;
	}
	
	/**
	 * A helper of method getKnownTags(). Get a set of o from tags in sentence
	 * collections
	 * 
	 * @return a set of o
	 */
	public Set<String> getOrgans(DataHolder dataholderHandler) {
		Set<String> oSet = new HashSet<String>(); // set of organs
		
		Iterator<SentenceStructure> iterSentence = dataholderHandler
				.getSentenceHolder().iterator();
		while (iterSentence.hasNext()) {
			SentenceStructure sentence = iterSentence.next();
			String tag = sentence.getTag();

			if (tag != null) {
				if ((!StringUtils.equals(tag, "ignore"))
						&& (!StringUtility.createMatcher(tag, ".* .*").find()) 
						&& (!StringUtility.createMatcher(tag, ".*\\[.*").find())) {
					if (StringUtility.createMatcher(tag, "^[a-zA-Z0-9_-]+$").find()) {
						oSet.add(tag);
					}
				}
			}
		}
		
		return oSet;
	}
	
	/**
	 * Get modifier words from modifier collection.
	 * 
	 * @return a set fo modifer words
	 */
	public Set<String> getModifiers(DataHolder dataholderHandler) {
		Set<String> mSet = new HashSet<String>(); // set of o
		
		Iterator<Entry<String, ModifierTableValue>> iter = dataholderHandler
				.getModifierHolder().entrySet().iterator();
		while (iter.hasNext()) {
			Entry<String, ModifierTableValue> entry = iter.next();
			String word = entry.getKey();
			if (word != null) {
				if (StringUtility.createMatcher(word, "^[a-zA-Z0-9_-]+$")
						.find()) {
					mSet.add(word);
				}
			}
		}
		
		return mSet;
	}
	
	/**
	 * Get boundary words and marks.
	 * 
	 * @return a list of two elements. The first element is a set of boundary
	 *         words, and second element is a set of boundary marks.
	 */
    public List<Set<String>> getBoundaries (DataHolder dataholderHandler){
    	Set<String> bWords = new HashSet<String>();
    	Set<String> bMarks = new HashSet<String>();
    	List<Set<String>> result = new LinkedList<Set<String>>();
    	
		Iterator<Entry<WordPOSKey, WordPOSValue>> iter = dataholderHandler
				.getWordPOSHolderIterator();
    	while (iter.hasNext()) {
    		Entry<WordPOSKey, WordPOSValue> entry = iter.next();
    		String word = entry.getKey().getWord();
    		String POS = entry.getKey().getPOS();

			if (word != null && POS != null) {
				if (StringUtils.equals(POS, "b")) {
//					String pattern = "^[-\\\\\\(\\)\\[\\]\\{\\}\\.\\|\\+\\*\\?]$";
					String pattern = "^(-|\\\\|\\(|\\)|\\[|\\]|\\{|\\}|\\.|\\||\\+|\\*|\\?)$";
					if (StringUtility.isMatchedNullSafe(word, pattern)) {
						bMarks.add(word);
					} else if ((!(StringUtility.isMatchedNullSafe(word, "\\w"))) && (!StringUtils.equals(word, "/"))) {
						if (StringUtility.createMatcher(word, "^[a-zA-Z0-9_-]+$").find()) {
							bMarks.add(word);
						}
					} else {
						if (StringUtility.isMatchedNullSafe(word, "^[a-zA-Z0-9_-]+$")) {
							bWords.add(word);
						}
					}
				}
			}
		}

    	result.add(bWords);
    	result.add(bMarks);
    	
    	return result;
    }
    
	/**
	 * Get the proper nouns from the word-POS collection
	 * 
	 * @return a set of the porper nouns
	 */
	public Set<String> getProperNouns(DataHolder dataholderHandler) {
		Set<String> pNouns = new HashSet<String>();
		
		Iterator<Entry<WordPOSKey, WordPOSValue>> iter = dataholderHandler.getWordPOSHolder().entrySet().iterator();
		
		while (iter.hasNext()) {
			Entry<WordPOSKey, WordPOSValue> entry = iter.next();
			String word = entry.getKey().getWord();
			String POS = entry.getKey().getPOS();
			
			if (StringUtils.equals(POS, "z")) {
				if (StringUtility.createMatcher(word, "^[a-zA-Z0-9_-]+$").find()) {
					pNouns.add(word);
				}
			}
		}
		
		return pNouns;
	}

//	/**
//	 * @param args
//	 */
//	public static void main(String[] args) {
////		[-\\\\\\(\\)\\[\\]\\{\\}\\.\\|\\+\\*\\?]
////		
////		stops.addAll(Arrays.asList(new String[] { "NUM", "(", "[", "{",
////				")", "]", "}", "d+" }));
//		testRunner("z", "abczabc");
//		testRunner("/", "abc/abc");
//		testRunner("-", "abc-abc");
//		testRunner("_", "abc_abc");
//		testRunner(addDoubleBackslash("\\"), "abc\\abc");
//		testRunner(addDoubleBackslash("("), "abc(abc");
//		testRunner(addDoubleBackslash(")"), "abc)abc");
//		testRunner(addDoubleBackslash("["), "abc[abc");
//		testRunner(addDoubleBackslash("]"), "abc]abc");
//		testRunner(addDoubleBackslash("{"), "abc{abc");
//		testRunner(addDoubleBackslash("}"), "abc}abc");
//		testRunner(addDoubleBackslash("."), "abc.abc");
//		testRunner(addDoubleBackslash("|"), "abc|abc");
//		testRunner(addDoubleBackslash("+"), "abc+abc");
//		testRunner(addDoubleBackslash("*"), "abc*abc");
//		testRunner(addDoubleBackslash("?"), "abc?abc");
//		testRunner(addDoubleBackslash("d+"), "01138");
////		testRunner("\\(", "abc(abc");
////		testRunner("\\(", "abc(abc");
////		testRunner("\\(", "abc(abc");
//		
//		String str = "(";
//		str = str.replaceAll("(\\()", "\\\\$1");
//		System.out.println(str);
//		
//		str = addDoubleBackslash(str);
//
//
//	}
	private String addDoubleBackslash(String word) {
		word = word.replaceAll("^(\\\\|\\(|\\)|\\[|\\]|\\{|\\}|\\.|\\||\\+|\\*|\\?|d\\+)$", "\\\\$1");
//		word = word.replaceAll("^(d\\+)$", "\\\\$1");
		
		return word;
	}
	
//	private static String addDoubleBackslash(String word) {
//		word = word.replaceAll("^(\\\\|\\(|\\)|\\[|\\]|\\{|\\}|\\.|\\||\\+|\\*|\\?|d\\+)$", "\\\\$1");
////		word = word.replaceAll("^(d\\+)$", "\\\\$1");
//		
//		return word;
//	}

	private static boolean testRunner(String regex, String str) {
		boolean isMatched = false;
		
		Pattern p = Pattern.compile(regex);
		Matcher m = p.matcher(str);
		
		isMatched = m.find();
		
		System.out.println(isMatched);
		
		return isMatched;
	}
	
	public String getSentencePtn(DataHolder dataholderHandler, Set<String> token, int limit, List<String> words) {
		Set<String> typeModifierPtns = dataholderHandler.getTypeModifierPattern();
		String ptn = "";
		
		int counter = 0;
		String regex = String.format("\\b(%s)\\b",StringUtils.join(token, "|"));
		
		Iterator<String> wordIter = words.iterator();
		while (wordIter.hasNext()) {
			if (counter > limit - 1) {
				break;
			}
			counter++;
			String word = wordIter.next();
			
			if (StringUtility.isEntireMatchedNullSafe(word, regex))	{
				ptn = ptn + "&";
			}
			else {
				if (word == null) {
					ptn = ptn + "q";
				}
				else {
					Matcher m1 = StringUtility.createMatcher(word, "([,:;\\.])");
					Matcher m2 = StringUtility.createMatcher(word, "<(\\w)>");
					if (m1.find()) {
						String g1 = m1.group(1);
						ptn = ptn + g1;
					}
					else if (m2.find()){
						String g1 = m2.group(1);
						String tag = g1;
						if (StringUtils.equals(tag, "M") && typeModifierPtns.contains(word)) {
							ptn = ptn + "t";
						}
						else {
							ptn = ptn + tag.toLowerCase();
						}
					}
					else if (StringUtils.equals(this.getWordFormUtility().getNumber(word), "p")) {
						ptn = ptn + "p";
					}
					else {
						ptn = ptn + "q";
					}
				}
			}
		}
		
		return ptn;
	}

	public String getParentSentenceTag(int sentenceID) {
		// TODO Auto-generated method stub
		return null;
	}
	
	
	// doItMarkup
	/**
	 * skip and/or cases skip leads with $stop words
	 * 
	 * @return number of updates
	 */
	public int doItMarkup(DataHolder dataholderHandler, int maxLength) {
		PropertyConfigurator.configure("conf/log4j.properties");
		Logger myLogger = Logger
				.getLogger("learn.additionalBootStrapping.doItMarkup");
		myLogger.trace("Enter");

		int sign = 0;
		// for (int i=0;i<myDataHolder.getSentenceHolder().size();i++) {
		Iterator<SentenceStructure> iter = dataholderHandler.getSentenceHolder().iterator();
		while (iter.hasNext()) {
			SentenceStructure sentenceObject = iter.next();
			String tag = sentenceObject.getTag();
			if (doItMarkupHelper(tag)) {
				int ID = sentenceObject.getID();
				String lead = sentenceObject.getLead();
				String sentence = sentenceObject.getSentence();

				// case 1
				if (doItMarkupCase1Helper(sentence)) {
					myLogger.trace(String.format("sent #%d: case 1", ID));
					continue;
				}

				// case 2
				if (doItMarkupCase2Helper(lead)) {
					myLogger.trace(String.format("sent #%d: case 2", ID));
					continue;
				}

				StringAndInt tagAndSign = learnTerms(dataholderHandler, ID);
				String doItTag = tagAndSign.getString();
				int doItSign = tagAndSign.getInt();
				sign = doItSign;

				// case 3
				if (StringUtility.createMatcher(doItTag, "\\w").find()) {
					myLogger.trace(String.format("sent #%d: case 3", ID));
					this.tagSentence(dataholderHandler, maxLength, ID, doItTag);
				}
			}
		}

		myLogger.trace("Return: " + sign);
		return sign;
	}

	public boolean doItMarkupHelper(String tag) {
		boolean flag = false;
		flag = (tag == null) || (StringUtils.equals(tag, ""))
				|| (StringUtils.equals(tag, "unknown"));

		return flag;
	}

	public boolean doItMarkupCase1Helper(String sentence) {
		boolean flag = false;
		flag = StringUtility.createMatcher(sentence,
				"^.{0,40} (nor|or|and|\\/)").find();
		return flag;
	}

	public boolean doItMarkupCase2Helper(String lead) {
		boolean flag = false;
		flag = StringUtility.createMatcher(lead,
				"\\b(" + getConstant().STOP + ")\\b").find();

		return flag;
	}
	
	public boolean tagSentence(DataHolder dataholderHandler, int maxLength, int sentenceID, String tag) {
		PropertyConfigurator.configure("conf/log4j.properties");
		Logger myLogger = Logger.getLogger("learn.tagSentence");
		myLogger.trace(String.format("Enter (%d, %s)", sentenceID, tag));

		// case 1
		if (!StringUtility.createMatcher(tag, "\\w+").find()) {
			myLogger.trace("\t:tag is not a word. Return");
			return false;
		} else {
			// case 2
			if (StringUtility.createMatcher(tag, "^(" + getConstant().STOP + ")\\b")
					.find()) {
				myLogger.trace(String
						.format("\t:tag %s starts with a stop word, ignore tagging requrest",
								tag));
				return false;
			} else {
				// case 3
				if (tag.length() > maxLength) {
					tag = tag.substring(0, maxLength);
					myLogger.debug(String.format("\ttag: %s longer than %d)",
							tag, maxLength));
				} else {
					;
				}
				SentenceStructure sentence = dataholderHandler.getSentence(sentenceID);
				sentence.setTag(tag);
				myLogger.debug(String.format(
						"\t:mark up sentence #%d with tag %s", sentenceID, tag));
				return true;
			}
		}
	}
	
	/**
	 * Update wordpos table (on certainty) when a sentence is tagged for the
	 * first time. Note: 1) this update should not be done when a POS is looked
	 * up, because we may lookup a POS for the same example multiple times. 2)
	 * if the tag need to be adjusted (not by doit function), also need to
	 * adjust certainty counts.
	 * 
	 * @param sentID
	 *            the ID of the sentence
	 * @return a pair of (tag, sign)
	 */
	public StringAndInt learnTerms(DataHolder dataholderHandler, int sentID) {
		PropertyConfigurator.configure("conf/log4j.properties");
		Logger myLogger = Logger
				.getLogger("learn.discover.ruleBasedLearn.doIt");

		myLogger.trace("Enter doIt");
		myLogger.trace("sentence ID: " + sentID);

		SentenceStructure sentEntry = dataholderHandler.getSentenceHolder()
				.get(sentID);
		String thisSentence = sentEntry.getSentence();
		String thisLead = sentEntry.getLead();

		StringAndInt returnValue = this.doItCaseHandle(dataholderHandler, thisSentence, thisLead);

		myLogger.trace("Return Tag: " + returnValue.getString() + ", sign: "
				+ returnValue.getInt());
		myLogger.trace("Quit doIt");
		myLogger.trace("\n");

		return returnValue;
	}
	
	/**
	 * 
	 * @param thisSentence
	 * @param thisLead
	 * @return
	 */
	public StringAndInt doItCaseHandle(DataHolder dataholderHandler, String thisSentence, String thisLead) {
		PropertyConfigurator.configure("conf/log4j.properties");
		Logger myLogger = Logger
				.getLogger("learn.discover.ruleBasedLearn.doIt.doItCaseHandle");

		myLogger.trace("Enter doItCaseHandle");
		myLogger.trace("Sentence: " + thisSentence);
		myLogger.trace("Lead: " + thisLead);

		if (thisSentence == null || thisLead == null) {
			return null;
		}

		int sign = 0;
		String tag = "";

		List<String> words = Arrays.asList(thisLead.split("\\s+"));
		String ptn = this.getPOSptn(dataholderHandler, words);
		myLogger.trace("ptn: " + ptn);

		Pattern p2 = Pattern.compile("ps");
		Matcher m2 = p2.matcher(ptn);

		Pattern p3 = Pattern.compile("p(\\?)");
		Matcher m3 = p3.matcher(ptn);

		Pattern p4 = Pattern.compile("[psn](b)");
		Matcher m4 = p4.matcher(ptn);

		Pattern p5 = Pattern.compile("([psn][psn]+)");
		Matcher m5 = p5.matcher(ptn);

		Pattern p6A = Pattern.compile("b[?b]([psn])$");
		Matcher m6A = p6A.matcher(ptn);

		Pattern p6B = Pattern.compile("[?b]b([psn])$");
		Matcher m6B = p6B.matcher(ptn);

		boolean case6A = m6A.find();
		boolean case6B = m6B.find();

		Pattern p7 = Pattern.compile("^s(\\?)$");
		Matcher m7 = p7.matcher(ptn);

		Pattern p10 = Pattern.compile("^\\?(b)");
		Matcher m10 = p10.matcher(ptn);

		// Case 1: single word case
		if (ptn.matches("^[pns]$")) {
			myLogger.trace("Case 1");
			tag = words.get(0);
			sign = sign
					+ dataholderHandler.updateDataHolder(tag, ptn, "-",
							"wordpos", 1);
			myLogger.debug("Directly markup with tag: " + tag + "\n");
		}

		// Case 2: "ps"
		else if (m2.find()) {
			myLogger.trace("Case 2");
			myLogger.debug("Found [ps] pattern\n");
			int start = m2.start();
			int end = m2.end();
			String pWord = words.get(start);
			String sWord = words.get(end - 1);
			List<String> tempWords = StringUtility.stringArraySplice(words, 0,
					start + 1);
			tag = StringUtility.joinList(" ", tempWords);

			myLogger.debug("\tdetermine the tag: " + tag);

			int returnedSign = 0;
			returnedSign = dataholderHandler.updateDataHolder(pWord, "p", "-",
					"wordpos", 1);
			sign += returnedSign;
			myLogger.trace(String.format(
					"updateDataHolder(%s, p, -, wordpos, 1), returned: %d",
					pWord, returnedSign));

			returnedSign = dataholderHandler.updateDataHolderNN(0,
					tempWords.size(), tempWords);
			sign += returnedSign;
			myLogger.trace(String.format(
					"updateDataHolderNN(0, %d, %s), returned: %d",
					tempWords.size(), tempWords.toString(), returnedSign));

			returnedSign = dataholderHandler.updateDataHolder(sWord, "b", "",
					"wordpos", 1);
			sign += returnedSign;
			myLogger.trace(String.format(
					"updateDataHolder(%s, b, , wordpos, 1), returned: %d",
					sWord, returnedSign));
		}

		// Case 3: "p(\\?)"
		else if (m3.find()) {
			myLogger.trace("Case 3");
			myLogger.debug("Found [p?] pattern");

			// int start = m3.start(1);
			int end = m3.end(1);

			String secondMatchedWord = words.get(end - 1);

			// case 3.1
			if (StringUtils.equals(this.myWordFormUtility.getNumber(secondMatchedWord), "p")) {
				myLogger.trace("Case 3.1");
				tag = secondMatchedWord;
				sign = sign
						+ dataholderHandler.updateDataHolder(tag, "p", "-",
								"wordpos", 1);
				dataholderHandler
						.add2Holder(
								DataHolder.ISA,
								Arrays.asList(new String[] { tag,
										words.get(end - 2) }));
				myLogger.debug("\t:[p p] pattern: determine the tag: " + tag);
			}
			// case 3.2
			else {
				myLogger.trace("Case 3.2");

				List<String> wordsCopy = new ArrayList<String>(words);
				// $i is just end-1
				List<String> tempWords = StringUtility.stringArraySplice(words,
						0, end - 1);
				tag = StringUtility.joinList(" ", tempWords);

				myLogger.debug("\t:determine the tag: " + tag);
				myLogger.debug("\t:updates on POSs");

				int temp = 0;
				temp = dataholderHandler.updateDataHolder(
						wordsCopy.get(end - 1), "b", "", "wordpos", 1);
				sign += temp;
				myLogger.debug("\t:updateDataHolder1 returns " + temp);

				temp = dataholderHandler.updateDataHolder(
						wordsCopy.get(end - 2), "p", "-", "wordpos", 1);
				sign += temp;
				myLogger.debug("\t:updateDataHolder2 returns " + temp);

				temp = dataholderHandler.updateDataHolderNN(0,
						tempWords.size(), tempWords);
				sign += temp;
				myLogger.debug("\t:updateDataHolder returns " + temp);
			}
		}

		// case 4: "[psn](b)"
		else if (m4.find()) {
			myLogger.trace("Case 4");
			Pattern p41 = Pattern.compile("^sbp");
			Matcher m41 = p41.matcher(ptn);

			if (m41.find()) {
				myLogger.trace("\tCase 4.1");
				myLogger.debug("Found [sbp] pattern");
				List<String> wordsCopy = new ArrayList<String>(words);
				tag = StringUtility.joinList(" ",
						StringUtility.stringArraySplice(wordsCopy, 0, 3));
				myLogger.trace("\t:determine the tag: " + tag);
			} else {
				myLogger.trace("\tCase 4.2");
				myLogger.debug("Found [[psn](b)] pattern");

				int index = m4.start(1);

				// get tag, which is the words prior to the b word (exclusive)
				List<String> wordsTemp = StringUtility.stringArraySplice(words,
						0, index);
				tag = StringUtility.joinList(" ", wordsTemp);
				myLogger.trace("Tag: " + tag);

				// update the b word
				sign += dataholderHandler.updateDataHolder(words.get(index),
						"b", "", "wordpos", 1);
				myLogger.trace(String.format(
						"updateDataHolder (%s, b, , wordpos, 1)",
						words.get(index)));

				sign += dataholderHandler.updateDataHolder(
						words.get(index - 1), ptn.substring(index - 1, index),
						"-", "wordpos", 1);

				myLogger.trace(String.format(
						"updateDataHolder (%s, %s, -, wordpos, 1)",
						words.get(index - 1), ptn.substring(index - 1, index)));

				sign += dataholderHandler.updateDataHolderNN(0,
						wordsTemp.size(), wordsTemp);
				myLogger.trace(String.format("updateDataHolderNN (0, %d, %s)",
						wordsTemp.size(), wordsTemp.toString()));

				myLogger.debug("\t:determine the tag: " + tag);
				myLogger.debug("\t:updates on POSs");
			}
		}

		// case 5: "pp"
		else if (m5.find()) {
			myLogger.debug("Case 5: Found [[psn][psn]+] pattern");
			int start = m5.start(1);
			int end = m5.end(1);
			List<String> copyWords = new ArrayList<String>();
			copyWords.addAll(words);
			GetNounsAfterPtnReturnValue returnedValue = this.getNounsAfterPtn(dataholderHandler,
					thisSentence, end);
			List<String> moreNoun = new LinkedList<String>();
			List<String> morePtn = new LinkedList<String>();
			String bWord = "";

			moreNoun.addAll(returnedValue.getNouns());
			morePtn.addAll(returnedValue.getNounPtn());
			bWord = returnedValue.getBoundaryWord();
			List<POSInfo> t;

			if (StringUtility.createMatcher(ptn, "pp").find()) {
				myLogger.trace("Case 5.1");

				String morePtnStr = StringUtility.joinList("", morePtn);
				Pattern p511 = Pattern.compile("/^p*(s)");
				Matcher m511 = p511.matcher(morePtnStr);
				Pattern p512 = Pattern.compile("^(p+)");
				Matcher m512 = p512.matcher(morePtnStr);

				if (m511.find()) {
					myLogger.trace("Case 5.1.1");
					// find last p word, and reset it to "b"
					int sAfterPIndex = m511.start(1);
					int lastPIndex = sAfterPIndex - 1;
					String sWord = moreNoun.get(sAfterPIndex);
					String lastPWord = lastPIndex >= 0 ? moreNoun
							.get(lastPIndex) : "";
					bWord = lastPWord;
					if (StringUtils.equals(lastPWord, "")) {
						tag = words.get(ptn.lastIndexOf("p"));
					} else {
						tag = lastPWord;
					}
					sign += dataholderHandler.updateDataHolder(sWord, "b",
							"", "wordpos", 1);
				} else if (m512.find()) {
					myLogger.trace("Case 5.1.2");
					tag = moreNoun.get(m512.end(1) - 1);
				} else {
					myLogger.trace("Case 5.1.3");
					int lastPIndex = ptn.lastIndexOf("p");
					tag = words.get(lastPIndex);
				}
				t = dataholderHandler.checkPOSInfo(tag);
			} else {
				myLogger.trace("Case 5.2");
				List<String> tempWords = new LinkedList<String>();
				tempWords
						.addAll(StringUtility.stringArraySplice(words, 0, end));
				tag = StringUtility.joinList(" ", tempWords);
				if (moreNoun.size() > 0) {
					tag = tag + " " + StringUtility.joinList(" ", moreNoun);
				}

				t = dataholderHandler.checkPOSInfo(
						tag.substring(tag.lastIndexOf(" ") + 1, tag.length()));
			}

			if (t.size() > 0) {
				String pos = t.get(0).getPOS();
				// String role = t.get(0).getRole();
				// int certiantyU = t.get(0).getCertaintyU();
				// int certiantyL = t.get(0).getCertaintyL();

				if (StringUtility.createMatcher(pos, "[psn]").find()) {
					// case 5.x
					myLogger.debug("Case 5.x: relax this condition");
					List<String> tWords = new LinkedList<String>();
					tWords.addAll(Arrays.asList(thisSentence.split(" ")));
					sign += dataholderHandler.updateDataHolder(bWord, "b",
							"", "wordpos", 1);
					ptn = ptn.substring(start, end);
					String tempPtn = ptn + StringUtility.joinList("", morePtn);
					for (int k = start; k < tempPtn.length(); k++) {
						if (k != tempPtn.length() - 1) {
							sign += dataholderHandler.updateDataHolder(
									tWords.get(k), tempPtn.substring(k, k + 1),
									"_", "wordpos", 1);
						} else {
							sign += dataholderHandler.updateDataHolder(
									tWords.get(k), tempPtn.substring(k, k + 1),
									"-", "wordpos", 1);
						}
					}
					if (tWords.size() > 1) {
						sign += dataholderHandler.updateDataHolderNN(0,
								tempPtn.length(), tWords);
					}
				}
			}
			myLogger.debug("\t:determine the tag: " + tag);

		}

		// case 6: "b[?b]([psn])$" or "[?b]b([psn])$"
		else if (case6A || case6B) {
			myLogger.debug("Case 6: Found [b?[psn]$] or [[?b]b([psn])$] pattern");
			int end = -1;
			// the index of noun
			if (case6A) {
				end = m6A.end(1) - 1;
			} else {
				end = m6B.end(1) - 1;
			}
			GetNounsAfterPtnReturnValue tempReturnValue = this
					.getNounsAfterPtn(dataholderHandler, thisSentence, end + 1);
			// List<String> moreNouns = tempReturnValue.getNouns();
			List<String> morePtn = tempReturnValue.getNounPtn();
			String bWord = tempReturnValue.getBoundaryWord();

			List<String> sentenceHeadWords = tokenizeText(thisSentence, "firstseg");
			end += morePtn.size();
			List<String> tempWords = StringUtility.stringArraySplice(
					sentenceHeadWords, 0, end + 1);
			tag = StringUtility.joinList(" ", tempWords);
			myLogger.debug("\t:updates on POSs");
			if (StringUtility.createMatcher(bWord, "\\w").find()) {
				sign += dataholderHandler.updateDataHolder(bWord, "b", "",
						"wordpos", 1);
			}
			String allPtn = "" + ptn;
			allPtn = allPtn + StringUtility.joinList("", morePtn);
			// from the index of noun
			for (int i = 2; i < allPtn.length(); i++) {
				// case 6.1: last ptn
				if (i != allPtn.length() - 1) {
					myLogger.trace("Case 6.1");
					sign += dataholderHandler.updateDataHolder(
							sentenceHeadWords.get(i),
							allPtn.substring(i, i + 1), "_", "wordpos", 1);
				}
				// case 6.2: not last ptn
				else {
					myLogger.trace("Case 6.2");
					sign += dataholderHandler.updateDataHolder(
							sentenceHeadWords.get(i),
							allPtn.substring(i, i + 1), "-", "wordpos", 1);
				}
			}
			myLogger.debug("\t:determine the tag: " + tag);
		}

		// case 7: "^s(\\?)$"
		else if (m7.find()) {
			myLogger.trace("Case 7");
			String singularWord = words.get(0);
			String questionedWord = words.get(1);
			String wnPOS = this.myWordFormUtility.checkWN(
					questionedWord, "pos");

			if (StringUtility.createMatcher(wnPOS, "p").find()) {
				myLogger.trace("Case 7.1");
				tag = singularWord + " " + questionedWord;
				myLogger.debug("\t:determine the tag: " + tag);
				myLogger.debug("\t:updates on POSs");
				String questionedPOS = this.myWordFormUtility.getNumber(singularWord);
				sign += dataholderHandler.updateDataHolder(questionedWord,
						questionedPOS, "-", "wordpos", 1);
			} else {
				myLogger.trace("Case 7.2");
				tag = words.get(0);
				myLogger.debug("\t:determine the tag: " + tag);
				myLogger.debug("\t:updates on POSs");
				sign += dataholderHandler.updateDataHolder(questionedWord,
						"b", "", "wordpos", 1);
				sign += dataholderHandler.updateDataHolder(singularWord,
						"s", "-", "wordpos", 1);
			}
		}

		// case 8: "^bs$"
		else if (StringUtility.createMatcher(ptn, "^bs$").find()) {
			myLogger.trace("Case 8");
			tag = StringUtility.joinList(" ", words);
			sign += dataholderHandler.updateDataHolder(words.get(0), "b",
					"", "wordpos", 1);
			sign += dataholderHandler.updateDataHolder(words.get(1), "s",
					"-", "wordpos", 1);
		}

		// case 9: ^bp$
		else if (StringUtility.createMatcher(ptn, "^bp$").find()) {
			myLogger.trace("Case 9");
			tag = StringUtility.joinList(" ", words);
			sign += dataholderHandler.updateDataHolder(words.get(0), "b",
					"", "wordpos", 1);
			sign += dataholderHandler.updateDataHolder(words.get(1), "p",
					"-", "wordpos", 1);
		}

		// case 10: "^\\?(b)"
		else if (m10.find()) {
			myLogger.trace("Case 10");
			myLogger.trace("Found [?(b)] pattern");

			int index = m10.start(1);

			sign += dataholderHandler.updateDataHolder(words.get(index), "b",
					"", "wordpos", 1);
			myLogger.trace(String.format(
					"updateDataHolder (%s, b, , wordpos, 1)", words.get(index)));

			List<String> wordsTemp = StringUtility.stringArraySplice(words, 0,
					index);
			tag = StringUtility.joinList(" ", wordsTemp);
			String word = words.get(index - 1); // the "?" word

			myLogger.trace("Tag: " + tag);
			myLogger.trace("Word: " + word);

			if (!isFollowedByNoun(dataholderHandler, thisSentence, thisLead)) {
				myLogger.trace("Case 10.1");
				String wnP1 = this.myWordFormUtility.checkWN(word, "pos");
				myLogger.trace("wnP1: " + wnP1);
				String wnP2 = "";

				if (!StringUtility.createMatcher(wnP1, "\\w").find()) {
					wnP2 = this.myWordFormUtility.getNumber(word);
				}
				myLogger.trace("wnP2: " + wnP2);

				if (StringUtility.createMatcher(wnP1, "[ar]").find()) {
					wnP1 = "";
				}

				if ((StringUtility.createMatcher(wnP1, "[psn]").find())
						|| (StringUtility.createMatcher(wnP2, "[ps]").find())) {
					myLogger.trace("Case 10.1.1");
					myLogger.debug("\t:determine the tag: " + tag);
					myLogger.debug("\t:updates on POSs");
					sign += dataholderHandler.updateDataHolder(word, "n", "-",
							"wordpos", 1);
					myLogger.trace(String.format(
							"updateDataHolder(%s, n, -, wordpos, 1)", word));
					sign += dataholderHandler.updateDataHolderNN(0,
							wordsTemp.size() - 1, wordsTemp);
					myLogger.trace(String.format(
							"updateDataHolderNN(%d, %d, %s)", 0,
							wordsTemp.size() - 1, wordsTemp));

				} else {
					myLogger.trace("Case 10.1.2");
					myLogger.debug("\t:" + tag
							+ " is adv/adj or modifier. skip.");
					tag = "";
				}
			} else {
				myLogger.trace("Case 10.2");
				myLogger.debug(String.format(
						"\t:%s is adv/adj or modifier. skip.", tag));
				tag = "";
			}
		} else {
			myLogger.trace("\tCase 0");
			myLogger.trace(String.format("Pattern [%s] is not processed", ptn));
		}

		StringAndInt returnValue = new StringAndInt(tag, sign);

		myLogger.trace("Return: " + returnValue.toString());
		return returnValue;
	}

	public int doItCase7Helper(String regex, String ptn) {
		Matcher m = StringUtility.createMatcher(ptn, regex);
		if (m.find()) {
			int start = m.start();
			return start + 1;
		} else {
			return -1;
		}
	}
	
	/**
	 * The length of the ptn must be the same as the number of words in words.
	 * If certainty is < 50%, replace POS with ?.
	 * 
	 * @param words
	 * @return
	 */
	public String getPOSptn(DataHolder dataholderHandler, List<String> words) {
		PropertyConfigurator.configure("conf/log4j.properties");
		Logger myLogger = Logger
				.getLogger("learn.discover.ruleBasedLearn.doIt.getPOSptn");

		myLogger.trace("Enter getPOSptn");
		myLogger.trace("Words: " + words.toString());

		String ptn = "";
		String POS = "";
		double certainty;
		for (int i = 0; i < words.size(); i++) {

			String word = words.get(i);
			myLogger.trace("\tCheck word: " + word);
			List<POSInfo> POSInfoList = dataholderHandler.checkPOSInfo(word);
			if (POSInfoList.size() >= 0) {
				if (POSInfoList.size() == 0) {
					myLogger.trace("\t\tThe word is not in WordPOS holder");
					POS = "?";
				} else {
					POSInfo p = POSInfoList.get(0);
					POS = p.getPOS();

					if (p.getCertaintyU() == 0) {
						certainty = 1.0;
					} else {
						double certaintyU = (double) p.getCertaintyU();
						double certaintyL = (double) p.getCertaintyL();
						certainty = certaintyU / certaintyL;
					}

					myLogger.trace(String.format("\t\tCertaintyU: %d",
							p.getCertaintyU()));
					myLogger.trace(String.format("\t\tCertaintyL: %d",
							p.getCertaintyL()));
					myLogger.trace(String
							.format("\t\tCertainty: %f", certainty));
					if ((!StringUtils.equals(POS, "?")) && (certainty <= 0.5)) {
						myLogger.info("\t\tThis POS has a certainty less than 0.5. It is ignored.");
						POS = "?";
					}

				}
				ptn = ptn + POS;
				myLogger.trace("\t\tAdd pos: " + POS);
			} else {
				myLogger.error("Error: checkPOSInfo gave invalid return value");
			}
		}

		myLogger.trace("Return ptn: " + ptn);
		myLogger.trace("Quite getPOSptn");

		return ptn;
	}
	
	public GetNounsAfterPtnReturnValue getNounsAfterPtn(DataHolder dataholderHandler, String sentence,
			int startWordIndex) {
		PropertyConfigurator.configure("conf/log4j.properties");
		Logger myLogger = Logger.getLogger("learn.getNounsAfterPattern");
		myLogger.trace(String
				.format("enter (%s, %d)", sentence, startWordIndex));

		String bWord = "";
		List<String> nouns = new ArrayList<String>();
		List<String> nounPtn = new ArrayList<String>();

		List<String> tempWords = new ArrayList<String>();
		tempWords.addAll(tokenizeText(sentence,
				"firstseg"));
		List<String> words = StringUtility.stringArraySplice(tempWords,
				startWordIndex, tempWords.size());
		myLogger.trace("words: " + words);
		String ptn = this.getPOSptn(dataholderHandler, words);
		myLogger.trace("ptn: " + ptn);

		if (ptn != null) {
			Matcher m1 = StringUtility.createMatcher(ptn, "^([psn]+)");
			Matcher m2 = StringUtility.createMatcher(ptn, "^(\\?+)");
			boolean case1 = false;
			boolean case2 = false;
			int end = -1;
			if (m1.find()) {
				case1 = true;
				end = m1.end(1);
			}
			if (m2.find()) {
				case2 = true;
				end = m2.end(1);
			}
			if (case1 || case2) {
				myLogger.trace("end: " + end);
				if (end < words.size()) {
					bWord = words.get(end);
				}
				List<String> nWords = new ArrayList<String>();
				nWords.addAll(StringUtility.stringArraySplice(words, 0, end));
				for (int i = 0; i < nWords.size(); i++) {
					String p = ptn.substring(i, i + 1);
					p = StringUtils.equals(p, "?") ? this.myWordFormUtility.checkWN(nWords.get(i), "pos")
							: p;
					if (StringUtility.createMatcher(p, "^[psn]+$").find()) {
						nouns.add(nWords.get(i));
						nounPtn.add(p);
					} else {
						bWord = nWords.get(i);
						break;
					}
				}
			}
		}

		GetNounsAfterPtnReturnValue returnValue = new GetNounsAfterPtnReturnValue(
				nouns, nounPtn, bWord);
		myLogger.trace("return " + returnValue);
		return (returnValue);
	}

	/**
	 * Check if a lead is followed by a noun without any proposition in between
	 * in the sentence
	 * 
	 * @param thisSentence
	 *            the sentence
	 * @param thisLead
	 *            the lead
	 * @return true if lead is followed by a N without any proposition in
	 *         between
	 */
	public boolean isFollowedByNoun(DataHolder dataholderHandler, String sentence, String lead) {
		PropertyConfigurator.configure("conf/log4j.properties");
		Logger myLogger = Logger
				.getLogger("learn.discover.ruleBasedLearn.doIt.isFollowedByNoun");
		myLogger.trace(String.format("(%s, %s)", sentence, lead));

		// null case
		if (sentence == null || lead == null) {
			myLogger.trace("Return false");
			return false;
		}

		if (StringUtils.equals(sentence, "")) {
			myLogger.trace("Return false");
			return false;
		}

		// remove lead from sentence
		sentence = sentence.replaceFirst("^" + lead, "");
		myLogger.trace("Sentence after remove lead: " + sentence);

		// List<String> nouns = this.myDataHolder.getWordByPOS("ps");
		Set<String> POSTags = new HashSet<String>();
		POSTags.add("p");
		POSTags.add("s");
		Set<String> nouns = dataholderHandler.getWordsFromWordPOSByPOSs(POSTags);

		if (nouns.size() == 0) {
			myLogger.trace("Return false");
			return false;
		}

		// String pattern1 = StringUtility.joinList("|", nouns);
		String pattern1 = StringUtils.join(nouns, "|");

		pattern1 = "(.*?)\\b(" + pattern1 + ")" + "\\b";
		myLogger.trace("Pattern: " + pattern1);

		Pattern p1 = Pattern.compile(pattern1);
		Matcher m1 = p1.matcher(sentence);

		String inBetweenPart = "";
		if (m1.find()) {
			inBetweenPart = m1.group(1);

			String pattern2 = "\\b(" + this.myConstant.PREPOSITION + ")\\b";
			Pattern p2 = Pattern.compile(pattern2);
			Matcher m2 = p2.matcher(inBetweenPart);
			if (!m2.find()) {
				myLogger.trace("Return true");
				return true;
			}
		}
		myLogger.trace("Return false");
		return false;
	}

}