Initializer.java example

Explorer

charaparser-unsupervised-master
- src
  - main
    - java
      - semanticMarkup
        core
        Treatment.java
        io
        input
        lib
        db
        ParentTagProvider.java
        know
        IGlossary.java
        IPOSKnowledgeBase.java
        Stemmer.java
        lib
        InMemoryGlossary.java
        WordNetPOSKnowledgeBase.java
        knowledge
        KnowledgeBase.java
        ling
        Token.java
        learn
        Configuration.java
        ITerminologyLearner.java
        Learner.java
        UnsupervisedClauseMarkup.java
        auxiliary
        AjectiveReplacementForNoun.java
        FileLoader.java
        GetNounsAfterPtnReturnValue.java
        KnownTagCollection.java
        POS.java
        POSInfo.java
        SentenceLeadLengthComparator.java
        StringAndInt.java
        StringPair.java
        dataholder
        DataHolder.java
        DiscountedKey.java
        IsAValue.java
        ModifierTableValue.java
        SentenceStructure.java
        SingularPluralPair.java
        WordPOSKey.java
        WordPOSValue.java
        knowledge
        AdditionalBootstrapping.java
        AdditionalBootstrappingLearner.java
        AdjectiveSubjectBootstrappingLearner.java
        AdjectiveVerifier.java
        AndOrTagSetter.java
        AnnotationNormalizer.java
        CommaAsAndAnnotator.java
        CommonSubstructureAnnotator.java
        Constant.java
        CoreBootstrappingLearner.java
        DittoAnnotator.java
        FiniteSetsLoader.java
        HeuristicNounLearnerUseMorphology.java
        HeuristicNounLearnerUseSuffix.java
        IModule.java
        IgnorePatternAnnotator.java
        IgnoredFinalizer.java
        Initializer.java
        MarkupByPatternLearner.java
        ModifierTagSeparator.java
        NMBResolver.java
        NullSentenceTagger.java
        POSBasedAnnotator.java
        PatternBasedAnnotator.java
        PhraseClauseAnnotator.java
        PronounCharactersAnnotator.java
        UnknownWordBootstrappingLearner.java
        utility
        LearnerUtility.java
        StringUtility.java
        WordFormUtility.java
        pos
        POS.java
        transform
        ITokenizer.java
        lib
        OpenNLPSentencesTokenizer.java
        OpenNLPTokenizer.java
  - test
    - java
      - semanticMarkup
        ling
        learn
        DataHolderTest.java
        LearnerTest.java
        LearnerUtilityTest.java
        SentenceLeadLengthComparatorTest.java
        StringUtilityTest.java
        UnsupervisedClauseMarkupTest.java
        WordFormUtilityTest.java
        knowledge
        AndOrTagSetterTest.java
        CommonSubstructureAnnotatorTest.java
        CoreBootstrappingLearnerTest.java
        InitializerTest.java
        POSBasedAnnotatorTest.java
        UnknownWordBootstrappingTest.java

package semanticMarkup.ling.learn.knowledge;

import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;

import semanticMarkup.core.Treatment;
import semanticMarkup.ling.Token;
import semanticMarkup.ling.learn.dataholder.DataHolder;
import semanticMarkup.ling.learn.utility.LearnerUtility;
import semanticMarkup.ling.learn.utility.StringUtility;

public class Initializer implements IModule {
	
	private LearnerUtility myLearnerUtility;
	private List<Treatment> treatments;
	private int numLeadWords;

	public Initializer(LearnerUtility learnerUtility, int num) {
		this.myLearnerUtility = learnerUtility;
		this.numLeadWords = num;
		treatments = new LinkedList<Treatment>();
	}

	@Override
	public void run(DataHolder myDataHolder) {
		this.populateSentence(this.treatments, myDataHolder);
		this.populateUnknownWordsTable(myDataHolder.allWords, myDataHolder);

	}
	
	public void loadTreatments(List<Treatment> treatments) {
		this.treatments.addAll(treatments);
	}
	
	/**
	 * 
	 * @param treatments
	 * @return number of sentences
	 */
	public int populateSentence(List<Treatment> treatments, DataHolder myDataHolder) {
		PropertyConfigurator.configure( "conf/log4j.properties" );
		Logger myLogger = Logger.getLogger("learn.populateSentence");
		myLogger.info("Enter");
		myLogger.info("Reading sentences...");

		String fileName;
		int type;
		String text;
		int SENTID = 0;

		for (int i = 0; i < treatments.size(); i++) {
			Treatment tm = treatments.get(i);
			fileName = tm.getFileName();
			text = tm.getDescription();
			type = this.myLearnerUtility.getType(fileName);

			if (text != null) {
				// process this text
				text = this.handleText(text);
				myLogger.debug("Text: " + text);

				//do sentence segmentation
				List<Token> sentences = this.myLearnerUtility.segmentSentence(text);

				List<String> sentCopy = new LinkedList<String>();
				List<Integer> validIndex = new LinkedList<Integer>();
				
				// for each sentence, do some operations
				for (int j = 0; j < sentences.size(); j++) {
					myLogger.debug("Sentence " + j + ": " + sentences.get(j).getContent());
					
					// if(!/\w+/){next;}
					if (!sentences.get(j).getContent().matches("^.*\\w+.*$")) {
						continue;
					}

					// This is a valid sentence, save the index
					validIndex.add(j);

					// restore marks in brackets
					sentences.get(j).setContent(this.myLearnerUtility.restoreMarksInBrackets(sentences.get(j).getContent()));
					// Make a copy of the sentence
					sentCopy.add(sentences.get(j).getContent());

					// process the sentence
					sentences.get(j).setContent(this.handleSentence(sentences.get(j).getContent()));

					// store all words
					myDataHolder.allWords = this.myLearnerUtility.getAllWords(sentences.get(j).getContent(), myDataHolder.allWords);
				}

				for (int j = 0; j < validIndex.size(); j++) {
					String line = sentences.get(validIndex.get(j)).getContent();
					String oline = sentCopy.get(j);

					// handle line first
					// remove all ' to avoid escape problems
					// $line =~ s#'# #g;
					line.replaceAll("\'", " ");

					// then handle oline
					Matcher matcher = Pattern.compile(
							"(\\d)\\s*\\[\\s*DOT\\s*\\]\\s*(\\d)").matcher(
							oline);
					if (matcher.lookingAt()) {
						oline = oline.replaceAll(
								"(\\d)\\s*\\[\\s*DOT\\s*\\]\\s*(\\d)",
								matcher.group(1) + matcher.group(2));
					}

					// restore ".", "?", ";", ":", "."
					oline = this.myLearnerUtility.restoreMarksInBrackets(oline);
					oline = oline.replaceAll("\'", " ");

					List<String> nWords = this.myLearnerUtility.getFirstNWords(line,
							this.numLeadWords);
					String lead = "";
					Iterator<String> iter = nWords.iterator();
					while (iter.hasNext()) {
						String w = iter.next();
						lead = lead + w + " ";
					}
					lead = lead.replaceAll("\\s$", "");

					String status = "";
					if (myLearnerUtility.getWordFormUtility().getNumber(nWords.get(0)).equals("p")) {
						status = "start";
					} else {
						status = "normal";
					}

					lead = StringUtility.removeAll(lead, "\\s+$");
					lead = StringUtility.removeAll(lead, "^\\s*");
					lead = lead.replaceAll("\\s+", " ");

					String source = fileName + "-" + Integer.toString(j);
					if (oline.length() >= 2000) { // EOL
						oline = line;
					}
					String typeStr = null;
					switch (type) {
					case 1:
						typeStr = "character";
						break;
					case 2:
						typeStr = "description";
						break;
					}

					myDataHolder.addSentence(source, line, oline, lead,
							status, null, null, typeStr);

					SENTID++;
				}
			}
		}

		myLogger.info("Total sentences = " + SENTID);
		myLogger.info("Quite");

		return SENTID;
	}
	
	/**
	 * A helper of method pupulateSentence to handle text process
	 * 
	 * @param t
	 * @return text after process
	 */
	public String handleText(String t) {

		if (t == null || t == "") {
			return t;
		}

		String text = t;

		//
		text = text.replaceAll("[\"']", "");

		// plano - to
		text = text.replaceAll("\\s*-\\s*to\\s+", " to ");

		//
		text = text.replaceAll("[-_]+shaped", "-shaped");

		// unhide <i>
		text = text.replaceAll("<i>", "<i>");

		// unhide </i>, these will be used by characterHeuristics to
		// collect taxon names
		text = text.replaceAll("</i>", "</i>");

		// remove 2a. (key marks)
		text = text.replaceAll("^\\s*\\d+[a-z].\\s*", "");

		// this is not used any more, see perl code - Dongye
		// store text at this point in original
		// String original = text;

		// remove HTML entities
		text = text.replaceAll("&[;#\\w\\d]+;", " ");

		//
		text = text.replaceAll(" & ", " and ");

		// replace '.', '?', ';', ':', '!' within brackets by some
		// special markers, to avoid split within brackets during
		// sentence segmentation
		// System.out.println("Before Hide: "+text);
		
		text = this.myLearnerUtility.hideMarksInBrackets(text);
		// System.out.println("After Hide: "+text+"\n");

		text = text.replaceAll("_", "-"); // _ to -
		text = text.replaceAll("", ""); //

		// absent ; => absent;
		while (true) {
			Matcher matcher1 = Pattern.compile("(^.*?)\\s+([:;\\.].*$)")
					.matcher(text);
			if (matcher1.lookingAt()) {
				text = matcher1.group(1) + matcher1.group(2);
			} else {
				break;
			}
		}

		// absent;blade => absent; blade
		while (true) {
			Matcher matcher2 = Pattern.compile("(^.*?\\w)([:;\\.])(\\w.*$)")
					.matcher(text);
			if (matcher2.lookingAt()) {
				// text = text.replaceAll("^.*\\w[:;\\.]\\w.*",
				// matcher2.group(1)
				// + matcher2.group(2) + " " + matcher2.group(3));
				text = matcher2.group(1) + matcher2.group(2) + " "
						+ matcher2.group(3);
			} else {
				break;
			}
		}

		// 1 . 5 => 1.5
		while (true) {
			Matcher matcher3 = Pattern.compile("(^.*?\\d\\s*\\.)\\s+(\\d.*$)")
					.matcher(text);
			if (matcher3.lookingAt()) {
				text = matcher3.group(1) + matcher3.group(2);
			} else {
				break;
			}
		}

		// ###NOT necessary at all, done before in "absent ; => absent;"###
		// diam . =>diam.
		// Matcher matcher4 =
		// Pattern.compile("(\\sdiam)\\s+(\\.)").matcher(text);
		// if (matcher4.lookingAt()) {
		// text = text.replaceAll("\\sdiam\\s+\\.", matcher4.group(1)
		// + matcher4.group(2));
		// }

		// ca . =>ca.
		// Matcher matcher5 = Pattern.compile("(\\sca)\\s+(\\.)").matcher(text);
		// if (matcher5.lookingAt()) {
		// text = text.replaceAll("\\sca\\s+\\.",
		// matcher5.group(1) + matcher5.group(2));
		// }

		//
		while (true) {
			Matcher matcher6 = Pattern.compile(
					"(^.*\\d\\s+(cm|mm|dm|m)\\s*)\\.(\\s+[^A-Z].*$)").matcher(
					text);
			if (matcher6.lookingAt()) {
				text = matcher6.group(1) + "[DOT]" + matcher6.group(3);
			} else {
				break;
			}
		}

		return text;
	}

	/**
	 * remove bracketed text from sentence (keep those in originalsent). Tthis
	 * step will not be able to remove nested brackets, such as (petioles
	 * (2-)4-8 cm). Nested brackets will be removed after threedsent step in
	 * POSTagger4StanfordParser.java
	 * 
	 * @param s
	 *            sentence to be handled
	 * @return sentence after being processed
	 */
	public String handleSentence(String s) {
		if (s == null || s == "") {
			return s;
		}

		String sentence = s;

		// remove (.a.)
		sentence = sentence.replaceAll("\\([^()]*?[a-zA-Z][^()]*?\\)", " ");

		// remove [.a.]
		sentence = sentence.replaceAll("\\[[^\\]\\[]*?[a-zA-Z][^\\]\\[]*?\\]",
				" ");

		// remove {.a.}
		sentence = sentence.replaceAll("\\{[^{}]*?[a-zA-Z][^{}]*?\\}", " ");

		// to fix basi- and hypobranchial
		while (true) {
			Matcher matcher = Pattern.compile("(^.*?)\\s*[-]+\\s*([a-z].*$)")
					.matcher(sentence);
			if (matcher.lookingAt()) {
				sentence = matcher.group(1) + "_ " + matcher.group(2);
			} else {
				break;
			}
		}

		// add space around nonword char
		sentence = this.myLearnerUtility.addSpace(sentence, "\\W");

		// multiple spaces => 1 space
		sentence = sentence.replaceAll("\\s+", " ");

		// trim: remove leading and ending spaces
		sentence = sentence.replaceAll("^\\s*", "");
		sentence = sentence.replaceAll("\\s*$", "");

		recordProperNouns(sentence);
		
		// all to lower case
		sentence = sentence.toLowerCase();

		return sentence;
	}
	
	public void recordProperNouns(String sentence) {
		if (sentence == null) {
			return;
		}
		
		sentence = sentence.replaceAll("[(\\[{]\\s*[A-Z]", " ");
		
		Pattern p = Pattern.compile("(.+)\\b([A-Z][a-z]*)\\b");
		Matcher m = p.matcher(sentence);
		while (m.find()) {
			String pattern = m.group(2);
			pattern = pattern.toLowerCase();
			// print "find a pn [$pn] in [$sent]\n\n" if $debug;
			sentence = m.group(1);
			if (pattern.length() > 1) {
				// add pattern into proper nouns
				this.myLearnerUtility.getConstant().pronounWords.add(pattern);
				this.myLearnerUtility.getConstant().updatePronoun();
			}
			
			m = p.matcher(sentence);
		}
		
		// test case:
		//[recordpropernouns] enter (Pronounced dorsal process on Meckelian element)
		// [recordpropernouns] add to PROPERNOUNS: (meckelian)
	}

	/**
	 * Insert all words in WORDS into getUnknownWordHolder(). Insert those formed by
	 * non words characters into getWordPOSHolder()
	 * 
	 * @param WORDS
	 * @return
	 */
	public int populateUnknownWordsTable(Map<String, Integer> WORDS, DataHolder myDataHolder) {
		PropertyConfigurator.configure( "conf/log4j.properties" );
		Logger myLogger = Logger.getLogger("learn.pupluateUnknownWords");

		myLogger.trace("Enter");
		int count = 0;

		Iterator<String> iter = WORDS.keySet().iterator();

		while (iter.hasNext()) {
			String word = iter.next();
			if ((!word.matches("^.*\\w.*$")) || (word.matches("^.*ous$"))) {
				myDataHolder.addUnknown(word, word);
				myDataHolder.updateDataHolder(word, "b", "", "wordpos", 1);
			} else {
				myDataHolder.addUnknown(word, "unknown");
			}
			count++;
		}

		myLogger.info("Total words = " + count);
		
		myLogger.trace("Return: "+count);
		myLogger.trace("Quite\n");
		return count;
	}
	
	public LearnerUtility getLearnerUtility(){
		return this.myLearnerUtility;
	}

}