package semanticMarkup.ling.learn.knowledge; import java.util.ArrayList; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.log4j.Logger; import org.apache.log4j.PropertyConfigurator; import semanticMarkup.core.Treatment; import semanticMarkup.ling.Token; import semanticMarkup.ling.learn.dataholder.DataHolder; import semanticMarkup.ling.learn.utility.LearnerUtility; import semanticMarkup.ling.learn.utility.StringUtility; public class Initializer implements IModule { private LearnerUtility myLearnerUtility; private List<Treatment> treatments; private int numLeadWords; public Initializer(LearnerUtility learnerUtility, int num) { this.myLearnerUtility = learnerUtility; this.numLeadWords = num; treatments = new LinkedList<Treatment>(); } @Override public void run(DataHolder myDataHolder) { this.populateSentence(this.treatments, myDataHolder); this.populateUnknownWordsTable(myDataHolder.allWords, myDataHolder); } public void loadTreatments(List<Treatment> treatments) { this.treatments.addAll(treatments); } /** * * @param treatments * @return number of sentences */ public int populateSentence(List<Treatment> treatments, DataHolder myDataHolder) { PropertyConfigurator.configure( "conf/log4j.properties" ); Logger myLogger = Logger.getLogger("learn.populateSentence"); myLogger.info("Enter"); myLogger.info("Reading sentences..."); String fileName; int type; String text; int SENTID = 0; for (int i = 0; i < treatments.size(); i++) { Treatment tm = treatments.get(i); fileName = tm.getFileName(); text = tm.getDescription(); type = this.myLearnerUtility.getType(fileName); if (text != null) { // process this text text = this.handleText(text); myLogger.debug("Text: " + text); //do sentence segmentation List<Token> sentences = this.myLearnerUtility.segmentSentence(text); List<String> sentCopy = new LinkedList<String>(); List<Integer> validIndex = new LinkedList<Integer>(); // for each sentence, do some operations for (int j = 0; j < sentences.size(); j++) { myLogger.debug("Sentence " + j + ": " + sentences.get(j).getContent()); // if(!/\w+/){next;} if (!sentences.get(j).getContent().matches("^.*\\w+.*$")) { continue; } // This is a valid sentence, save the index validIndex.add(j); // restore marks in brackets sentences.get(j).setContent(this.myLearnerUtility.restoreMarksInBrackets(sentences.get(j).getContent())); // Make a copy of the sentence sentCopy.add(sentences.get(j).getContent()); // process the sentence sentences.get(j).setContent(this.handleSentence(sentences.get(j).getContent())); // store all words myDataHolder.allWords = this.myLearnerUtility.getAllWords(sentences.get(j).getContent(), myDataHolder.allWords); } for (int j = 0; j < validIndex.size(); j++) { String line = sentences.get(validIndex.get(j)).getContent(); String oline = sentCopy.get(j); // handle line first // remove all ' to avoid escape problems // $line =~ s#'# #g; line.replaceAll("\'", " "); // then handle oline Matcher matcher = Pattern.compile( "(\\d)\\s*\\[\\s*DOT\\s*\\]\\s*(\\d)").matcher( oline); if (matcher.lookingAt()) { oline = oline.replaceAll( "(\\d)\\s*\\[\\s*DOT\\s*\\]\\s*(\\d)", matcher.group(1) + matcher.group(2)); } // restore ".", "?", ";", ":", "." oline = this.myLearnerUtility.restoreMarksInBrackets(oline); oline = oline.replaceAll("\'", " "); List<String> nWords = this.myLearnerUtility.getFirstNWords(line, this.numLeadWords); String lead = ""; Iterator<String> iter = nWords.iterator(); while (iter.hasNext()) { String w = iter.next(); lead = lead + w + " "; } lead = lead.replaceAll("\\s$", ""); String status = ""; if (myLearnerUtility.getWordFormUtility().getNumber(nWords.get(0)).equals("p")) { status = "start"; } else { status = "normal"; } lead = StringUtility.removeAll(lead, "\\s+$"); lead = StringUtility.removeAll(lead, "^\\s*"); lead = lead.replaceAll("\\s+", " "); String source = fileName + "-" + Integer.toString(j); if (oline.length() >= 2000) { // EOL oline = line; } String typeStr = null; switch (type) { case 1: typeStr = "character"; break; case 2: typeStr = "description"; break; } myDataHolder.addSentence(source, line, oline, lead, status, null, null, typeStr); SENTID++; } } } myLogger.info("Total sentences = " + SENTID); myLogger.info("Quite"); return SENTID; } /** * A helper of method pupulateSentence to handle text process * * @param t * @return text after process */ public String handleText(String t) { if (t == null || t == "") { return t; } String text = t; // text = text.replaceAll("[\"']", ""); // plano - to text = text.replaceAll("\\s*-\\s*to\\s+", " to "); // text = text.replaceAll("[-_]+shaped", "-shaped"); // unhide <i> text = text.replaceAll("<i>", "<i>"); // unhide </i>, these will be used by characterHeuristics to // collect taxon names text = text.replaceAll("</i>", "</i>"); // remove 2a. (key marks) text = text.replaceAll("^\\s*\\d+[a-z].\\s*", ""); // this is not used any more, see perl code - Dongye // store text at this point in original // String original = text; // remove HTML entities text = text.replaceAll("&[;#\\w\\d]+;", " "); // text = text.replaceAll(" & ", " and "); // replace '.', '?', ';', ':', '!' within brackets by some // special markers, to avoid split within brackets during // sentence segmentation // System.out.println("Before Hide: "+text); text = this.myLearnerUtility.hideMarksInBrackets(text); // System.out.println("After Hide: "+text+"\n"); text = text.replaceAll("_", "-"); // _ to - text = text.replaceAll("", ""); // // absent ; => absent; while (true) { Matcher matcher1 = Pattern.compile("(^.*?)\\s+([:;\\.].*$)") .matcher(text); if (matcher1.lookingAt()) { text = matcher1.group(1) + matcher1.group(2); } else { break; } } // absent;blade => absent; blade while (true) { Matcher matcher2 = Pattern.compile("(^.*?\\w)([:;\\.])(\\w.*$)") .matcher(text); if (matcher2.lookingAt()) { // text = text.replaceAll("^.*\\w[:;\\.]\\w.*", // matcher2.group(1) // + matcher2.group(2) + " " + matcher2.group(3)); text = matcher2.group(1) + matcher2.group(2) + " " + matcher2.group(3); } else { break; } } // 1 . 5 => 1.5 while (true) { Matcher matcher3 = Pattern.compile("(^.*?\\d\\s*\\.)\\s+(\\d.*$)") .matcher(text); if (matcher3.lookingAt()) { text = matcher3.group(1) + matcher3.group(2); } else { break; } } // ###NOT necessary at all, done before in "absent ; => absent;"### // diam . =>diam. // Matcher matcher4 = // Pattern.compile("(\\sdiam)\\s+(\\.)").matcher(text); // if (matcher4.lookingAt()) { // text = text.replaceAll("\\sdiam\\s+\\.", matcher4.group(1) // + matcher4.group(2)); // } // ca . =>ca. // Matcher matcher5 = Pattern.compile("(\\sca)\\s+(\\.)").matcher(text); // if (matcher5.lookingAt()) { // text = text.replaceAll("\\sca\\s+\\.", // matcher5.group(1) + matcher5.group(2)); // } // while (true) { Matcher matcher6 = Pattern.compile( "(^.*\\d\\s+(cm|mm|dm|m)\\s*)\\.(\\s+[^A-Z].*$)").matcher( text); if (matcher6.lookingAt()) { text = matcher6.group(1) + "[DOT]" + matcher6.group(3); } else { break; } } return text; } /** * remove bracketed text from sentence (keep those in originalsent). Tthis * step will not be able to remove nested brackets, such as (petioles * (2-)4-8 cm). Nested brackets will be removed after threedsent step in * POSTagger4StanfordParser.java * * @param s * sentence to be handled * @return sentence after being processed */ public String handleSentence(String s) { if (s == null || s == "") { return s; } String sentence = s; // remove (.a.) sentence = sentence.replaceAll("\\([^()]*?[a-zA-Z][^()]*?\\)", " "); // remove [.a.] sentence = sentence.replaceAll("\\[[^\\]\\[]*?[a-zA-Z][^\\]\\[]*?\\]", " "); // remove {.a.} sentence = sentence.replaceAll("\\{[^{}]*?[a-zA-Z][^{}]*?\\}", " "); // to fix basi- and hypobranchial while (true) { Matcher matcher = Pattern.compile("(^.*?)\\s*[-]+\\s*([a-z].*$)") .matcher(sentence); if (matcher.lookingAt()) { sentence = matcher.group(1) + "_ " + matcher.group(2); } else { break; } } // add space around nonword char sentence = this.myLearnerUtility.addSpace(sentence, "\\W"); // multiple spaces => 1 space sentence = sentence.replaceAll("\\s+", " "); // trim: remove leading and ending spaces sentence = sentence.replaceAll("^\\s*", ""); sentence = sentence.replaceAll("\\s*$", ""); recordProperNouns(sentence); // all to lower case sentence = sentence.toLowerCase(); return sentence; } public void recordProperNouns(String sentence) { if (sentence == null) { return; } sentence = sentence.replaceAll("[(\\[{]\\s*[A-Z]", " "); Pattern p = Pattern.compile("(.+)\\b([A-Z][a-z]*)\\b"); Matcher m = p.matcher(sentence); while (m.find()) { String pattern = m.group(2); pattern = pattern.toLowerCase(); // print "find a pn [$pn] in [$sent]\n\n" if $debug; sentence = m.group(1); if (pattern.length() > 1) { // add pattern into proper nouns this.myLearnerUtility.getConstant().pronounWords.add(pattern); this.myLearnerUtility.getConstant().updatePronoun(); } m = p.matcher(sentence); } // test case: //[recordpropernouns] enter (Pronounced dorsal process on Meckelian element) // [recordpropernouns] add to PROPERNOUNS: (meckelian) } /** * Insert all words in WORDS into getUnknownWordHolder(). Insert those formed by * non words characters into getWordPOSHolder() * * @param WORDS * @return */ public int populateUnknownWordsTable(Map<String, Integer> WORDS, DataHolder myDataHolder) { PropertyConfigurator.configure( "conf/log4j.properties" ); Logger myLogger = Logger.getLogger("learn.pupluateUnknownWords"); myLogger.trace("Enter"); int count = 0; Iterator<String> iter = WORDS.keySet().iterator(); while (iter.hasNext()) { String word = iter.next(); if ((!word.matches("^.*\\w.*$")) || (word.matches("^.*ous$"))) { myDataHolder.addUnknown(word, word); myDataHolder.updateDataHolder(word, "b", "", "wordpos", 1); } else { myDataHolder.addUnknown(word, "unknown"); } count++; } myLogger.info("Total words = " + count); myLogger.trace("Return: "+count); myLogger.trace("Quite\n"); return count; } public LearnerUtility getLearnerUtility(){ return this.myLearnerUtility; } }