/**
*
*/
package fna.parsing;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.ObjectInputStream;
import java.io.ObjectOutput;
import java.io.ObjectOutputStream;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.Hashtable;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import outputter.knowledge.TermOutputerUtilities;
import fna.parsing.ApplicationUtilities;
import fna.parsing.ParsingException;
import fna.parsing.TaxonIndexer;
/**
* @author Hong Cui
* convert a sentence to a format so the phrases matching terms in a knowledge source such as an ontology are marked.
* A phrase is represented by words connected using "_"
*
*/
public class PhraseMarker {
private Pattern phrasepattern; //"dorsal_fin|leaf_blade"
private String phrasestr;
public ArrayList<String> phrases;
public Hashtable<String, String> newphrases = new Hashtable<String, String>(); //plural => singular
//private static final String BIN_FILE = "PO_phrases.bin";
/**
* @param termsource: filepath to the serialized arraylist, holding the phrases
*/
@SuppressWarnings("unchecked")
public PhraseMarker(/*String termsourcepath*/) {
try {
File file = new File(ApplicationUtilities.getProperty("ontology.dir"),ApplicationUtilities.getProperty("ontology.uberon")+"_"+ApplicationUtilities.getProperty("uberonphrases.bin"));
ObjectInputStream in = new ObjectInputStream(new FileInputStream(
file));
// Deserialize the object
phrases = new ArrayList();
phrases.addAll((HashSet<String>) in.readObject()); //phrases are words connected with " "
in.close();
Collections.sort(phrases, new PhraseComparable()); //longest phrases first
phrasestr = "";
for(String phrase: phrases){
//hyomandibula-opercle joint
phrase = phrase.replaceAll("\\([^)]*\\)", "").trim();
if(phrase.length()>0 && phrase.indexOf(" ")>0){ //can't allow single-word phrase
phrase = phrase.replaceAll("-", "_");//hyomandibula_opercle joint
phrase = phraseForms(phrase);//added plural forms
phrasestr += phrase+"|";
}
}
//phrasestr="";
phrasestr = phrasestr.replaceFirst("\\|$", ""); //space separated words in phrases
System.out.println(phrasestr);
this.phrasepattern =Pattern.compile("(.*?\\b)("+phrasestr+")(\\b.*)", Pattern.CASE_INSENSITIVE);
//serialize the updated phrases
phrases.addAll(newphrases.keySet()); //add plurals
ObjectOutput out = new ObjectOutputStream(
new FileOutputStream(new File(ApplicationUtilities.getProperty("ontology.dir"),ApplicationUtilities.getProperty("ontology.uberon")+"_"+ApplicationUtilities.getProperty("uberonphrases.update.bin")))); //avoid increase the size of the original
out.writeObject(phrases);
out.close();
//serialize the plural-singular mapping
file = new File(ApplicationUtilities.getProperty("ontology.dir"),ApplicationUtilities.getProperty("ontology.uberon")+"_"+ApplicationUtilities.getProperty("uberonphrases.p2s.bin"));
out = new ObjectOutputStream(
new FileOutputStream(file));
out.writeObject(newphrases);
out.close();
} catch (Exception e) {
//LOGGER.error("Load the updated TaxonIndexer failed.", e);
//StringWriter sw = new StringWriter();PrintWriter pw = new PrintWriter(sw);e.printStackTrace(pw);LOGGER.error(ApplicationUtilities.getProperty("CharaParser.version")+System.getProperty("line.separator")+sw.toString());
e.printStackTrace();
}
}
/**
* endochondral element => endochondral element|endochondral elements
* @param phrase: typically in singular form
* @return alternative reg exp with original and plural forms
*/
private String phraseForms(String phrase) {
if(phrase.indexOf(" ")>0){
String result = phrase;
String noun = phrase.substring(phrase.lastIndexOf(" ")).trim();
String modifier = phrase.substring(0, phrase.lastIndexOf(" ")).trim();
String pnoun = noun.matches("\\d+")? noun : outputter.knowledge.TermOutputerUtilities.toPlural(noun);
if(pnoun.compareTo(noun)!=0){
result += "|"+modifier+" "+pnoun;
this.newphrases.put(modifier+" "+pnoun, phrase); //plural=>singluar
}
return result;
}
return phrase;
}
/**
* @param sentence : leaf blade rounded
* @return leaf_blade rounded
*/
public String markPhrases(String sentence){
Matcher m = phrasepattern.matcher(sentence);
//System.out.println(this.phrasestr);
while(m.matches()){
//System.out.println(phrasepattern);
sentence = m.group(1)+m.group(2).replaceAll("\\s+", "_")+m.group(3);
m = phrasepattern.matcher(sentence);
}
return sentence;
}
/**
* @param args
*/
public static void main(String[] args) {
long time1 = System.currentTimeMillis();
String termpath = "C:/Users/updates/CharaParserTest/Ontologies/ext_terms.bin";
PhraseMarker pm = new PhraseMarker(/*termpath*/);
long time2 = System.currentTimeMillis();
System.out.println("read term list took "+(time2-time1)+" ms");
//System.out.println(pm.markPhrases("hyomandibula-opercle joint and anal fin absent . female gonad present."));
System.out.println(pm.markPhrases("Medioventral endochondral elements of the shoulder girdle"));
long time3 = System.currentTimeMillis();
System.out.println("mark the sentence took "+(time3-time2)+" ms");
}
}