/**
*
*/
package outputter.search;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Enumeration;
import java.util.HashSet;
import java.util.Hashtable;
import java.util.LinkedHashSet;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.log4j.Logger;
import outputter.Utilities;
import outputter.XML2EQ;
import outputter.data.FormalConcept;
import outputter.data.Quality;
import outputter.data.SimpleEntity;
import outputter.knowledge.Dictionary;
import outputter.knowledge.Wordforms;
/**
* @author hong cui accepts entity (entity, entity locator, quality modifier)
* and quality phrases and search them in ontologies. this class
* generates various forms of search phrases to try to hit a target in
* the ontologies apply stemming? rank candidate matches
*
*/
public class TermSearcher {
private static final Logger LOGGER = Logger.getLogger(TermSearcher.class);
/** The strong entity id cache. */
public static Hashtable<String, ArrayList<FormalConcept>> entityIDCache = new Hashtable<String, ArrayList<FormalConcept>>(); // term=>
// {id,
// label}
/** The strong quality id cache. */
public static Hashtable<String, ArrayList<FormalConcept>> qualityIDCache = new Hashtable<String, ArrayList<FormalConcept>>();
/** The candidate entity id cache. */
public static Hashtable<String, ArrayList<FormalConcept>> entityCandidateIDCache = new Hashtable<String, ArrayList<FormalConcept>>(); // term=>
// {id,
// label}
/** The candidate quality id cache. */
public static Hashtable<String, ArrayList<FormalConcept>> qualityCandidateIDCache = new Hashtable<String, ArrayList<FormalConcept>>();
/** The entity id cache. */
// public static Hashtable<String, ArrayList<FormalConcept>>
// regexpEntityIDCache = new Hashtable<String, ArrayList<FormalConcept>>();
// //term=> {id, label}
/** The quality id cache. */
// public static Hashtable<String, ArrayList<FormalConcept>>
// regexpQualityIDCache = new Hashtable<String, ArrayList<FormalConcept>>();
private static Pattern p = Pattern.compile("^(" + Dictionary.spatialtermptn
+ ")(\\b.*)");
ArrayList<Hashtable<String, String>> candidatematches = new ArrayList<Hashtable<String, String>>();
public static ArrayList<String> nomatchCache = new ArrayList<String>();
//for these terms, both strong and weak matches will be returned
//To add terms to this list: "short|term2";
public static String looseTerms = "short";
/**
* Search term in the whole ontology (of a particular type) Result from each
* ontology is either a match to original class, or via exact, narrow, or
* related synonyms. The result could also be null when no match is found.
*
* Return all strong matches based on the original phrase. Other matches are
* saved in candidate matches Strong match = a match to a class lable or an
* exact synonym
*
* @param query
* a term or a regular expression
* @param phrasetype
* the type: 'entity' or 'quality'
* @return null when no match, otherwise, an arrayList of matched
* FormalConcepts
*/
public ArrayList<FormalConcept> searchTerm(String phrase, String phrasetype) {
phrase = phrase.trim();
String cleanphrase = phrase.replaceAll("[()?:]", "");
if (phrase.length() == 0)
return null;
// search cache
ArrayList<FormalConcept> result = this.searchCache(phrase, phrasetype);
if (result == null && this.nomatchCache.contains(phrase))
return null;
else if (result != null && result.size()>0)
return result;
String query = formatExpand(phrase); // expand with syn-ring
String querycopy = query;
result = null;
// 0. special cases
if (phrasetype.compareTo("entity") == 0) {
// 'process' => 'anatomical projection' UBERON:0004529
if (query.matches("\\W*process\\W*")
|| query.matches("\\W*process(\\|process)+\\W*")) {
SimpleEntity se = Dictionary.anatomicalprojection;
se.setSearchString(query);
se.setString(cleanphrase);
if (result == null)
result = new ArrayList<FormalConcept>();
result.add(se);
TermSearcher.cacheIt(query, result, "entity");
return result;
}
}
// 1. search the original phrase/reg exp
ArrayList<Hashtable<String, String>> results = new ArrayList<Hashtable<String, String>>();
ArrayList<FormalConcept> strongmatch = getStrongMatch(cleanphrase, query,
phrasetype, results, 1f);
if (strongmatch != null && strongmatch.size()>0)
return strongmatch;
// /if landed here, all matches based on the original phrase are weak
// matches.
candidatematches.addAll(results);
results = new ArrayList<Hashtable<String, String>>();
/*
* TODO Changed by Zilong: deal with terms like "unossified" Transform
* the result from an adjective word (binary form) to
* "noun+present/absent"
*/
/*
* quality = quality.toLowerCase().trim();
* if(dictionary.verbalizednouns.containsKey(quality)){ EQ.put("entity",
* entity+" "+dictionary.verbalizednouns.get(quality).split(",")[0]);
* EQ.put("quality",
* dictionary.verbalizednouns.get(quality).split(",")[1]); }
*/
/* end handling the "unossified" like term */
// TODO let ontoutil.searchOntologies handle variations in hyphens as
// this case can be mixed with any other cases
// 3. phrase with hyphens, replace hyphens with spaces
if (query.indexOf("-") > 0) { // caudal-fin
// caudal-fin|caudal fin
String[] tokens = query.split("[$^():?*+.| ]+");
for (String token : tokens) {
if (token.contains("-")) {
//token = token.replaceAll("\\\\b", "");
String tcopy = token;
token = "(:?" + token + "|" + token.replaceAll("-", " ")
+ ")";
query = query.replaceAll("\\b" + tcopy + "\\b", token);
}
}
// phrase = phrase.replaceAll("-", " ");
strongmatch = getStrongMatch(cleanphrase, query, phrasetype, results, 1f);
if (strongmatch != null && strongmatch.size()>0)
return strongmatch;
// TODO: latero-sensory => sensory
candidatematches.addAll(results);
results = new ArrayList<Hashtable<String, String>>();
query = querycopy;
}
// 4. phrase with /, assuming one / in the phrase.
if (query.indexOf("/") > 0) { // xyz bone/tendon
String[] tokens = query.split("[$^():?*+.| ]+"); // ^ can't be the
// first in []
for (String token : tokens) {
if (token.contains("/")) {
String tcopy = token;
token = "(?:" + token.replaceAll("/", "|") + ")"; // (?:bone|tendon)
query = query.replaceAll("\\b" + tcopy + "\\b", token);
}
}
strongmatch = getStrongMatch(cleanphrase, query, phrasetype, results, 1f);
if (strongmatch != null && strongmatch.size()>0)
return strongmatch;
// if landed here, all matches based on this reform are weak
// matches.
candidatematches.addAll(results);
results = new ArrayList<Hashtable<String, String>>();
query = querycopy;
/*
* String replacement =
* phrase.substring(phrase.indexOf("/")).replaceFirst("^/", "");
* //tendon String firstpart = phrase.substring(0,
* phrase.indexOf("/")); //xyz bone
*
* strongmatch = getStrongMatch(firstpart, phrasetype, results, 1f);
* if(strongmatch != null) return strongmatch;
*
* //if landed here, all matches based on this reform are weak
* matches. candidatematches.addAll(results); results = new
* ArrayList<Hashtable<String, String>>();
*
* while(firstpart.contains(" ")){ phrase =
* firstpart.replaceFirst("\\s\\S+$", replacement);//replace the
* last word in firstpart with "replacement": now term = xyz tendon
* strongmatch = getStrongMatch(phrase, phrasetype, results, 0.8f);
* if(strongmatch != null) return strongmatch;
*
* //if landed here, all matches based on this spatial reform are
* weak matches. candidatematches.addAll(results); results = new
* ArrayList<Hashtable<String, String>>(); firstpart =
* firstpart.substring(0, firstpart.lastIndexOf(" ")).trim(); }
*/
}
// convert to relational adjectives by appending ed|-shaped|-like|less
// etc.
if (phrasetype.compareTo("quality") == 0) {
// TODO: Handle cases like divergent from => ending with a
// preposition
String[] tokens = query.split("[$^():?*+.| ]+");
for (String token : tokens) {
if(token.length()==0) continue;
token = token.replaceAll("\\\\b", "");
LinkedHashSet<String> phraseforms = Wordforms
.toAdjective(token);
String regexp = "";
for (String form : phraseforms) {
if (form.trim().length() > 0)
regexp += form + "|";
}
regexp = regexp.replaceFirst("\\|+$", "");
if (regexp.contains("|"))
regexp = "(?:" + regexp + ")";
query = query.replaceAll("\\b" + token + "\\b", regexp);
}
strongmatch = getStrongMatch(cleanphrase, query, phrasetype, results, 0.8f);
if (strongmatch != null && strongmatch.size()>0)
return strongmatch;
candidatematches.addAll(results);
results = new ArrayList<Hashtable<String, String>>();
query = querycopy;
/*
* LinkedHashSet<String> phraseforms =
* Wordforms.toAdjective(phrase); //Uses wordforms class to get all
* the adjectives of this quality for(String form:phraseforms) {
* //to match predefined quality mapping
* if(Dictionary.qualitymapping.get(form)!=null)
* form=Dictionary.qualitymapping.get(form); strongmatch =
* getStrongMatch(form, phrasetype, results, 0.8f); if(strongmatch
* != null) return strongmatch; candidatematches.addAll(results);
* results = new ArrayList<Hashtable<String, String>>(); }
*/
}
// TODO: lastly, rank candidate matches and select the most likely one
/*
* if(this.candidatematches.size()>0) {
*
* return candidateMataches(phrase, this.candidatematches,
* phrasetype,.5f); } else
*/
// keep weaker matches
if (candidatematches.size() == 0)
TermSearcher.cacheIt(phrase, null, phrasetype);
cacheCandidateMataches(cleanphrase, query, phrasetype, .5f);
return getCandidateMatches(query, phrasetype);
//return null;
}
private void cacheCandidateMataches(String term, String query, String type,
float confscore) {
ArrayList<FormalConcept> concepts = null;
for (Hashtable<String, String> aresult : candidatematches) {
if (aresult.get("matchtype").contains("related")) {
ArrayList<Hashtable<String, String>> resultlist = split(aresult);
for (Hashtable<String, String> result : resultlist) {
if (type.compareTo("entity") == 0) {
SimpleEntity entity = new SimpleEntity();
entity.setSearchString(result.get("term"));
entity.setString(term);
entity.setLabel(result.get("label"));
entity.setId(result.get("id"));
entity.setClassIRI(result.get("iri"));
entity.setConfidenceScore(confscore);
// cacheIt(aresult.get("term"), entity, type);
// return entity;
if (concepts == null)
concepts = new ArrayList<FormalConcept>();
concepts.add(entity);
} else {
Quality quality = new Quality();
quality.setSearchString(result.get("term"));
quality.setString(term);
quality.setLabel(result.get("label").split(";")[0]);
quality.setId(result.get("id").split(";")[0]);
quality.setClassIRI(result.get("iri").split(";")[0]);
quality.setConfidenceScore(confscore);
// cacheIt(aresult.get("term"), quality, type);
// return quality;
if (concepts == null)
concepts = new ArrayList<FormalConcept>();
concepts.add(quality);
}
}
}
}
if (concepts != null)
cacheCandidates(query, concepts, type);
}
/**
*
* @param query: ordinary string or regular expressions like (?:a b|c)
* @return
*/
private static String formatExpand(String query) {
// format
query = query.replaceAll("_", " "); // abc_1
query = query.replaceAll("(?<=\\w)- (?=\\w)", "-"); // dorsal- fin
// word = word.replaceAll("\\[.*?\\]", "");//remove [usually]
// word = word.replaceAll("[()]", ""); // turn dorsal-(fin) to
// dorsal-fin: fix it early, not here because search word may be regular
// expressions
query = query.replaceAll("-to\\b", " to"); // turn dorsal-to to 'dorsal to'
query = query.replaceAll("(?<=\\w)shaped", "-shaped");
if (query.compareTo("elongate") == 0)
query = "elongated";
if (query.compareTo("directed") == 0)
query = "direction";
String querycp = query;
// syn-ring expand:
String[] tokens = query.split("[$^():?*+.| ]+");
Set<String> tokenset = new HashSet<String>(Arrays.asList(tokens));
for (String token : tokenset) {
if(token.length()>0){
token = token.replaceAll("\\\\b", "");
String tcopy = token;
token = Utilities.getSynRing4Phrase(token);
query = query.replaceAll("\\b" + tcopy + "\\b", token);
}
}
return querycp+"|"+query;
}
public static String adjectiveOrganSearch(String term) {
Hashtable<String, String> result = XML2EQ.ontoutil
.searchAdjectiveOrgan(term, "entity");
if (result != null) {
// return the first match
// TODO
Enumeration<String> en = result.keys();
while (en.hasMoreElements()) {
String id = en.nextElement();
return id + "#" + result.get(id);
}
}
return null;
}
/**
* fill in results, return strong match ( original and exac synonym matches)
* if there is any matches via related, broad, narrow synonyms are not
* considered strong
*
* @param query
* @param type
* @param results
* @return null if no match, otherwise, an arraylist of FormalConcepts
* @throws Exception
*/
private ArrayList<FormalConcept> getStrongMatch(String term, String query, String type,
ArrayList<Hashtable<String, String>> results, float confscore) {
ArrayList<FormalConcept> concepts = null;
XML2EQ.ontoutil.searchOntologies(query, type, results);
if (results != null && results.size() > 0) {
// loop through results to find the closest match
// return original or exact match
for (Hashtable<String, String> aresult : results) {
if (aresult.get("matchtype").contains("original")
|| aresult.get("matchtype").contains("exact")) {
ArrayList<Hashtable<String, String>> resultlist = split(aresult);
for (Hashtable<String, String> result : resultlist) {
if (type.compareTo("entity") == 0) {
SimpleEntity entity = new SimpleEntity();
entity.setSearchString(result.get("term"));
entity.setString(term);
entity.setLabel(result.get("label"));
entity.setId(result.get("id"));
entity.setClassIRI(result.get("iri"));
entity.setConfidenceScore(confscore);
// cacheIt(term, entity, type);
if (concepts == null)
concepts = new ArrayList<FormalConcept>();
concepts.add(entity);
// return entity;
} else {
Quality quality = new Quality();
quality.setSearchString(result.get("term"));
quality.setString(term);
quality.setLabel(result.get("label"));
quality.setId(result.get("id"));
quality.setClassIRI(result.get("iri"));
quality.setConfidenceScore(confscore);
// cacheIt(term, quality, type);
if (concepts == null)
concepts = new ArrayList<FormalConcept>();
concepts.add(quality);
// return quality;
}
}
}
}
cacheIt(query, concepts, type);
}
return concepts;
}
/**
*
* @param multiplevalues
* @return
*/
private ArrayList<Hashtable<String, String>> split(
Hashtable<String, String> multiplevalues) {
// multiplevalues: keys: term, label, id, iri
ArrayList<Hashtable<String, String>> splited = new ArrayList<Hashtable<String, String>>();
String[] terms = multiplevalues.get("term").split(";");
//String term = multiplevalues.get("term");
String[] labels = multiplevalues.get("label").split(";");
String[] ids = multiplevalues.get("id").split(";");
String[] iris = multiplevalues.get("iri").split(";");
if (labels.length == 1) {
splited.add(multiplevalues);
} else {
for (int i = 0; i < labels.length; i++) {
Hashtable<String, String> one = new Hashtable<String, String>();
one.put("term", terms[0]);
one.put("label", labels[i]);
one.put("id", ids[i]);
one.put("iri", iris[i]);
splited.add(one);
}
}
return splited;
}
/**
* Searches for all combination of spatial and headnoun in owl entity and
* returns matching entities
*
* @param spatial
* @param headnoun
* @return
*/
/*
* public static ArrayList<FormalConcept> entityVariationTermSearch(String
* spatial, String headnoun) { ArrayList<FormalConcept> matches = new
* ArrayList<FormalConcept>();
*
* for(String spatialterm:spatial.split("\\|")) for(String
* nounterm:headnoun.split("\\|")) { FormalConcept term = new
* TermSearcher().searchTerm(spatialterm+" "+nounterm, "entity");
* if(term!=null) matches.add(term); } return matches;
*
* }
*/
/**
* search pattern with wildcard "*". search for "pevlic *", it will return
* any class with label or syn of "pelvic somthing".
*
* @param term
* @param type
* @param results
* @return all matches
* @throws Exception
*/
/*
* public static ArrayList<FormalConcept> regexpSearchTerm(String phrase,
* String phrasetype){ ArrayList<FormalConcept> result = null;
* if(phrasetype.compareTo("entity")==0){ result =
* TermSearcher.regexpEntityIDCache.get(phrase); }
* if(phrasetype.compareTo("quality")==0){ result =
* TermSearcher.regexpQualityIDCache.get(phrase); } if(result !=null )
* return result;
*
* if(phrasetype.compareTo("entity")==0){ //'process' => 'anatomical
* projection' UBERON:0004529
* if(phrase.matches("\\W*process(\\|process)+\\W*")){ SimpleEntity se = new
* SimpleEntity();
* se.setClassIRI("http://purl.obolibrary.org/obo/UBERON_0004529");
* se.setConfidenceScore(1f); se.setId("UBERON:0004529");
* se.setLabel("anatomical projection"); se.setString(phrase); result= new
* ArrayList<FormalConcept>(); result.add(se);
* TermSearcher.regexpEntityIDCache.put(phrase, result); return result; } }
*
*
* ArrayList<Hashtable<String, String>> searchresult = new
* ArrayList<Hashtable<String, String>> ();
* XML2EQ.ontoutil.searchOntologies(phrase, phrasetype, searchresult);
* if(searchresult !=null && searchresult.size() > 0){ result = new
* ArrayList<FormalConcept>(); for(Hashtable<String, String> item:
* searchresult){ if(phrasetype.compareTo("entity")==0){ String str =
* item.get("term"); String[] labels = item.get("label").split(";");
* String[] ids = item.get("id").split(";"); String[] iris =
* item.get("iri").split(";"); for(int i = 0; i < labels.length; i++){
* SimpleEntity entity = new SimpleEntity(); entity.setString(str);
* entity.setLabel(labels[i]); entity.setId(ids[i]);
* entity.setClassIRI(iris[i]); entity.setConfidenceScore((float)0.5);
* result.add(entity); } }else{ String str = item.get("term"); String[]
* labels = item.get("label").split(";"); String[] ids =
* item.get("id").split(";"); String[] iris = item.get("iri").split(";");
* for(int i = 0; i < labels.length; i++){ Quality quality = new Quality();
* quality.setString(str); quality.setLabel(labels[i]);
* quality.setId(ids[i]); quality.setClassIRI(iris[i]);
* quality.setConfidenceScore((float)0.5); result.add(quality); } } }
* if(phrasetype.compareTo("entity")==0)
* TermSearcher.regexpEntityIDCache.put(phrase, result);
* if(phrasetype.compareTo("quality")==0)
* TermSearcher.regexpQualityIDCache.put(phrase, result); return result; }
* return result; }
*/
/**
* search in cache
*
* @param term
* @param type
* @return
*/
private static ArrayList<FormalConcept> searchCache(String term, String type) {
ArrayList<FormalConcept> result = null;
if (type.compareTo("entity") == 0) {
result = entityIDCache.get(term);
}
if (type.compareTo("quality") == 0) {
result = qualityIDCache.get(term);
}
if(result!=null && result.size()>0) return result;
return null;
}
private static void cacheIt(String term, ArrayList<FormalConcept> aresult,
String type) {
if (aresult == null){
//TermSearcher.nomatchCache.add(term);
return;
}
else if (type.compareTo("entity") == 0)
TermSearcher.entityIDCache.put(term, aresult);
else if (type.compareTo("quality") == 0)
TermSearcher.qualityIDCache.put(term, aresult);
}
private static void cacheCandidates(String term,
ArrayList<FormalConcept> aresult, String type) {
if (aresult == null)
TermSearcher.nomatchCache.add(term);
else if (type.compareTo("entity") == 0)
TermSearcher.entityCandidateIDCache.put(term, aresult);
else if (type.compareTo("quality") == 0)
TermSearcher.qualityCandidateIDCache.put(term, aresult);
}
public ArrayList<FormalConcept> getCandidateMatches(String term, String type) {
if (type.compareTo("entity") == 0)
return TermSearcher.entityCandidateIDCache.get(term);
else if (type.compareTo("quality") == 0)
return TermSearcher.qualityCandidateIDCache.get(term);
return null;
}
/*
* public ArrayList<FormalConcept> getQualityCandidateMatches() {
* ArrayList<FormalConcept> fcs = new ArrayList<FormalConcept>();
* for(Hashtable<String, String> aresult: this.candidatematches){
* ArrayList<Hashtable<String, String>> resultlist = split(aresult);
* for(Hashtable<String, String> result: resultlist){
* if(result.get("querytype").compareTo("entity")==0){ SimpleEntity se = new
* SimpleEntity(); se.setConfidenceScore(0.5f); //candidate match is less
* reliable, will replace hard-coded score with a calculated score
* se.setString(result.get("term")); se.setLabel(result.get("label"));
* se.setId(result.get("id")); se.setClassIRI(result.get("iri"));
* fcs.add(se); } if(result.get("querytype").compareTo("quality")==0){
* Quality q = new Quality(); q.setConfidenceScore(0.5f); //candidate match
* is less reliable, will replace hard-coded score with a calculated score
* q.setString(result.get("term")); q.setLabel(result.get("label"));
* q.setId(result.get("id")); q.setClassIRI(result.get("iri")); fcs.add(q);
* } } } return fcs; }
*/
/**
* @param args
*/
public static void main(String[] args) {
TermSearcher ts = new TermSearcher();
// FormalConcept result = ts.searchTerm("ornament", "quality");
// ArrayList<FormalConcept> result
// =TermSearcher.regexpSearchTerm("epichordal\\b.*", "entity");
// if(result!=null){
// System.out.println(result.toString());
// }else{
// ArrayList<FormalConcept> fcs = ts.getCandidateMatches();
// for(FormalConcept fc: fcs){
// System.out.println(fc.toString());
// }
ArrayList<FormalConcept> quality = ts.searchTerm("(?:manual phalanx 2)",
"entity");
if(quality!=null){
for (FormalConcept fc : quality)
System.out.println(fc.getLabel());
}
}
}