package outputter.search;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.Hashtable;
import java.util.List;
import org.apache.log4j.Logger;
import org.jdom.Element;
import org.jdom.xpath.XPath;
import outputter.Utilities;
import outputter.data.EntityProposals;
import outputter.data.FormalConcept;
import outputter.data.SimpleEntity;
/**
*
* @author Hong Cui
* strategy for phrases that omitted the head noun, for example epibranchial, which
* may match Epibranchial bone, Epibranchial element, or Epibranchial cartilage
*
*/
public class EntitySearcher5 extends EntitySearcher {
private static final Logger LOGGER = Logger.getLogger(EntitySearcher5.class);
private static Hashtable<String, ArrayList<EntityProposals>> cache = new Hashtable<String, ArrayList<EntityProposals>>();
private static ArrayList<String> nomatchcache = new ArrayList<String>();
public EntitySearcher5() {
}
@Override
public ArrayList<EntityProposals> searchEntity(Element root, String structid,
String entityphrase, String elocatorphrase,
String originalentityphrase, String prep) {
LOGGER.debug("EntitySearcher5: search '"+entityphrase+"[orig="+originalentityphrase+"]'");
//search cache
if(EntitySearcher5.nomatchcache.contains(entityphrase+"+"+elocatorphrase)) return null;
if(EntitySearcher5.cache.get(entityphrase+"+"+elocatorphrase)!=null) return EntitySearcher5.cache.get(entityphrase+"+"+elocatorphrase);
//TODO take care of elocatorphrase
//bone, cartilage, element
//Epibranchial 1: (0) present and ossified E: Epibranchial 1 bone, Q: present
//Epibranchial 1: (1) present and cartilaginous E: Epibranchial 1 cartilage, Q: present
//Epibranchial 1: (2) absent E: Epibranchial 1 cartilage, Q: absent E: Epibranchial 1 bone, Q: absent
//The curator should use both the cartilage and bone terms to annotate state 2 because the author clearly differentiates between the two.
//search with regular expression "epibranchial .*" to find possible missing headnouns
String aentityphrase = entityphrase.replaceFirst("^\\(\\?:", "").replaceFirst("\\)$", "");
//if(entityphrase.indexOf(" ")<0 && entityphrase.compareTo(originalentityphrase)==0){
if(aentityphrase.indexOf(" ")<0){
Hashtable<String, String> headnouns = new Hashtable<String, String>();
//ArrayList<FormalConcept> regexpresults = TermSearcher.regexpSearchTerm(entityphrase+" .*", "entity");
ArrayList<FormalConcept> regexpresults = new TermSearcher().searchTerm(aentityphrase+" .*", "entity");
String nouns = null;
if(regexpresults!=null){
LOGGER.debug("...search entity '"+aentityphrase+" .*' found match");
for(FormalConcept regexpresult: regexpresults){
//regexpresult.setSearchString(originalentityphrase+"["+regexpresult.getSearchString()+"]"); //record originalentityphrase for grouping entity proposals later
headnouns.put(regexpresult.getLabel().replace(aentityphrase, ""), regexpresult.getId()+"#"+regexpresult.getClassIRI()); //don't trim headnoun
}
if(regexpresults.size()<10){
//search headnouns in the context: coronoid .* => coronoid process of ulna
//headnouns may have leading or trailing spaces, perserve them: hindlimb intermedium; intermedium (fore)
nouns= searchContext(root, structid, headnouns); //bone, cartilaginous
}
}else{
LOGGER.debug("...search entity '"+aentityphrase+" .*' found no match");
}
if(nouns != null){
LOGGER.debug("...found candidate headnouns '"+nouns+"', forming proposals...");
EntityProposals ep = new EntityProposals();
ArrayList<EntityProposals> entities = null;
//ep.setPhrase(entityphrase+" .*");
ep.setPhrase(originalentityphrase);
String[] choices = nouns.split(",");
float score = 1.0f/regexpresults.size();
boolean found = false;
for(String noun: choices){
String[] idiri = headnouns.get(noun).split("#");
SimpleEntity sentity = new SimpleEntity();
sentity.setSearchString(aentityphrase+" .*");
sentity.setString(aentityphrase);
sentity.setLabel(noun.startsWith(" ")? aentityphrase+noun: noun+aentityphrase);
sentity.setId(idiri[0]);
sentity.setConfidenceScore(score);
sentity.setClassIRI(idiri[1]);
ep.add(sentity);
LOGGER.debug(".....add a proposal:"+sentity);
found = true;
}
//entities.add(ep);
if(found){
if(entities==null) entities = new ArrayList<EntityProposals>();
Utilities.addEntityProposals(entities, ep);
//logging
LOGGER.debug("EntitySearcher5 completed search for '"+aentityphrase+"[orig="+originalentityphrase+"]' and returns:");
for(EntityProposals aep: entities){
LOGGER.debug("..: "+aep.toString());
}
if(entities==null) EntitySearcher5.nomatchcache.add(entityphrase+"+"+elocatorphrase);
else EntitySearcher5.cache.put(entityphrase+"+"+elocatorphrase, entities);
return entities;
}
}else{
LOGGER.debug("...candidate headnouns is null, search failed");
}
/*else{
//text::Caudal fin
//text::heterocercal (heterocercal tail is a subclass of caudal fin, search "heterocercal *")
//return all matches as candidates
if(regexpresults!=null){
EntityProposals entities = new EntityProposals();
for(FormalConcept regexpresult: regexpresults){
Entity e = (Entity) regexpresult;
entities.add(e);
}
return entities;
}
}*/
//caching
}
EntitySearcher5.nomatchcache.add(entityphrase+"+"+elocatorphrase);
LOGGER.debug("...search for entity '"+entityphrase+"' found no match");
LOGGER.debug("EntitySearcher5 calls EntitySearcher6");
return new EntitySearcher6().searchEntity(root, structid, entityphrase, elocatorphrase, originalentityphrase, prep);
}
/**
* look into text context for statements containing structid
* to determin the target the context is most close to. for example
* //bone, cartilage, element
//Epibranchial 1: (0) present and ossified E: Epibranchial 1 bone, Q: present
//Epibranchial 1: (1) present and cartilaginous E: Epibranchial 1 cartilage, Q: present
//Epibranchial 1: (2) absent E: Epibranchial 1 cartilage, Q: absent E: Epibranchial 1 bone, Q: absent
//The curator should use both the cartilage and bone terms to annotate state 2 because the author clearly differentiates between the two.
* //could perform a content similarity measure between the definitions associated with the targets in ontology and the text of the statement
* @param root
* @param structid
* @param target
* @return
*/
private static String searchContext(Element root, String structid, Hashtable<String, String> targets){
try{
Element structure = (Element) XPath.selectSingleNode(root, ".//statement/structure[@id='"+structid+"']");
Element statement = structure.getParentElement();
String text = statement.getChildText("text");
//disambiguate between bone and cartilage
if(targets.get(" bone") != null && targets.get(" cartilage")!=null){
int bonecount = text.replaceAll("(ossifi|bone)", "#").replaceAll("[^#]", "").length();
int cartcount = text.replaceAll("cartil", "#").replaceAll("[^#]", "").length();
if(bonecount > cartcount) return "bone";
if(bonecount < cartcount) return "cartilage";
if(bonecount == cartcount) return "element";
}
//filter other cases: prefer phrases one-word longer than the original phrase
String result = "";
Enumeration<String> keys = targets.keys();
while(keys.hasMoreElements()){
String noun = keys.nextElement();
if(noun.indexOf(" of ")>=0 || noun.indexOf(" and ")>=0) continue; //coronoids 'proccess of ulna': coronoids can't possibility be used to represent a complex concept that require the use of "of"
if(noun.trim().indexOf(" ")<= 0){
if(!related(noun.trim(), structid, root)){ //text: coronoids with tooth, then 'coronoid tooth' should be filtered
result += noun+","; //don't trim noun
}
}
}
if(result.trim().length()>0){
return result.replaceFirst(",$", "");
}
}catch(Exception e){
LOGGER.error("", e);
}
return null;
}
/**
*
* @param noun: the headnoun candidate
* @param structid: the structure that in need of the headnoun
* @param root: the root of the xml file
* @return whether the headnoun and the structure is connected via a <relation> chain in xml
*/
@SuppressWarnings("unchecked")
private static boolean related(String noun, String structid, Element root) {
try{
XPath xpath = XPath.newInstance("//relation[@from='"+structid+"']");
List<Element> relations = xpath.selectNodes(root);
for(Element relation: relations){
String toid = relation.getAttributeValue("to");
Element related = (Element) XPath.selectSingleNode(root, "//structure[@id='"+toid+"']");
if(related.getAttributeValue("name").compareTo(noun)==0){
return true;
}else{
return related(noun, toid, root);
}
}
}catch(Exception e){
LOGGER.error("", e);
}
return false;
}
/**
* @param args
*/
public static void main(String[] args) {
// TODO Auto-generated method stub
}
}