/**
*
*/
package outputter.search;
import java.util.ArrayList;
import java.util.Hashtable;
import org.apache.log4j.Logger;
import org.jdom.Element;
import org.jdom.xpath.XPath;
import outputter.Utilities;
import outputter.XML2EQ;
import outputter.data.CompositeEntity;
import outputter.data.EntityProposals;
import outputter.data.FormalConcept;
import outputter.data.FormalRelation;
import outputter.data.REntity;
import outputter.data.SimpleEntity;
import outputter.knowledge.Dictionary;
/**
* @author Hong Cui
*
* use wild cards
*/
public class EntitySearcher6 extends EntitySearcher {
private static final Logger LOGGER = Logger.getLogger(EntitySearcher6.class);
boolean debug = true;
private static Hashtable<String, ArrayList<EntityProposals>> cache = new Hashtable<String, ArrayList<EntityProposals>>();
private static ArrayList<String> nomatchcache = new ArrayList<String>();
/**
*
*/
public EntitySearcher6() {
}
/* (non-Javadoc)
* @see outputter.EntitySearcher#searchEntity(org.jdom.Element, java.lang.String, java.lang.String, java.lang.String, java.lang.String, java.lang.String, int)
*/
/**
* test cases:patterns.xml_s413c5e3e-7941-44e3-8be6-b17e6193752e.xml (manual)
*/
@Override
public ArrayList<EntityProposals> searchEntity(Element root, String structid,
String entityphrase, String elocatorphrase,
String originalentityphrase, String prep) {
LOGGER.debug("EntitySearcher6: search '"+entityphrase+"[orig="+originalentityphrase+"]'");
//search cache
if(EntitySearcher6.nomatchcache.contains(entityphrase+"+"+elocatorphrase)) return null;
if(EntitySearcher6.cache.get(entityphrase+"+"+elocatorphrase)!=null) return EntitySearcher6.cache.get(entityphrase+"+"+elocatorphrase);
//still not find a match, remove the last term in the entityphrase, when what is then left is not just a spatial term
//"humeral deltopectoral crest apex" => "humeral deltopectoral crest"
//TODO "some part" of humerus; "some quality"
//the last token could be a number (index)
//Changed by Zilong:
//enhanced entity format condition to exclude the spatial terms: in order to solve the problem that
//"rostral tubule" will match "anterior side" because rostral is synonymous with anterior
String[] entitylocators = null;
if(elocatorphrase.length()>0) entitylocators = elocatorphrase.split("\\s*,\\s*");
ArrayList<SimpleEntity> entityls = new ArrayList<SimpleEntity>();
//entityl.setString(elocatorphrase);
if(entitylocators!=null) {
ArrayList<FormalConcept> result = new TermSearcher().searchTerm(elocatorphrase, "entity"); //should it call EntitySearcherOriginal? decided not to.
if(result!=null){
//entityl = result;
LOGGER.debug("search for locator '"+elocatorphrase+"' found match: ");
for(FormalConcept fc: result){
entityls.add((SimpleEntity)fc);
LOGGER.debug(".."+fc.toString());
}
}else{ //entity locator not matched
LOGGER.debug("search for locator '"+elocatorphrase+"' found no match");
}
}
String aentityphrase = entityphrase.replaceFirst("^\\(\\?:", "").replaceFirst("\\)$", "");
String[] tokens = aentityphrase.split("\\s+"); //(?:(?:humerus|humeral) (?:shaft))
if(tokens.length>=2){ //to prevent "rostral tubule" from entering the subsequent process
String shortened = aentityphrase.substring(0, aentityphrase.lastIndexOf(" ")).trim();
if(!shortened.matches(".*?\\b("+Dictionary.spatialtermptn+")$")){
//SimpleEntity sentity = (SimpleEntity) new TermSearcher().searchTerm(shortened, "entity");
//search shortened and other strings with the same starting words
ArrayList<FormalConcept> shortentities = new TermSearcher().searchTerm(shortened, "entity");
if(shortentities!=null){
LOGGER.debug("search for entity '"+shortened+"' found match, forming proposals...");
//construct anatomicalentity
SimpleEntity anatomicalentity = Dictionary.anatomicalentity;
anatomicalentity.setString(aentityphrase.substring(aentityphrase.lastIndexOf(" ")).trim());
//construct relation
FormalRelation rel = Dictionary.partof;
rel.setConfidenceScore((float)1.0);
EntityProposals ep = new EntityProposals();
ep.setPhrase(originalentityphrase);
boolean found = false;
for(FormalConcept sentityfc: shortentities){
//if sentity part_of entityl holds, then sentity's conf score = 1 and return the result
SimpleEntity sentity = (SimpleEntity)sentityfc;
if(sentity.isOntologized() && entityls!=null /*&& entityls.isOntologized()*/){
for(FormalConcept fc: entityls){
SimpleEntity entityl = (SimpleEntity)fc;
if(XML2EQ.elk.isSubclassOfWithPart(entityl.getClassIRI(), sentity.getClassIRI())){
LOGGER.debug("'"+entityl.getLabel() +"' and '"+sentity.getLabel() + "' are related, increase confidence score");
found = true;
sentity.setConfidenceScore(1f);
CompositeEntity centity = new CompositeEntity();
centity.addEntity(anatomicalentity);//anatomical entity ...
centity.addParentEntity(new REntity(rel, sentity)); // part of sentity ...
centity.addParentEntity(new REntity(rel, entityl)); //part of entityl
//ep.setPhrase(sentity.getString());
ep.add(centity);//add one proposal with anatomical entity
LOGGER.debug("add a proposal with anatomical entity:"+centity.toString());
/*centity = new CompositeEntity();
centity.addEntity(sentity);
centity.addParentEntity(new REntity(rel, entityl)); //part of entityl
centity.setString(originalentityphrase);
ep.add(centity); //add the other proposal without anatomical entity
LOGGER.debug("add the other proposal without anatomical entity:"+centity.toString());*/
}
}
}
}
if(found){
ArrayList<EntityProposals> entities = new ArrayList<EntityProposals>();
//entities.add(ep);
Utilities.addEntityProposals(entities, ep);
LOGGER.debug("EntitySearcher6 returns:");
for(EntityProposals aep: entities){
LOGGER.debug("..EntityProposals: "+aep.toString());
}
//caching
if(entities==null) EntitySearcher6.nomatchcache.add(entityphrase+"+"+elocatorphrase);
else EntitySearcher6.cache.put(entityphrase+"+"+elocatorphrase, entities);
return entities;
}
//else, record results that meet certain criteria
LOGGER.debug("entity and entity locator (if exists) are not related");
ArrayList<EntityProposals> entities = null;
found = false;
for(FormalConcept sentityfc: shortentities){
SimpleEntity sentity = (SimpleEntity)sentityfc;
//consider only the matches that are one word longer and don't have of/and in the labels
String label = sentity.getLabel();
String added = label.replaceFirst(shortened, "").trim();
if(label.indexOf(" of ")>=0 || label.indexOf(" and ")>=0 || added.indexOf(" ")>0) continue;
if(sentity.getId().compareTo(Dictionary.mcorganism)==0){
//too general "body scale", try to search for "scale"
//TODO: multi-cellular organism is too general a syn for body. "body" could mean something more restricted depending on the context.
//TODO: change labels to ids
}
//entity
//if(entityl.getString().length()>0){
if(elocatorphrase.length()>0){
for(FormalConcept fc: entityls){
found = true;
SimpleEntity entityl = (SimpleEntity)fc;
//relation & entity locator
CompositeEntity centity = new CompositeEntity();
centity.addEntity(anatomicalentity);
centity.addParentEntity(new REntity(rel, sentity));
centity.addParentEntity(new REntity(rel, entityl));
centity.setString(originalentityphrase);
//ep.setPhrase(sentity.getString());
//ep.setPhrase(originalentityphrase);
ep.add(centity); //add one
LOGGER.debug("add a proposal with anatomical entity:"+centity.toString());
/*centity = new CompositeEntity();
centity.addEntity(sentity);
centity.addParentEntity(new REntity(rel, entityl));
centity.setString(originalentityphrase);
ep.add(centity); //add the other
LOGGER.debug("add a proposal without anatomical entity:"+centity.toString());*/
}
}else{
//EntityProposals entities = new EntityProposals();
found = true;
CompositeEntity centity = new CompositeEntity();
centity.addEntity(anatomicalentity);
centity.addParentEntity(new REntity(rel, sentity));
centity.setString(originalentityphrase);
//ep.setPhrase(sentity.getString());
ep.setPhrase(originalentityphrase);
ep.add(centity); //add one
LOGGER.debug("add a proposal with anatomical entity:"+centity.toString());
/*ep.add(sentity); //add the other
LOGGER.debug("add a proposal without anatomical entity:"+sentity.toString());*/
}
}
//entities.add(ep);
if(found){
if(entities==null) entities = new ArrayList<EntityProposals>();
Utilities.addEntityProposals(entities, ep);
LOGGER.debug("EntitySearcher6 returns:");
for(EntityProposals aep: entities){
LOGGER.debug("..EntityProposals: "+aep.toString());
}
//caching
if(entities==null) EntitySearcher6.nomatchcache.add(entityphrase+"+"+elocatorphrase);
else EntitySearcher6.cache.put(entityphrase+"+"+elocatorphrase, entities);
return entities;
}
}
//if failed, try wildcard
//ArrayList<FormalConcept> sentities = TermSearcher.regexpSearchTerm(shortened+"\\b.*", "entity"); //candidate matches for the same entity
ArrayList<FormalConcept> sentities = new TermSearcher().searchTerm(shortened+"\\b.*", "entity"); //candidate matches for the same entity
if(sentities!=null){
LOGGER.debug("search for entity '"+shortened+"\\b.*' found match, forming proposals...");
//construct anatomicalentity
/*SimpleEntity anatomicalentity = new SimpleEntity();
anatomicalentity.setClassIRI("http://purl.obolibrary.org/obo/UBERON_0001062");
anatomicalentity.setConfidenceScore(0.8f);
anatomicalentity.setId("UBERON:0001062");
anatomicalentity.setLabel("anatomical entity");
anatomicalentity.setString(aentityphrase.substring(aentityphrase.lastIndexOf(" ")).trim());
anatomicalentity.setXMLid(structid);*/
//construct relation
FormalRelation rel = Dictionary.partof;
rel.setConfidenceScore((float)1.0);
EntityProposals ep = new EntityProposals();
ep.setPhrase(originalentityphrase);
boolean found = false;
for(FormalConcept sentityfc: sentities){
//if sentity part_of entityl holds, then sentity's conf score = 1 and return the result
SimpleEntity sentity = (SimpleEntity)sentityfc;
if(sentity.isOntologized() && entityls!=null /*&& entityls.isOntologized()*/){
for(FormalConcept fc: entityls){
SimpleEntity entityl = (SimpleEntity)fc;
if(XML2EQ.elk.isSubclassOfWithPart(entityl.getClassIRI(), sentity.getClassIRI())){
LOGGER.debug("'"+entityl.getLabel() +"' and '"+sentity.getLabel() + "' are related, increase confidence score");
found = true;
sentity.setConfidenceScore(1f);
/*CompositeEntity centity = new CompositeEntity();
centity.addEntity(anatomicalentity);//anatomical entity ...
centity.addParentEntity(new REntity(rel, sentity)); // part of sentity ...
centity.addParentEntity(new REntity(rel, entityl)); //part of entityl
//ep.setPhrase(sentity.getString());
ep.add(centity);//add one proposal with anatomical entity
LOGGER.debug("add a proposal with anatomical entity:"+centity.toString());*/
CompositeEntity centity = new CompositeEntity();
centity.addEntity(sentity);
centity.addParentEntity(new REntity(rel, entityl)); //part of entityl
centity.setString(originalentityphrase);
ep.add(centity); //add the other proposal without anatomical entity
LOGGER.debug("add the other proposal without anatomical entity:"+centity.toString());
}
}
}
}
if(found){
ArrayList<EntityProposals> entities = new ArrayList<EntityProposals>();
//entities.add(ep);
Utilities.addEntityProposals(entities, ep);
LOGGER.debug("EntitySearcher6 returns:");
for(EntityProposals aep: entities){
LOGGER.debug("..EntityProposals: "+aep.toString());
}
//caching
if(entities==null) EntitySearcher6.nomatchcache.add(entityphrase+"+"+elocatorphrase);
else EntitySearcher6.cache.put(entityphrase+"+"+elocatorphrase, entities);
return entities;
}
//else, record results that meet certain criteria
LOGGER.debug("entity and entity locator (if exists) are not related");
ArrayList<EntityProposals> entities = null;
found = false;
for(FormalConcept sentityfc: sentities){
SimpleEntity sentity = (SimpleEntity)sentityfc;
//consider only the matches that are one word longer and don't have of/and in the labels
String label = sentity.getLabel();
String added = label.replaceFirst(shortened, "").trim();
if(label.indexOf(" of ")>=0 || label.indexOf(" and ")>=0 || added.indexOf(" ")>0) continue;
if(sentity.getId().compareTo(Dictionary.mcorganism)==0){
//too general "body scale", try to search for "scale"
//TODO: multi-cellular organism is too general a syn for body. "body" could mean something more restricted depending on the context.
//TODO: change labels to ids
}
//entity
//if(entityl.getString().length()>0){
if(elocatorphrase.length()>0){
for(FormalConcept fc: entityls){
SimpleEntity entityl = (SimpleEntity)fc;
//relation & entity locator
/*CompositeEntity centity = new CompositeEntity();
centity.addEntity(anatomicalentity);
centity.addParentEntity(new REntity(rel, sentity));
centity.addParentEntity(new REntity(rel, entityl));
centity.setString(originalentityphrase);
//ep.setPhrase(sentity.getString());
//ep.setPhrase(originalentityphrase);
ep.add(centity); //add one
LOGGER.debug("add a proposal with anatomical entity:"+centity.toString());*/
CompositeEntity centity = new CompositeEntity();
centity.addEntity(sentity);
centity.addParentEntity(new REntity(rel, entityl));
centity.setString(originalentityphrase);
ep.add(centity); //add the other
LOGGER.debug("add a proposal without anatomical entity:"+centity.toString());
found = true;
}
}else{
//EntityProposals entities = new EntityProposals();
/*CompositeEntity centity = new CompositeEntity();
centity.addEntity(anatomicalentity);
centity.addParentEntity(new REntity(rel, sentity));
centity.setString(originalentityphrase);
//ep.setPhrase(sentity.getString());
ep.setPhrase(originalentityphrase);
ep.add(centity); //add one
LOGGER.debug("add a proposal with anatomical entity:"+centity.toString());*/
ep.add(sentity); //add the other
LOGGER.debug("add a proposal without anatomical entity:"+sentity.toString());
found = true;
}
}
//entities.add(ep);
if(found){
if(entities==null) entities = new ArrayList<EntityProposals>();
Utilities.addEntityProposals(entities, ep);
LOGGER.debug("EntitySearcher6 returns:");
for(EntityProposals aep: entities){
LOGGER.debug("..EntityProposals: "+aep.toString());
}
//caching
if(entities==null) EntitySearcher6.nomatchcache.add(entityphrase+"+"+elocatorphrase);
else EntitySearcher6.cache.put(entityphrase+"+"+elocatorphrase, entities);
return entities;
}
}
}
}
EntitySearcher6.nomatchcache.add(entityphrase+"+"+elocatorphrase);
LOGGER.debug("EntitySearch6 found no match");
return null;
}
/**
* @param args
*/
public static void main(String[] args) {
// TODO Auto-generated method stub
}
}