/**
*
*/
package outputter.search;
import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Hashtable;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.jdom.Document;
import org.jdom.Element;
import org.jdom.JDOMException;
import org.jdom.input.SAXBuilder;
import outputter.Utilities;
import outputter.data.CompositeEntity;
import outputter.data.Entity;
import outputter.data.EntityProposals;
import outputter.data.FormalConcept;
import outputter.data.REntity;
import outputter.data.SimpleEntity;
import outputter.knowledge.Dictionary;
import outputter.knowledge.TermOutputerUtilities;
/**
* @author Hong Cui
* it searches different variations of the E/EL compositions using all the elements.
*
* For examples:
* input: e:postaxial process, el:modifier fibula
* generate variations like:
* 1. (postaxial|syn_ring) (process|crest|syn_ring) of modifier (fibula|fibular|adj)
* 2. modifier (fibula|fibular|adj) (postaxial|syn_ring) (process|crest|syn_ring)
* 3. (postaxial|syn_ring) modifier (fibula|fibular|adj) (process|crest|syn_ring)
* 4. modifier (postaxial|syn_ring) (fibula|fibular|adj) (process|crest|syn_ring)
*
*/
public class EntitySearcher1 extends EntitySearcher {
private static final Logger LOGGER = Logger.getLogger(EntitySearcher1.class);
private static boolean debug_permutation = false;
private static Hashtable<String, ArrayList<EntityProposals>> cache = new Hashtable<String, ArrayList<EntityProposals>>();
private static ArrayList<String> nomatchcache = new ArrayList<String>();
private static final float partial = 0.8f;
//boolean debug = true;
/**
*
*/
public EntitySearcher1() {
}
//TODO patterns s0fd16381: maxillae, anterior end of
//entityphrase could be reg exp such as (?:A of B| B A) of (?: C D | D of C) or a simple string
/* (non-Javadoc)
* @see outputter.EntitySearcher#searchEntity(org.jdom.Element, java.lang.String, java.lang.String, java.lang.String, java.lang.String, java.lang.String, int)
*/
@Override
public ArrayList<EntityProposals> searchEntity(Element root, String structid,
String entityphrase, String elocatorphrase,
String originalentityphrase, String prep) {
LOGGER.debug("EntitySearcher1: search '"+entityphrase+"[orig="+originalentityphrase+"]'");
//search cache
if(EntitySearcher1.nomatchcache.contains(entityphrase+"+"+elocatorphrase)) return null;
if(EntitySearcher1.cache.get(entityphrase+"+"+elocatorphrase)!=null) return EntitySearcher1.cache.get(entityphrase+"+"+elocatorphrase);
ArrayList<EntityProposals> entities = null;
EntityProposals ep = new EntityProposals(); //search results
//entityphrase = "posterior radials";
//elocatorphrase = "anterior dorsal fin";
//save phrases as components
EntityComponents ecs = new EntityComponents(entityphrase, elocatorphrase);
ArrayList<EntityComponent> components = ecs.getComponents(); //each component is an entity or an entity locator
//construct pre-composed variations: selected permutations without repetition
ArrayList<String> variations = new ArrayList<String>();
//permutation on synrings that are results of subcomponents permutation
permutation(components, variations);
LOGGER.debug("...created variations");
//LOGGER.debug("'"+entityphrase+" , "+elocatorphrase+"' generated "+variations.size()+" variations:");
for(String variation : variations)
LOGGER.debug("....."+variation);
//search variations for pre-composed terms one by one, return all the results
boolean found = false;
//LOGGER.debug("search variations one by one...");
for(String variation: variations){
LOGGER.debug("...search variation '"+variation+"'");
//ArrayList<FormalConcept> entityfcs = new TermSearcher().regexpSearchTerm(variation, "entity"); //remove indexes from variation before search
ArrayList<FormalConcept> entityfcs = new TermSearcher().searchTerm(variation, "entity"); //remove indexes from variation before search
//check for the strength of the match: related synonyms: (?:(?:crista) (?:parotica)) entity=>tegmen tympani
if(entityfcs!=null){
for(FormalConcept entity:entityfcs){
if(entity!=null){
found = true;
//ep.setPhrase(entityphrase);
ep.setPhrase(originalentityphrase);
ep.add((Entity)entity); //all variations are alternative entities (i.e. proposals) for the phrase
}
}
LOGGER.debug("...found match: "+ep.toString());
}
}
if(found){
//entities.add(ep);
if(entities==null) entities = new ArrayList<EntityProposals>();
Utilities.addEntityProposals(entities, ep);
//LOGGER.debug("EntitySearcher1 found matched variations, returns:");
//for(EntityProposals aep: entities){
// LOGGER.debug("..:"+aep.toString());
//}
//caching
if(entities==null) EntitySearcher1.nomatchcache.add(entityphrase+"+"+elocatorphrase);
else EntitySearcher1.cache.put(entityphrase+"+"+elocatorphrase, entities);
return entities;
}
ArrayList<EntityProposals> bestpartialresults = null;
//failed to find pre-composed terms, try to post-compose using part_of
//call on EntityEntityLocatorStrategy on expressions without spatial terms:
//(the attachment of spatial terms to parent entity is different from attachement of a child entity to parent entity)
//like EntitySearch2, but more flexible: may call the strategy on different entity/entity locator combinations
//TODO: need more work: what's entityphrase and elocatorphrase?
boolean startwithspatial = false;
Pattern p = Pattern.compile("^("+Dictionary.spatialtermptn+")\\b\\s*\\b("+Dictionary.allSpatialHeadNouns()+")?\\b");
Matcher m = p.matcher(entityphrase);
if(m.find()) startwithspatial = true;
//boolean hasspatial = ecs.containsSpatial();
//if(elocatorphrase.trim().length()>0 && !hasspatial){//call EELS strategy when there is an entity locator to avoid infinite loop.
if(!startwithspatial){//call EELS strategy when there is an entity locator to avoid infinite loop.
//ep.setPhrase(entityphrase);
LOGGER.debug(System.getProperty("line.separator")+"EntitySearcher1 calls EntityEntityLocatorStrategy");
if(components.size()==1){
//LOGGER.debug("find components size = 1");
//has one component only, split the component into entity and entitylocator
ArrayList<String> perms = components.get(0).getPermutations(); //perms are not reg exps
for(String perm : perms){
if(perm.indexOf(" of ")<0) continue; //there must be another variation with " of " that is equivalent to this variation
if(this.debug_permutation) System.err.println("variation to split: "+perm);
String[] parts = perm.split("\\s+of\\s+");
if(parts.length>1){
for(int l = 0; l < parts.length-1; l++){ //length of entity
String aentityphrase = Utilities.join(parts, 0, l, " of ");
String aelocatorphrase = Utilities.join(parts, l+1, parts.length-1, " of ");
System.out.println("..EEL search: entity '"+aentityphrase+"' and locator '"+aelocatorphrase+"'");
LOGGER.debug("..EEL search: entity '"+aentityphrase+"' and locator '"+aelocatorphrase+"'");
EntityEntityLocatorStrategy eels = new EntityEntityLocatorStrategy(root, structid, aentityphrase, aelocatorphrase, originalentityphrase, prep);
eels.handle();
ArrayList<EntityProposals> entity = eels.getEntities(); //a list of different entities: both sexes => female and male
if(entity != null){
found = true;
//ep.add(entity);
//entities.add(ep);
//entities.addAll(entity);
if(entities==null) entities = new ArrayList<EntityProposals>();
for(EntityProposals aep: entity){
Utilities.addEntityProposals(entities, aep);
//LOGGER.debug("..EEL adds proposals:"+aep);
}
}else{
ArrayList<EntityProposals> eresult = eels.getEntityResult();
ArrayList<EntityProposals> elresult = eels.getEntityLocatorResult();
ArrayList<EntityProposals> best = null;
if(eresult!=null) best = removeRedundancy(eresult);
else if(elresult!=null) best = removeRedundancy(elresult);
if(best!=null){
if(bestpartialresults==null){
bestpartialresults = new ArrayList<EntityProposals>();
bestpartialresults.addAll(best);
}else{
bestpartialresults.addAll(best);
bestpartialresults = removeRedundancy(bestpartialresults);
}
}
if(bestpartialresults!=null){
LOGGER.debug("..EEL return partial matches");
System.out.println("..EEL return partial matches");
for(EntityProposals aep: bestpartialresults){
LOGGER.debug("..:"+aep.toString());
System.out.println("..:"+aep.toString());
}
}
}
}
}
}
}else{
//LOGGER.debug("find components size > 1");
//has multiple components
//use the first n as entity, the remaining as entity locator
//form simple string, not reg exp for entity and locator
for(int n = 1; n < components.size(); n++){
String aentityphrase="", aelocatorphrase="";
for(int i = 0; i < n; i++){ //
String var = components.get(i).getPhrase().split("\\|")[0];
var = var.replaceAll("[(:?)]", "");
aentityphrase += var+" of ";
/*ArrayList<String> perms = components.get(i).getPermutations();
String vars = "";
for(String perm : perms){
vars += perm+"|"; //include all perms in search
}
vars = vars.replaceFirst("\\|$", "");
aentityphrase +="(?:"+vars+") of ";
*/
}
aentityphrase = aentityphrase.replaceFirst(" of $", ""); // (?:A of B| B A) of (?: C D | D of C)
//use the rest as entity locators
for(int i = n; i < components.size(); i++){ //
String var = components.get(i).getPhrase().split("\\|")[0];
var = var.replaceAll("[(:?)]", "");
aelocatorphrase += var+" of ";
/*
ArrayList<String> perms = components.get(i).getPermutations();
String vars = "";
for(String perm : perms){
vars += perm+"|";
}
vars = vars.replaceFirst("\\|$", "");
aelocatorphrase +="(?:"+vars+") of ";*/
}
aelocatorphrase = aelocatorphrase.replaceFirst(" of $", "").trim();//similar to aentityphrase: (?:A of B| B A) of (?: C D | D of C)
//LOGGER.debug("..EEL search: entity '"+entityphrase+"' and locator '"+elocatorphrase+"'");
//entityphrase = entityphrase.replaceFirst("(\\(\\?:|\\)|\\|)", "");
//elocatorphrase = elocatorphrase.replaceFirst("(\\(\\?:|\\)|\\|)", "");
LOGGER.debug("ES1->EEL...entity:'"+aentityphrase+"' entitylocator:'"+aelocatorphrase+"'");
if(elocatorphrase.length()>0){
EntityEntityLocatorStrategy eels = new EntityEntityLocatorStrategy(root, structid, aentityphrase, aelocatorphrase, originalentityphrase, prep);
eels.handle();
ArrayList<EntityProposals> entity = eels.getEntities(); //a list of different entities: both sexes => female and male
if(entity != null){
found = true;
//ep.add(entity);
//entities.add(ep);
//entities.addAll(entity);
if(entities==null) entities = new ArrayList<EntityProposals>();
for(EntityProposals aep: entity){
Utilities.addEntityProposals(entities, aep);
//LOGGER.debug("..EEL adds proposals:"+aep);
}
}else{
//LOGGER.debug("..EEL didn't return composed entity");
ArrayList<EntityProposals> eresult = eels.getEntityResult();
ArrayList<EntityProposals> elresult = eels.getEntityLocatorResult();
ArrayList<EntityProposals> best = null;
if(eresult!=null) best = removeRedundancy(eresult);
else if(elresult!=null) best = removeRedundancy(elresult);
if(best!=null){
if(bestpartialresults==null){
bestpartialresults = new ArrayList<EntityProposals>();
bestpartialresults.addAll(best);
}else{
bestpartialresults.addAll(best);
bestpartialresults = removeRedundancy(bestpartialresults);
}
}
if(bestpartialresults!=null){
System.out.println("..EEL return partial matches");
LOGGER.debug("..EEL return partial matches");
for(EntityProposals aep: bestpartialresults){
LOGGER.debug("..:"+aep.toString());
System.out.println("..:"+aep.toString());
}
}
}
}
}
}
}
//if(found) return entities;
//deal with spatial expressions
if(startwithspatial){
LOGGER.debug(System.getProperty("line.separator")+"EntitySearcher1 calls SpatialModifiedEntityStrategy");
//TODO: need more work: what's entityphrase and elocatorphrase?
SpatialModifiedEntityStrategy smes = new SpatialModifiedEntityStrategy(root, structid, entityphrase, elocatorphrase, originalentityphrase, prep);
smes.handle();
ArrayList<EntityProposals> entity = smes.getEntities();
if(entity != null){
found = true;
//ep.add(entity);
//entities.add(ep);
//entities.addAll(entity); //add a list of different entities: both sexes => female and male
if(entities==null) entities = new ArrayList<EntityProposals>();
for(EntityProposals aep: entity){
//LOGGER.debug("..SME adds proposals: "+aep.toString());
Utilities.addEntityProposals(entities, aep);
}
}else{
//LOGGER.debug("..SME didn't return composed entity");
ArrayList<EntityProposals> eresult = smes.getEntityResult();
ArrayList<EntityProposals> elresult = smes.getEntityLocatorResult();
ArrayList<EntityProposals> best = null;
if(eresult!=null) best = removeRedundancy(eresult);
else if(elresult!=null) best = removeRedundancy(elresult);
if(best!=null){
if(bestpartialresults==null){
bestpartialresults = new ArrayList<EntityProposals>();
bestpartialresults.addAll(best);
}else{
bestpartialresults.addAll(best);
bestpartialresults = removeRedundancy(bestpartialresults);
}
}
if(bestpartialresults!=null){
System.out.println("..SME return partial matches");
LOGGER.debug("..SME return partial matches");
for(EntityProposals aep: bestpartialresults){
LOGGER.debug("..:"+aep.toString());
System.out.println("..:"+aep.toString());
}
}
}
}
//if(found) return entities;
LOGGER.debug(System.getProperty("line.separator")+"EntitySearcher1 calls EntitySearcher4");
ArrayList<EntityProposals> entity = new EntitySearcher4().searchEntity(root, structid, entityphrase, elocatorphrase, originalentityphrase, prep);
//proximal tarsal element:
//SpaticalModifiedEntity: phrase=proximal region entity=proximal region score=1.0 and (part_of some phrase=tarsal\b.* entity=tarsal bone score=0.5)
//EntitySearcher5: phrase=proximal tarsal\b.* entity=proximal tarsal bone score=0.5
//TODO: save both or select one?
if(entity!=null){
//entities.addAll(entity);
if(entities==null) entities = new ArrayList<EntityProposals>();
for(EntityProposals aep: entity){
//LOGGER.debug("..ES4 adds proposals: "+aep.toString());
Utilities.addEntityProposals(entities, aep);
}
}else{
LOGGER.debug("ES4.. found no match");
}
if(entities == null || isOriginatedFromPartialResults(entities)){
if(bestpartialresults!=null){
LOGGER.debug("..no better match, use bestpartialresults:");
System.out.println("..no better match, use bestpartialresults:");
bestpartialresults = removeRedundancy(bestpartialresults);
bestpartialresults =lowerscore(bestpartialresults);
if(entities==null) entities = new ArrayList<EntityProposals>();
entities.addAll(bestpartialresults);
for(EntityProposals aep: entities){
LOGGER.debug("..:"+aep.toString());
System.out.println("..:"+aep.toString());
}
}
}
//logging
if(entities!=null){
LOGGER.debug(System.getProperty("line.separator")+"EntitySearcher1 completed search for '"+entityphrase+"[orig="+originalentityphrase+"]' and returns:");
for(EntityProposals aep: entities){
LOGGER.debug("..:"+aep.toString());
}
}
//caching
if(entities==null) EntitySearcher1.nomatchcache.add(entityphrase+"+"+elocatorphrase);
else EntitySearcher1.cache.put(entityphrase+"+"+elocatorphrase, entities);
return entities;
//return new EntitySearcher5().searchEntity(root, structid, entityphrase, elocatorphrase, originalentityphrase, prep);
}
/**
* when entities are based on partial results, their confidence score is less than this.partial
* @param entities
* @return true if all entities are from partial
*/
private boolean isOriginatedFromPartialResults(
ArrayList<EntityProposals> entities) {
for(EntityProposals ep: entities){
for(Entity e: ep.getProposals()){
if(e.getConfidenceScore()>this.partial){
return false;
}
}
}
return true;
}
/**
* partial results should have a lower score
* @param bestpartialresults
* @return
*/
private ArrayList<EntityProposals> lowerscore(
ArrayList<EntityProposals> bestpartialresults) {
for(EntityProposals ep: bestpartialresults){
for(Entity e: ep.getProposals()){
e.setConfidenceScore(e.getConfidenceScore()*partial);
}
}
return bestpartialresults;
}
/**
* remove redundant matches of parts
* @param bestpartialresults
* @return the unique, longest matches
*/
private ArrayList<EntityProposals> removeRedundancy(
ArrayList<EntityProposals> bestpartialresults) {
ArrayList<EntityProposals> tobecleaned = new ArrayList<EntityProposals>();
for(int i = 0; i<bestpartialresults.size(); i++){
for(int j = i+1; j<bestpartialresults.size(); j++){
EntityProposals ep1 = bestpartialresults.get(i);
EntityProposals ep2 = bestpartialresults.get(j);
if(ep1.equals(ep2) || ep1.content().contains(ep2.content())){
tobecleaned.add(ep2);
}else if (ep2.content().contains(ep1.content())){
tobecleaned.add(ep1);
}
}
}
for(EntityProposals ep: tobecleaned){
bestpartialresults.remove(ep);
}
return bestpartialresults;
}
/**
* find the best (covers the most concepts in the original search phrases) proposals
* P1:phrase=(?:lobe) entity=lobe score=1.0 and (part_of some phrase=(?:(?:caudal) (?:fin)) entity=caudal fin score=1.0)
* P2:phrase=(?:caudal) (?:fin) .*? (?:lobe) entity=caudal fin upper lobe score=1.0
P3:phrase=(?:caudal) (?:fin) .*? (?:lobe) entity=caudal fin lower lobe score=1.0
* @param proposals
* @param elocatorphrase
* @param entityphrase
* @return a set of best proposals
*/
/*private ArrayList<EntityProposals> best(ArrayList<EntityProposals> proposals) {
Hashtable<String, Set<Entity>> scores = new Hashtable<String, Set<Entity>>();
int max = -1;
for(EntityProposals ep: proposals){
for(Entity e: ep.getProposals()){
String tokens = getTokens(e, "");
String[] covered = tokens.trim().split("\\s+");
int c = (new HashSet<String>(Arrays.asList(covered))).size();
Set<Entity> ps = scores.get(c+"");
if(ps==null){
ps = new HashSet<Entity>();
}
ps.add(e);
scores.put(c+"", ps);
if(c>max) max = c;
}
}
EntityProposals best = new EntityProposals();
for(Entity e: scores.get(max+""))
best.add(e);
ArrayList<EntityProposals> results = new ArrayList<EntityProposals>();
results.add(best);
return results;
}*/
/**
*
* @param e
* @param tokens
* @return may contain trailing space and multiple spaces in tokens
*/
private String getTokens(Entity e, String tokens) {
if(e instanceof SimpleEntity) return e.getString().substring(0, e.getString().indexOf("|")).replaceAll("(\\(\\?:|\\)|\\.\\*\\?)", "")+" ";
else if(e instanceof CompositeEntity){
ArrayList<Entity> es = ((CompositeEntity) e).getEntities();
for(Entity e1: es){
tokens +=getTokens(e1, tokens); //
}
}else if(e instanceof REntity){
Entity e1 = ((REntity) e).getEntity();
return getTokens(e1, tokens);
}
return tokens;
}
/**
* 'posterior radials,anterior dorsal fin' generated 2 variations:
..(?:(?:posterior|posterior side) (?:radials)) of (?:(?:fin) of (?:anterior|anterior side) (?:dorsal|dorsal side)|(?:anterior|anterior side) (?:dorsal|dorsal side) (?:fin))
..(?:(?:fin) of (?:anterior|anterior side) (?:dorsal|dorsal side)|(?:anterior|anterior side) (?:dorsal|dorsal side) (?:fin)) (?:(?:posterior|posterior side) (?:radials))
* 'ventral radial crest,' generated 1 variations
*..(?:(?:process|crest|ridge|tentacule|shelf|flange|ramus) of (?:ventral|ventral side) (?:radial|radius)|(?:ventral|ventral side) (?:radial|radius) (?:process|crest|ridge|tentacule|shelf|flange|ramus))
*
* 'posterior postfrontal,' generated 1 variations:
* 1. (?:(?:posterior|posterior side) (?:postfrontal))
* @param components
* @param variations
*/
public static void permutation(ArrayList<EntityComponent> components, ArrayList<String> variations) {
//System.out.println("round 0: i=-1 "+ "components size="+components.size()+" prefix=''");
permutation("", components, variations, clone(components), -1);
//remove indexes
for(int i = 0; i < variations.size(); i++){
variations.set(i, variations.get(i).replaceAll("\\(-?\\d+\\)", "").trim());
}
}
/**
* collect permutations of a list of components
* @param prefix: current output sequence
* @param components: remaining components to be added to the sequence
* @param variations: permutations collected so far
* @param clone: the original components
* @param lastindex: index of the last component in the original components(clone) that was added to the prefix.
*/
private static void permutation(String prefix, ArrayList<EntityComponent> components, ArrayList<String> variations, ArrayList<EntityComponent> clone, int lastindex) {
int n = components.size();
if (n == 0){
if(!clone.get(lastindex).isSpatial() || lastindex == 0){ //the last component can not be a spatial term, if there are multiple components (lastindex!=0)
variations.add(prefix+"("+lastindex+")");
if(debug_permutation) System.err.println("variation: "+prefix+"("+lastindex+")");
}
}
else {
for (int i = 0; i < n; i++){
ArrayList<EntityComponent> reducedcomps = new ArrayList<EntityComponent>();
for(int j = 0; j < n; j++){
if(j!=i) reducedcomps.add(components.get(j)); //reducedcomps = components - element_i
}
if(debug_permutation) System.err.println("prefix="+prefix+" new round: i="+i+ " components size="+reducedcomps.size());
String newprefix = newPrefix(prefix, lastindex, clone, clone.indexOf(components.get(i)), i, components);
if(debug_permutation) System.err.println("newprefix="+newprefix+" new round: i="+i+ " components size="+reducedcomps.size());
permutation(/*(prefix+" "+components.get(i).getSynRing()).trim()*/newprefix, reducedcomps, variations, clone, clone.indexOf(components.get(i)));
}
}
}
/**
* decide whether to concatenate oldprefix and components.get(i).getSynRing() directly or to add " of " between them.
* add "of" after a structure when the lastindex < newindex (i.e., putting a child before a parent organ)
* @param oldprefix: the current prefix
* @param lastindex: index of the last component in the original components(clone) that was added to prefix.
* @param newindex: index of the component i in the original clone
* @param i: index of the to-be-added component in current components
* @param components
* @return
*/
private static String newPrefix(String oldprefix, int lastindex, ArrayList<EntityComponent> clone, int newindex, int i,
ArrayList<EntityComponent> components) {
//if(lastindex>=0 && lastindex<components.size() && clone.get(lastindex).isStructure() && lastindex < newindex){
if(lastindex>=0 && clone.get(lastindex).isStructure() && lastindex < newindex){
return (oldprefix+"("+lastindex+") of "+components.get(i).getPhrase()).trim();
}
return (oldprefix+"("+lastindex+") "+components.get(i).getPhrase()).trim();
}
private static ArrayList<EntityComponent> clone(
ArrayList<EntityComponent> components) {
ArrayList<EntityComponent> clone = new ArrayList<EntityComponent>();
for(int i = 0 ; i < components.size(); i++){
clone.add(components.get(i));
}
return clone;
}
/**
* private class
* @author Hong Cui
*
*/
private class EntityComponents{
ArrayList<EntityComponent> components = new ArrayList<EntityComponent>(); //the order of the elements indicate the part of relation, 0 part of 1 part of 2 ...
public EntityComponents(String entity, String locator){
//1. join entityphrase and elocatorphrase, then split them into entity components, sorted from child to parent organ
components = joinAndSplit(entity, locator);
//2. create syn_ring for each component
//setSynRings(components);
}
/**
* turn entityphrase + elocatorphrases to a list of EntityComponents, each EntityComponent represents one structure, e.g. 'dorsal', 'fin', 'dorsal region', 'long tooth'
* @param entityphrase: entities separated by ',', later entities are parent organs of the earlier ones
* @param elocatorphrase: entity locators separated by ',', later entities are parent organs of the earlier ones
* @return
*/
private ArrayList<EntityComponent> joinAndSplit(String entityphrase,
String elocatorphrase) {
ArrayList<EntityComponent> components = new ArrayList<EntityComponent>();
entityphrase = entityphrase+","+elocatorphrase; //join
//split: separate adjective organs ('nasal') and modified organ ('bone');
//keep spatial term ('dorsal') and modified organ ('fin') together, keep "dorsal margin" as one part, separate them from other parts
//split on " of ".
//String spatialphraseptn = "(?:"+Dictionary.singlewordspatialtermptn +")?\\s*"
// + "\\b(?:(?:"+Dictionary.allSpatialHeadNouns()+")\\b|\\b(?:"+TermOutputerUtilities.adjectiveorganptn+"))";
String singleptn = "((?:"+Dictionary.singlewordspatialtermptn +")\\b\\s*\\b(?:"+Dictionary.allSpatialHeadNouns()+")?\\b\\s*)|"
+ "\\b("+TermOutputerUtilities.adjectiveorganptn+")\\b\\s*";
String spatialphrasesptn = "((?:"+singleptn+")+)"; //allow selection of either single spatial term, spatial phrase, or organadjective, or combination of spatial and organadjs
String[] entityphrases = entityphrase.split("\\s*(,| of )\\s*");
//order of the phrases matters
for(String phrase: entityphrases){
phrase = phrase.trim();
if(phrase.length()==0) continue;
String phrasecp = phrase;
//phrase = "medioventral axis radial element";
//Pattern p = Pattern.compile("(.*?)\\b("+Dictionary.spatialtermptn+"|"+TermOutputerUtilities.adjectiveorganptn+")\\b(.*)"); //this splits on single-word spatial term also
Pattern p = Pattern.compile("(.*?)\\b"+spatialphrasesptn+"\\b(.*)");
Matcher m = p.matcher(phrase);
String temp = ""+"";
while(m.matches()){
//temp += m.group(1)+"#"+m.group(2)+"#";
//temp += m.group(1)+m.group(2)+"#";
//phrase = m.group(3);
temp += m.group(1);
phrase = m.group(5);
String matched = m.group(2);
Pattern p1 = Pattern.compile(singleptn);
Matcher m1 = p1.matcher(matched);
while(m1.find()){
if(m1.group(1)!=null && m1.group(1).length()>0){ //spatial
if(m1.group(1).trim().indexOf(" ")>0) temp +="#"+m1.group(1)+"#";
else temp +="#"+m1.group(1)+" ";
matched = matched.substring(m1.end(1)).trim();
}
if(m1.group(2)!=null && m1.group(2).length()>0){
temp +="#"+m1.group(2)+"#";
matched = matched.substring(m1.end(2)).trim();
}
m1 = p1.matcher(matched);
}
m = p.matcher(phrase);
}
temp +=phrase.trim();//appending the original string to the tokens separated by #
temp = temp.trim();
if(debug_permutation) System.err.println("split&join: '"+phrasecp+"' =>'"+temp+"'");
temp = temp.replaceAll("\\s+", " ").replaceAll("(^#+|#+$)", "");
String[] temps =temp.split("\\s*#+\\s*");
if(temps.length==1){ //if the split didn't split, force split on spaces
ArrayList<FormalConcept> test = new TermSearcher().searchTerm(phrasecp, "entity");
if(test==null) temps = temp.split("\\s+");
}
ArrayList<EntityComponent> thiscomponents = new ArrayList<EntityComponent>();
//for(String part: temps){
for(int i = temps.length-1; i>=0; i--){
String part = temps[i];
part = part.trim();
if(part.length()>0){
//parts.add(t);
EntityComponent ec = new EntityComponent(part);
//ec.setSynRing(this.getSynRing4Phrase(part));
if(part.indexOf(" ")<0 && part.matches(Dictionary.singlewordspatialtermptn)){
ec.isSpatial(true);
ec.isStructure(false);
}
else{
ec.isStructure(true);
ec.isSpatial(false);
}
thiscomponents.add(ec);
}
}
//permute parts in the phrase
ArrayList<String> permutations = new ArrayList<String>();
EntitySearcher1.permutation(thiscomponents, permutations);
//save EntityComponent
String thephrase = "";
for(String permu : permutations){ //A B; B of A
thephrase += permu+"|"; //A B|B of A
}
thephrase = "(?:"+thephrase.replaceFirst("\\|$", "").trim()+")";
EntityComponent ec = new EntityComponent(thephrase);
ec.isStructure(true); //each phrase representing a structure
ec.setPermutations(permutations);
components.add(ec);
}
return components;
}
/**
* sort organs so parent organs come later
* turn 'ventral radial process' to 'process, ventral radial'
* turn 'radial ventral region' to 'ventral region, radial'
* @param phrases: phrases without comma or 'of'
* @return sorted list of strings
*/
/*private ArrayList<String> sort(String[] phrases){
ArrayList<String> sorted = new ArrayList<String>();
for(String phrase: phrases){
Pattern p = Pattern.compile("(.*?)\\b("+Dictionary.spatialtermptn+"|"+TermOutputerUtilities.adjectiveorganptn+")\\b(.*)");
Matcher m = p.matcher(phrase);
String temp = "";
while(m.matches()){
temp += m.group(1)+"#"+m.group(2)+"#";
phrase = m.group(3);
m = p.matcher(phrase);
}
temp +=phrase.trim();//appending the original string to the tokens separated by #
String[] temps = temp.split("\\s*#\\s*");
for(int)
}
return sorted;
}*/
/**
* Set the syn ring for each component. Treat syn rings for different permutations the alternatives in the syn ring
* @param components
*/
/*private void setSynRings(ArrayList<EntityComponent> components) {
for(EntityComponent component: components){
String synring = "";
ArrayList<String> permus = component.getPermutations();
for(String permu : permus){ //A B; B A
//synring += getSynRing4Phrase(permu)+"|"; //(A|A1|A2) (B|B1)|(B|B1) (A|A1|A2)
synring += permu+"|"; //(A|A1|A2) (B|B1)|(B|B1) (A|A1|A2)
}
component.setSynRing("(?:"+synring.replaceFirst("\\|$", "").trim()+")");
}
}*/
/**
* dorsal fin
* @param phrase: (?:(?:shoulder) (?:girdle)) or dorsal fin
* @return (?:dorsal|dorsal side) (?:fin)
*/
public ArrayList<EntityComponent> getComponents(){
return this.components;
}
public EntityComponent getComponent(int index){
return this.components.get(index);
}
public int indexOf(EntityComponent c){
return this.components.indexOf(c);
}
/**
* whether this set of entitycomponents contain a spatial term
* @return
*/
public boolean containsSpatial(){
//[dorsal radials, posterior dorsal fin] => true
//[anterior process, maxilla] => true
//for(EntityComponent cp: components){
if(components.get(0).getPhrase().matches(".*?\\b("+Dictionary.spatialtermptn+")\\b.*"))
return true;
//}
return false;
}
}
/**
* private class
* @author Hong Cui
*
*/
private class EntityComponent{
//String synring; //for the component and is the permutations concatenated as alternatives
String phrase; //e.g. posterior dorsal fin, or fin
ArrayList<String> permutations; // permutations of the parts (represented as synrings) in the phrase
boolean spatial = false;
boolean structure = false;
public EntityComponent(String phrase){ this.phrase = phrase;}
public String getPhrase(){return this.phrase;}
public void setPermutations(ArrayList<String> permutations) {
this.permutations = permutations;
}
public ArrayList<String> getPermutations() {
return this.permutations;
}
/**
*
* @param synring
*/
//public void setSynRing(String synring) {this.synring = synring;}
/**
* used only for one-word spatial terms
* @return
*/
public void isSpatial(boolean isspatial) {this.spatial = isspatial;}
/**
* used for one-word or n-word phrases
* @return
*/
public void isStructure(boolean isstructure) {this.structure = isstructure;}
//public String getSynRing(){ return this.synring;}
/**
* used only for one-word spatial terms
* @return
*/
public boolean isSpatial(){return spatial;}
/**
* used for one-word or n-word phrases
* @return
*/
public boolean isStructure(){return structure;}
}
/**
* @param args
*/
public static void main(String[] args) {
//Posterior radials in posterior-dorsal-fin
EntitySearcher1 eso = new EntitySearcher1();
String src = "C:/Users/updates/CharaParserTest/EQ-patterns_FixedGloss/target/test/011_patterns.xml_se063847a-767d-434d-aedd-fd4bce8f5cb3.xml";
SAXBuilder builder = new SAXBuilder();
Document xml = null;
try {
xml = builder.build(new File(src));
} catch (JDOMException e) {
LOGGER.error("", e);
} catch (IOException e) {
LOGGER.error("", e);
}
if(xml!=null){
Element root = xml.getRootElement();
String structid ="o1141";
//String entityphrase = "posterior postfrontal";
//String entityphrase ="heterocercal";
//String elocatorphrase = "";
//String entityphrase = "posterior supraorbital postfrontal";
//String entityphrase ="posterior radials";
//String elocatorphrase = "anterior dorsal fin";
String entityphrase = "main";
String elocatorphrase = "";
String prep = "";
ArrayList<EntityProposals> eps = eso.searchEntity(root, structid, entityphrase, elocatorphrase, entityphrase, prep);
System.out.println("final result:");
for(EntityProposals ep: eps)
System.out.println(ep.toString());
}
}
}
/*if((entityphrase.split("\\s").length>=2)&&(elocatorphrase=="")){
//try out the variations
SynRingVariation entityvariation = new SynRingVariation(entityphrase);
SynRingVariation elocatorvariation = null;
if(elocatorphrase==null || elocatorphrase.length()==0){
//elocatorvariation = new SynRingVariation(elocatorphrase);
}
if(elocatorvariation == null){ //try entityvariation alone
String spatial = entityvariation.getLeadSpaticalTermVariation(); //TODO
String head = entityvariation.getHeadNounVariation(); //TODO remove duplicates
// the below code passes all the spatial and entity variations to termsearcher and get all the matching entities.
ArrayList<FormalConcept> matches = TermSearcher.entityvariationtermsearch(spatial,head);
if(matches.size()>0)
{
EntityProposals entities = new EntityProposals();
for(int i =0; i <matches.size(); i++){
entities.add((Entity)matches.get(i));
}
return entities;
}
}
}*/