package org.molgenis.ontology.sorta.service.impl;
import com.google.common.base.Function;
import com.google.common.collect.FluentIterable;
import com.google.common.collect.Sets;
import org.apache.commons.lang3.StringUtils;
import org.elasticsearch.common.collect.Iterables;
import org.molgenis.data.DataService;
import org.molgenis.data.Entity;
import org.molgenis.data.QueryRule;
import org.molgenis.data.semanticsearch.string.NGramDistanceAlgorithm;
import org.molgenis.data.semanticsearch.string.Stemmer;
import org.molgenis.data.support.QueryImpl;
import org.molgenis.ontology.core.meta.*;
import org.molgenis.ontology.roc.InformationContentService;
import org.molgenis.ontology.sorta.bean.OntologyTermHitEntity;
import org.molgenis.ontology.sorta.meta.OntologyTermHitMetaData;
import org.molgenis.ontology.sorta.service.SortaService;
import org.springframework.beans.factory.annotation.Autowired;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import static java.util.Objects.requireNonNull;
import static org.molgenis.data.QueryRule.Operator.*;
import static org.molgenis.ontology.core.meta.OntologyMetaData.ONTOLOGY;
import static org.molgenis.ontology.core.meta.OntologyTermDynamicAnnotationMetaData.ONTOLOGY_TERM_DYNAMIC_ANNOTATION;
import static org.molgenis.ontology.core.meta.OntologyTermMetaData.ONTOLOGY_TERM;
import static org.molgenis.ontology.sorta.meta.OntologyTermHitMetaData.COMBINED_SCORE;
import static org.molgenis.ontology.sorta.meta.OntologyTermHitMetaData.SCORE;
public class SortaServiceImpl implements SortaService
{
private static final Set<String> ELASTICSEARCH_RESERVED_WORDS = Sets.newHashSet("or", "and", "if");
private static final String NON_WORD_SEPARATOR = "[^a-zA-Z0-9]";
private static final String ILLEGAL_CHARACTERS_PATTERN = "[^a-zA-Z0-9 ]";
private static final String FUZZY_MATCH_SIMILARITY = "~0.8";
private static final String SINGLE_WHITESPACE = " ";
private static final int MAX_NUMBER_MATCHES = 50;
private static final int NUMBER_NGRAM_MATCHES = 10;
// Global fields that are used by other classes
public static final String SIGNIFICANT_VALUE = "Significant";
public static final Character DEFAULT_SEPARATOR = ';';
public static final String DEFAULT_MATCHING_NAME_FIELD = "Name";
public static final String DEFAULT_MATCHING_SYNONYM_PREFIX_FIELD = "Synonym";
public static final String DEFAULT_MATCHING_IDENTIFIER = "Identifier";
private final DataService dataService;
private final InformationContentService informationContentService;
private final OntologyTermHitMetaData ontologyTermHitMetaData;
private final OntologyTermSynonymFactory ontologyTermSynonymFactory;
@Autowired
public SortaServiceImpl(DataService dataService, InformationContentService informationContentService,
OntologyTermHitMetaData ontologyTermHitMetaData,
OntologyTermSynonymFactory ontologyTermSynonymFactory)
{
this.dataService = requireNonNull(dataService);
this.informationContentService = requireNonNull(informationContentService);
this.ontologyTermHitMetaData = requireNonNull(ontologyTermHitMetaData);
this.ontologyTermSynonymFactory = requireNonNull(ontologyTermSynonymFactory);
}
@Override
public Iterable<Entity> getAllOntologyEntities()
{
Stream<Entity> findAll = dataService.findAll(ONTOLOGY);
return findAll.collect(Collectors.toList());
}
@Override
public Entity getOntologyEntity(String ontologyIri)
{
return dataService.findOne(ONTOLOGY, new QueryImpl<Entity>().eq(OntologyMetaData.ONTOLOGY_IRI, ontologyIri));
}
@Override
public Entity getOntologyTermEntity(String ontologyTermIri, String ontologyIri)
{
Entity ontologyEntity = getOntologyEntity(ontologyIri);
if (ontologyEntity != null)
{
return dataService.findOne(ONTOLOGY_TERM,
new QueryImpl<Entity>().eq(OntologyTermMetaData.ONTOLOGY_TERM_IRI, ontologyTermIri).and()
.eq(OntologyTermMetaData.ONTOLOGY, ontologyEntity));
}
return null;
}
@Override
public Iterable<Entity> findOntologyTermEntities(String ontologyIri, Entity inputEntity)
{
Entity ontologyEntity = getOntologyEntity(ontologyIri);
if (ontologyEntity == null)
throw new IllegalArgumentException("Ontology IRI " + ontologyIri + " does not exist in the database!");
// a list to store most relevant entities
List<Entity> relevantEntities = new ArrayList<>();
// query rules for ontology anntations, e.g. OMIM:124343
List<QueryRule> rulesForOtherFields = new ArrayList<>();
// query rules for ontology name and synonyms, e.g. name = proptosis, sysnonym = protruding eye
List<QueryRule> rulesForOntologyTermFields = new ArrayList<>();
List<QueryRule> rulesForOntologyTermFieldsNGram = new ArrayList<>();
for (String attributeName : inputEntity.getAttributeNames())
{
if (StringUtils.isNotEmpty(inputEntity.getString(attributeName)) && !attributeName
.equalsIgnoreCase(DEFAULT_MATCHING_IDENTIFIER))
{
// The attribute name is either equal to 'Name' or starts with string 'Synonym'
if (isAttrNameValidForLexicalMatch(attributeName))
{
String stemmedQueryString = stemQuery(inputEntity.getString(attributeName));
if (StringUtils.isNotEmpty(stemmedQueryString))
{
rulesForOntologyTermFields
.add(new QueryRule(OntologyTermMetaData.ONTOLOGY_TERM_SYNONYM, FUZZY_MATCH,
fuzzyMatchQuerySyntax(stemmedQueryString)));
rulesForOntologyTermFieldsNGram
.add(new QueryRule(OntologyTermMetaData.ONTOLOGY_TERM_SYNONYM, FUZZY_MATCH_NGRAM,
stemmedQueryString));
}
}
else
{
QueryRule queryAnnotationName = new QueryRule(OntologyTermDynamicAnnotationMetaData.NAME, EQUALS,
attributeName);
QueryRule queryAnnotationValue = new QueryRule(OntologyTermDynamicAnnotationMetaData.VALUE, EQUALS,
inputEntity.getString(attributeName));
// ((name=OMIM Operator.AND value=124325) Operator.OR (name=HPO Operator.AND value=hp12435))
if (rulesForOtherFields.size() > 0) rulesForOtherFields.add(new QueryRule(OR));
rulesForOtherFields.add(new QueryRule(
Arrays.asList(queryAnnotationName, new QueryRule(AND), queryAnnotationValue)));
}
}
}
// Find the ontology terms that have the same annotations as the input ontology annotations
if (rulesForOtherFields.size() > 0)
{
annotationMatchOntologyTerms(inputEntity, ontologyEntity, relevantEntities, rulesForOtherFields);
}
// Find the ontology terms based on the lexical similarities
if (rulesForOntologyTermFields.size() > 0)
{
int pageSize = MAX_NUMBER_MATCHES - relevantEntities.size();
lexicalMatchOntologyTerms(ontologyIri, inputEntity, ontologyEntity, pageSize, rulesForOntologyTermFields,
relevantEntities);
}
if (rulesForOntologyTermFieldsNGram.size() > 0)
{
lexicalMatchOntologyTerms(ontologyIri, inputEntity, ontologyEntity, NUMBER_NGRAM_MATCHES,
rulesForOntologyTermFieldsNGram, relevantEntities);
}
Collections.sort(relevantEntities, new Comparator<Entity>()
{
public int compare(Entity entity_1, Entity entity_2)
{
return entity_2.getDouble(COMBINED_SCORE).compareTo(entity_1.getDouble(COMBINED_SCORE));
}
});
return relevantEntities;
}
private void annotationMatchOntologyTerms(Entity inputEntity, Entity ontologyEntity, List<Entity> relevantEntities,
List<QueryRule> rulesForOtherFields)
{
List<Entity> ontologyTermAnnotationEntities = dataService.findAll(ONTOLOGY_TERM_DYNAMIC_ANNOTATION,
new QueryImpl<Entity>(rulesForOtherFields).pageSize(Integer.MAX_VALUE)).collect(Collectors.toList());
if (ontologyTermAnnotationEntities.size() > 0)
{
List<QueryRule> rules = Arrays
.asList(new QueryRule(OntologyTermMetaData.ONTOLOGY, EQUALS, ontologyEntity), new QueryRule(AND),
new QueryRule(OntologyTermMetaData.ONTOLOGY_TERM_DYNAMIC_ANNOTATION, IN,
ontologyTermAnnotationEntities));
Stream<Entity> ontologyTermEntities = dataService
.findAll(ONTOLOGY_TERM, new QueryImpl<Entity>(rules).pageSize(Integer.MAX_VALUE));
List<Entity> relevantOntologyTermEntities = ontologyTermEntities
.map(ontologyTermEntity -> calculateNGromOTAnnotations(inputEntity, ontologyTermEntity))
.collect(Collectors.toList());
relevantEntities.addAll(relevantOntologyTermEntities);
}
}
private void lexicalMatchOntologyTerms(String ontologyIri, Entity inputEntity, Entity ontologyEntity, int pageSize,
List<QueryRule> rulesForOntologyTermFields, List<Entity> relevantEntities)
{
QueryRule disMaxQueryRule = new QueryRule(rulesForOntologyTermFields);
disMaxQueryRule.setOperator(DIS_MAX);
List<QueryRule> finalQueryRules = Arrays
.asList(new QueryRule(OntologyTermMetaData.ONTOLOGY, EQUALS, ontologyEntity), new QueryRule(AND),
disMaxQueryRule);
Stream<Entity> lexicalMatchedOntologyTermEntities = dataService
.findAll(ONTOLOGY_TERM, new QueryImpl<Entity>(finalQueryRules).pageSize(pageSize))
.map(ontologyTerm -> addLexicalScoreToMatchedEntity(inputEntity, ontologyTerm, ontologyIri));
lexicalMatchedOntologyTermEntities.forEach(matchedEntity ->
{
if (!relevantEntities.contains(matchedEntity))
{
relevantEntities.add(matchedEntity);
}
});
}
Entity addLexicalScoreToMatchedEntity(Entity inputEntity, Entity ontologyTerm, String ontologyIri)
{
double maxNgramScore = 0;
double maxNgramIDFScore = 0;
for (String inputAttrName : inputEntity.getAttributeNames())
{
String queryString = inputEntity.getString(inputAttrName);
if (StringUtils.isNotEmpty(queryString) && isAttrNameValidForLexicalMatch(inputAttrName))
{
Entity topMatchedSynonymEntity = findSynonymWithHighestNgramScore(ontologyIri, queryString,
ontologyTerm);
if (maxNgramScore < topMatchedSynonymEntity.getDouble(SCORE))
{
maxNgramScore = topMatchedSynonymEntity.getDouble(SCORE);
}
if (maxNgramIDFScore < topMatchedSynonymEntity.getDouble(COMBINED_SCORE))
{
maxNgramIDFScore = topMatchedSynonymEntity.getDouble(COMBINED_SCORE);
}
}
}
OntologyTermHitEntity mapEntity = new OntologyTermHitEntity(ontologyTerm, ontologyTermHitMetaData);
mapEntity.set(SCORE, maxNgramScore);
mapEntity.set(COMBINED_SCORE, maxNgramIDFScore);
return mapEntity;
}
/**
* A helper function to check if the ontology term (OT) contains the ontology annotations provided in input. If the
* OT has the same annotation, the OT will be considered as a good match and the similarity scores 100 are allocated
* to the OT
*
* @param inputEntity
* @param ontologyTermEntity
* @return
*/
private Entity calculateNGromOTAnnotations(Entity inputEntity, Entity ontologyTermEntity)
{
OntologyTermHitEntity mapEntity = new OntologyTermHitEntity(ontologyTermEntity, ontologyTermHitMetaData);
for (Entity annotationEntity : ontologyTermEntity
.getEntities(OntologyTermMetaData.ONTOLOGY_TERM_DYNAMIC_ANNOTATION))
{
String annotationName = annotationEntity.getString(OntologyTermDynamicAnnotationMetaData.NAME);
String annotationValue = annotationEntity.getString(OntologyTermDynamicAnnotationMetaData.VALUE);
for (String attributeName : inputEntity.getAttributeNames())
{
if (StringUtils.isNotEmpty(inputEntity.getString(attributeName)) && StringUtils
.equalsIgnoreCase(attributeName, annotationName) && StringUtils
.equalsIgnoreCase(inputEntity.getString(attributeName), annotationValue))
{
mapEntity.set(SCORE, 100d);
mapEntity.set(COMBINED_SCORE, 100d);
return mapEntity;
}
}
}
return mapEntity;
}
/**
* A helper function to calculate the best NGram score from a list ontologyTerm synonyms
*
* @param queryString
* @param ontologyTermEntity
* @return
*/
private Entity findSynonymWithHighestNgramScore(String ontologyIri, String queryString, Entity ontologyTermEntity)
{
Iterable<Entity> entities = ontologyTermEntity.getEntities(OntologyTermMetaData.ONTOLOGY_TERM_SYNONYM);
if (Iterables.size(entities) > 0)
{
String cleanedQueryString = removeIllegalCharWithSingleWhiteSpace(queryString);
// Calculate the Ngram silmiarity score for all the synonyms and sort them in descending order
List<Entity> synonymEntities = FluentIterable.from(entities).transform(new Function<Entity, Entity>()
{
public Entity apply(Entity ontologyTermSynonymEntity)
{
Entity mapEntity = ontologyTermSynonymFactory.create();
mapEntity.set(ontologyTermSynonymEntity);
String ontologyTermSynonym = removeIllegalCharWithSingleWhiteSpace(ontologyTermSynonymEntity
.getString(OntologyTermSynonymMetaData.ONTOLOGY_TERM_SYNONYM_ATTR));
mapEntity
.set(SCORE, NGramDistanceAlgorithm.stringMatching(cleanedQueryString, ontologyTermSynonym));
return mapEntity;
}
}).toSortedList(new Comparator<Entity>()
{
public int compare(Entity entity_1, Entity entity_2)
{
return entity_2.getDouble(SCORE).compareTo(entity_1.getDouble(SCORE));
}
});
Entity firstMatchedSynonymEntity = Iterables.getFirst(synonymEntities, ontologyTermSynonymFactory.create());
double topNgramScore = firstMatchedSynonymEntity.getDouble(SCORE);
String topMatchedSynonym = firstMatchedSynonymEntity
.getString(OntologyTermSynonymMetaData.ONTOLOGY_TERM_SYNONYM_ATTR);
// the algorithm to combine synonyms to re-calculate the similarity scores to deal with the case where the
// input query string contains multiple words from different synonyms of the same ontology term. E.g.
// query string "propotosis, protruding eyeball, Exophthalmos" contains three synonyms of OT (propotosis),
// if it was matched to each of the synonyms, all the similarity score would be fairly low (25%), therefore
// need to combine those synonyms to recalculate the similarity score.
//
// The idea of the algorithm is quite simple, we add up the current synonym (the most) and next synonym (the
// second most), if the combined string yields a higher score, the synonyms will be combined together. The
// same process is repeated until all the synonyms have been checked
// A --> 30%
// B --> 25%
// C --> 20%
//
// if(score(a+b, query) > score(a)) combine
// else move to next synonym
for (Entity nextMatchedSynonymEntity : Iterables.skip(synonymEntities, 1))
{
String nextMatchedSynonym = nextMatchedSynonymEntity
.getString(OntologyTermSynonymMetaData.ONTOLOGY_TERM_SYNONYM_ATTR);
StringBuilder tempCombinedSynonym = new StringBuilder();
tempCombinedSynonym.append(topMatchedSynonym).append(SINGLE_WHITESPACE).append(nextMatchedSynonym);
double newScore = NGramDistanceAlgorithm.stringMatching(cleanedQueryString,
removeIllegalCharWithSingleWhiteSpace(tempCombinedSynonym.toString()));
if (newScore > topNgramScore)
{
topNgramScore = newScore;
topMatchedSynonym = tempCombinedSynonym.toString();
}
}
firstMatchedSynonymEntity.set(OntologyTermSynonymMetaData.ONTOLOGY_TERM_SYNONYM_ATTR, topMatchedSynonym);
firstMatchedSynonymEntity.set(SCORE, topNgramScore);
firstMatchedSynonymEntity.set(COMBINED_SCORE, topNgramScore);
// The similarity scores are adjusted based on the inverse document frequency of the words.
// The idea is that all the words from query string are weighted (important words occur fewer times across
// all ontology terms than common words), the final score should be compensated for according to the word
// // weight.
Map<String, Double> weightedWordSimilarity = informationContentService
.redistributedNGramScore(cleanedQueryString, ontologyIri);
Set<String> synonymStemmedWords = informationContentService.createStemmedWordSet(topMatchedSynonym);
Set<String> createStemmedWordSet = informationContentService.createStemmedWordSet(cleanedQueryString);
createStemmedWordSet.stream().filter(originalWord -> Iterables.contains(synonymStemmedWords, originalWord)
&& weightedWordSimilarity.containsKey(originalWord)).forEach(word -> firstMatchedSynonymEntity
.set(COMBINED_SCORE,
(firstMatchedSynonymEntity.getDouble(COMBINED_SCORE) + weightedWordSimilarity.get(word))));
return firstMatchedSynonymEntity;
}
return null;
}
/**
* A helper function to produce fuzzy match query with 80% similarity in elasticsearch because PorterStem does not
* work in some cases, e.g. the stemming results for placenta and placental are different, therefore would be missed
* by elasticsearch
*
* @param queryString
* @return
*/
private String stemQuery(String queryString)
{
StringBuilder stringBuilder = new StringBuilder();
Set<String> uniqueTerms = Sets.newHashSet(queryString.toLowerCase().trim().split(NON_WORD_SEPARATOR));
uniqueTerms.removeAll(NGramDistanceAlgorithm.STOPWORDSLIST);
for (String word : uniqueTerms)
{
if (StringUtils.isNotEmpty(word.trim()) && !(ELASTICSEARCH_RESERVED_WORDS.contains(word)))
{
String afterStem = Stemmer.stem(removeIllegalCharWithEmptyString(word));
if (StringUtils.isNotEmpty(afterStem))
{
stringBuilder.append(afterStem).append(SINGLE_WHITESPACE);
}
}
}
return stringBuilder.toString().trim();
}
private String fuzzyMatchQuerySyntax(String queryString)
{
StringBuilder stringBuilder = new StringBuilder();
for (String word : queryString.split(SINGLE_WHITESPACE))
{
stringBuilder.append(word).append(FUZZY_MATCH_SIMILARITY).append(SINGLE_WHITESPACE);
}
return stringBuilder.toString().trim();
}
private static String removeIllegalCharWithSingleWhiteSpace(String string)
{
return string.replaceAll(ILLEGAL_CHARACTERS_PATTERN, SINGLE_WHITESPACE);
}
private static String removeIllegalCharWithEmptyString(String string)
{
return string.replaceAll(ILLEGAL_CHARACTERS_PATTERN, StringUtils.EMPTY);
}
private boolean isAttrNameValidForLexicalMatch(String attr)
{
return StringUtils.equalsIgnoreCase(attr, DEFAULT_MATCHING_NAME_FIELD) || StringUtils
.containsIgnoreCase(attr, DEFAULT_MATCHING_SYNONYM_PREFIX_FIELD);
}
}