package org.molgenis.data.semanticsearch.service.impl;
import com.google.common.base.Splitter;
import com.google.common.collect.FluentIterable;
import com.google.common.collect.Ordering;
import com.google.common.collect.Sets;
import org.apache.commons.lang3.StringUtils;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.spell.StringDistance;
import org.elasticsearch.common.base.Joiner;
import org.elasticsearch.common.collect.Lists;
import org.molgenis.data.DataService;
import org.molgenis.data.Entity;
import org.molgenis.data.MolgenisDataAccessException;
import org.molgenis.data.QueryRule;
import org.molgenis.data.QueryRule.Operator;
import org.molgenis.data.meta.MetaDataService;
import org.molgenis.data.meta.model.Attribute;
import org.molgenis.data.meta.model.AttributeMetadata;
import org.molgenis.data.meta.model.EntityType;
import org.molgenis.data.semanticsearch.explain.bean.ExplainedAttribute;
import org.molgenis.data.semanticsearch.explain.bean.ExplainedQueryString;
import org.molgenis.data.semanticsearch.explain.service.ElasticSearchExplainService;
import org.molgenis.data.semanticsearch.semantic.Hit;
import org.molgenis.data.semanticsearch.service.SemanticSearchService;
import org.molgenis.data.semanticsearch.string.NGramDistanceAlgorithm;
import org.molgenis.data.semanticsearch.string.Stemmer;
import org.molgenis.data.support.QueryImpl;
import org.molgenis.ontology.core.model.Ontology;
import org.molgenis.ontology.core.model.OntologyTerm;
import org.molgenis.ontology.core.service.OntologyService;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import java.util.*;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import static java.util.Objects.requireNonNull;
import static org.molgenis.data.meta.model.AttributeMetadata.ATTRIBUTE_META_DATA;
public class SemanticSearchServiceImpl implements SemanticSearchService
{
private static final Logger LOG = LoggerFactory.getLogger(SemanticSearchServiceImpl.class);
private final DataService dataService;
private final OntologyService ontologyService;
private final MetaDataService metaDataService;
private final SemanticSearchServiceHelper semanticSearchServiceHelper;
private final ElasticSearchExplainService elasticSearchExplainService;
private static final int MAX_NUM_TAGS = 100;
private static final float CUTOFF = 0.4f;
private Splitter termSplitter = Splitter.onPattern("[^\\p{IsAlphabetic}]+");
private Joiner termJoiner = Joiner.on(' ');
private static final String UNIT_ONTOLOGY_IRI = "http://purl.obolibrary.org/obo/uo.owl";
// We only explain the top 10 suggested attributes because beyond that the attributes are not high quliaty anymore
private static final int MAX_NUMBER_EXPLAINED_ATTRIBUTES = 10;
@Autowired
public SemanticSearchServiceImpl(DataService dataService, OntologyService ontologyService,
MetaDataService metaDataService, SemanticSearchServiceHelper semanticSearchServiceHelper,
ElasticSearchExplainService elasticSearchExplainService)
{
this.dataService = requireNonNull(dataService);
this.ontologyService = requireNonNull(ontologyService);
this.metaDataService = requireNonNull(metaDataService);
this.semanticSearchServiceHelper = requireNonNull(semanticSearchServiceHelper);
this.elasticSearchExplainService = requireNonNull(elasticSearchExplainService);
}
@Override
public Map<Attribute, ExplainedAttribute> findAttributes(EntityType sourceEntityType, Set<String> queryTerms,
Collection<OntologyTerm> ontologyTerms)
{
Iterable<String> attributeIdentifiers = semanticSearchServiceHelper.getAttributeIdentifiers(sourceEntityType);
QueryRule disMaxQueryRule = semanticSearchServiceHelper
.createDisMaxQueryRuleForAttribute(queryTerms, ontologyTerms);
List<QueryRule> finalQueryRules = Lists
.newArrayList(new QueryRule(AttributeMetadata.ID, Operator.IN, attributeIdentifiers));
if (disMaxQueryRule.getNestedRules().size() > 0)
{
finalQueryRules.addAll(Arrays.asList(new QueryRule(Operator.AND), disMaxQueryRule));
}
Stream<Entity> attributeEntities = dataService.findAll(ATTRIBUTE_META_DATA, new QueryImpl<>(finalQueryRules));
Map<String, String> collectExpanedQueryMap = semanticSearchServiceHelper
.collectExpandedQueryMap(queryTerms, ontologyTerms);
// Because the explain-API can be computationally expensive we limit the explanation to the top 10 attributes
Map<Attribute, ExplainedAttribute> explainedAttributes = new LinkedHashMap<>();
AtomicInteger count = new AtomicInteger(0);
attributeEntities.forEach(attributeEntity ->
{
Attribute attribute = sourceEntityType.getAttribute(attributeEntity.getString(AttributeMetadata.NAME));
if (count.get() < MAX_NUMBER_EXPLAINED_ATTRIBUTES)
{
Set<ExplainedQueryString> explanations = convertAttributeEntityToExplainedAttribute(attributeEntity,
sourceEntityType, collectExpanedQueryMap, finalQueryRules);
boolean singleMatchHighQuality = isSingleMatchHighQuality(queryTerms,
Sets.newHashSet(collectExpanedQueryMap.values()), explanations);
explainedAttributes
.put(attribute, ExplainedAttribute.create(attribute, explanations, singleMatchHighQuality));
}
else
{
explainedAttributes.put(attribute, ExplainedAttribute.create(attribute));
}
count.incrementAndGet();
});
return explainedAttributes;
}
boolean isSingleMatchHighQuality(Collection<String> queryTerms, Collection<String> ontologyTermQueries,
Iterable<ExplainedQueryString> explanations)
{
Map<String, Double> matchedTags = new HashMap<>();
for (ExplainedQueryString explanation : explanations)
{
matchedTags.put(explanation.getTagName().toLowerCase(), explanation.getScore());
}
ontologyTermQueries.removeAll(queryTerms);
if (queryTerms.size() > 0 && queryTerms.stream().anyMatch(token -> isGoodMatch(matchedTags, token)))
return true;
if (ontologyTermQueries.size() > 0 && ontologyTermQueries.stream()
.allMatch(token -> isGoodMatch(matchedTags, token))) return true;
return false;
}
boolean isGoodMatch(Map<String, Double> matchedTags, String label)
{
label = label.toLowerCase();
return matchedTags.containsKey(label) && matchedTags.get(label).intValue() == 100 || Sets
.newHashSet(label.split(" ")).stream()
.allMatch(word -> matchedTags.containsKey(word) && matchedTags.get(word).intValue() == 100);
}
@Override
public Map<Attribute, ExplainedAttribute> decisionTreeToFindRelevantAttributes(EntityType sourceEntityType,
Attribute targetAttribute, Collection<OntologyTerm> ontologyTermsFromTags, Set<String> searchTerms)
{
Set<String> queryTerms = createLexicalSearchQueryTerms(targetAttribute, searchTerms);
Collection<OntologyTerm> ontologyTerms = ontologyTermsFromTags;
if (null != searchTerms && !searchTerms.isEmpty())
{
Set<String> escapedSearchTerms = searchTerms.stream().filter(StringUtils::isNotBlank)
.map(QueryParser::escape).collect(Collectors.toSet());
ontologyTerms = ontologyService
.findExcatOntologyTerms(ontologyService.getAllOntologiesIds(), escapedSearchTerms, MAX_NUM_TAGS);
}
else if (null == ontologyTerms || ontologyTerms.size() == 0)
{
List<String> allOntologiesIds = ontologyService.getAllOntologiesIds();
Ontology unitOntology = ontologyService.getOntology(UNIT_ONTOLOGY_IRI);
if (unitOntology != null)
{
allOntologiesIds.remove(unitOntology.getId());
}
Hit<OntologyTerm> ontologyTermHit = findTags(targetAttribute, allOntologiesIds);
ontologyTerms =
ontologyTermHit != null ? Arrays.asList(ontologyTermHit.getResult()) : Collections.emptyList();
}
return findAttributes(sourceEntityType, queryTerms, ontologyTerms);
}
/**
* A helper function to create a list of queryTerms based on the information from the targetAttribute as well as
* user defined searchTerms. If the user defined searchTerms exist, the targetAttribute information will not be
* used.
*
* @param targetAttribute
* @param searchTerms
* @return list of queryTerms
*/
public Set<String> createLexicalSearchQueryTerms(Attribute targetAttribute, Set<String> searchTerms)
{
Set<String> queryTerms = new HashSet<>();
if (searchTerms != null && !searchTerms.isEmpty())
{
queryTerms.addAll(searchTerms);
}
if (queryTerms.size() == 0)
{
if (StringUtils.isNotBlank(targetAttribute.getLabel()))
{
queryTerms.add(targetAttribute.getLabel());
}
if (StringUtils.isNotBlank(targetAttribute.getDescription()))
{
queryTerms.add(targetAttribute.getDescription());
}
}
return queryTerms;
}
/**
* A helper function to explain each of the matched attributes returned by the explain-API
*
* @param attributeEntity
* @param sourceEntityType
* @param collectExpanedQueryMap
* @param finalQueryRules
* @return
*/
public Set<ExplainedQueryString> convertAttributeEntityToExplainedAttribute(Entity attributeEntity,
EntityType sourceEntityType, Map<String, String> collectExpanedQueryMap, List<QueryRule> finalQueryRules)
{
String attributeId = attributeEntity.getString(AttributeMetadata.ID);
String attributeName = attributeEntity.getString(AttributeMetadata.NAME);
Attribute attribute = sourceEntityType.getAttribute(attributeName);
if (attribute == null)
{
throw new MolgenisDataAccessException(
"The attribute : " + attributeName + " does not exsit in EntityType : " + sourceEntityType
.getName());
}
Explanation explanation = elasticSearchExplainService
.explain(new QueryImpl<Entity>(finalQueryRules), dataService.getEntityType(ATTRIBUTE_META_DATA),
attributeId);
Set<ExplainedQueryString> detectedQueryStrings = elasticSearchExplainService
.findQueriesFromExplanation(collectExpanedQueryMap, explanation);
return detectedQueryStrings;
}
@Override
public Map<Attribute, Hit<OntologyTerm>> findTags(String entity, List<String> ontologyIds)
{
Map<Attribute, Hit<OntologyTerm>> result = new LinkedHashMap<Attribute, Hit<OntologyTerm>>();
EntityType emd = metaDataService.getEntityType(entity);
for (Attribute amd : emd.getAtomicAttributes())
{
Hit<OntologyTerm> tag = findTags(amd, ontologyIds);
if (tag != null)
{
result.put(amd, tag);
}
}
return result;
}
@Override
public Hit<OntologyTerm> findTags(Attribute attribute, List<String> ontologyIds)
{
String description = attribute.getDescription() == null ? attribute.getLabel() : attribute.getDescription();
Set<String> searchTerms = splitIntoTerms(description);
Stemmer stemmer = new Stemmer();
if (LOG.isDebugEnabled())
{
LOG.debug("findOntologyTerms({},{},{})", ontologyIds, searchTerms, MAX_NUM_TAGS);
}
List<OntologyTerm> candidates = ontologyService.findOntologyTerms(ontologyIds, searchTerms, MAX_NUM_TAGS);
if (LOG.isDebugEnabled())
{
LOG.debug("Candidates: {}", candidates);
}
List<Hit<OntologyTerm>> hits = candidates.stream()
.filter(ontologyTerm -> filterOntologyTerm(splitIntoTerms(Stemmer.stemAndJoin(searchTerms)),
ontologyTerm, stemmer)).map(ontolgoyTerm -> Hit.<OntologyTerm>create(ontolgoyTerm,
bestMatchingSynonym(ontolgoyTerm, searchTerms).getScore())).sorted(Ordering.natural().reverse())
.collect(Collectors.toList());
if (LOG.isDebugEnabled())
{
LOG.debug("Hits: {}", hits);
}
Hit<OntologyTerm> result = null;
String bestMatchingSynonym = null;
for (Hit<OntologyTerm> hit : hits)
{
String bestMatchingSynonymForHit = bestMatchingSynonym(hit.getResult(), searchTerms).getResult();
if (result == null)
{
result = hit;
bestMatchingSynonym = bestMatchingSynonymForHit;
}
else
{
Set<String> jointTerms = Sets
.union(splitIntoTerms(bestMatchingSynonym), splitIntoTerms(bestMatchingSynonymForHit));
String joinedSynonyms = termJoiner.join(jointTerms);
Hit<OntologyTerm> joinedHit = Hit.create(OntologyTerm.and(result.getResult(), hit.getResult()),
distanceFrom(joinedSynonyms, searchTerms, stemmer));
if (joinedHit.compareTo(result) > 0)
{
result = joinedHit;
bestMatchingSynonym = bestMatchingSynonym + " " + bestMatchingSynonymForHit;
}
}
if (LOG.isDebugEnabled())
{
LOG.debug("result: {}", result);
}
}
if (result != null && result.getScore() >= CUTOFF)
{
if (LOG.isDebugEnabled())
{
LOG.debug("Tag {} with {}", attribute, result);
}
return result;
}
return null;
}
private boolean filterOntologyTerm(Set<String> keywordsFromAttribute, OntologyTerm ontologyTerm, Stemmer stemmer)
{
Set<String> ontologyTermSynonyms = semanticSearchServiceHelper.getOtLabelAndSynonyms(ontologyTerm);
for (String synonym : ontologyTermSynonyms)
{
Set<String> splitIntoTerms = splitIntoTerms(Stemmer.stemAndJoin(splitIntoTerms(synonym)));
if (splitIntoTerms.size() != 0 && keywordsFromAttribute.containsAll(splitIntoTerms)) return true;
}
return false;
}
/**
* Computes the best matching synonym which is closest to a set of search terms.<br/>
* Will stem the {@link OntologyTerm} 's synonyms and the search terms, and then compute the maximum
* {@link StringDistance} between them. 0 means disjunct, 1 means identical
*
* @param ontologyTerm the {@link OntologyTerm}
* @param searchTerms the search terms
* @return the maximum {@link StringDistance} between the ontologyterm and the search terms
*/
public Hit<String> bestMatchingSynonym(OntologyTerm ontologyTerm, Set<String> searchTerms)
{
Stemmer stemmer = new Stemmer();
Optional<Hit<String>> bestSynonym = ontologyTerm.getSynonyms().stream()
.map(synonym -> Hit.<String>create(synonym, distanceFrom(synonym, searchTerms, stemmer)))
.max(Comparator.naturalOrder());
return bestSynonym.get();
}
float distanceFrom(String synonym, Set<String> searchTerms, Stemmer stemmer)
{
String s1 = Stemmer.stemAndJoin(splitIntoTerms(synonym));
String s2 = Stemmer.stemAndJoin(searchTerms);
float distance = (float) NGramDistanceAlgorithm.stringMatching(s1, s2) / 100;
LOG.debug("Similarity between: {} and {} is {}", s1, s2, distance);
return distance;
}
private Set<String> splitIntoTerms(String description)
{
return FluentIterable.from(termSplitter.split(description)).transform(String::toLowerCase)
.filter(w -> !NGramDistanceAlgorithm.STOPWORDSLIST.contains(w)).filter(StringUtils::isNotEmpty).toSet();
}
}