package org.deri.grefine.reconcile.rdf.factories; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.HashSet; import java.util.List; import java.util.Set; import org.deri.grefine.reconcile.model.ReconciliationCandidate; import org.deri.grefine.reconcile.model.ReconciliationRequest; import org.deri.grefine.reconcile.model.SearchResultItem; import org.deri.grefine.reconcile.model.ReconciliationRequestContext.PropertyContext; import org.deri.grefine.reconcile.rdf.factories.JenaTextSparqlQueryFactory.ScoredLabel; import org.deri.grefine.reconcile.util.StringUtils; import org.json.JSONException; import org.json.JSONWriter; import com.google.common.collect.ImmutableList; import com.hp.hpl.jena.query.QuerySolution; import com.hp.hpl.jena.query.ResultSet; import com.hp.hpl.jena.rdf.model.Literal; import com.hp.hpl.jena.rdf.model.Resource; public class PlainSparqlQueryFactory extends AbstractSparqlQueryFactory{ @Override public String getReconciliationSparqlQuery(ReconciliationRequest request, ImmutableList<String> searchPropertyUris) { String query = prepareQueryRE(request.getQueryString()); String typesFilter = getTypesFilter(request.getTypes()); String contextFilter = getContextFilter(request.getContext().getProperties()); String fulltextFilter = getFulltextFilter(query, searchPropertyUris); String labelClause = getLabelClause(searchPropertyUris.size()); return RECONCILE_QUERY_TEMPLATE.replace("[[LABEL_CLAUSE]]", labelClause) .replace("[[FULLTEXT_SEARCH_FILTER]]",fulltextFilter) .replace("[[TYPE_FILTER]]", typesFilter) .replace("[[CONTEXT_FILTER]]", contextFilter) .replace("[[LIMIT]]", String.valueOf(request.getLimit())); } @Override public List<ReconciliationCandidate> wrapReconciliationResultset(ResultSet resultSet, String queryString, ImmutableList<String> searchPropertyUris, int limit, double matchThreshold) { List<ReconciliationCandidate> candidates = new ArrayList<ReconciliationCandidate>(); boolean match = false; boolean moreThanOneMatch = false; Set<String> seen = new HashSet<String>(); while(resultSet.hasNext()){ QuerySolution solution = resultSet.nextSolution(); Resource entity = solution.getResource("entity"); String entityUri = entity.getURI(); if(seen.contains(entityUri)){ //already seen continue; } seen.add(entityUri); ScoredLabel scoredLabel = getPreferredLabel(solution, queryString, searchPropertyUris); if(scoredLabel.score >= matchThreshold){ if(match){ moreThanOneMatch = true; }else{ match = true; } } ReconciliationCandidate candidate = new ReconciliationCandidate(entity.getURI(), scoredLabel.label, new String[] {}, scoredLabel.score,match); candidates.add(candidate); } if(moreThanOneMatch){ //set all candidates as match =false for(ReconciliationCandidate candidate:candidates){ candidate.setMatch(false); } } //sort according to score Collections.sort(candidates, new Comparator<ReconciliationCandidate>() { @Override public int compare(ReconciliationCandidate o1, ReconciliationCandidate o2) { //descending order return Double.compare(o2.getScore(), o1.getScore()); } }); return candidates; } public String getExactMatchReconciliationSparqlQuery(ReconciliationRequest request, ImmutableList<String> searchPropertyUris){ String query = prepareQuery(request.getQueryString()); String typesFilter = getTypesFilter(request.getTypes()); String contextFilter = getContextFilter(request.getContext().getProperties()); String fulltextFilter = getExactMatchFulltextFilter(query, searchPropertyUris); String labelClause = ""; return RECONCILE_QUERY_TEMPLATE.replace("[[LABEL_CLAUSE]]", labelClause) .replace("[[FULLTEXT_SEARCH_FILTER]]",fulltextFilter) .replace("[[TYPE_FILTER]]", typesFilter) .replace("[[CONTEXT_FILTER]]", contextFilter) .replace("[[LIMIT]]", String.valueOf(request.getLimit())); } @Override public String getTypeSuggestSparqlQuery(String prefix, int limit) { return SUGGEST_TYPE_QUERY_TEMPLATE.replace("[[QUERY]]", prepareQueryRE(prefix)).replace("[[LIMIT]]", String.valueOf(limit)); } @Override public ImmutableList<SearchResultItem> wrapTypeSuggestResultSet(ResultSet resultSet, String prefix, int limit) { List<SearchResultItem> items = new ArrayList<SearchResultItem>(); while(resultSet.hasNext()){ QuerySolution solution = resultSet.nextSolution(); String type = solution.getResource("type").getURI(); String label = solution.getLiteral("label").getString(); double score = StringUtils.getLevenshteinScore(label, prefix); items.add(new SearchResultItem(type, label, score)); } Collections.sort(items, new Comparator<SearchResultItem>() { @Override public int compare(SearchResultItem o1, SearchResultItem o2) { //descending order return Double.compare(o2.getScore(), o1.getScore()); } }); return ImmutableList.copyOf(items); } public List<ReconciliationCandidate> wrapResultset(ResultSet resultSet,String queryString, double matchThreshold){ List<ReconciliationCandidate> candidates = new ArrayList<ReconciliationCandidate>(); while(resultSet.hasNext()){ QuerySolution solution = resultSet.nextSolution(); Resource entity = solution.getResource("entity"); ReconciliationCandidate candidate = new ReconciliationCandidate(entity.getURI(), queryString, new String[] {}, 1.0d,true); candidates.add(candidate); } if(candidates.size() > 1 || matchThreshold > 1 ){ //set all match =false for(ReconciliationCandidate candidate : candidates){ candidate.setMatch(false); } } return candidates; } @Override public String getPropertySuggestSparqlQuery(String prefix, String typeUri, int limit) { return SUGGEST_PROPERTY_WITH_SPECIFIC_SUBJECT_TYPE_QUERY_TEMPLATE.replace("[[QUERY]]", prefix) .replace("[[TYPE_URI]]", typeUri) .replace("[[LIMIT]]", String.valueOf(limit)); } @Override public String getPropertySuggestSparqlQuery(String prefix, int limit) { return SUGGEST_PROPERTY_QUERY_TEMPLATE.replace("[[QUERY]]", prefix) .replace("[[LIMIT]]", String.valueOf(limit)); } @Override public String getEntitySearchSparqlQuery(String prefix, ImmutableList<String> searchPropertyUris, int limit) { String labelClause = getLabelClause(searchPropertyUris.size()); StringBuilder regexPatternClause = new StringBuilder(); StringBuilder boundFilterClause = new StringBuilder(); for(int i =1;i<=searchPropertyUris.size();i++){ String propUri = searchPropertyUris.get(i-1); regexPatternClause.append(REGEX_SEARCH_PATTERN.replaceAll("\\[\\[INDEX\\]\\]", String.valueOf(i)) .replace("[[QUERY]]", prefix) .replace("[[PROPERTY_URI]]", propUri)); boundFilterClause.append(BOUND_LABEL_FILTER.replace("[[INDEX]]", String.valueOf(i))); } //remove the last additional " ||" boundFilterClause.setLength(boundFilterClause.length()-3); return SEARCH_ENTITY_QUERY_TEMPLATE.replace("[[LABEL_CLAUSE]]", labelClause) .replace("[[REGEX_SEARCH_PATTERN]]", regexPatternClause.toString()) .replace("[[BOUND_LABEL_FILTER]]", boundFilterClause.toString()) .replace("[[LIMIT]]", String.valueOf(limit)); } @Override public void write(JSONWriter writer) throws JSONException { writer.object(); writer.key("type"); writer.value("plain"); writer.endObject(); } /** * @param solution * @param queryString * @param searchPropertyUris * @return the best label... which is the one with the best score */ private ScoredLabel getPreferredLabel(QuerySolution solution, String queryString, ImmutableList<String> searchPropertyUris){ double maxScore = -1d; String bestLabel = ""; for(int i=1; i<=searchPropertyUris.size(); i++){ Literal label = solution.getLiteral("label" + i); double score; if(label!=null){ score = StringUtils.getLevenshteinScore(label.getString(), queryString); if(score>maxScore){ maxScore = score; bestLabel = label.getString(); } } } if(bestLabel.isEmpty()){ //fail... should never get here as bound restrictions are added to the SPARQL query used throw new RuntimeException("could not find label in the resultset for " + queryString); } return new ScoredLabel(bestLabel, maxScore); } private String getTypesFilter(String[] types){ if(types==null || types.length==0 || (types.length==1 && types[0].isEmpty())){ return ""; } String typesFilter = ""; if(types.length>0){ if(types.length==1){ typesFilter = getSimpleTypeFilter(types[0]); }else{ StringBuilder typesFilterBuilder = new StringBuilder(); typesFilterBuilder.append("{"); for(int i=0; i<types.length; i++){ typesFilterBuilder.append(TYPE_FILTER.replace("[[TYPE_URI]]", types[i])); } // remove the last additional UNION typesFilterBuilder.setLength(typesFilterBuilder.length()-6); typesFilterBuilder.append("}"); typesFilter = typesFilterBuilder.toString(); } } return typesFilter; } private String getFulltextFilter(String queryString, ImmutableList<String> searchPropertyUris){ if(searchPropertyUris.size()==1){ //simple optimization return SIMPLE_REGEX_SEARCH_PATTERN.replace("[[QUERY]]", queryString) .replace("[[PROPERTY_URI]]", searchPropertyUris.get(0)); } StringBuilder regexPatternClause = new StringBuilder(); StringBuilder boundFilterClause = new StringBuilder(); for(int i =1;i<=searchPropertyUris.size();i++){ String propUri = searchPropertyUris.get(i-1); regexPatternClause.append(REGEX_SEARCH_PATTERN.replaceAll("\\[\\[INDEX\\]\\]", String.valueOf(i)) .replace("[[QUERY]]", queryString) .replace("[[PROPERTY_URI]]", propUri)); boundFilterClause.append(BOUND_LABEL_FILTER.replace("[[INDEX]]", String.valueOf(i))); } //remove the last additional " ||" boundFilterClause.setLength(boundFilterClause.length()-3); return FULLTEXT_SEARCH_FILTER.replace("[[REGEX_SEARCH_PATTERN]]", regexPatternClause.toString()) .replace("[[BOUND_LABEL_FILTER]]", boundFilterClause.toString()); } private String getExactMatchFulltextFilter(String queryString, ImmutableList<String> searchPropertyUris){ if(searchPropertyUris.size()==1){ //simple optimization return SIMPLE_EXACT_MATCH_SEARCH_PATTERN.replace("[[QUERY]]", queryString) .replace("[[PROPERTY_URI]]", searchPropertyUris.get(0)); } StringBuilder exactMatchPatternClause = new StringBuilder(); for(int i =1;i<=searchPropertyUris.size();i++){ String propUri = searchPropertyUris.get(i-1); exactMatchPatternClause.append(EXACT_MATCH_SEARCH_PATTERN.replace("[[QUERY]]", queryString) .replace("[[PROPERTY_URI]]", propUri) .replaceAll("\\[\\[INDEX\\]\\]", String.valueOf(i))); } //remove the last additional "UNION " exactMatchPatternClause.setLength(exactMatchPatternClause.length()-6); return EXACT_MATCH_FULLTEXT_SEARCH_FILTER.replace("[[EXACT_MATCH_SEARCH_PATTERN]]", exactMatchPatternClause.toString()); } private String getLabelClause(int num){ StringBuilder labelClause = new StringBuilder(); for(int i=1;i<=num;i++){ labelClause.append(LABEL).append(i); } return labelClause.toString(); } private String getContextFilter(Set<PropertyContext> properties){ StringBuilder contextFilter = new StringBuilder(); for(PropertyContext prop:properties){ contextFilter.append( PROPERTY_FILTER.replace("[[PROPERTY_URI]]", prop.getPid()) .replace("[[VALUE]]", prop.getV().asSparqlValue()) ); } return contextFilter.toString(); } private String getSimpleTypeFilter(String typeUri){ return SIMPLE_TYPE_FILTER.replace("[[TYPE_URI]]", typeUri); } private String prepareQuery(String query){ return query.replaceAll("'", "\\\\'"); } private String prepareQueryRE(String query){ return query.replaceAll("'", "\\\\'").replaceAll("\\?", "\\\\\\\\?").replaceAll("\\.", "\\\\\\\\.").replaceAll("\\(","\\\\\\\\(") .replaceAll("\\)","\\\\\\\\)"); // return Pattern.quote(query.replaceAll("'", "\\\\'")); } private static final String RECONCILE_QUERY_TEMPLATE = "PREFIX rdf:<http://www.w3.org/1999/02/22-rdf-syntax-ns#> " + "PREFIX rdfs:<http://www.w3.org/2000/01/rdf-schema#> " + "SELECT ?entity[[LABEL_CLAUSE]] " + "WHERE" + "{" + "[[FULLTEXT_SEARCH_FILTER]]" + "[[TYPE_FILTER]]" + "[[CONTEXT_FILTER]]" + "FILTER isIRI(?entity). }LIMIT [[LIMIT]]"; private static final String LABEL = " ?label"; private static final String REGEX_SEARCH_PATTERN = "OPTIONAL{ " + "?entity <[[PROPERTY_URI]]> ?label[[INDEX]]. " + "FILTER regex(str(?label[[INDEX]]),'[[QUERY]]','i')" + "}"; private static final String FULLTEXT_SEARCH_FILTER = "{" + "[[REGEX_SEARCH_PATTERN]]" + "FILTER ([[BOUND_LABEL_FILTER]])" + "}" ; private static final String SIMPLE_REGEX_SEARCH_PATTERN = "?entity <[[PROPERTY_URI]]> ?label1. " + "FILTER regex(str(?label1),'[[QUERY]]','i'). " ; private static final String SIMPLE_EXACT_MATCH_SEARCH_PATTERN = "?entity <[[PROPERTY_URI]]> ?label. FILTER (str(?label) = '[[QUERY]]'). "; private static final String BOUND_LABEL_FILTER = " bound(?label[[INDEX]]) ||"; private static final String TYPE_FILTER = "{?entity rdf:type <[[TYPE_URI]]>. } UNION "; private static final String SIMPLE_TYPE_FILTER = "?entity rdf:type <[[TYPE_URI]]>. "; private static final String EXACT_MATCH_FULLTEXT_SEARCH_FILTER = "{" + "[[EXACT_MATCH_SEARCH_PATTERN]]" + "}" ; private static final String EXACT_MATCH_SEARCH_PATTERN = "{ " + "?entity <[[PROPERTY_URI]]> ?label[[INDEX]]. FILTER (str(?label[[INDEX]]) = '[[QUERY]]'). " + "}UNION "; private static final String PROPERTY_FILTER = "?entity <[[PROPERTY_URI]]> [[VALUE]]. "; private static final String SUGGEST_TYPE_QUERY_TEMPLATE = "SELECT DISTINCT ?type ?label " + "WHERE{" + "[] a ?type. " + "?type ?p ?label. " + "FILTER (?p=<http://www.w3.org/2000/01/rdf-schema#label> || ?p=<http://www.w3.org/2004/02/skos/core#prefLabel>). " + "FILTER regex(str(?label),'^[[QUERY]]','i')" + "} LIMIT [[LIMIT]]"; private static final String SUGGEST_PROPERTY_WITH_SPECIFIC_SUBJECT_TYPE_QUERY_TEMPLATE = "SELECT DISTINCT ?p ?label " + "WHERE{" + "[] a <[[TYPE_URI]]>;" + "?p ?v." + "?p ?label_prop ?label. " + "FILTER (?label_prop=<http://www.w3.org/2000/01/rdf-schema#label> || ?label_prop=<http://www.w3.org/2004/02/skos/core#prefLabel>). " + "FILTER regex(str(?label),'^[[QUERY]]','i')" + "} LIMIT [[LIMIT]]"; private static final String SUGGEST_PROPERTY_QUERY_TEMPLATE = "SELECT DISTINCT ?p ?label " + "WHERE{" + "[] ?p ?v." + "?p ?label_prop ?label. " + "FILTER (?label_prop=<http://www.w3.org/2000/01/rdf-schema#label> || ?label_prop=<http://www.w3.org/2004/02/skos/core#prefLabel>). " + "FILTER regex(str(?label),'^[[QUERY]]','i')" + "} LIMIT [[LIMIT]]"; private static final String SEARCH_ENTITY_QUERY_TEMPLATE = "SELECT ?entity[[LABEL_CLAUSE]] " + "WHERE" + "{" + "[[REGEX_SEARCH_PATTERN]]" + "FILTER ([[BOUND_LABEL_FILTER]]). " + "FILTER isIRI(?entity). }LIMIT [[LIMIT]]"; }