package org.deri.grefine.reconcile.sindice;
import java.io.IOException;
import java.io.StringReader;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLEncoder;
import java.util.LinkedHashSet;
import java.util.LinkedList;
import java.util.List;
import org.deri.grefine.reconcile.rdf.factories.JenaTextSparqlQueryFactory;
import org.deri.grefine.reconcile.rdf.factories.SparqlQueryFactory;
import org.deri.grefine.reconcile.util.GRefineJsonUtilities;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.collect.ImmutableList;
import com.hp.hpl.jena.rdf.model.Model;
import com.hp.hpl.jena.rdf.model.ModelFactory;
public class SindiceBroker {
final static Logger logger = LoggerFactory.getLogger("SindiceBroker");
private final String sindiceSearchUrl = "http://api.sindice.com/v2/search";
private SparqlQueryFactory queryFactory = new JenaTextSparqlQueryFactory();
public List<String> guessDomain(String query, int limit, GRefineJsonUtilities jsonUtilities) {
Model model;
String domain;
SindiceQueryEndpoint endpoint;
ImmutableList<String> empty = ImmutableList.of();
List<String> domains = new LinkedList<String>();
try {
LinkedHashSet<String[]> urlPairs = getUrlsForSimpleTermSearch(query, GUESS_DOMAIN_SEARCH_LIMIT, jsonUtilities);
for(String[] pair: urlPairs){
domain = getDomainForUrl(pair[0], pair[1], jsonUtilities);
if(domains.contains(domain)){
continue;
}
model = getModelForUrl(pair[0], pair[1], jsonUtilities);
endpoint = new SindiceQueryEndpoint(queryFactory);
if(endpoint.hasResult(model, query, empty, limit - domains.size())){
domains.add(domain);
}
if(domains.size()>=limit){
break;
}
}
return domains;
} catch (Exception e) {
logger.error("error reconcling using Sindice API", e);
}
return domains;
}
public Model getModelForUrl(String url, String cacheUrl, GRefineJsonUtilities jsonUtilities) throws JSONException, IOException {
JSONObject sindiceCacheObj = jsonUtilities.getJSONObjectFromUrl(new URL(cacheUrl));
JSONArray rdfContentArr = sindiceCacheObj.getJSONObject(url).getJSONArray("explicit_content");
StringBuilder buff = new StringBuilder();
for(int j=0;j<rdfContentArr.length();j++){
String triple = rdfContentArr.getString(j);
buff.append(triple);
}
Model m = ModelFactory.createDefaultModel();
m.read(new StringReader(buff.toString()), null, "N-TRIPLE");
return m;
}
private String getDomainForUrl(String url, String cacheUrl, GRefineJsonUtilities jsonUtilities) throws JSONException, IOException {
JSONObject sindiceCacheObj = jsonUtilities.getJSONObjectFromUrl(new URL(cacheUrl));
String domain = sindiceCacheObj.getJSONObject(url).getString("domain");
return domain;
}
/**
* @param q
* @return LinkedHashSet of string-pairs. each pair is (in order) URL of the document
* matching the search for q and the Sindice cache URL for the
* document. the LinkedHashSet assures no duplication while keeping the order of addition
* @throws IOException
* @throws JSONException
*/
public LinkedHashSet<String[]> getUrlsForSimpleTermSearch(String q, int searchLimit, GRefineJsonUtilities jsonUtilities) throws JSONException, IOException {
return getUrlsForSimpleTermSearch(q, null, null, searchLimit, jsonUtilities);
}
public LinkedHashSet<String[]> getUrlsForSimpleTermSearch(String q, String domain, String type, int searchLimit, GRefineJsonUtilities jsonUtilities) throws JSONException, IOException {
LinkedHashSet<String[]> lst = new LinkedHashSet<String[]>();
URL url = buildUrl(q,domain, type, searchLimit);
JSONObject documentsObj = jsonUtilities.getJSONObjectFromUrl(url);
JSONArray entries = documentsObj.getJSONArray("entries");
int length = Math.min(entries.length(), searchLimit);
for (int i = 0; i < length; i++) {
String link = entries.getJSONObject(i).getString("link");
String cache = entries.getJSONObject(i).getString("cache");
lst.add(new String[] {link, cache + "&field=domain"});
}
return lst;
}
private URL buildUrl(String q, String domain, String type, int searchLimit) throws MalformedURLException {
try {
//TODO
if(type!=null){type = type.replace("#", "%23");}
String typeFilter = type==null?"":"class:\""+type+"\"";
String domainFilter = domain==null?"":"domain:"+URLEncoder.encode(domain,"UTF-8")+ "%20";
String fq =domainFilter + typeFilter;
//FIXME couldn't figure out how to tell Sindice how many items I want (itemsPerPage)
String query = String.format("q=%s&qt=%s&fq=%s",
URLEncoder.encode(q , "UTF-8"),
URLEncoder.encode("term", "UTF-8"),
fq);
return new URL(sindiceSearchUrl + "?" + query);
} catch (UnsupportedEncodingException e) {
// should never get here
throw new RuntimeException(e);
}
}
static class ModelAndDomain{
final String domain;
final Model model;
public ModelAndDomain(String domain, Model m) {
this.domain = domain;
this.model = m;
}
public String getDomain() {
return domain;
}
public Model getModel() {
return model;
}
}
static final int DEFAULT_SEARCH_LIMIT = 3;
static final int GUESS_DOMAIN_SEARCH_LIMIT = 8;
}