/*
* Copyright (c) 2009, MediaEvent Services GmbH & Co. KG
* http://mediaeventservices.com
*
* This file is part of Marbles.
*
* Marbles is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Marbles is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Marbles. If not, see <http://www.gnu.org/licenses/>.
*
*/
package de.fuberlin.wiwiss.marbles.loading;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager;
import org.apache.commons.httpclient.URI;
import org.apache.commons.httpclient.URIException;
import org.apache.commons.httpclient.params.HttpConnectionManagerParams;
import org.openrdf.model.BNode;
import org.openrdf.model.Resource;
import org.openrdf.repository.Repository;
import de.fuberlin.wiwiss.marbles.dataproviders.DataProvider;
import de.fuberlin.wiwiss.marbles.dataproviders.RevyuProvider;
import de.fuberlin.wiwiss.marbles.dataproviders.SindiceProvider;
/**
* Provides functionalities to load URLs and to discover related data by means of data providers.
*
* @author Christian Becker
*/
public class SemanticWebClient {
/**
* Number of seconds to wait to load an URL
*/
final int CONNECTION_TIMEOUT = 20;
/**
* Number of seconds to wait for additional data once focal resource is loaded
*/
final int TIME_LIMIT_ADDITIONAL = 3;
/**
* Maximum number of steps for autonomous discovery
*/
final int MAX_STEPS = 1;
/**
* Maximum number of redirects to follow in the course of a single request for a document
*/
final int MAX_REDIRECTS = 2;
private Collection<DataProvider> dataProviders;
private DereferencingTaskQueue uriQueue;
private HttpClient httpClient;
private CacheController cacheController;
/**
* Constructs a new <code>SemanticWebClient</code>
*
* @param cacheController
* @param spongerProvider
* @param dataProviders
*/
public SemanticWebClient(CacheController cacheController, SpongerProvider spongerProvider, Collection<DataProvider> dataProviders) {
this.cacheController = cacheController;
this.dataProviders = dataProviders;
/* Set connection parameters */
HttpConnectionManagerParams httpManagerParams = new HttpConnectionManagerParams();
httpManagerParams.setConnectionTimeout(CONNECTION_TIMEOUT * 1000);
httpManagerParams.setTcpNoDelay(true);
httpManagerParams.setStaleCheckingEnabled(true);
MultiThreadedHttpConnectionManager httpManager = new MultiThreadedHttpConnectionManager();
httpManager.setParams(httpManagerParams);
httpClient = new HttpClient(httpManager);
uriQueue = new DereferencingTaskQueue(httpClient, spongerProvider, 10 /* maxThreads */, 500 * 1024 /* maxFileSize */);
}
/**
* Builds list of URLs to be loaded to learn more about a resource.
* Uses data providers.
*
* @param resource
*/
private List<URI> getURLsForResource(Resource resource) {
List<URI> urls = new ArrayList<URI>();
/* Dereference the resource itself */
try {
if (!(resource instanceof BNode)) {
urls.add(new URI(resource.toString(), true));
/* Temporarily work around DBpedia 303 redirection bug that occurs when special characters are involved */
if (resource.toString().startsWith("http://dbpedia.org/resource/")) {
urls.add(new URI(resource.toString().replace("http://dbpedia.org/resource/", "http://dbpedia.org/data/") + ".xml", true));
}
}
} catch (URIException e) {
e.printStackTrace();
} catch (NullPointerException e) {
e.printStackTrace();
}
/* and ask the data providers */
for (DataProvider provider : dataProviders) {
URI queryURL;
if (null != (queryURL = provider.getQueryURL(resource)))
urls.add(queryURL);
}
return urls;
}
/**
* Initiates a {@link DereferencerBatch} to retrieve data for a given resource
*
* @param resource
* @return List of URLs queries in the process; these may be looked up in the metadata store for details
*/
public List<URI> discoverResource(Resource resource, boolean wait) {
List<URI>urlsToBeFetched = getURLsForResource(resource);
DereferencerBatch dereferencerBatch = new DereferencerBatch(cacheController, uriQueue, dataProviders, resource, MAX_STEPS, MAX_REDIRECTS);
/* provide URLs to dereferencer */
for (URI url : urlsToBeFetched) {
try {
dereferencerBatch.loadURL(url, 0 /* step */, 0 /* redirect step */, false /* don't force reload */);
} catch (URIException e) {
e.printStackTrace();
}
}
/* Initiate link retrieval from any previous data */
dereferencerBatch.processLinks(1);
/* Wait loop with timeout */
long timeStarted = System.currentTimeMillis();
System.err.println(Thread.currentThread().getName() + ": starting discoverResource() at " + timeStarted);
if (wait) {
synchronized (dereferencerBatch) {
while (dereferencerBatch.hasPending(0)
|| ((System.currentTimeMillis() - timeStarted < TIME_LIMIT_ADDITIONAL * 1000) && dereferencerBatch.hasPending()))
{
try {
dereferencerBatch.wait(100);
}
catch ( InterruptedException e ) {
}
}
}
}
System.err.println(Thread.currentThread().getName() + ": finished discoverResource() after " + ((System.currentTimeMillis() - timeStarted) / 1000) + "s");
/*
* We stop waiting here so that the data retrieved so far can be shown to the client.
* Nonetheless, retrieval is not canceled - the client could refresh at a later time to get it
* (AJAX automation would make a lot of sense here), and additional information can be incorporated
* into subsequent views
*/
return dereferencerBatch.getRetrievedURLs();
}
/**
* Loads a given URL into the cache using a {@link DereferencerBatch}
*
* @param url The URL to be loaded
* @param wait If true, the method returns after the request has been processed
*/
public void loadURL(URI url, boolean wait) {
DereferencerBatch dereferencerBatch = new DereferencerBatch(cacheController, uriQueue, dataProviders, null, 0 /* maxSteps (!) */, MAX_REDIRECTS);
/* Provide URLs to dereferencer */
try {
dereferencerBatch.loadURL(url, 0 /* step */, 0 /* redirect step */, true /* force reload */);
} catch (URIException e1) {
e1.printStackTrace();
return;
}
if (wait) {
synchronized (dereferencerBatch) {
while (dereferencerBatch.hasPending())
{
try {
dereferencerBatch.wait(100);
}
catch ( InterruptedException e ) {
}
}
}
}
}
}