/*
* Copyright (c) 2009, MediaEvent Services GmbH & Co. KG
* http://mediaeventservices.com
*
* This file is part of Marbles.
*
* Marbles is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Marbles is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Marbles. If not, see <http://www.gnu.org/licenses/>.
*
*/
package de.fuberlin.wiwiss.marbles.loading;
import info.aduna.iteration.Iterations;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.URI;
import org.apache.commons.httpclient.URIException;
import org.openrdf.model.Resource;
import org.openrdf.model.Statement;
import org.openrdf.model.Value;
import org.openrdf.model.impl.URIImpl;
import org.openrdf.repository.RepositoryConnection;
import org.openrdf.repository.RepositoryException;
import org.openrdf.repository.RepositoryResult;
import de.fuberlin.wiwiss.marbles.Constants;
import de.fuberlin.wiwiss.marbles.dataproviders.DataProvider;
/**
* Starting with one URL, the DereferencerBatch handles the nested retrieval of data
* by following known predicates in retrieved data, and processing retrieval results
* with data providers.
*
* @author Christian Becker
*/
public class DereferencerBatch implements DereferencingListener {
private List<ExtendedDereferencingTask> pendingTasks = new ArrayList<ExtendedDereferencingTask>();
private List<URI> retrievedURLs = new ArrayList<URI>();
private CacheController cacheController;
private Resource mainResource;
private DereferencingTaskQueue uriQueue;
private Collection<DataProvider> dataProviders;
private int maxSteps;
private int maxRedirects;
/**
* Constructs a new <code>DereferencerBatch</code>
* @param cacheController
* @param uriQueue
* @param dataProviders
* @param mainResource
* @param maxSteps
*/
public DereferencerBatch(CacheController cacheController, DereferencingTaskQueue uriQueue, Collection<DataProvider> dataProviders, Resource mainResource, int maxSteps, int maxRedirects) {
this.cacheController = cacheController;
this.mainResource = mainResource;
this.uriQueue = uriQueue;
this.dataProviders = dataProviders;
this.maxSteps = maxSteps;
this.maxRedirects = maxRedirects;
}
/**
* Loads URL if not yet loaded
*
* @param url The URL to load
* @param step The distance from the focal resource
* @param redirectCount The number of redirects performed in the course of this individual request
* @param forceReload Set this to true if the URL should be loaded even if a valid copy is already in the cache
* @throws URIException
*/
public void loadURL(URI url, int step, int redirectCount, boolean forceReload) throws URIException {
if (step > maxSteps || redirectCount > maxRedirects)
return;
/* Cut off local names from URI */
url.setFragment("");
if (retrievedURLs.contains(url)) /* force reload doesn't apply on batch level, as they are short-lived and this could cause infinite loops */
return;
if (!forceReload && cacheController.hasURLData(url.toString())) {
/* Treat as retrieved when reading from cache */
retrievedURLs.add(url);
String redirect = cacheController.getCachedRedirect(url.toString());
/* Process a cached redirect */
if (redirect != null) {
URI redirectUrl = new URI(url, redirect, true);
loadURL(redirectUrl, step, redirectCount + 1, forceReload);
}
else {
/* Data is already loaded; try to find new links within it */
try {
org.openrdf.model.URI sesameUri = new URIImpl(url.toString());
processLinks(step + 1, sesameUri);
}
catch (IllegalArgumentException e) {
e.printStackTrace();
}
}
}
else {
/* No data about this URL; get it */
ExtendedDereferencingTask task = new ExtendedDereferencingTask(this, url.toString(), step, redirectCount, forceReload);
if (uriQueue.addTask(task)) {
pendingTasks.add(task);
retrievedURLs.add(url);
}
}
}
/**
* Determines whether requests are pending below a specified step level
* @param maxLevel Maximum step level to consider
* @return true, if requests are pending
*/
public boolean hasPending(int maxLevel) {
boolean pending = false;
for (ExtendedDereferencingTask task : pendingTasks) {
if (task.getStep() <= maxLevel && !task.isDone()) {
pending = true;
break;
}
}
return pending;
}
/**
* Determines whether any requests are pending
* @return true, if requests are pending
*/
public boolean hasPending() {
return hasPending(Integer.MAX_VALUE);
}
/*
* TODO Determine whether a retrieval batch was executed successfully
* Problem: To do this, {@link DereferencingResult} should be a member of {@link DereferencingTask}, not vice versa
*/
/*public boolean wasSuccess() {
boolean success = true;
for (ExtendedDereferencingTask task : pendingTasks) {
if (task.isDone() && task.) {
pending = true;
break;
}
}
return pending;
}*/
/**
* Called by {@link DereferencerThread} once data has been retrieved.
* Handles insertion into cache, processes redirects, and initiates following of known links
* for the retrieved URL using {@link #processLinks(int, Resource...)}
*/
public void dereferenced(DereferencingResult result) {
ExtendedDereferencingTask task = (ExtendedDereferencingTask) result.getTask();
/* Add to cache - including header data for redirects */
cacheController.addURLData(result.getURI(), result.getResultData(), result.getMethod());
/* Handle known redirect */
if (null != result.getMethod() && null != result.getMethod().getStatusLine()) /* against NullPointerException with getStatusCode() */ {
int resultCode = result.getMethod().getStatusCode();
if (HttpStatusCodes.isRedirect(resultCode)) {
Header locationHeader;
if (null != (locationHeader = result.getMethod().getResponseHeader("location"))) {
try {
loadURL(new URI(new URI(result.getURI(), true), locationHeader.getValue(), true), task.getStep(), task.getRedirectStep() + 1, task.isForceReload());
} catch (URIException e) {
e.printStackTrace();
}
}
}
}
task.setDone(true);
/* Wake up parent */
synchronized(this) {
notify();
}
/* find new links */
if (result.isSuccess())
processLinks(task.getStep() + 1, new URIImpl(result.getURI()));
}
/**
* Identifies known links from loaded data and submits them to <code>{@link #loadURL(URI, int, int, boolean)}</code>
* @param step Current step level
* @param contexts Contexts that are to be considered to find links
*/
public void processLinks(int step, Resource ... contexts) {
if (step > maxSteps)
return;
RepositoryConnection conn = null;
try {
conn = cacheController.getDataRepository().getConnection();
for (org.openrdf.model.URI predicate : Constants.interestingPredicates) {
List<Statement> statementsList;
RepositoryResult<Statement> statements = conn.getStatements(mainResource, predicate, null /* obj */, true /* includeInferred */, contexts);
statementsList = Iterations.addAll(statements, new ArrayList<Statement>());
statements.close();
/* Also include inverse properties */
statements = conn.getStatements(null, predicate, mainResource, true /* includeInferred */, contexts);
Iterations.addAll(statements, statementsList);
statements.close();
List<URI> urlsToBeFetched = new ArrayList<URI>();
for (Statement st : statementsList) {
Value obj = (st.getSubject().equals(mainResource) ? st.getObject() : st.getSubject());
if (obj instanceof org.openrdf.model.URI && !urlsToBeFetched.contains(obj.toString()))
try {
urlsToBeFetched.add(new URI(obj.toString(), true));
} catch (URIException e) {
e.printStackTrace();
} catch (NullPointerException e) {
e.printStackTrace();
}
}
/* Ask data providers */
for (DataProvider p : dataProviders) {
List<URI> newURLs = p.getURLsFromData(cacheController, conn, mainResource);
if (newURLs != null)
urlsToBeFetched.addAll(newURLs);
}
/* Load URLs */
for (URI url : urlsToBeFetched) {
try {
loadURL(url, step, 0 /* redirectStep */, false);
} catch (URIException e) {
e.printStackTrace();
}
}
}
}
catch (RepositoryException e) {
e.printStackTrace();
}
finally {
try {
if (conn != null)
conn.close();
} catch (RepositoryException e) {
e.printStackTrace();
}
}
}
public List<URI> getRetrievedURLs() {
return retrievedURLs;
}
}