/*
* Copyright (c) 2009, MediaEvent Services GmbH & Co. KG
* http://mediaeventservices.com
*
* This file is part of Marbles.
*
* Marbles is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Marbles is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Marbles. If not, see <http://www.gnu.org/licenses/>.
*
*/
package de.fuberlin.wiwiss.marbles.loading;
import java.util.Date;
import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HeaderElement;
import org.apache.commons.httpclient.HttpMethod;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.URIException;
import org.apache.commons.httpclient.util.DateParseException;
import org.apache.commons.httpclient.util.DateUtil;
import org.openrdf.model.Graph;
import org.openrdf.model.Literal;
import org.openrdf.model.Resource;
import org.openrdf.model.Statement;
import org.openrdf.model.URI;
import org.openrdf.model.impl.URIImpl;
import org.openrdf.repository.Repository;
import org.openrdf.repository.RepositoryConnection;
import org.openrdf.repository.RepositoryException;
import org.openrdf.repository.RepositoryResult;
import org.openrdf.repository.sail.SailRepository;
import org.openrdf.sail.SailException;
import org.openrdf.sail.inferencer.InferencerConnection;
import de.fuberlin.wiwiss.marbles.Constants;
/**
* Implements caching of data retrieved from HTTP URLs using a Sesame repository
*
* Uses HTTP in RDF namespace, but currently does not follow the ontology as
* this would require deeper nesting with b-nodes, which is an overcomplication
* for the current use cases
*
* @see http://www.w3.org/TR/HTTP-in-RDF/
* @author Christian Becker
*/
public class CacheController {
/**
* The repository that holds retrieved data using URLs as graphs (contexts)
*/
private SailRepository dataRepository;
/**
* The repository that holds metadata about the contents of the {@link #dataRepository}
*/
private SailRepository metaDataRepository;
private URI contextCacheDataURI;
/**
* Response header fields that are to be stored in the metadata cache
*/
private final static String[] cachedHeaderFields = {"cache-control", "expires", "pragma", "location", "content-type"};
/**
* Constructs a new <code>CacheController</code>
* @param dataRepository
* @param metaDataRepository
*/
public CacheController(SailRepository dataRepository, SailRepository metaDataRepository) {
this.dataRepository = dataRepository;
this.metaDataRepository = metaDataRepository;
contextCacheDataURI = metaDataRepository.getValueFactory().createURI(Constants.contextCacheData);
}
/**
* Removes data for a given URL
* @param url The URL whose data is to be removed
*/
public synchronized void removeData(String url) {
/* Prevent deletion of base graphs */
if (Constants.isBaseUrl(url))
return;
RepositoryConnection dataConn = null;
RepositoryConnection metaDataConn = null;
InferencerConnection inferencerConn = null;
try {
dataConn = dataRepository.getConnection();
inferencerConn = (InferencerConnection) dataRepository.getSail().getConnection();
metaDataConn = metaDataRepository.getConnection();
URI urlDataContext = dataRepository.getValueFactory().createURI(url);
URI urlInferencerContext = dataRepository.getSail().getValueFactory().createURI(url);
URI urlMetadata = metaDataRepository.getValueFactory().createURI(url);
inferencerConn.removeInferredStatement((Resource)null, null, null, urlInferencerContext);
/*
* Because inferencerConn now holds the transaction lock on the store,
* we need to commit changes first or we'll run into a deadlock when removing statements
* using dataConn. They could be removed using dataConn; but the problem
* would remain for the adding of statements.
*/
inferencerConn.commit();
dataConn.remove((Resource)null, null, null, urlDataContext);
metaDataConn.remove(urlMetadata, null, null, contextCacheDataURI);
/* Commit */
// inferencerConn.commit();
} catch (RepositoryException e) {
e.printStackTrace();
} catch (SailException e) {
e.printStackTrace();
}
finally {
if (dataConn != null)
try {
dataConn.close();
} catch (RepositoryException e) {
e.printStackTrace();
}
if (metaDataConn != null)
try {
metaDataConn.close();
} catch (RepositoryException e) {
e.printStackTrace();
}
if (inferencerConn != null)
try {
inferencerConn.close();
} catch (SailException e) {
e.printStackTrace();
}
}
}
/**
* Adds retrieved URL data to the cache
* @param url The URL that was retrieved
* @param data The retrieved data
* @param method Used to obtain metadata
*/
public synchronized void addURLData(String url, Graph data, HttpMethod method) {
RepositoryConnection dataConn = null;
InferencerConnection inferencerConn = null;
RepositoryConnection metaDataConn = null;
try {
dataConn = dataRepository.getConnection();
inferencerConn = (InferencerConnection) dataRepository.getSail().getConnection();
metaDataConn = metaDataRepository.getConnection();
URI urlDataContext = dataRepository.getValueFactory().createURI(url);
URI urlInferencerContext = dataRepository.getValueFactory().createURI(url);
URI urlMetadata = metaDataRepository.getValueFactory().createURI(url);
/* Remove cached data and previous metadata */
inferencerConn.removeInferredStatement((Resource)null, null, null, urlInferencerContext);
/*
* Because inferencerConn now holds the transaction lock on the store,
* we need to commit changes first or we'll run into a deadlock when removing statements
* using dataConn. They could be removed using dataConn; but the problem
* would remain for the adding of statements.
*/
inferencerConn.commit();
dataConn.remove((Resource)null, null, null, urlDataContext);
metaDataConn.remove(urlMetadata, null, null, contextCacheDataURI);
/* Add retrieved data */
if (data != null)
dataConn.add(data);
/* Add metadata */
if (method != null) {
for (String headerField : cachedHeaderFields) {
Header header;
if (null != (header = method.getResponseHeader(headerField))) {
metaDataConn.add(urlMetadata,
metaDataRepository.getValueFactory().createURI(Constants.nsHTTP, headerField),
metaDataRepository.getValueFactory().createLiteral(header.getValue()),
contextCacheDataURI);
}
}
/* Add status code */
if (null != method.getStatusLine()) /* or we'll run into a NullPointerException when calling getStatusCode() */
metaDataConn.add(urlMetadata,
metaDataRepository.getValueFactory().createURI(Constants.nsHTTP, "responseCode"),
metaDataRepository.getValueFactory().createLiteral(method.getStatusCode()),
contextCacheDataURI);
}
/* We'll make use of the date header to specify when the document was retrieved */
metaDataConn.add(urlMetadata,
metaDataRepository.getValueFactory().createURI(Constants.nsHTTP, "date"),
metaDataRepository.getValueFactory().createLiteral(DateUtil.formatDate(new Date())),
contextCacheDataURI);
/* Commit */
// inferencerConn.commit();
} catch (IllegalArgumentException e) {
e.printStackTrace();
} catch (RepositoryException e) {
e.printStackTrace();
} catch (SailException e) {
e.printStackTrace();
}
finally {
if (dataConn != null)
try {
dataConn.close();
} catch (RepositoryException e) {
e.printStackTrace();
}
if (inferencerConn != null)
try {
inferencerConn.close();
} catch (SailException e) {
e.printStackTrace();
}
if (metaDataConn != null)
try {
metaDataConn.close();
} catch (RepositoryException e) {
e.printStackTrace();
}
}
}
/**
* Determines whether the cache holds a valid copy of an URL's data
* @param url The URL of interest
* @return true, if a valid copy is present
*/
public boolean hasURLData(String url) {
boolean hasData = false;
RepositoryConnection metaDataConn = null;
try {
metaDataConn = metaDataRepository.getConnection();
URI metaUrlContext = metaDataRepository.getValueFactory().createURI(url);
Date now = new Date();
/* This is always set, so if it's not set, the URL has not been loaded */
String date = getCachedHeaderDataValue(metaDataConn, metaUrlContext, "date");
if (date == null)
return false;
Date dateRetrieved = DateUtil.parseDate(date);
/*
* Due to performance considerations, don't retrieve an URL
* twice within 24 hours (response headers are deliberately ignored here!!)
*/
if (dateRetrieved.getTime() + 1000 * 60 * 60 * 24 > now.getTime())
return true;
/*
* Check several caching indicators
* @see http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html
*/
String pragma = getCachedHeaderDataValue(metaDataConn, metaUrlContext, "pragma");
if ((pragma != null
&& pragma.equalsIgnoreCase("no-cache")))
return false;
Header cacheControlHeader = getCachedHeaderData(metaDataConn, metaUrlContext, "cache-control");
if (cacheControlHeader != null) {
for (HeaderElement element : cacheControlHeader.getElements()) {
if (element.getName().equalsIgnoreCase("private")
|| element.getName().equalsIgnoreCase("no-cache")
|| element.getName().equalsIgnoreCase("no-store")
|| element.getName().equalsIgnoreCase("must-revalidate")
|| element.getName().equalsIgnoreCase("proxy-revalidate"))
return false;
if (element.getName().equalsIgnoreCase("max-age") || element.getName().equalsIgnoreCase("s-max-age")) {
try {
long maxAge = Long.parseLong(element.getValue());
Date expiryDate = new Date(dateRetrieved.getTime() + maxAge * 1000);
if (now.after(expiryDate))
return false;
}
catch (NumberFormatException e) {
e.printStackTrace();
}
}
}
}
String expires = getCachedHeaderDataValue(metaDataConn, metaUrlContext, "expires");
if (expires != null) {
Date expiryDate = DateUtil.parseDate(expires);
if (now.after(expiryDate))
return false;
}
hasData = true;
} catch (IllegalArgumentException e) {
e.printStackTrace();
} catch (RepositoryException e) {
e.printStackTrace();
} catch (DateParseException e) {
e.printStackTrace();
}
finally {
if (metaDataConn != null)
try {
metaDataConn.close();
} catch (RepositoryException e) {
e.printStackTrace();
}
}
return hasData;
}
/**
* Provides redirection targets from the cache
*
* @param uri The URI of interest
* @return The redirection target, or <code>null</code> if there is none
*/
public String getCachedRedirect(String uri) {
RepositoryConnection metaDataConn = null;
String redirectLocation = null;
try {
metaDataConn = metaDataRepository.getConnection();
Header header = getCachedHeaderData(metaDataConn, uri, "location");
if (header != null)
redirectLocation = header.getValue();
} catch (RepositoryException e) {
e.printStackTrace();
}
finally {
try {
if (metaDataConn != null)
metaDataConn.close();
} catch (RepositoryException e) {
e.printStackTrace();
}
}
return redirectLocation;
}
/**
* Retrieves a cached response header field from the metadata cache
*
* @param metaDataConn A connection to the metadata repository
* @param mainResource The resource of interest
* @param headerField The header field of interest
* @return Header data
* @throws RepositoryException
*/
public Header getCachedHeaderData(RepositoryConnection metaDataConn, Resource mainResource, String headerField) throws RepositoryException {
Header header = null;
RepositoryResult<Statement> results = metaDataConn.getStatements(mainResource,
dataRepository.getValueFactory().createURI(Constants.nsHTTP, headerField),
null, false, contextCacheDataURI);
if (results.hasNext()) {
Statement st = results.next();
if (st.getObject() instanceof Literal)
header = new Header(headerField, ((Literal)st.getObject()).getLabel());
}
results.close();
return header;
}
/**
* Retrieves a cached response header field from the metadata cache
*
* @param metaDataConn A connection to the metadata repository
* @param url The resource of interest
* @param headerField The header field of interest
* @return Header data
* @throws RepositoryException
*/
public Header getCachedHeaderData(RepositoryConnection metaDataConn, String url, String headerField) throws RepositoryException {
try {
URI metaUrlContext = metaDataRepository.getValueFactory().createURI(url);
return getCachedHeaderData(metaDataConn, metaUrlContext, headerField);
}
catch (IllegalArgumentException e) {
e.printStackTrace();
return null;
}
}
/**
* Retrieves the value of a cached response header field from the metadata cache
* @param metaDataConn A connection to the metadata repository
* @param mainResource The resource of interest
* @param headerField The header field of interest
* @return Header data
* @throws RepositoryException
*/
public String getCachedHeaderDataValue(RepositoryConnection metaDataConn, Resource mainResource, String headerField) throws RepositoryException {
Header header = getCachedHeaderData(metaDataConn, mainResource, headerField);
return (header == null ? null : header.getValue());
}
/**
* Retrieves the value of a cached response header field from the metadata cache
*
* @param metaDataConn A connection to the metadata repository
* @param url The resource of interest
* @param headerField The header field of interest
* @return Header data
* @throws RepositoryException
*/
public String getCachedHeaderDataValue(RepositoryConnection metaDataConn, String url, String headerField) throws RepositoryException {
Header header = getCachedHeaderData(metaDataConn, url, headerField);
return (header == null ? null : header.getValue());
}
/**
* @return The repository that holds retrieved data using URLs as graphs (contexts)
*/
public Repository getDataRepository() {
return dataRepository;
}
/**
* @return The repository that holds metadata about the contents of the {@link #dataRepository}
*/
public Repository getMetaDataRepository() {
return metaDataRepository;
}
}