package de.fuberlin.wiwiss.marbles.loading; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import org.apache.commons.httpclient.Header; import org.apache.commons.httpclient.HttpClient; import org.apache.commons.httpclient.HttpException; import org.apache.commons.httpclient.HttpMethod; import org.apache.commons.httpclient.HttpStatus; import org.apache.commons.httpclient.URIException; import org.apache.commons.httpclient.methods.GetMethod; import org.apache.commons.httpclient.params.HttpMethodParams; import org.apache.commons.httpclient.params.HttpParams; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.openrdf.OpenRDFUtil; import org.openrdf.model.Graph; import org.openrdf.model.Resource; import org.openrdf.model.URI; import org.openrdf.model.impl.GraphImpl; import org.openrdf.rio.RDFFormat; import org.openrdf.rio.RDFHandlerException; import org.openrdf.rio.RDFParseException; import org.openrdf.rio.RDFParser; import org.openrdf.rio.Rio; import de.fuberlin.wiwiss.marbles.Constants; import de.fuberlin.wiwiss.ng4j.semwebclient.DereferencingTask; import de.fuberlin.wiwiss.ng4j.semwebclient.LimitedInputStream; /** * The DereferencerThread executes a given DereferencingTask. It opens a * HttpURLConnection, creates an InputStream and tries to parse it. If the * Thread is finished it delivers the retrieval result. * * @author Tobias Gau� * Adapted to the Apache HTTP Client and Sesame by Christian Becker */ public class DereferencerThread extends Thread { private final static int MAX_REDIRECTS = 3; private DereferencingTask task = null; private boolean stopped = false; private int maxfilesize = -1; private Log log = LogFactory.getLog(DereferencerThread.class); private HttpClient httpClient; private SpongerProvider spongerProvider; public DereferencerThread(HttpClient httpClient, SpongerProvider spongerProvider) { this.httpClient = httpClient; this.spongerProvider = spongerProvider; } public void run() { this.log.debug("Thread started."); while (!this.stopped) { if (hasTask()) { DereferencingResult result = dereference(); deliver(result); clearTask(); } try { synchronized (this) { if (this.stopped) { break; } wait(); } } catch (InterruptedException ex) { // Happens when the thread is stopped } } this.log.debug("Thread stopped."); } /** * @return Returns true if the DereferencerThread is available for new * tasks. */ public synchronized boolean isAvailable() { return !hasTask() && !this.stopped; } /** * Starts to execute the DereferencingTask task. Returns true if the * retrieval process is started false if the thread is unable to execute the * task. * * @param task * The task to execute. * @return */ public synchronized boolean startDereferencingIfAvailable( DereferencingTask task) { if (!isAvailable()) { return false; } this.task = task; this.notify(); return true; } /** * @return Returns true if the DereferencerThread is busy false if not. */ private boolean hasTask() { return task != null; } /** * Clears the DereferencerThreads tasks. */ private synchronized void clearTask() { task = null; } /** * Creates a new DereferencingResult which contains information about the * retrieval failure. * * @param errorCode * the error code * @param exception * the thrown exception * @return */ private DereferencingResult createNewUrisResult(int errorCode, ArrayList urilist) { return new DereferencingResult(task, errorCode, urilist); } /** * Delivers the retrieval result. * * @param result */ private /*synchronized*/ void deliver(DereferencingResult result) { if (stopped) { return; } task.getListener().dereferenced(result); } private HttpMethod getURL(String uriString, boolean externallyObtained) throws HttpException, IOException, URIException { GetMethod method = new GetMethod(); org.apache.commons.httpclient.URI uri = new org.apache.commons.httpclient.URI(uriString, true); method.setURI(uri); /* Set read timeout based on step level */ HttpMethodParams params = new HttpMethodParams(); params.setSoTimeout(task.getStep() == 0 ? 30 * 1000 : 15 * 1000); method.setParams(params); method.setRequestHeader( "Accept", "application/rdf+xml;q=1," + "text/xml;q=0.6,text/rdf+n3;q=0.9," + "application/octet-stream;q=0.5," + "application/xml;q=0.5," + "text/plain;q=0.5,application/x-turtle;q=0.5," + "application/x-trig;q=0.5," + "application/xhtml+xml;q=0.5," + "text/html;q=0.5" ); method.setRequestHeader("User-Agent", Constants.userAgent); /* Own redirect handling in order to tie it in with caching etc. */ method.setFollowRedirects(false); /* a little security by obscurity */ if (!(externallyObtained && (method.getURI().getHost().contains("localhost") || method.getURI().getHost().contains("127.") || method.getURI().getHost().contains("10.")))) httpClient.executeMethod(method); log.debug((method.getStatusLine() != null ? method.getStatusCode() : "(fail)") + " " + uriString); // + " (" + (contentType != null ? contentType.getValue() : "(null)") + ")"); return method; } private DereferencingResult dereference() { DereferencingResult result = new DereferencingResult(task, 0, null /* method */, null /* graph */, null /* exception */); HttpMethod method = null; try { method = getURL(task.getURI(), true); result.setMethod(method); Header contentType = method.getResponseHeader("Content-Type"); /* Interpret response for successful requests */ if (HttpStatusCodes.isSuccess(method.getStatusCode())) { RDFFormat format = guessFormat(contentType != null ? contentType.getValue() : null); Graph g = null; try { g = parseRdf(method, format); } catch (Exception e) { /* Couldn't parse this - try it again via the Sponger proxy. */ if (spongerProvider != null) { method.releaseConnection(); method = getURL(spongerProvider.getQueryURL(task.getURI()), false); result.setMethod(method); g = parseRdf(method, format); } } result.setResultData(g); } else if (!HttpStatusCodes.isRedirect(method.getStatusCode())) result.setResultCode(DereferencingResult.STATUS_PARSING_FAILED); } catch (URIException e) { e.printStackTrace(); log.debug(e.getMessage()); result.setResultCode(DereferencingResult.STATUS_MALFORMED_URL); result.setResultException(e); } catch (HttpException e) { log.debug(e.getMessage()); result.setResultCode(DereferencingResult.STATUS_PARSING_FAILED); result.setResultException(e); } catch (IOException e) { log.debug(e.getMessage()); result.setResultCode(DereferencingResult.STATUS_UNABLE_TO_CONNECT); result.setResultException(e); } catch (Exception e) { log.debug(e.getMessage()); result.setResultCode(DereferencingResult.STATUS_PARSING_FAILED); result.setResultException(e); } finally { if (method != null) method.releaseConnection(); } return result; } /** * Parses an RDF String. * @throws IOException * @throws RDFHandlerException * @throws RDFParseException */ private Graph parseRdf(HttpMethod method, RDFFormat format) throws RDFParseException, RDFHandlerException, IOException { LimitedInputStream lis = new LimitedInputStream(method.getResponseBodyAsStream(), maxfilesize); Graph graph = new GraphImpl(); URI urlContext = graph.getValueFactory().createURI(task.getURI().toString()); addData(graph, lis, format, task.getURI().toString(), urlContext); return graph; } /* Adapted from Sesame's RepositoryConnectionBase.addInputStreamOrReader() */ private void addData(Graph graph, InputStream is, RDFFormat dataFormat, String baseURI, Resource... contexts) throws RDFParseException, RDFHandlerException, IOException { OpenRDFUtil.verifyContextNotNull(contexts); RDFParser rdfParser = Rio.createParser(dataFormat, graph.getValueFactory()); rdfParser.setVerifyData(true); rdfParser.setStopAtFirstError(true); rdfParser.setDatatypeHandling(RDFParser.DatatypeHandling.IGNORE); RDFGraphInserter rdfInserter = new RDFGraphInserter(graph, contexts); rdfParser.setRDFHandler(rdfInserter); rdfParser.parse(is, baseURI); } /** * Tries to guess an RDF Format from a connection. * * @return */ private RDFFormat guessFormat(String contentType) { if (contentType == null || ContentTypes.isRDFXML(contentType)) return RDFFormat.RDFXML; else if (ContentTypes.isRDFN3(contentType)) return RDFFormat.N3; else if (ContentTypes.isRDFTTL(contentType)) return RDFFormat.TURTLE; else return RDFFormat.RDFXML; /* worth a try... */ } /** * Stops the UriConnector from retrieving the URI. */ public synchronized void stopThread() { stopped = true; interrupt(); } public synchronized void setMaxfilesize(int size){ maxfilesize = size; } }