/* $Id: ThrottledFetcher.java 988245 2010-08-23 18:39:35Z kwright $ */ /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.manifoldcf.crawler.connectors.rss; import org.apache.manifoldcf.core.interfaces.*; import org.apache.manifoldcf.connectorcommon.interfaces.*; import org.apache.manifoldcf.connectorcommon.common.XThreadInputStream; import org.apache.manifoldcf.connectorcommon.common.InterruptibleSocketFactory; import org.apache.manifoldcf.agents.interfaces.*; import org.apache.manifoldcf.crawler.interfaces.*; import org.apache.manifoldcf.crawler.system.Logging; import org.apache.manifoldcf.crawler.system.ManifoldCF; import java.util.*; import java.io.*; import java.net.*; import org.apache.http.conn.HttpClientConnectionManager; import org.apache.http.client.HttpClient; import org.apache.http.impl.conn.PoolingHttpClientConnectionManager; import org.apache.http.impl.client.HttpClientBuilder; import org.apache.http.protocol.HttpRequestExecutor; import org.apache.http.impl.client.HttpClients; import org.apache.http.client.config.RequestConfig; import org.apache.http.client.CredentialsProvider; import org.apache.http.impl.client.BasicCredentialsProvider; import org.apache.http.config.SocketConfig; import org.apache.http.config.RegistryBuilder; import org.apache.http.conn.ssl.NoopHostnameVerifier; import org.apache.http.conn.socket.ConnectionSocketFactory; import org.apache.http.conn.ssl.SSLConnectionSocketFactory; import org.apache.http.conn.socket.PlainConnectionSocketFactory; import org.apache.http.HttpEntity; import org.apache.http.HttpResponse; import org.apache.http.auth.AuthScope; import org.apache.http.auth.NTCredentials; import org.apache.http.client.methods.HttpGet; import org.apache.http.client.methods.HttpRequestBase; import org.apache.http.impl.client.DefaultRedirectStrategy; import org.apache.http.util.EntityUtils; import org.apache.http.HttpStatus; import org.apache.http.HttpHost; import org.apache.http.Header; import org.apache.http.message.BasicHeader; import org.apache.http.client.HttpRequestRetryHandler; import org.apache.http.protocol.HttpContext; import org.apache.http.conn.ConnectTimeoutException; import org.apache.http.client.CircularRedirectException; import org.apache.http.NoHttpResponseException; import org.apache.http.HttpException; /** This class uses httpclient to fetch stuff from webservers. However, it additionally controls the fetch * rate in two ways: first, controlling the overall bandwidth used per server, and second, limiting the number * of simultaneous open connections per server. It's also capable of limiting the maximum number of fetches * per time period per server as well; however, this functionality is not strictly necessary at this time because * the CF scheduler does that at a higher layer. * An instance of this class would very probably need to have a lifetime consistent with the long-term nature * of these values, and be static. * This class sets up a different Http connection pool for each server, so that we can foist off onto the httpclient * library the task of limiting the number of connections. This means that we need periodic polling to determine * when idle pooled connections can be freed. */ public class ThrottledFetcher { public static final String _rcsid = "@(#)$Id: ThrottledFetcher.java 988245 2010-08-23 18:39:35Z kwright $"; /** This flag determines whether we record everything to the disk, as a means of doing a web snapshot */ protected static final boolean recordEverything = false; /** The read chunk length */ protected static final int READ_CHUNK_LENGTH = 4096; /** This counter keeps track of the total outstanding handles across everything, because we do try to control that */ protected static int globalHandleCount = 0; /** This is the lock object for that global handle counter */ protected static Integer globalHandleCounterLock = new Integer(0); /** This hash maps the server string (without port) to a pool throttling object, where * we can track the statistics and make sure we throttle appropriately */ protected final Map<String,IConnectionThrottler> serverMap = new HashMap<String,IConnectionThrottler>(); /** Reference count for how many connections to this pool there are */ protected int refCount = 0; // Current host name private static String currentHost = null; static { // Find the current host name try { java.net.InetAddress addr = java.net.InetAddress.getLocalHost(); // Get hostname currentHost = addr.getHostName(); } catch (java.net.UnknownHostException e) { } } /** Note that we're about to need a handle (and make sure we have enough) */ protected static void registerGlobalHandle(int maxHandles) throws ManifoldCFException { try { synchronized (globalHandleCounterLock) { while (globalHandleCount >= maxHandles) { globalHandleCounterLock.wait(); } globalHandleCount++; } } catch (InterruptedException e) { throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED); } } /** Note that we're done with a handle (so we can free it) */ protected static void releaseGlobalHandle() { synchronized (globalHandleCounterLock) { globalHandleCount--; globalHandleCounterLock.notifyAll(); } } /** Constructor. */ public ThrottledFetcher() { } /** Establish a connection to a specified URL. * @param serverName is the FQDN of the server, e.g. foo.metacarta.com * @param connectionLimit is the maximum desired outstanding connections at any one time. * @param connectionTimeoutMilliseconds is the number of milliseconds to wait for the connection before timing out. */ public synchronized IThrottledConnection createConnection(IThreadContext threadContext, String throttleGroupName, String serverName, int connectionLimit, int connectionTimeoutMilliseconds, String proxyHost, int proxyPort, String proxyAuthDomain, String proxyAuthUsername, String proxyAuthPassword, IAbortActivity activities) throws ManifoldCFException, ServiceInterruption { IConnectionThrottler server; server = serverMap.get(serverName); if (server == null) { // Create a connection throttler for this server IThrottleGroups tg = ThrottleGroupsFactory.make(threadContext); server = tg.obtainConnectionThrottler(RSSConnector.rssThrottleGroupType, throttleGroupName, new String[]{serverName}); serverMap.put(serverName,server); } return new ThrottledConnection(serverName, server, connectionTimeoutMilliseconds,connectionLimit, proxyHost,proxyPort,proxyAuthDomain,proxyAuthUsername,proxyAuthPassword, activities); } /** Poll. This method is designed to allow idle connections to be closed and freed. */ public synchronized void poll() throws ManifoldCFException { // Nothing needed now; connections are released when we're done with them. } /** Note that there is a repository connection that is using this object. */ public synchronized void noteConnectionEstablished() { refCount++; } /** Connection pool no longer needed. Call this to indicate that this object no * longer needs to keep its pools available, for the moment. */ public synchronized void noteConnectionReleased() { refCount--; if (refCount == 0) { // Since we don't have any actual pools here, this can be a no-op for now // MHL serverMap.clear(); } } /** This class represents an established connection to a URL. */ protected static class ThrottledConnection implements IThrottledConnection { /** The server fqdn */ protected final String serverName; /** The throttling object we use to track connections */ protected final IConnectionThrottler connectionThrottler; /** The throttling object we use to track fetches */ protected final IFetchThrottler fetchThrottler; /** Connection timeout in milliseconds */ protected final int connectionTimeoutMilliseconds; /** The client connection manager */ protected final HttpClientConnectionManager connectionManager; /** The httpclient */ protected final HttpClient httpClient; /** The method object */ protected HttpRequestBase executeMethod = null; /** The start-fetch time */ protected long startFetchTime = -1L; /** The error trace, if any */ protected Throwable throwable = null; /** The current URL being fetched */ protected String myUrl = null; /** The status code fetched, if any */ protected int statusCode = FETCH_NOT_TRIED; /** The kind of fetch we are doing */ protected String fetchType = null; /** The current bytes in the current fetch */ protected long fetchCounter = 0L; /** The thread that is actually doing the work */ protected ExecuteMethodThread methodThread = null; /** Set if thread has been started */ protected boolean threadStarted = false; /** Abort checker */ protected final AbortChecker abortChecker; /** Constructor. */ public ThrottledConnection(String serverName, IConnectionThrottler connectionThrottler, int connectionTimeoutMilliseconds, int connectionLimit, String proxyHost, int proxyPort, String proxyAuthDomain, String proxyAuthUsername, String proxyAuthPassword, IAbortActivity activities) throws ManifoldCFException, ServiceInterruption { this.serverName = serverName; this.connectionThrottler = connectionThrottler; this.connectionTimeoutMilliseconds = connectionTimeoutMilliseconds; this.abortChecker = new AbortChecker(activities); // Create the https scheme for this connection javax.net.ssl.SSLSocketFactory httpsSocketFactory = KeystoreManagerFactory.getTrustingSecureSocketFactory();; SSLConnectionSocketFactory myFactory = new SSLConnectionSocketFactory(new InterruptibleSocketFactory(httpsSocketFactory,connectionTimeoutMilliseconds), NoopHostnameVerifier.INSTANCE); PoolingHttpClientConnectionManager poolingConnectionManager = new PoolingHttpClientConnectionManager(RegistryBuilder.<ConnectionSocketFactory>create() .register("http", PlainConnectionSocketFactory.getSocketFactory()) .register("https", myFactory) .build()); poolingConnectionManager.setDefaultMaxPerRoute(1); poolingConnectionManager.setValidateAfterInactivity(2000); poolingConnectionManager.setDefaultSocketConfig(SocketConfig.custom() .setTcpNoDelay(true) .setSoTimeout(connectionTimeoutMilliseconds) .build()); connectionManager = poolingConnectionManager; CredentialsProvider credentialsProvider = new BasicCredentialsProvider(); RequestConfig.Builder requestBuilder = RequestConfig.custom() .setCircularRedirectsAllowed(true) .setSocketTimeout(connectionTimeoutMilliseconds) .setExpectContinueEnabled(true) .setConnectTimeout(connectionTimeoutMilliseconds) .setConnectionRequestTimeout(connectionTimeoutMilliseconds); // If there's a proxy, set that too. if (proxyHost != null && proxyHost.length() > 0) { // Configure proxy authentication if (proxyAuthUsername != null && proxyAuthUsername.length() > 0) { if (proxyAuthPassword == null) proxyAuthPassword = ""; if (proxyAuthDomain == null) proxyAuthDomain = ""; credentialsProvider.setCredentials( new AuthScope(proxyHost, proxyPort), new NTCredentials(proxyAuthUsername, proxyAuthPassword, currentHost, proxyAuthDomain)); } HttpHost proxy = new HttpHost(proxyHost, proxyPort); requestBuilder.setProxy(proxy); } httpClient = HttpClients.custom() .setConnectionManager(connectionManager) .setMaxConnTotal(1) .disableAutomaticRetries() .setDefaultRequestConfig(requestBuilder.build()) .setDefaultCredentialsProvider(credentialsProvider) .setRequestExecutor(new HttpRequestExecutor(connectionTimeoutMilliseconds)) .setRedirectStrategy(new DefaultRedirectStrategy()) .build(); registerGlobalHandle(connectionLimit); try { int result = connectionThrottler.waitConnectionAvailable(abortChecker); if (result != IConnectionThrottler.CONNECTION_FROM_CREATION) throw new IllegalStateException("Got back unexpected value from waitForAConnection() of "+result); } catch (InterruptedException e) { throw new ManifoldCFException(e.getMessage(),ManifoldCFException.INTERRUPTED); } catch (BreakException e) { abortChecker.rethrowExceptions(); } fetchThrottler = connectionThrottler.getNewConnectionFetchThrottler(); } /** Begin the fetch process. * @param fetchType is a short descriptive string describing the kind of fetch being requested. This * is used solely for logging purposes. */ @Override public void beginFetch(String fetchType) throws ManifoldCFException, ServiceInterruption { this.fetchType = fetchType; fetchCounter = 0L; try { if (fetchThrottler.obtainFetchDocumentPermission(abortChecker) == false) throw new IllegalStateException("obtainFetchDocumentPermission() had unexpected return value"); } catch (InterruptedException e) { throw new ManifoldCFException("Interrupted",ManifoldCFException.INTERRUPTED); } catch (BreakException e) { abortChecker.rethrowExceptions(); } threadStarted = false; } /** Log the fetch of a number of bytes. */ public void logFetchCount(int count) { fetchCounter += (long)count; } /** Execute the fetch and get the return code. This method uses the * standard logging mechanism to keep track of the fetch attempt. It also * signals the following three conditions: ServiceInterruption (if a dynamic * error occurs), OK, or a static error code (for a condition where retry is * not likely to be helpful). The actual HTTP error code is NOT returned by * this method. * @param protocol is the protocol to use to perform the access, e.g. "http" * @param port is the port to use to perform the access, where -1 means "use the default" * @param urlPath is the path part of the url, e.g. "/robots.txt" * @param userAgent is the value of the userAgent header to use. * @param from is the value of the from header to use. * @param proxyHost is the proxy host, or null if none. * @param proxyPort is the proxy port, or -1 if none. * @param proxyAuthDomain is the proxy authentication domain, or null. * @param proxyAuthUsername is the proxy authentication user name, or null. * @param proxyAuthPassword is the proxy authentication password, or null. * @param lastETag is the requested lastETag header value. * @param lastModified is the requested lastModified header value. * @return the status code: success, static error, or dynamic error. */ @Override public int executeFetch(String protocol, int port, String urlPath, String userAgent, String from, String lastETag, String lastModified) throws ManifoldCFException, ServiceInterruption { StringBuilder sb = new StringBuilder(protocol); sb.append("://").append(serverName); if (port != -1) sb.append(":").append(Integer.toString(port)); sb.append(urlPath); myUrl = sb.toString(); // Create the get method executeMethod = new HttpGet(myUrl); startFetchTime = System.currentTimeMillis(); // Set all appropriate headers executeMethod.setHeader(new BasicHeader("User-Agent",userAgent)); executeMethod.setHeader(new BasicHeader("From",from)); executeMethod.setHeader(new BasicHeader("Accept","*/*")); if (lastETag != null) executeMethod.setHeader(new BasicHeader("ETag",lastETag)); if (lastModified != null) executeMethod.setHeader(new BasicHeader("Last-Modified",lastModified)); // Create the execution thread. methodThread = new ExecuteMethodThread(this, fetchThrottler, httpClient, executeMethod); // Start the method thread, which will start the transaction try { methodThread.start(); threadStarted = true; // We want to wait until at least the execution has fired, and then figure out where we // stand try { statusCode = methodThread.getResponseCode(); long currentTime; switch (statusCode) { case HttpStatus.SC_OK: return STATUS_OK; case HttpStatus.SC_UNAUTHORIZED: case HttpStatus.SC_USE_PROXY: // Permanent errors that mean, "fetch not allowed" return STATUS_SITEERROR; case HttpStatus.SC_REQUEST_TIMEOUT: case HttpStatus.SC_GATEWAY_TIMEOUT: case HttpStatus.SC_SERVICE_UNAVAILABLE: // Temporary service interruption // May want to make the retry time a parameter someday currentTime = System.currentTimeMillis(); throw new ServiceInterruption("Http response temporary error on '"+myUrl+"': "+Integer.toString(statusCode), null,currentTime + 60L * 60000L,currentTime + 1440L * 60000L,-1,false); case HttpStatus.SC_NOT_MODIFIED: return STATUS_NOCHANGE; case HttpStatus.SC_INTERNAL_SERVER_ERROR: // Fail for a while, but give up after 24 hours currentTime = System.currentTimeMillis(); throw new ServiceInterruption("Http response internal server error on '"+myUrl+"': "+Integer.toString(statusCode), null,currentTime + 60L * 60000L,currentTime + 1440L * 60000L,-1,false); case HttpStatus.SC_GONE: case HttpStatus.SC_NOT_FOUND: case HttpStatus.SC_BAD_GATEWAY: case HttpStatus.SC_BAD_REQUEST: default: return STATUS_PAGEERROR; } } catch (InterruptedException e) { methodThread.interrupt(); methodThread = null; threadStarted = false; throw e; } } catch (InterruptedException e) { // Drop the current connection on the floor, so it cannot be reused. executeMethod = null; throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED); } catch (java.net.MalformedURLException e) { throwable = new ManifoldCFException("Illegal URI: '"+myUrl+"'",e); statusCode = FETCH_BAD_URI; return STATUS_PAGEERROR; } catch (java.net.SocketTimeoutException e) { throwable = e; long currentTime = System.currentTimeMillis(); throw new ServiceInterruption("Timed out waiting for IO for '"+myUrl+"': "+e.getMessage(), e, currentTime + 300000L, currentTime + 120L * 60000L,-1,false); } catch (ConnectTimeoutException e) { throwable = e; long currentTime = System.currentTimeMillis(); throw new ServiceInterruption("Timed out waiting for connect for '"+myUrl+"': "+e.getMessage(), e, currentTime + 60L * 60000L, currentTime + 720L * 60000L,-1,false); } catch (InterruptedIOException e) { throw new ManifoldCFException("Interrupted",ManifoldCFException.INTERRUPTED); } catch (CircularRedirectException e) { throwable = e; statusCode = FETCH_CIRCULAR_REDIRECT; return STATUS_PAGEERROR; } catch (NoHttpResponseException e) { throwable = e; // Give up after 2 hours. long currentTime = System.currentTimeMillis(); throw new ServiceInterruption("Timed out waiting for response for '"+myUrl+"'", e, currentTime + 15L * 60000L, currentTime + 120L * 60000L,-1,false); } catch (java.net.ConnectException e) { throwable = e; // Give up after 6 hours. long currentTime = System.currentTimeMillis(); throw new ServiceInterruption("Timed out waiting for a connection for '"+myUrl+"'", e, currentTime + 1000000L, currentTime + 720L * 60000L,-1,false); } catch (java.net.NoRouteToHostException e) { // This exception means we know the IP address but can't get there. That's either a firewall issue, or it's something transient // with the network. Some degree of retry is probably wise. throwable = e; long currentTime = System.currentTimeMillis(); throw new ServiceInterruption("No route to host for '"+myUrl+"'", e, currentTime + 1000000L, currentTime + 720L * 60000L,-1,false); } catch (HttpException e) { throwable = e; statusCode = FETCH_IO_ERROR; return STATUS_PAGEERROR; } catch (IOException e) { // Treat this as a bad url. We don't know what happened, but it isn't something we are going to naively // retry on. throwable = e; statusCode = FETCH_IO_ERROR; return STATUS_PAGEERROR; } catch (Throwable e) { Logging.connectors.debug("RSS: Caught an unexpected exception: "+e.getMessage(),e); throwable = e; statusCode = FETCH_UNKNOWN_ERROR; return STATUS_PAGEERROR; } } /** Get the http response code. *@return the response code. This is either an HTTP response code, or one of the codes above. */ @Override public int getResponseCode() throws ManifoldCFException, ServiceInterruption { return statusCode; } /** Get the response input stream. It is the responsibility of the caller * to close this stream when done. */ @Override public InputStream getResponseBodyStream() throws ManifoldCFException, ServiceInterruption { if (executeMethod == null) throw new ManifoldCFException("Attempt to get an input stream when there is no method"); if (methodThread == null || threadStarted == false) throw new ManifoldCFException("Attempt to get an input stream when no method thread"); try { return methodThread.getSafeInputStream(); } catch (InterruptedException e) { methodThread.interrupt(); throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED); } catch (java.net.SocketTimeoutException e) { long currentTime = System.currentTimeMillis(); throw new ServiceInterruption("Timed out waiting for IO for '"+myUrl+"': "+e.getMessage(), e, currentTime + 300000L, currentTime + 120L * 60000L,-1,false); } catch (ConnectTimeoutException e) { long currentTime = System.currentTimeMillis(); throw new ServiceInterruption("Timed out waiting for connect for '"+myUrl+"': "+e.getMessage(), e, currentTime + 60L * 60000L, currentTime + 720L * 60000L,-1,false); } catch (InterruptedIOException e) { methodThread.interrupt(); throw new ManifoldCFException("Interrupted",ManifoldCFException.INTERRUPTED); } catch (NoHttpResponseException e) { // Give up after 2 hours. long currentTime = System.currentTimeMillis(); throw new ServiceInterruption("Timed out waiting for response for '"+myUrl+"'", e, currentTime + 15L * 60000L, currentTime + 120L * 60000L,-1,false); } catch (java.net.ConnectException e) { // Give up after 6 hours. long currentTime = System.currentTimeMillis(); throw new ServiceInterruption("Timed out waiting for a stream connection for '"+myUrl+"'", e, currentTime + 1000000L, currentTime + 720L * 60000L,-1,false); } catch (java.net.NoRouteToHostException e) { // This exception means we know the IP address but can't get there. That's either a firewall issue, or it's something transient // with the network. Some degree of retry is probably wise. long currentTime = System.currentTimeMillis(); throw new ServiceInterruption("No route to host for '"+myUrl+"'", e, currentTime + 1000000L, currentTime + 720L * 60000L,-1,false); } catch (HttpException e) { throw new ManifoldCFException("Http exception reading stream: "+e.getMessage(),e); } catch (IOException e) { throw new ManifoldCFException("I/O exception reading stream: "+e.getMessage(),e); } } /** Get a specified response header, if it exists. *@param headerName is the name of the header. *@return the header value, or null if it doesn't exist. */ @Override public String getResponseHeader(String headerName) throws ManifoldCFException, ServiceInterruption { if (executeMethod == null) throw new ManifoldCFException("Attempt to get a header when there is no method"); if (methodThread == null || threadStarted == false) throw new ManifoldCFException("Attempt to get a header when no method thread"); try { return methodThread.getFirstHeader(headerName); } catch (InterruptedException e) { methodThread.interrupt(); throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED); } catch (java.net.SocketTimeoutException e) { long currentTime = System.currentTimeMillis(); throw new ServiceInterruption("Timed out waiting for IO for '"+myUrl+"': "+e.getMessage(), e, currentTime + 300000L, currentTime + 120L * 60000L,-1,false); } catch (ConnectTimeoutException e) { long currentTime = System.currentTimeMillis(); throw new ServiceInterruption("Timed out waiting for connect for '"+myUrl+"': "+e.getMessage(), e, currentTime + 60L * 60000L, currentTime + 720L * 60000L,-1,false); } catch (InterruptedIOException e) { methodThread.interrupt(); throw new ManifoldCFException("Interrupted",ManifoldCFException.INTERRUPTED); } catch (NoHttpResponseException e) { // Give up after 2 hours. long currentTime = System.currentTimeMillis(); throw new ServiceInterruption("Timed out waiting for response for '"+myUrl+"'", e, currentTime + 15L * 60000L, currentTime + 120L * 60000L,-1,false); } catch (java.net.ConnectException e) { // Give up after 6 hours. long currentTime = System.currentTimeMillis(); throw new ServiceInterruption("Timed out waiting for a connection for '"+myUrl+"'", e, currentTime + 1000000L, currentTime + 720L * 60000L,-1,false); } catch (java.net.NoRouteToHostException e) { // This exception means we know the IP address but can't get there. That's either a firewall issue, or it's something transient // with the network. Some degree of retry is probably wise. long currentTime = System.currentTimeMillis(); throw new ServiceInterruption("No route to host for '"+myUrl+"'", e, currentTime + 1000000L, currentTime + 720L * 60000L,-1,false); } catch (HttpException e) { throw new ManifoldCFException("Http exception reading response: "+e.getMessage(),e); } catch (IOException e) { throw new ManifoldCFException("I/O exception reading response: "+e.getMessage(),e); } } /** Done with the fetch. Call this when the fetch has been completed. A log entry will be generated * describing what was done. */ @Override public void doneFetch(IProcessActivity activities) throws ManifoldCFException { if (fetchType != null) { if (methodThread != null && threadStarted) methodThread.abort(); long endTime = System.currentTimeMillis(); activities.recordActivity(new Long(startFetchTime),RSSConnector.ACTIVITY_FETCH, new Long(fetchCounter),myUrl,Integer.toString(statusCode),(throwable==null)?null:throwable.getMessage(),null); Logging.connectors.info("RSS: FETCH "+fetchType+"|"+myUrl+"|"+new Long(startFetchTime).toString()+"+"+new Long(endTime-startFetchTime).toString()+"|"+ Integer.toString(statusCode)+"|"+new Long(fetchCounter).toString()+"|"+((throwable==null)?"":(throwable.getClass().getName()+"| "+throwable.getMessage()))); if (throwable != null) { if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: Fetch exception for '"+myUrl+"'",throwable); } // Shut down (join) the connection thread, if any, and if it started if (methodThread != null) { if (threadStarted) { try { methodThread.finishUp(); } catch (InterruptedException e) { throw new ManifoldCFException(e.getMessage(),e,ManifoldCFException.INTERRUPTED); } threadStarted = false; } methodThread = null; } executeMethod = null; throwable = null; startFetchTime = -1L; myUrl = null; statusCode = -1; fetchType = null; } } /** Close the connection. Call this to end this server connection. */ @Override public void close() throws ManifoldCFException { // Clean up the connection pool. This should do the necessary bookkeeping to release the one connection that's sitting there. connectionManager.shutdown(); connectionThrottler.noteConnectionDestroyed(); releaseGlobalHandle(); } } /** This class throttles an input stream based on the specified byte rate parameters. The * throttling takes place across all streams that are open to the server in question. */ protected static class ThrottledInputstream extends InputStream { /** Throttled connection */ protected final ThrottledConnection throttledConnection; /** Stream throttler */ protected final IStreamThrottler streamThrottler; /** The stream we are wrapping. */ protected final InputStream inputStream; /** Constructor. */ public ThrottledInputstream(ThrottledConnection throttledConnection, IStreamThrottler streamThrottler, InputStream is) { this.throttledConnection = throttledConnection; this.streamThrottler = streamThrottler; this.inputStream = is; } /** Read a byte. */ public int read() throws IOException { byte[] byteArray = new byte[1]; int count = read(byteArray,0,1); if (count == -1) return count; return (int)byteArray[0]; } /** Read lots of bytes. */ public int read(byte[] b) throws IOException { return read(b,0,b.length); } /** Read lots of specific bytes. */ public int read(byte[] b, int off, int len) throws IOException { int totalCount = 0; while (len > ThrottledFetcher.READ_CHUNK_LENGTH) { int amt = basicRead(b,off,ThrottledFetcher.READ_CHUNK_LENGTH,totalCount); if (amt == -1) { if (totalCount == 0) return amt; return totalCount; } totalCount += amt; off += amt; len -= amt; } if (len > 0) { int amt = basicRead(b,off,len,totalCount); if (amt == -1) { if (totalCount == 0) return amt; return totalCount; } return totalCount + amt; } return totalCount; } /** Basic read, which uses the server object to throttle activity. */ protected int basicRead(byte[] b, int off, int len, int totalSoFar) throws IOException { try { if (streamThrottler.obtainReadPermission(len) == false) throw new IllegalStateException("Throttler shut down while still active"); int amt = 0; try { amt = inputStream.read(b,off,len); return amt; } finally { if (amt == -1) streamThrottler.releaseReadPermission(len,0); else { streamThrottler.releaseReadPermission(len,amt); throttledConnection.logFetchCount(amt); } } } catch (InterruptedException e) { InterruptedIOException e2 = new InterruptedIOException("Interrupted"); e2.bytesTransferred = totalSoFar; throw e2; } } /** Skip */ public long skip(long n) throws IOException { // Not sure whether we should bother doing anything with this; it's not used. return inputStream.skip(n); } /** Get available. */ public int available() throws IOException { return inputStream.available(); } /** Mark. */ public void mark(int readLimit) { inputStream.mark(readLimit); } /** Reset. */ public void reset() throws IOException { inputStream.reset(); } /** Check if mark is supported. */ public boolean markSupported() { return inputStream.markSupported(); } /** Close. */ public void close() throws IOException { try { inputStream.close(); } finally { streamThrottler.closeStream(); } } } /** This thread does the actual socket communication with the server. * It's set up so that it can be abandoned at shutdown time. * * The way it works is as follows: * - it starts the transaction * - it receives the response, and saves that for the calling class to inspect * - it transfers the data part to an input stream provided to the calling class * - it shuts the connection down * * If there is an error, the sequence is aborted, and an exception is recorded * for the calling class to examine. * * The calling class basically accepts the sequence above. It starts the * thread, and tries to get a response code. If instead an exception is seen, * the exception is thrown up the stack. */ protected static class ExecuteMethodThread extends Thread { /** The connection */ protected final ThrottledConnection theConnection; /** The fetch throttler */ protected final IFetchThrottler fetchThrottler; /** Client and method, all preconfigured */ protected final HttpClient httpClient; protected final HttpRequestBase executeMethod; protected HttpResponse response = null; protected Throwable responseException = null; protected XThreadInputStream threadStream = null; protected InputStream bodyStream = null; protected boolean streamCreated = false; protected Throwable streamException = null; protected boolean abortThread = false; protected Throwable shutdownException = null; protected Throwable generalException = null; public ExecuteMethodThread(ThrottledConnection theConnection, IFetchThrottler fetchThrottler, HttpClient httpClient, HttpRequestBase executeMethod) { super(); setDaemon(true); this.theConnection = theConnection; this.fetchThrottler = fetchThrottler; this.httpClient = httpClient; this.executeMethod = executeMethod; } public void run() { try { try { // Call the execute method appropriately synchronized (this) { if (!abortThread) { try { response = httpClient.execute(executeMethod); } catch (java.net.SocketTimeoutException e) { responseException = e; } catch (ConnectTimeoutException e) { responseException = e; } catch (InterruptedIOException e) { throw e; } catch (Throwable e) { responseException = e; } this.notifyAll(); } } // Start the transfer of the content if (responseException == null) { synchronized (this) { if (!abortThread) { try { bodyStream = response.getEntity().getContent(); if (bodyStream != null) { bodyStream = new ThrottledInputstream(theConnection,fetchThrottler.createFetchStream(),bodyStream); threadStream = new XThreadInputStream(bodyStream); } streamCreated = true; } catch (java.net.SocketTimeoutException e) { streamException = e; } catch (ConnectTimeoutException e) { streamException = e; } catch (InterruptedIOException e) { throw e; } catch (Throwable e) { streamException = e; } this.notifyAll(); } } } if (responseException == null && streamException == null) { if (threadStream != null) { // Stuff the content until we are done threadStream.stuffQueue(); } } } finally { if (bodyStream != null) { try { bodyStream.close(); } catch (IOException e) { } bodyStream = null; } synchronized (this) { try { executeMethod.abort(); } catch (Throwable e) { shutdownException = e; } this.notifyAll(); } } } catch (Throwable e) { // We catch exceptions here that should ONLY be InterruptedExceptions, as a result of the thread being aborted. this.generalException = e; } } public int getResponseCode() throws InterruptedException, IOException, HttpException { // Must wait until the response object is there while (true) { synchronized (this) { checkException(responseException); if (response != null) return response.getStatusLine().getStatusCode(); wait(); } } } public String getFirstHeader(String headerName) throws InterruptedException, IOException, HttpException { // Must wait for the response object to appear while (true) { synchronized (this) { checkException(responseException); if (response != null) { Header h = response.getFirstHeader(headerName); if (h == null) return null; return h.getValue(); } wait(); } } } public InputStream getSafeInputStream() throws InterruptedException, IOException, HttpException { // Must wait until stream is created, or until we note an exception was thrown. while (true) { synchronized (this) { if (responseException != null) throw new IllegalStateException("Check for response before getting stream"); checkException(streamException); if (streamCreated) return threadStream; wait(); } } } public void abort() { // This will be called during the finally // block in the case where all is well (and // the stream completed) and in the case where // there were exceptions. synchronized (this) { if (streamCreated) { if (threadStream != null) threadStream.abort(); } abortThread = true; } } public void finishUp() throws InterruptedException { join(); } protected synchronized void checkException(Throwable exception) throws IOException, HttpException { if (exception != null) { // Throw the current exception, but clear it, so no further throwing is possible on the same problem. Throwable e = exception; if (e instanceof IOException) throw (IOException)e; else if (e instanceof HttpException) throw (HttpException)e; else if (e instanceof RuntimeException) throw (RuntimeException)e; else if (e instanceof Error) throw (Error)e; else throw new RuntimeException("Unhandled exception of type: "+e.getClass().getName(),e); } } } /** This class furnishes an abort signal whenever the job activity says it should. * It should never be invoked from a background thread, only from a ManifoldCF thread. */ protected static class AbortChecker implements IBreakCheck { protected final IAbortActivity activities; protected ServiceInterruption serviceInterruption = null; protected ManifoldCFException mcfException = null; public AbortChecker(IAbortActivity activities) { this.activities = activities; } @Override public long abortCheck() throws BreakException, InterruptedException { try { activities.checkJobStillActive(); return 1000L; } catch (ServiceInterruption e) { serviceInterruption = e; throw new BreakException("Break requested: "+e.getMessage(),e); } catch (ManifoldCFException e) { if (e.getErrorCode() == ManifoldCFException.INTERRUPTED) throw new InterruptedException("Interrupted: "+e.getMessage()); mcfException = e; throw new BreakException("Error during break check: "+e.getMessage(),e); } } public void rethrowExceptions() throws ManifoldCFException, ServiceInterruption { if (serviceInterruption != null) throw serviceInterruption; if (mcfException != null) throw mcfException; } } }