/* * Copyright 2014 Bibliotheca Alexandrina. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.archive.wayback.liveweb; import java.io.ByteArrayInputStream; import java.io.IOException; import java.net.ConnectException; import java.net.SocketException; import java.net.SocketTimeoutException; import java.net.URL; import java.util.logging.Logger; import org.apache.commons.httpclient.ConnectTimeoutException; import org.apache.commons.httpclient.HostConfiguration; import org.apache.commons.httpclient.HttpClient; import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager; import org.apache.commons.httpclient.NoHttpResponseException; import org.apache.commons.httpclient.params.HttpClientParams; import org.apache.http.Header; import org.apache.http.HttpEntity; import org.apache.http.HttpHost; import org.apache.http.client.config.RequestConfig; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.util.EntityUtils; import org.archive.io.arc.ARCRecord; import org.archive.wayback.core.Resource; import org.archive.wayback.exception.LiveDocumentNotAvailableException; import org.archive.wayback.exception.LiveWebCacheUnavailableException; import org.archive.wayback.exception.LiveWebTimeoutException; import org.archive.wayback.exception.ResourceNotAvailableException; import org.archive.wayback.resourcestore.resourcefile.ArcResource; import org.archive.wayback.resourcestore.resourcefile.ResourceFactory; /** * This class fetches resource from live web. * It works with standard proxy server e.g. Squid. * * @author Mohamed Elsayed * @see LiveWebCache * @see ArcRemoteLiveWebCache */ public class StdRemoteLiveWebCache implements LiveWebCache { private static final Logger LOGGER = Logger.getLogger( StdRemoteLiveWebCache.class.getName() ); protected MultiThreadedHttpConnectionManager connectionManager; protected HostConfiguration hostConfiguration; protected HttpClient httpClient; protected String requestPrefix; private CloseableHttpResponse response; private ArcResource ar; /** * StdRemoteLiveWebCache constructor initializes and configures connection objects. */ public StdRemoteLiveWebCache() { connectionManager = new MultiThreadedHttpConnectionManager(); hostConfiguration = new HostConfiguration(); HttpClientParams params = new HttpClientParams(); params.setParameter( HttpClientParams.RETRY_HANDLER, new NoRetryHandler() ); httpClient = new HttpClient( params, connectionManager ); httpClient.setHostConfiguration( hostConfiguration ); } /** * Gets resource object from the live web. Configure timeout to 10 seconds. * * @param url to fetch from the live web. * @param maxCacheMS maximum age of resource to return - optionally honored * @param bUseOlder if true, return documents older than maxCacheMS if * a more recent copy is not available. * * @return Resource for url * * @throws LiveDocumentNotAvailableException if the resource cannot be * retrieved from the live web, but all proxying and caching * mechanisms functioned properly * @throws LiveWebCacheUnavailableException if there was a problem either * accessing the live web, in proxying to the live web, or in * maintaining the cache for the live web * @throws LiveWebTimeoutException if there is no response from the live * web cache before a timeout occurred. * @throws IOException for the usual reasons * * @see org.archive.wayback.liveweb.LiveWebCache#getCachedResource(java.net.URL, long, boolean) * @inheritDoc org.archive.wayback.liveweb.LiveWebCache#getCachedResource */ @Override public Resource getCachedResource( URL url, long maxCacheMS, boolean bUseOlder ) throws LiveDocumentNotAvailableException, LiveWebCacheUnavailableException, LiveWebTimeoutException, IOException { String urlStr = url.toExternalForm(); if (requestPrefix != null) urlStr = requestPrefix + urlStr; HttpHost proxy = new HttpHost( hostConfiguration.getProxyHost(), hostConfiguration.getProxyPort() ); // Set socketTimeout and connectionTimeout to 10 seconds. RequestConfig reqConf = RequestConfig.custom().setProxy( proxy ) .setSocketTimeout( 10000 ) .setConnectTimeout( 10000 ) .setConnectionRequestTimeout( 10000 ) .build(); CloseableHttpClient httpclient = HttpClients.custom(). setDefaultRequestConfig(reqConf).build(); HttpGet httpGet = new HttpGet( urlStr ); try { // The following line gets robots.txt from live web response= httpclient.execute( httpGet ); String httpHeaderStr = ""; String bodyStr = ""; /* If it fails to get robots.txt (http status code is 404), then display contents and don't throw exception (socketTimeOutException or connectTimeOutException) */ if ( response.getStatusLine().getStatusCode() == 404 ) { httpHeaderStr = "HTTP/1.0 200 OK\n"; bodyStr = String.format( "%s\n%s\n", "User-agent: *", "Allow: /" ); } else if ( response.getStatusLine().getStatusCode() == 200 ) { // The following line represents first line in http header httpHeaderStr = String.format( "%s %d %s\n", response.getStatusLine().getProtocolVersion(), response.getStatusLine().getStatusCode(), response.getStatusLine().getReasonPhrase() ); // Get robots.txt contents and store it into bodyStr HttpEntity entity = response.getEntity(); bodyStr = EntityUtils.toString(entity); } // Get Http Header and store complete http header in httpHeaderStr for ( Header header : response.getAllHeaders() ) httpHeaderStr += header.toString() + "\n"; httpHeaderStr += "\n"; int length = httpHeaderStr.length() + bodyStr.length(); /* Using httpHeaderStr and bodyStr to construct responseStr. First line in responseStr should exist. */ // TODO: the following line should be enhanced, // especially the first line in responseStr. String responseStr = String.format( "%s %s %d\n%s%s", urlStr, "0.0.0.0 10000000000000 text/plain", length, httpHeaderStr, bodyStr ); ByteArrayInputStream bais = new ByteArrayInputStream( responseStr.getBytes() ); // TODO: Should not use ARCRecord ARCRecord r = new ARCRecord( bais, "id", 0L, false, false, true ); ar = ( ArcResource ) ResourceFactory.ARCArchiveRecordToResource( r, null ); if ( ar.getStatusCode() == 502 ) { throw new LiveDocumentNotAvailableException( urlStr ); } else if ( ar.getStatusCode() == 504 ) { throw new LiveWebTimeoutException( "Timeout:" + urlStr ); } return ar; } catch( ResourceNotAvailableException e ) { throw new LiveDocumentNotAvailableException( urlStr ); } catch( NoHttpResponseException e ) { throw new LiveWebCacheUnavailableException( "No Http Response for " + urlStr ); } catch( ConnectException e ) { throw new LiveWebCacheUnavailableException( e.getLocalizedMessage() + " : " + urlStr ); } catch ( SocketException e ) { throw new LiveWebCacheUnavailableException( e.getLocalizedMessage() + " : " + urlStr ); } catch ( SocketTimeoutException e ) { throw new LiveWebTimeoutException( e.getLocalizedMessage() + " : " + urlStr ); } catch( ConnectTimeoutException e ) { throw new LiveWebTimeoutException( e.getLocalizedMessage() + " : " + urlStr ); } finally { response.close(); } } /** * Sets proxy and port (proxy:port). * * @param hostPort to proxy requests through - ex. "localhost:3128" */ public void setProxyHostPort( String hostPort ) { int colonIdx = hostPort.indexOf( ':' ); if(colonIdx > 0) { String host = hostPort.substring( 0,colonIdx ); int port = Integer.valueOf( hostPort.substring( colonIdx+1 ) ); hostConfiguration.setProxy( host, port ); } } /** * * @see org.archive.wayback.liveweb.LiveWebCache#shutdown() */ @Override public void shutdown() { throw new UnsupportedOperationException( "Not supported yet." ); //To change body of generated methods, choose Tools | Templates. } }