/*
* Copyright 2000-2004 The Apache Software Foundation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.jetspeed.services.urlmanager;
//standard Java stuff
import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.InputStreamReader;
import java.io.IOException;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.Hashtable;
import java.util.Vector;
//turbine stuff
import org.apache.jetspeed.services.resources.JetspeedResources;
//jetspeed stuff
import org.apache.jetspeed.cache.disk.DiskCacheEntry;
import org.apache.jetspeed.cache.disk.DiskCacheUtils;
import org.apache.jetspeed.cache.disk.JetspeedDiskCache;
import org.apache.jetspeed.services.logging.JetspeedLogFactoryService;
import org.apache.jetspeed.services.logging.JetspeedLogger;
/**
<p>
Handles fetching URLs and if for some reason anything happens add it to the
BadURLManager. There are also some util methods for downloading URLs that don't
use the Disk Cache.
</p>
@author <a href="mailto:burton@apache.org">Kevin A. Burton</a>
@author <a href="mailto:sgala@hisitech.com">Santiago Gala</a>
@version $Id: URLFetcher.java,v 1.14 2004/02/23 03:30:47 jford Exp $
*/
public class URLFetcher
{
/**
* Static initialization of the logger for this class
*/
private static final JetspeedLogger logger = JetspeedLogFactoryService.getLogger(URLFetcher.class.getName());
/**
URLs that Jetspeed is currently trying to fetch in real time.
*/
private static Hashtable realtime_urls = new Hashtable();
/**
*
*/
static final boolean shouldFetchNow =
JetspeedResources.getBoolean( JetspeedResources.CACHE_REQUIRE_CACHED_KEY );
static {
//Looking for redirected channels...
java.net.HttpURLConnection.setFollowRedirects(true);
}
public static final Reader fetch( String url ) throws IOException {
return fetch ( url, false );
}
/**
Try and fetch a URL as and get the content as a String and possibly add
the URL to the BadURLManager if anything goes wrong.
@param url The URL to fetch
@param force if set to true then do not use force this entry to be in the cache...
IE do not use CACHE_REQUIRE_CACHED
*/
public static final Reader fetch( String url,
boolean force ) throws IOException {
if ( ! URLManager.isOK( url ) ) {
throw new URLNotAvailableException( url );
}
//SGP
if( force == false && DiskCacheUtils.isCached( url ) == true)
{
logger.info( "The url " +
url +
" is fetched from the Cache" );
return JetspeedDiskCache.getInstance().getEntry( url ).getReader();
}
//do cache required checking
if ( shouldFetchNow &&
DiskCacheUtils.isCached( url ) == false &&
isRealtimeURL( url ) == false &&
force == false ) {
logger.info( "The url " +
url +
" is not in the cache and will be fetched now because you have configured -> " +
JetspeedResources.CACHE_REQUIRE_CACHED_KEY );
//it is possible that two thread request the same URL.
//The refresh call in JetspeedDiskCache takes care of this.
JetspeedDiskCache.getInstance().refresh( url );
//thow an Exception that this isn't in the cache.
throw new ContentNotAvailableException( url );
}
if( isRealtimeURL( url ) == true ) {
addRealtimeURL( url );
synchronized(url.intern())
{
try
{
//We wait for other thread to load
url.intern().wait();
} catch (InterruptedException e)
{
logger.info("Wait Interrupted");
} finally
{
removeRealtimeURL( url );
}
}
// We try again
return URLFetcher.fetch( url, force );
} else {
addRealtimeURL( url );
}
try {
URL content;
// Determine the URL's protocol
String protocol = url.substring(0, url.indexOf(":/"));
// Check if a proxy is set. If no port is set, use the default port (-1)
String proxyHost = URLManager.getProxyHost( protocol );
if (proxyHost != null)
{
// Open the URL using a proxy
content = new URL(protocol,
proxyHost,
URLManager.getProxyPort( protocol ),
url);
}
else
{
content = new URL( url );
}
URLConnection conn = content.openConnection();
return getReader( conn );
} catch ( Throwable t ) {
String reason = "";
if ( t instanceof MalformedURLException ) {
reason = "The URL is Malformed.";
} else {
reason = t.toString();
}
//if the URL couldn't be fetched because it is remote AND
//it is not in the cache, add it to the bad URL list.
if ( DiskCacheUtils.isCached( url ) == false ) {
//Reported up there...
//logger.error( t );
URLManager.register( url, URLManagerService.STATUS_BAD, reason );
} else {
//it is in the cache, remove it (could be broken in cache).
//next time we could be luckier.
JetspeedDiskCache.getInstance().remove(url);
}
throw new URLNotAvailableException( reason, url );
} finally {
removeRealtimeURL( url );
}
}
/**
Try and fetch a URL if the copy in the cache has expired and add
the URL to the BadURLManager if anything goes wrong.
@param url The URL to fetch
@param force if set to true then do not use force this entry to be in the cache...
IE do not use CACHE_REQUIRE_CACHED
*/
public static final boolean refresh( String url) throws IOException {
if ( ! URLManager.isOK( url ) ) {
if( DiskCacheUtils.isCached(url) )
JetspeedDiskCache.getInstance().remove(url);
throw new URLNotAvailableException( url );
}
if(isRealtimeURL(url)) {
return false;
}
DiskCacheEntry dce = null;
if( DiskCacheUtils.isCached(url) ) {
try {
dce = JetspeedDiskCache.getInstance().getEntry( url );
if(!dce.hasExpired())
{
return false;
}
addRealtimeURL( url );
//only update this if the URL on which it is based is newer
//than the one on disk.
URL sock;
// Determine the URL's protocol
String protocol = url.substring(0, url.indexOf(":/"));
// Check if a proxy is set. If no port is set, use the default port (-1)
String proxyHost = URLManager.getProxyHost( protocol );
if (proxyHost != null)
{
// Open the URL using a proxy
sock = new URL(protocol,
proxyHost,
URLManager.getProxyPort( protocol ),
url);
}
else
{
sock = new URL( url );
}
URLConnection conn = null;
conn = sock.openConnection();
File file = dce.getFile();
long mod = dce.getLastModified();
long filesize = 0;
if(file != null)
{
filesize = file.length();
}
if(mod > 0 || filesize > 0)
conn.setIfModifiedSince(mod);
conn.connect();
long last = conn.getLastModified();
long expires = conn.getExpiration();
int clength = conn.getContentLength();
int respCode = 200;
if(conn instanceof HttpURLConnection) {
respCode = ( ( HttpURLConnection )conn ).getResponseCode();
}
if (respCode != 304 /*NOT MODIFIED*/ &&
(clength == -1 || clength > 0) &&
( last == 0 ||
last > dce.getLastModified()) ) {
logger.info( "URLFetcher: Found updated URL: " +
url +
" Modified " + last + " Expires: " + expires +
" CLength: " + clength );
//force this URL to update.
JetspeedDiskCache.getInstance().getEntry( url, getReader( conn ) );
//Trying to deal with a problem under FreeBSD
conn.getInputStream().close();
//Set the last modified and expiration times for entry
//FIXME: 0 is used in FileWatcher to mean not initialized...
if(last > 0)
dce.setLastModified(last);
else
dce.setLastModified( System.currentTimeMillis() );
dce.setExpirationTime(expires);
//removeRealtimeURL( url ); (done in finally)
return true;
//now make sure that the entry that depends on this HREF
//is updated in the PortletFactory.
} else {
if(last > 0)
dce.setLastModified(last);
else
dce.setLastModified( System.currentTimeMillis() );
dce.setExpirationTime(expires);
logger.info( "DiskCacheDaemon: URL still valid: " + url +
" Modified " + last + " Expires: " + expires +
" CLength: " + clength);
//removeRealtimeURL( url ); (done in finally)
return false;
}
} catch (Throwable e) {
//Add as a Bad URL
logger.error("Throwable", e);
URLManager.register( url,
URLManagerService.STATUS_BAD,
e.toString() );
} finally {
removeRealtimeURL( url );
}
} else {
logger.info( "URLFetcher: Cache miss during validation! Forcing url: " + url );
removeRealtimeURL( url );
JetspeedDiskCache.getInstance().getEntry( url, true );
return true;
}
return false;
}
/**
*
* Return a Reader for a given HTTP connection.
* If the connection first line contains a XML declaration
* with encoding, honor this encoding.
* If not, use the encoding from the HTTP connection,
* taking ISO-8859-1 as default.
*
*/
static final Reader getReader( URLConnection conn )
throws IOException, UnsupportedEncodingException {
String enc = conn.getContentEncoding();
if( enc == null ) {
enc = "ISO-8859-1";
}
// Some XML files come with a encoding attribute inside,
// different than the HTTP encoding. We will have
// to start reading the Reader, read the attribute and rewind
// the stream, generating a new reader with the "true" encoding
BufferedInputStream is = new BufferedInputStream( conn.getInputStream() );
//If document is XML, find the encoding and give it priority over
//the one returned by the connection
//we mark for resetting later. We need a big number to ensure
// stack of streams don't read it to fill buffers.
is.mark( 20480 );
BufferedReader asciiReader = new BufferedReader( new InputStreamReader( is, "ASCII" ) );
String decl = asciiReader.readLine();
//System.err.println( "Line: " + decl );
String key = "encoding=\"";
//decl nul means that the connection got reset...
if( decl != null ) {
int off = decl.indexOf( key );
if( off > 0 ) {
enc = decl.substring( off + key.length(),
decl.indexOf( '"' , off + key.length()) );
}
}
logger.info("URLFetcher: found URL with encoding -> " + enc );
//Reset the bytes read
is.reset();
Reader rdr = new InputStreamReader( is,
enc );
return rdr;
}
/**
Add a URL that is downloading in realtime
*/
static final void addRealtimeURL( String url ) {
synchronized( realtime_urls )
{
Vector threads = (Vector) realtime_urls.get( url);
if(threads != null)
{
if(!threads.contains(Thread.currentThread()))
{
threads.addElement(Thread.currentThread() );
}
} else {
threads = new Vector();
threads.addElement(Thread.currentThread());
realtime_urls.put( url, threads );
}
}
}
/**
Remove a URL because it isn't downloading anymore.
*/
static final void removeRealtimeURL( String url ) {
synchronized( realtime_urls )
{
Vector threads = (Vector) realtime_urls.get( url);
if(threads != null)
synchronized( threads )
{
Thread realLoader = (Thread) threads.firstElement();
if(realLoader == Thread.currentThread())
{
synchronized(url.intern())
{
realtime_urls.remove(url);
url.intern().notifyAll();
}
} else {
threads.removeElement(Thread.currentThread());
}
}
}
}
/**
Return true if this URL isn't downloading in realtime.
*/
static final boolean isRealtimeURL( String url ) {
synchronized( realtime_urls ) {
return realtime_urls.get( url ) != null;
}
}
/**
Return the list of realtime URLs for debug
*/
public static final Hashtable getRealtimeURLs() {
synchronized(realtime_urls) {
return realtime_urls;
}
}
}