package org.benow.java.rest; import java.io.BufferedInputStream; import java.io.BufferedOutputStream; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStream; import java.net.ConnectException; import java.net.HttpURLConnection; import java.net.URL; import java.nio.charset.Charset; import java.util.zip.GZIPInputStream; import org.w3c.dom.Document; /** * Utility class to load pages and XML documents from a site. Uses caching * and gzip compression for performance. Implements throttling of requests, * if specified. * * @author andy * */ public class DocumentLoader { /** * The maximum age of files in the cache. Files older than this will be deleted. */ public static long MAX_CACHE_AGE_MILLIS = (2 * 7 * 24 * 60 * 60 * 1000); private File cacheDir; private boolean cacheDisabled; private final URL baseURL; private long lastLoad; private long loadInterval = 0; private String agent; // charset for string read private Charset charset = null; private static long cacheTrimTime = 0; public DocumentLoader(URL baseURL) { this.baseURL = baseURL; this.cacheDisabled = false; this.cacheDir = new File("var/cache/" + baseURL.getHost() + (baseURL.getPort() != -1 ? "-" + baseURL.getPort() : "")); } public void setCharset( Charset charset) { this.charset = charset; } private void trimCache() { long oldTime = System.currentTimeMillis() - MAX_CACHE_AGE_MILLIS; if (cacheDir != null && cacheDir.exists()) { for (File curr : cacheDir.listFiles()) { if (!curr.getName().endsWith(".stamp")) { if (curr.isFile() && curr.lastModified() < oldTime) { curr.delete(); File stampFile = getStampFile(curr); if (stampFile.exists()) stampFile.delete(); } } } } } private File getStampFile( File cacheFile) { return new File(cacheFile.getParentFile(), cacheFile.getName() + ".stamp"); } /** * Changes the cache directory from the default * @param cacheDir */ public void setCacheDir( File cacheDir) { this.cacheDir = cacheDir; } public void setUserAgent( String agent) { this.agent = agent; } /** * set the minimum time between requests (in millis). If requests arrive * more frequently than this interval, then the thread will be stalled, * throttling requests. Set to 0 or less to disable. * @param loadInterval */ public void setLoadInterval( long loadInterval) { this.loadInterval = loadInterval; } public InputStream loadStream( String urlStr) throws IOException { // trim cache on vm startup or daily if (System.currentTimeMillis() - cacheTrimTime > 24 * 60 * 60 * 1000) trimCache(); /* * urls with extended characters fail: * http://www.discogs.com/artist/St%C3%A9phane+Pompougnac * http://www.discogs.com/artist/St%C3%A9phane+Pompougnac urlStr = URLEncoder.encode(urlStr, "UTF-8"); */ if (urlStr.contains(" ")) { urlStr = urlStr.replace(" ", "+"); } URL url = new URL(baseURL, urlStr); InputStream result = null; long lastModified = -1; System.out.println("Hitting url: " + url); HttpURLConnection.setFollowRedirects(true); HttpURLConnection conn = (HttpURLConnection) url.openConnection(); // must accept gzip: http://www.discogs.com/help/api // http://www.oreillynet.com/onjava/blog/2004/07/optimizing_http_downloads_in_j.html conn.setRequestProperty("Accept-Encoding", "gzip, deflate"); if (agent != null) conn.setRequestProperty("User-Agent", agent); try { conn.connect(); } catch (ConnectException e) { System.err.println("Error connecting to: " + url); result = loadFromCache(url, -1); if (result != null) System.err.println("Returning result directly from cache."); return result; } // use last modified, if supported by server String lmStr = conn.getHeaderField("Last-Modified"); /* Map<String, List<String>> fields = conn.getHeaderFields(); for (String field : fields.keySet()) { List<String> vals = fields.get(field); String val = null; for (int i = 0; i < vals.size(); i++) { if (val == null) val = vals.get(i); else val = val + "," + vals.get(i); } System.out.println(field + ": " + val); } */ if (lmStr != null) { try { lastModified = Long.parseLong(lmStr); } catch (NumberFormatException e) { System.err.println("Error in Last-Modified header: " + lmStr + ". Loading new."); } } result = loadFromCache(url, lastModified); if (result == null) { String encoding = conn.getContentEncoding(); InputStream in = conn.getInputStream(); if (encoding != null && encoding.equalsIgnoreCase("gzip")) in = new GZIPInputStream(in); if (loadInterval > 0 && lastLoad != 0 && System.currentTimeMillis() - lastLoad < loadInterval) { long wait = System.currentTimeMillis() - lastLoad; System.out.println("Waiting for " + wait + " ms before fetch. Must be " + loadInterval + "ms between requests."); try { Thread.sleep(wait); } catch (InterruptedException e) { // ignore } } in = saveToCache(in, url, lastModified); lastLoad = System.currentTimeMillis(); result = in; } return result; } public static String readFromStream( InputStream inputStream) throws IOException { String read = ""; BufferedReader in = new BufferedReader(new InputStreamReader(inputStream)); String line = in.readLine(); while (line != null) { read += line + "\n"; line = in.readLine(); } return read; } private InputStream saveToCache( InputStream in, URL url, long lastModified) throws IOException { if (cacheDisabled) return in; File cacheFile = urlToCacheFile(url); cacheFile.getParentFile().mkdirs(); OutputStream out = new BufferedOutputStream(new FileOutputStream(cacheFile)); try { byte[] buff = new byte[512]; int read = in.read(buff); while (read > 0) { out.write(buff, 0, read); read = in.read(buff); } } finally { out.flush(); out.close(); } if (lastModified > 0) { File stampFile = getStampFile(cacheFile); stampFile.createNewFile(); stampFile.setLastModified(lastModified); } return new BufferedInputStream(new FileInputStream(cacheFile)); } private InputStream loadFromCache( URL url, long lastModified) throws IOException { if (cacheDisabled) return null; File cacheFile = urlToCacheFile(url); if (!cacheFile.exists()) return null; File stampFile = getStampFile(cacheFile); if (lastModified >= 0 && stampFile.exists()) { // don't load from cache if current last modified is newer if (stampFile.lastModified() < lastModified) return null; } return new BufferedInputStream(new FileInputStream(cacheFile)); } private File urlToCacheFile( URL url) { String fn = url.getFile() + ".xml"; fn = fn.replace("&", "-"); fn = fn.replace("/", "-"); fn = fn.replace("?", "-"); fn = fn.replace("'", "-"); fn = fn.replace("\"", "-"); fn = fn.replace(";", "-"); fn = fn.replace(":", "-"); File cacheFile = new File(cacheDir, fn); return cacheFile; } /** * Turns caching off. Responses will not be cached and results will not be returned from the cache. */ public void disableCaching() { cacheDisabled = true; } public void setCacheEnabled(boolean b) { cacheDisabled = !b; } public String loadString( String urlStr) throws IOException { InputStream in = loadStream(urlStr); try { return readFromStream(in); } finally { in.close(); } } public Document loadDocument( String urlStr) throws IOException { // http://www.velocityreviews.com/forums/t143346-xml-and-invalid-byte-utf-8-a.html String read = loadString(urlStr); if (charset != null) read = new String(read.getBytes(), charset); return XML.loadDocument(read); } }