/*==========================================================================*\ | $Id: WebContent.java,v 1.2 2010/02/23 17:06:36 stedwar2 Exp $ |*-------------------------------------------------------------------------*| | Copyright (C) 2007-2010 Virginia Tech | | This file is part of the Student-Library. | | The Student-Library is free software; you can redistribute it and/or | modify it under the terms of the GNU Lesser General Public License as | published by the Free Software Foundation; either version 3 of the | License, or (at your option) any later version. | | The Student-Library is distributed in the hope that it will be useful, | but WITHOUT ANY WARRANTY; without even the implied warranty of | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | GNU Lesser General Public License for more details. | | You should have received a copy of the GNU Lesser General Public License | along with the Student-Library; if not, see <http://www.gnu.org/licenses/>. \*==========================================================================*/ package student.web.internal; import java.io.IOException; import java.io.InputStream; import java.net.HttpURLConnection; import java.net.MalformedURLException; import java.net.URL; import java.net.URLConnection; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Scanner; import java.util.regex.Matcher; import java.util.regex.Pattern; //------------------------------------------------------------------------- /** * Static helper methods to retrieve web content, with built-in * concurrency-protected caching of results. * * @author Stephen Edwards * @author Last changed by $Author: stedwar2 $ * @version $Revision: 1.2 $, $Date: 2010/02/23 17:06:36 $ */ public class WebContent { public static String get(String url) throws MalformedURLException { URL realUrl = new URL(url); return get(realUrl, url); } public static String get(URL url) { return get(url, url.toString()); } public static String get(URL url, String urlAsString) { if (urlAsString == null) { urlAsString = url.toString(); } String result = READ_MARKER; boolean mustRead = false; synchronized (cache) { // This comparison intentionally uses == rather than equals(), // since we really, really mean object identity in this case while (result == READ_MARKER) { // Look in the cache result = cache.get(urlAsString); if (result == null) { // If the cache has no value for this url, // mark the cache to indicate some thread (this one) is // going to read the value cache.put(urlAsString, READ_MARKER); // Now remember that we're the thread that needs to read mustRead = true; } // This comparison intentionally uses == rather than equals(), // since we really, really mean object identity in this case else if (result == READ_MARKER) { // Inside here, we know that some other thread is currently // trying to read the content of this URL, but hasn't put // it in the cache yet. try { cache.wait(); } catch (InterruptedException e) { // Now the reader should be done } // Now, let the while loop repeat to try again } } } // Check to see if this thread must read the URL's content to // insert into the cache if (mustRead) { // Now, check to see if we need to delay the read to prevent // throttling by the host. boolean canRead = false; while (!canRead) { String host = url.getHost(); long sleepTime = 0L; synchronized (lastAccess) { Long last = lastAccess.get(host); if (last == null) { // No limits on this host canRead = true; } else { // Impose a delay of 2 seconds, in condition above sleepTime = System.currentTimeMillis() - last.longValue(); if (sleepTime > 1000) { canRead = true; lastAccess.put(host, System.currentTimeMillis()); } } } if (!canRead && sleepTime > 0L) { try { Thread.sleep(sleepTime); } catch (InterruptedException e) { // waking up } } } IOException exception = null; try { URLConnection connection = prepConnectionFor(url); if (connection instanceof HttpURLConnection && ((HttpURLConnection)connection).getResponseCode() == 999) { // System.out.println("Yahoo 999 error received on " + url); // Yahoo is being a pain InputStream errStream = ((HttpURLConnection)connection).getErrorStream(); if (errStream != null) { getYahooErrorCookies(errStream); // Try again // System.out.println("Attempting second load of " + url); connection = prepConnectionFor(url); } } // System.out.println("fetching from: " + url); result = getContentFrom(connection.getInputStream()); // System.out.println("Content =\n" + result); } catch (IOException e) { exception = e; } // If there was some failure, just force it to an empty string synchronized (cache) { if (result == null) { cache.remove(urlAsString); } else { cache.put(urlAsString, result); } cache.notifyAll(); } if (exception != null) { throw new RuntimeException(exception); } } return result; } private static URLConnection prepConnectionFor(URL url) throws IOException { java.net.URLConnection connection = url.openConnection(); // Use a browser-like user agent, so that servers that // refuse connections from generic programs might still // provide a useful response connection.setRequestProperty("User-Agent", USER_AGENT); if (cookies != null && url.getHost() != null) { String host = url.getHost().toLowerCase(); synchronized (cookies) { for (Cookie cookie : cookies) { if (host.endsWith(cookie.host)) { // System.out.println("adding cookie " + cookie.value); connection.setRequestProperty( "Cookie", cookie.value); break; } } } } connection.connect(); return connection; } private static void getYahooErrorCookies(InputStream stream) throws IOException { String content = getContentFrom(stream); Pattern p = Pattern.compile("<a href=\"([^\"]*)\">let us know</a>"); Matcher m = p.matcher(content); if (m.find()) { String newUrl = m.group(1); // System.out.println("Attempting to get cookies from " + newUrl); getCookiesFrom(newUrl); } } private static String getContentFrom(InputStream stream) { Scanner in = new Scanner(stream); in.useDelimiter("\\z"); StringBuffer sb = new StringBuffer(4096); while (in.hasNext()) { sb.append(in.next()); } in.close(); return sb.toString(); } private static void getCookiesFrom(String url) throws IOException { URLConnection connection = (new URL(url)).openConnection(); connection.connect(); Map<String, List<String>> headers = connection.getHeaderFields(); List<String> responseCookies = headers.get("Set-Cookie"); if (responseCookies != null) { for (String cookieVal : responseCookies) { String[] segments = cookieVal.split("\\s*;\\s*"); if (segments != null && segments.length >= 1) { for (String segment : segments) { if (segment.toLowerCase() .startsWith("domain=")) { String host = segment.substring( "domain=".length()); // System.out.println("loading live cookie = " // + segments[0] + " for host = " + host); synchronized (cookies) { for (int i = 0; i < cookies.size(); i++) { if (host.equals(cookies.get(i).host)) { cookies.remove(i); i--; } } cookies.add(new Cookie(host, segments[0])); } break; } } } } } } private static class Cookie { public String host; public String value; public Cookie(String h, String v) { host = h; value = v; } } //~ Instance/static variables ............................................. private static final String USER_AGENT = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)"; private static final String READ_MARKER = "READ_MARKER"; // Unlimited size to cache, but soft references mean content can // be garage-collected as needed private static final Map<String, String> cache = new MRUMap<String, String>(0, 180); private static final Map<String, Long> lastAccess = new HashMap<String, Long>(10); private static List<Cookie> cookies = new ArrayList<Cookie>(); static { lastAccess.put("news.search.yahoo.com", System.currentTimeMillis()); // URL cookieUrl = WebContent.class.getClassLoader() // .getResource("webBrowserCookies.txt"); // if (cookieUrl != null) // { // cookies = new Vector<Cookie>(); // Scanner in = new Scanner(WebContent.class.getClassLoader() // .getResourceAsStream("webBrowserCookies.txt")); // while (in.hasNextLine()) // { // String host = in.next(); // String cookie = in.nextLine().trim(); // cookies.add(new Cookie(host, cookie)); //// System.out.println("loaded cookie: host=" + host //// + ", value=" + cookie); // } // } } }