ParsedURL.java example

Explorer
ecologylabFundamental-master
- src
  - ecologylab
- test
  - ecologylab
    - serialization
package ecologylab.net;

import java.io.File;
import java.io.IOException;
import java.net.CookieHandler;
import java.net.CookieManager;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URL;
import java.net.URLEncoder;
import java.util.HashMap;
import java.util.StringTokenizer;

import ecologylab.collections.CollectionTools;
import ecologylab.generic.Debug;
import ecologylab.generic.IntSlot;
import ecologylab.generic.StringTools;
import ecologylab.io.Files;
import ecologylab.platformspecifics.FundamentalPlatformSpecifics;
import ecologylab.serialization.SIMPLTranslationException;
import ecologylab.serialization.SimplTypesScope;
import ecologylab.serialization.formatenums.Format;

/**
 * Extends the URL with many features for the convenience and power of network programmers. New
 * class for manipulating and displaying URLs.
 * 
 * Uses lazy evaluation to minimize storage allocation.
 * 
 * @author andruid
 * @author eunyee
 * @author madhur
 */
public class ParsedURL extends Debug implements MimeType
{
	private static final String	NOT_IN_THE_FORMAT_OF_A_WEB_ADDRESS	= " is not in the format of a web address";

	private static final String	DEFAULT_USER_AGENT									= "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.1.2) Gecko/20090729 Firefox/3.5.2 (.NET CLR 3.5.30729)";

	/**
	 * this is the no hash url, that is, the one with # and anything after it stripped out.
	 */
	protected URL								url																	= null;

	/**
	 * If this is built from an entity of the local file system, store a reference to the object for
	 * that here.
	 */
	File												file;

	/**
	 * URL with hash, that is, a reference to an anchor within the document.
	 */
	protected URL								hashUrl															= null;

	/**
	 * Directory that the document referred to by the URL resides in.
	 */
	protected URL								directory														= null;

	private ParsedURL						directoryPURL;

	/**
	 * String representation of the URL.
	 */
	protected String						string															= null;

	/**
	 * Shorter version of the string, for printing in tight spaces.
	 */
	String											shortString;

	/* lower case of the url string */
	protected String						lc																	= null;

	/* suffix string of the url */
	protected String						suffix															= null;

	/* domain value string of the ulr */
	protected String						domain															= null;
	
	protected boolean						includePrefix												= true;
	
	public static CookieManager cookieManager = new CookieManager();
	
	static
	{
		//cookieManager.setCookiePolicy(CookiePolicy.ACCEPT_ALL);
		CookieHandler.setDefault(cookieManager);
	}
	
	public ParsedURL(URL url)
	{
		String hash = url.getRef();
		
		if ("file".equals(url.getProtocol()))
		{
			String urlString = url.toString();
			if (urlString.startsWith("file://")) // this should be the case...
			{
				this.file = new File(urlString.substring(7));
			}
			else // if not, try our hardest to make a good file
			{
				this.file = new File(url.getHost()+url.getPath());
			}
			this.url 	= url;
		}
		else if (hash == null)
		{
			this.url = url;
			this.hashUrl = url;
		}
		else
		{
			this.hashUrl = url;
			try
			{
				// form no hash url (toss hash)
				this.url = new URL(url.getProtocol(), url.getHost(), url.getPort(), url.getFile());
			}
			catch (MalformedURLException e)
			{
				// TODO Auto-generated catch block
				e.printStackTrace();
			}
		}
	}

	/**
	 * 
	 * @return true if this refers to a file, and that file exists.  Also true if this does not refer to a file.
	 */
	public boolean isNotFileOrExists()
	{
		return (file == null) || file.exists();
	}

	/**
	 * Create a ParsedURL from a file. If the file is a directory, append "/" to the path, so that
	 * relative URLs will be formed properly later.
	 * 
	 * @param file
	 */
	public ParsedURL(File file)
	{
		try
		{
			String urlString = "file://" + file.getAbsolutePath();
			urlString = urlString.replace('\\', '/');
			if (file.isDirectory())
				urlString += "/";
			this.url = new URL(urlString);
		}
		catch (MalformedURLException e)
		{
			e.printStackTrace();
		}
		this.file = file;
	}

	/*
	 * Constructor with a url string parameter. get absolute URL with getAbsolute() method.
	 */
	/*
	 * public ParsedURL(String urlString) { // The second parameter of getAbolute method is error
	 * description. this.url = getAbsolute(urlString, "").url(); }
	 */
	// /////////////////////////////////////////////////////////////////////
	/**
	 * Create a PURL from an absolute address. (Do it the quick and dirty way, providing less error
	 * handling.) NB: Only call this method if you are *sure* a MalformedURlException would never be
	 * produced.
	 * 
	 */
	public static ParsedURL getAbsolute(String webAddr)
	{
		return getAbsolute(webAddr, "getAbsolute(String) ");
	}

	public static ParsedURL get(URI uri)
	{
		return getAbsolute(uri.toString());
	}
	/**
	 * Create a PURL from an absolute address.
	 * 
	 * @param webAddr
	 *          url string
	 * @param errorDescriptor
	 *          which will be printed out in the trace file if there is something happen converting
	 *          from the url string to URL.
	 * @return ParsedURL from url string parameter named webAddr, or null if the param is malformed.
	 */
	public static ParsedURL getAbsolute(String webAddr, String errorDescriptor)
	{
		if (webAddr == null || webAddr.length() <= 7)
		{
			println("ERROR: ParsedURL.getAbsolute() webAddr is null or too short: [" + webAddr + "]");
			//Thread.dumpStack(); //We don't really need such a hostile message.
		}
		else
		{
			try
			{
				URL url = new URL(webAddr);
				if (isUndetectedMalformedURL(url))
					return null;
				return new ParsedURL(url);
			}
			catch (MalformedURLException e)
			{
				if (!"".equals(errorDescriptor))
					errorDescriptor = "\n" + errorDescriptor;
				Debug.error(webAddr, NOT_IN_THE_FORMAT_OF_A_WEB_ADDRESS + "." + errorDescriptor);
			}
		}
		return null;
	}

	/**
	 * Determines a URL is malformed since Java fails to detect this.
	 * 
	 * @param url
	 * @return
	 */
	private static boolean isUndetectedMalformedURL(URL url)
	{
		// originally checked against "file:", but on OS X, we just get "file"; this is probably true
		// everywhere else too, but I will leave "file:" for the time being. -Zach
		boolean isFileProtocol = "file".equals(url.getProtocol()) || "file:".equals(url.getProtocol());
		String host = url.getHost().trim();

		return ((!isFileProtocol && ("".equals(host) || "/".equals(host)))
		        || (isFileProtocol && ("".equals(url.getPath().trim())
		                               || "localhost".equalsIgnoreCase(host))));
	}

	/**
	 * Form a ParsedURL, based on a relative path, using this as the base.
	 * 
	 * @param relativeURLPath
	 *          Path relative to this.
	 * @param errorDescriptor
	 * 
	 * @return New ParsedURL based on this and the relative path.
	 */
	public final ParsedURL getRelative(String relativeURLPath, String errorDescriptor)
	{
		if (isFile())
		{
			File newFile = Files.newFile(file, relativeURLPath);
			// remove ..'s from path
			if (newFile.getAbsolutePath().contains(".."))
			{
				try
				{
					File canonicalFile = newFile.getCanonicalFile();
					return new ParsedURL(canonicalFile);
				}
				catch (IOException e)
				{
					e.printStackTrace();
				}
			}
			return new ParsedURL(newFile);
		}
		else
			return getRelative(url, relativeURLPath, errorDescriptor);
	}

	/**
	 * Form a ParsedURL, based on a relative path, using this as the base.
	 * 
	 * @param relativeURLPath
	 *          Path relative to this.
	 * 
	 * @return New ParsedURL based on this and the relative path.
	 */
	public final ParsedURL getRelative(String relativeURLPath)
	{
		return getRelative(relativeURLPath, "");
	}

	/**
	 * Form a new ParsedURL, relative from a supplied base URL. Checks to see if the relativePath
	 * starts w a protocol spec. If so, calls getAbsolute(). Otherwise, forms a relative URL using the
	 * URL base.
	 * 
	 * @param relativeURLPath
	 * @param errorDescriptor
	 * @return New ParsedURL
	 */
	public static ParsedURL getRelative(URL base, String relativeURLPath, String errorDescriptor)
	{
		if (relativeURLPath == null)
			return null;

		ParsedURL result = null;
		if (!relativeURLPath.startsWith("http://") && !relativeURLPath.startsWith("ftp://"))
		{
			try
			{
				URL resultURL = new URL(base, relativeURLPath);
				result = new ParsedURL(resultURL);
			}
			catch (MalformedURLException e)
			{
				if (!"".equals(errorDescriptor))
					errorDescriptor = "\n" + errorDescriptor;
				Debug.error(relativeURLPath, NOT_IN_THE_FORMAT_OF_A_WEB_ADDRESS + "[" + base + "]."
						+ errorDescriptor);
			}
		}
		else
			return getAbsolute(relativeURLPath, errorDescriptor);

		return result;
	}

	/**
	 * Use this as the source of stuff to translate from XML
	 * 
	 * @param translationScope
	 *          Translations that specify package + class names for translating.
	 * @return ElementState object derived from XML at the InputStream of this.
	 * @throws SIMPLTranslationException
	 */
	public Object translateFromXML(SimplTypesScope translationScope)
			throws SIMPLTranslationException
	{
		return translationScope.deserialize(this, Format.XML);
	}

	public static URL getURL(URL base, String path, String error)
	{
		// ??? might want to allow this default behaviour ???
		if (path == null)
			return null;
		try
		{
			// System.err.println("\nGENERIC - base, path, error = \n" + base + "\n" + path);
			URL newURL = new URL(base, path);
			// System.err.println("\nNEW URL = " + newURL);
			return newURL;
		}
		catch (MalformedURLException e)
		{
			if (error != null)
				throw new Error(e + "\n" + error + " " + base + " -> " + path);
			return null;
		}
	}

	/**
	 * Uses lazy evaluation to minimize storage allocation.
	 * 
	 * @return The URL as a String.
	 */
	@Override
	public String toString()
	{
		String result = string;
		if (result == null)
		{
			if (isFile() && includePrefix)
				result = "file://" + file.toString().replace('\\', '/');
			else if (isFile() && !includePrefix)
				result = file.toString().replace('\\', '/');
			else if (url == null)
				result = "weirdly null";
			else
				result = StringTools.pageString(url);
			string = result;
		}
		return result;
	}

	/**
	 * Uses lazy evaluation to minimize storage allocation.
	 * 
	 * @return Lower case rendition of the URL String.
	 */
	public String lc()
	{
		String result = lc;
		if (result == null)
		{
			result = toString().toLowerCase();
			lc = result;
		}
		return result;
	}

	/**
	 * Uses lazy evaluation to minimize storage allocation.
	 * 
	 * @return The suffix of the filename, in lower case.
	 */
	public String suffix()
	{
		String result = suffix;
		if (result == null)
		{
			String path = url.getPath();
			if (path != null)
			{
				result = suffix(path.toLowerCase());
			}
			// TODO make sure that there isnt code somewhere testing suffix for null!
			if (result == null)
				result = "";
			suffix = result;
		}
		return result;
	}

	/**
	 * Form a ParsedURL based on this, if this is a directory. Otherwise, form the ParsedURL from the
	 * parent of this. Process files carefully to propagate their file-ness.
	 * 
	 * @return
	 */
	public ParsedURL directoryPURL()
	{
		ParsedURL result = directoryPURL;
		if (result == null)
		{
			if (isFile())
			{
				if (file.isDirectory())
					result = this;
				else
				{
					File parent = file.getParentFile();
					result = new ParsedURL(parent);
				}
			}
			else
			{
				result = new ParsedURL(directory());
			}
			this.directoryPURL = result;
		}
		return result;
	}

	/**
	 * Get the URL for the directory associated with this. Requires looking for slash at the end,
	 * looking for a suffix or arguments. As a result, we sometimes add a slash at the end, sometimes
	 * peel off the filename. Result is cached a la lazy evaluation.
	 * 
	 * @return Directory URL
	 */
	public URL directory()
	{
		URL result = this.directory;
		if (result == null)
		{
			if (StringTools.endsWithSlash(toString()))
				result = this.url;
			if (result == null)
			{
				String suffix = suffix();
				try
				{
					String path = url.getPath();
					String args = url.getQuery();
					String protocol = url.getProtocol();
					String host = url.getHost();
					int port = url.getPort();
					if (suffix.length() == 0)
					{ // this is a directory that is unterminated by slash; we need to fix that

						if (path.length() == 0)
							result = new URL(protocol, host, port, "/");
						else
						{
							if ((args == null) || (args.length() == 0))
								result = new URL(protocol, host, port, path + '/');
							else
							// this is a tricky executable with no suffix
							{
								// result = null;
								// drop down into the next block, and peel off that suffix-less executable name
							}
						}
					}
					// else
					if (result == null)
					{ // you have a suffix, so we need to trim off the filename
						int lastSlashIndex = path.lastIndexOf('/');
						if (lastSlashIndex == -1)
							// suffix, but not within any subdirectory
							result = new URL(protocol, host, port, "/");
						else
						{
							String pathThroughLastSlash = path.substring(0, lastSlashIndex + 1);
							result = new URL(protocol, host, port, pathThroughLastSlash);
						}
					}
				}
				catch (MalformedURLException e)
				{
					debug("Unexpected ERROR forming directory.");
					e.printStackTrace();
				}
			}
			this.directory = result;
		}
		return result;
	}

	/**
	 * Uses lazy evaluation to minimize storage allocation.
	 * 
	 * @return The domain of the URL.
	 */
	public String domain()
	{
		String result = domain;
		if (result == null && (url != null))
		{
			result = StringTools.domain(url);
			domain = result;
		}
		return result;
	}

	public boolean isNull()
	{
		return url == null && file == null;
	}

	/**
	 * @return The suffix of the filename, in whatever case is found in the input string.
	 */
	public static String suffix(String lc)
	{
		int afterDot = lc.lastIndexOf('.') + 1;
		int lastSlash = lc.lastIndexOf('/');
		String result = ((afterDot == 0) || (afterDot < lastSlash)) ? "" : lc.substring(afterDot);
		return result;
	}

	public String filename()
	{
		String lowerCase = noAnchorNoQueryPageString();
		int lastDot = lowerCase.lastIndexOf('.');
		int lastSlash = lowerCase.lastIndexOf('/');
		String result = (lastDot == 0 || (lastDot < lastSlash)) ? "" : lowerCase.substring(lastSlash,
				lastDot);
		return result;
	}

	/**
	 * Uses lazy evaluation to minimize storage allocation.
	 * 
	 * @return the URL.
	 */
	public final URL url()
	{
		return url;
	}

	public final URL hashUrl()
	{

		if (hashUrl == null)
			return url();
		else
			return hashUrl;

	}

	/*
	 * return noAnchor no query page string
	 */
	public String noAnchorNoQueryPageString()
	{
		return StringTools.noAnchorNoQueryPageString(url);
	}

	/*
	 * return no anchor no page string.
	 */
	public String noAnchorPageString()
	{
		return StringTools.noAnchorPageString(url);
	}

	/**
	 * @return true if the suffix of this is equal to that of the argument.
	 */
	public final boolean hasSuffix(String s)
	{
		return lc().endsWith(s);
		// return suffix().equals(s);
	}

	final static String										unsupportedMimeStrings[]		=
																																		{ "ai", "bmp", "eps", "ps",
			"psd", "svg", "tif", "vrml", "doc", "xls", "pps", "ppt", "adp", "rtf", "vbs", "vsd", "wht",
			"aif", "aiff", "aifc", "au", "mp3", "wav", "ra", "ram", "wm", "wma", "wmf", "wmp", "wms",
			"wmv", "wmx", "wmz", "avi", "mov", "mpa", "mpeg", "mpg", "ppj", "swf", "spl", "qdb", "cab",
			"chm", "gzip", "hqx", "jar", "lzh", "tar", "zip", "wpd", "xsl", };

	final static HashMap									unsupportedMimes						= CollectionTools
																																				.buildHashMapFromStrings(unsupportedMimeStrings);

	static final String[]									unsupportedProtocolStrings	=
																																		{ "mailto", "vbscript", "news",
			"rtsp", "https",																							};

	static final HashMap									unsupportedProtocols				= CollectionTools
																																				.buildHashMapFromStrings(unsupportedProtocolStrings);

	static final String[]									supportedProtocolStrings		=
																																		{ "http", "ftp", "file", };

	static final HashMap									supportedProtocols					= CollectionTools
																																				.buildHashMapFromStrings(supportedProtocolStrings);

	static final String[]									imgSuffixStrings;

	static final String[] SOME_IMG_SUFFIXES = { "jpg", "jpeg", "pjpg", "pjpeg", "gif", "png", };
	/*
	 * { "jpg", "jpeg", "pjpg", "pjpeg", "gif", "png", };
	 */
	static final HashMap									imgSuffixMap;																																													// case

	static final String[]									jpegMimeStrings							=
																																		{ "jpg", "JPG", "jpeg", "JPEG",
			"pjpg", "pjpeg",																							};

	static final String[]									gifMimeStrings							=
																																		{ "gif", "GIF", };

	static final String[]									pngMimeStrings							=
																																		{ "png", "PNG", };

	static final HashMap									jpegSuffixMap								= CollectionTools
																																				.buildHashMapFromStrings(jpegMimeStrings);

	static final String[]									htmlSuffixStrings							=
																																		{ "html", "htm", "stm", "php",
			"jhtml", "jsp", "asp", "txt", "shtml", "pl", "plx", "exe"		};

	static final String[]									noAlphaSuffixStrings					=
	{ 
		"bmp", "BMP", "wbmp", "WBMP", 
		"jpg", "JPG", "jpeg", "JPEG",
		"pjpg", "PJPG", "pjpeg", "PJPEG",
	};

	static final HashMap									noAlphaSuffixMap						= CollectionTools
																																				.buildHashMapFromStrings(noAlphaSuffixStrings);

	static final HashMap									htmlSuffixMap								= CollectionTools
																																				.buildHashMapFromStrings(htmlSuffixStrings);

	static final String[]									pdfMimeStrings							=
																																		{ "pdf" };

	static final HashMap									pdfSuffixMap								= CollectionTools
																																				.buildHashMapFromStrings(pdfMimeStrings);

	static final String[]									rssMimeStrings							=
																																		{ "rss", "xml" };

	static final HashMap									rssSuffixMap								= CollectionTools
																																				.buildHashMapFromStrings(rssMimeStrings);

	static final HashMap<String, IntSlot>	suffixesToMap								= new HashMap<String, IntSlot>();
	static
	{
		String[] platformSpecificImgFormats = null;
		try
		{
			platformSpecificImgFormats = FundamentalPlatformSpecifics.get().getReaderFormatNames(); 
		} catch (Throwable e)
		{
		}
		imgSuffixStrings	= (platformSpecificImgFormats == null) ? SOME_IMG_SUFFIXES : platformSpecificImgFormats;
		imgSuffixMap		= CollectionTools.buildHashMapFromLCStrings(imgSuffixStrings);

		for (int i = 0; i < pdfMimeStrings.length; i++)
			CollectionTools.stringIntMapEntry(suffixesToMap, pdfMimeStrings[i], PDF);
		for (int i = 0; i < htmlSuffixStrings.length; i++)
			CollectionTools.stringIntMapEntry(suffixesToMap, htmlSuffixStrings[i], HTML);
		for (int i = 0; i < rssMimeStrings.length; i++)
			CollectionTools.stringIntMapEntry(suffixesToMap, rssMimeStrings[i], RSS);
		for (int i = 0; i < jpegMimeStrings.length; i++)
			CollectionTools.stringIntMapEntry(suffixesToMap, jpegMimeStrings[i], JPG);
		for (int i = 0; i < gifMimeStrings.length; i++)
			CollectionTools.stringIntMapEntry(suffixesToMap, gifMimeStrings[i], GIF);
		for (int i = 0; i < pngMimeStrings.length; i++)
			CollectionTools.stringIntMapEntry(suffixesToMap, pngMimeStrings[i], PNG);
	}

	/**
	 * Called while processing (parsing) HTML. Used to create new <code>ParsedURL</code>s from
	 * urlStrings in response to such as the <code>a</code> element's <code>href</code> attribute, the
	 * <code>img</code> element's <code>src</code> attribute, etc.
	 * <p>
	 * Does processing of some fancy stuff, like, in the case of <code>javascript:</code> URLs, it
	 * mines them for embedded absolute URLs, if possible, and uses only those embedded URLs.
	 * 
	 * @param addressString
	 *          This may be specify a relative or absolute url.
	 * 
	 * @return The resulting ParsedURL. It may be null. It will never have protocol
	 *         <code>javascript:</code>.
	 */
	public ParsedURL createFromHTML(String addressString)
	{
		return createFromHTML(addressString, false);
	}

	/**
	 * Called while processing (parsing) HTML. Used to create new <code>ParsedURL</code>s from
	 * urlStrings in response to such as the <code>a</code> element's <code>href</code> attribute, the
	 * <code>img</code> element's <code>src</code> attribute, etc.
	 * <p>
	 * Does processing of some fancy stuff, like, in the case of <code>javascript:</code> URLs, it
	 * mines them for embedded absolute URLs, if possible, and uses only those embedded URLs.
	 * 
	 * @param addressString
	 *          This may be specify a relative or absolute url.
	 * 
	 * @param fromSearchPage
	 *          If false, then add <code>/</code> to the end of the URL if it seems to be a directory.
	 * 
	 * @return The resulting ParsedURL. It may be null. It will never have protocol
	 *         <code>javascript:</code>.
	 */
	public ParsedURL createFromHTML(String addressString, boolean fromSearchPage)
	{
		return createFromHTML(this, addressString, fromSearchPage);
	}

	protected static ParsedURL get(URL url, String addressString)
	{
		try
		{
			return new ParsedURL(new URL(url, addressString));
		}
		catch (MalformedURLException e)
		{
			println("ParsedURL.get() cant from url from: " +
			/* url +"\n\taddressString = "+ */addressString);
			// e.printStackTrace();
		}
		return null;
	}

	/**
	 * Called while processing (parsing) HTML. Used to create new <code>ParsedURL</code>s from
	 * urlStrings in response to such as the <code>a</code> element's <code>href</code> attribute, the
	 * <code>img</code> element's <code>src</code> attribute, etc.
	 * <p>
	 * Does processing of some fancy stuff, like, in the case of <code>javascript:</code> URLs, it
	 * mines them for embedded absolute URLs, if possible, and uses only those embedded URLs.
	 * 
	 * @param addressString
	 *          This may be specify a relative or absolute url.
	 * 
	 * @param fromSearchPage
	 *          If false, then add <code>/</code> to the end of the URL if it seems to be a directory.
	 * 
	 * @return The resulting ParsedURL. It may be null. It will never have protocol
	 *         <code>javascript:</code>.
	 */
	public static ParsedURL createFromHTML(ParsedURL contextPURL, String addressString,
			boolean fromSearchPage)
	{
		if ((addressString == null) || (addressString.length() == 0))
			return null;
		if (addressString.startsWith("#") || addressString.startsWith("mailto"))
		{
			// return get(contextPURL.url(), addressString);
			return null;
		}

		String lc = addressString.toLowerCase();
		boolean javascript = lc.startsWith("javascript:");

		// mine urls from javascript quoted strings
		if (javascript)
		{
			// !!! Could do an even better job here of mining quoted
			// !!! javascript strings.
			// println("Container.newURL("+s);
			int http = lc.lastIndexOf("http://");
			// TODO learn to mine PDFs as well as html!!
			int html = lc.lastIndexOf(".html");
			int pdf = lc.lastIndexOf(".pdf");
			// println("Container.newURL() checking javascript url:="+s+
			// " http="+http+" html="+html);
			if (http > -1)
			{ // seek absolute web addrs
				if ((html > -1) && (http < html))
				{
					int end = html + 5;
					addressString = addressString.substring(http, end);
					// println("Container.newURL fixed javascript:= " + s);
					lc = lc.substring(http, end);
					javascript = false;
				}
				else if ((pdf > -1) && (http < pdf))
				{
					int end = pdf + 4;
					addressString = addressString.substring(http, end);
					// println("Container.newURL fixed javascript:= " + s);
					lc = lc.substring(http, end);
					javascript = false;
				}
			}
			else
			{
				// seek relative addresses

				// need to find the bounds of a quoted string, if there is one
			}
			// !!! What we should really do here is find quoted strings
			// (usually with single quote, but perhaps double as well)
			// (use regular expressions?? - are they fast enough?)
			// and look at each one to see if either protocol is supported
			// or suffix is htmlMime or imgMime.
		}
		if (javascript)
			return null;

		char argDelim = '?';
		// url string always keep hash string.
		String hashString = StringTools.EMPTY_STRING;
		if (fromSearchPage)
		{
			// handle embedded http://
			int lastHttp = addressString.lastIndexOf("http://");
			// usually ? but could be &
			if (lastHttp > 0)
			{
				// this is search engine crap
				addressString = addressString.substring(lastHttp);
				// debugA("now addressString="+addressString);
				// handle any embedded args (for google mess)
				argDelim = '&';
			}
		}
		else
		{
			// TODO do we really need to do any of this???????????????????????
			// 1) peel off hash
			int hashPos = addressString.indexOf('#');
			// String hashString= StringTools.EMPTY_STRING;

			if (hashPos > -1)
			{
				hashString = addressString.substring(hashPos);
				addressString = addressString.substring(0, hashPos);
			}
			// 2) peel off args
			int argPos = addressString.indexOf(argDelim);
			String argString = StringTools.EMPTY_STRING;
			if (argPos > -1)
			{
				argString = addressString.substring(argPos);
				addressString = addressString.substring(0, argPos);
			}
			// This seems uneccessary, crawling any wikimedia based site will break by adding an extra
			// slash.
			// else
			// {
			// // 3) if what's left is a directory (w/o a mime type),add slash
			// int endingSlash = addressString.lastIndexOf('/');
			// int lastChar = addressString.length() - 1;
			// if (endingSlash == -1)
			// endingSlash++;
			// if ((lastChar > 0) &&
			// (lastChar != endingSlash) &&
			// (addressString.substring(endingSlash).indexOf('.') == -1))
			// addressString += '/';
			// }
			// 4) put back what we peeled off
			addressString = addressString + argString + hashString;
		}
		int protocolEnd = addressString.indexOf(":");
		if (protocolEnd != -1)
		{
			// this is an absolute URL; check for supported protocol
			String protocol = addressString.substring(0, protocolEnd);
			if (protocolIsUnsupported(protocol))
				return null;
		}
		ParsedURL parsedUrl;
		if (contextPURL == null || addressString.startsWith("http://"))
		{
			parsedUrl = getAbsolute(addressString, "in createFromHTML()");
		}
		else
		{
			ParsedURL directoryPURL = contextPURL.directoryPURL();
			parsedUrl = directoryPURL.getRelative(addressString);
		}

		return parsedUrl;
	}

	/**
	 * 
	 * @return A String version of the URL path, in which all punctuation characters have been changed
	 *         into spaces.
	 */
	public String removePunctuation()
	{
		return StringTools.removePunctuation(toString());
	}

	/**
	 * @return true if they have same domains. false if they have different domains.
	 */
	public boolean sameDomain(ParsedURL other)
	{
		return (other != null) && domain().equals(other.domain());
	}

	/**
	 * @return true if they have same hosts. false if they have different hosts.
	 */
	public boolean sameHost(ParsedURL other)
	{
		return (other != null) && url.getHost().equals(other.url().getHost());
	}

	/**
	 * Use unsupportedMimes and protocolIsSupported to determine if this is content fit for
	 * processing.
	 * 
	 * @return true if this seems to be a web addr we can crawl to. (currently that means html).
	 **/
	public boolean crawlable()
	{
		return protocolIsSupported() && !unsupportedMimes.containsKey(suffix());
	}

	/**
	 * Check whether the protocol is supported or not. Currently, only http and ftp are.
	 */
	public boolean protocolIsSupported()
	{
		return (url != null) && protocolIsSupported(url.getProtocol());
	}

	/**
	 * Check whether the protocol is supported or not. Currently, only http and ftp are.
	 */
	public static boolean protocolIsSupported(String protocol)
	{
		return supportedProtocols.containsKey(protocol);
	}

	/**
	 * Check whether the protocol is supported or not. Currently, only http and ftp are.
	 */
	public boolean protocolIsUnsupported()
	{
		return (url != null) && protocolIsUnsupported(url.getProtocol());
	}

	/**
	 * Check whether the protocol is supported or not. Currently, only http and ftp are.
	 */
	public static boolean protocolIsUnsupported(String protocol)
	{
		return unsupportedProtocols.containsKey(protocol);
	}

	/**
	 * @return true if this is an image file.
	 */
	public boolean isImg()
	{
		return isImageSuffix(suffix());
	}

	/**
	 * 
	 * @param thatSuffix
	 * @return true if the suffix passed in is one for an image type that we can handle.
	 */
	public static boolean isImageSuffix(String thatSuffix)
	{
		return imgSuffixMap.containsKey(thatSuffix);
	}

	/**
	 * @return true if this is a JPEG image file.
	 */
	public boolean isJpeg()
	{
		return jpegSuffixMap.containsKey(suffix());
	}

	/**
	 * @return true if we can tell the image file wont have alpha, just from its suffix. This is
	 *         currently the case for jpeg and bmp.
	 */
	public boolean isNoAlpha()
	{
		return noAlphaSuffixMap.containsKey(suffix());
	}

	/**
	 * Test type of document this refers to.
	 * 
	 * @return true if this refers to an HTML file
	 */
	public boolean isHTML()
	{
		return htmlSuffixMap.containsKey(suffix());
	}

	/**
	 * Test type of document this refers to.
	 * 
	 * @return true if this refers to a PDF file
	 */
	public boolean isPDF()
	{
		return pdfSuffixMap.containsKey(suffix());
	}

	/**
	 * Test type of document this refers to.
	 * 
	 * @return true if this refers to an RSS feed
	 */
	public boolean isRSS()
	{
		return rssSuffixMap.containsKey(suffix());
	}

	int	mimeIndex	= -1;

	/**
	 * Get MimeType index by seeing suffix().
	 * 
	 * @param parsedURL
	 */
	public int mimeIndex()
	{
		if (mimeIndex == -1)
		{
			String suffix = suffix();
			IntSlot mimeSlot = suffixesToMap.get(suffix);
			mimeIndex = (mimeSlot != null) ? mimeSlot.value : UNKNOWN_MIME;
			return mimeIndex;
		}
		else
			return mimeIndex;
	}

	public static int mimeIndex(String location)
	{
		int afterLastDot = location.lastIndexOf('.') + 1;
		int result = UNKNOWN_MIME;
		if ((afterLastDot > 0) && (location.length() > afterLastDot))
		{
			String suffix = location.substring(afterLastDot);
			IntSlot mimeSlot = suffixesToMap.get(suffix);
			if (mimeSlot != null)
				result = mimeSlot.value;
		}
		return result;
	}

	/**
	 * Get Media MimeType indexes. Media MimeTypes are currently text and all kinds of images such as
	 * JPG, GIF, and PNG.
	 * 
	 * @param parsedURL
	 */
	public int mediaMimeIndex()
	{
		return (mimeIndex() >= MimeType.UNKNOWN_MIME) ? MimeType.UNKNOWN_MIME : mimeIndex();
	}

	/*
	 * Check the suffix whether it is in the unsupportedMimes or not. If it is in the
	 * unsupportedMimes, return true, and if it is not, return false.
	 */
	public boolean isUnsupported()
	{
		return unsupportedMimes.containsKey(suffix());
	}

	/*
	 * return the inverse of isUnsupported(). Then, if the suffix is in the unsupportedMimes, return
	 * false, and if it is not, return true.
	 */
	public boolean supportedMime()
	{
		return !isUnsupported();
	}

	/**
	 * @return The directory of this, with protocol and host.
	 */
	public String directoryString()
	{
		String path = pathDirectoryString();

		int portNum = url.getPort();
		String port = (portNum == -1) ? "" : ":" + portNum;
		String host = url.getHost();
		String protocol = url.getProtocol();

		int stringLength = protocol.length() + 3 + host.length() + port.length() + path.length();

		StringBuffer buffy = new StringBuffer(stringLength);
		buffy.append(protocol).append("://").append(host).append(port).append(path);

		return buffy.toString(); // dont copy; wont reuse buffy
	}

	/**
	 * 
	 * @return The directory of this, without protocol and host.
	 */
	public String pathDirectoryString()
	{
		String path = url.getPath();

		int lastSlash = path.lastIndexOf("/");
		int lastDot = path.lastIndexOf(".");
		if (lastDot > lastSlash)
			path = path.substring(0, lastSlash);

		return path;
	}
	
	public String path()
	{
		return (url == null) ? null : url.getFile();
	}

	/**
	 * Return true if the other object is either a ParsedURL or a URL that refers to the same location
	 * as this. Note: this is our own implementation. It is *much* faster and slightly less careful
	 * than JavaSoft's. Checks port, host, file, protocol, and query. Ignores ref = hash.
	 */
	@Override
	public boolean equals(Object other)
	{
		if (other == null)
			return false;

		boolean otherIsPURL = other instanceof ParsedURL;
		boolean otherIsFile = other instanceof File;
		if (otherIsPURL || otherIsFile)
		{
			File otherFile = otherIsFile ? (File) other : ((ParsedURL) other).file;
			if (file != null)
			{
				return file.equals(otherFile);
			}
			if (otherFile != null)
				return false; // other has file but this does not
		}
		else if (!(other instanceof URL))
			return false; // not a PURL or an URL

		URL url = this.url;
		URL otherURL = otherIsPURL ? ((ParsedURL) other).url : (URL) other;

		if (url == null && otherURL == null)
			return true;

		if (url == null || otherURL == null)
			return false;

		// compare port
		if (url.getPort() != otherURL.getPort())
			return false;

		// compare host
		if (!url.getHost().equals(otherURL.getHost()))
			return false;

		// compare file
		if (!url.getFile().equals(otherURL.getFile()))
			return false;

		// compare protocol
		if (!url.getProtocol().equals(otherURL.getProtocol()))
			return false;

		// compare arguments
		return bothNullOrEqual(url.getQuery(), otherURL.getQuery());
	}

	private static boolean bothNullOrEqual(String a, String b)
	{
		return ((a == b) || // both are null or the same string
		((a != null) && a.equals(b))); // now safe to use a.equals()
	}

	/**
	 * Hash this by its URL.
	 */
	@Override
	public int hashCode()
	{
		if (url == null && file == null)
			debug("help!");
		return (url != null) ? url.hashCode() : (file != null) ? file.hashCode() : -1;
	}

	/**
	 * A shorter string for displaing in the modeline for debugging, and in popup messages.
	 */
	public String shortString()
	{
		String shortString = this.shortString;
		if (shortString == null)
		{
			URL url = this.url;
			if (url == null)
				shortString = "null";
			else
			{
				String file = url.getFile();
				shortString = url.getHost() + "/.../" + file.substring(file.lastIndexOf('/') + 1);
			}
			this.shortString = shortString;
		}
		return shortString;
	}

	/**
	 * True if this ParsedURL represents an entity on the local file system.
	 * 
	 * @return true if this is a local File object.
	 */
	public boolean isFile()
	{
		return file != null;
	}

	/**
	 * @return The file system object associated with this, if this is an entity on the local file
	 *         system, or null, otherwise.
	 */
	public File file()
	{
		return file;
	}

	/**
	 * Form a new ParsedURL from this, and the args passed in. A question mark is appended to the
	 * String form of this, and then args are appended.
	 * 
	 * @param args
	 * @return ParsedURL with args after ?
	 */
	public ParsedURL withArgs(String args)
	{
		try
		{
			URL url = new URL(toString() + "?" + args);
			return new ParsedURL(url);
		}
		catch (MalformedURLException e)
		{
			return null;
		}

	}

	/**
	 * Returns the name of the file or directory denoted by this abstract pathname. This is just the
	 * last name in the pathname's name sequence. If the pathname's name sequence is empty, then the
	 * empty string is returned.
	 * <p/>
	 * Analagous to File.getName().
	 * 
	 * @return Name of this, without directory, host, or protocol.
	 */
	public String getName()
	{
		URL url = this.url;
		String path = url.getPath();
		int lastSlash = path.lastIndexOf('/');
		if (lastSlash > -1)
		{
			path = path.substring(lastSlash + 1);
		}
		return path;
	}

	/**
	 * Basic ConnectionHelper. Does *nothing special* when encountering directories, re-directs, ...
	 */
	private static final ConnectionAdapter	connectionAdapter	= new ConnectionAdapter();

	// Set the URLConnection timeout a little smaller than our DownloadMonitor timeout.
	public static final int									CONNECT_TIMEOUT		= 15000;

	public static final int									READ_TIMEOUT			= 25000;

	/**
	 * Create a connection, using the standard timeouts of 23 seconds, and the super-basic
	 * ConnectionAdapter, which does *nothing special* when encountering directories, re-directs, ...
	 * 
	 * @param connectionHelper
	 * @return
	 */
	public PURLConnection connect()
	{
		return connect(connectionAdapter);
	}

	public PURLConnection connect(String userAgentName)
	{
		return connect(connectionAdapter, userAgentName);
	}

	/**
	 * Create a connection, using the standard timeouts of 23 seconds.
	 * 
	 * @param connectionHelper
	 * @return
	 */
	public PURLConnection connect(ConnectionHelper connectionHelper)
	{
		return connect(connectionHelper, DEFAULT_USER_AGENT, CONNECT_TIMEOUT, READ_TIMEOUT);
	}

	public PURLConnection connect(ConnectionHelper connectionHelper, String userAgentString)
	{

		return (userAgentString != null) ? connect(connectionHelper, userAgentString, CONNECT_TIMEOUT,
				READ_TIMEOUT) : connect(connectionHelper);
	}

	/**
	 * Create a connection.
	 * 
	 * @param connectionHelper
	 * @param userAgent
	 *          TODO
	 * @param connectionTimeout
	 * @param readTimeout
	 * @return
	 */
	public PURLConnection connect(ConnectionHelper connectionHelper, String userAgent,
			int connectionTimeout, int readTimeout)
	{
		PURLConnection result	= new PURLConnection(this);
		result.connect(connectionHelper,  userAgent, connectionTimeout,  readTimeout);
		return result;
	}

	/**
	 * Free some memory resources. They can be re-allocated through subsequent lazy evaluation. The
	 * object is still fully functional after this call.
	 */
	public void resetCaches()
	{
		this.directory = null;

		this.string = null;
		this.shortString = null;
		this.lc = null;
		this.suffix = null;
		this.domain = null;

		if (directoryPURL != null)
		{
			this.directoryPURL.recycle();
			this.directoryPURL = null;
		}

		// TODO -- is this too agressive?!
		this.hashUrl = null;
	}

	/**
	 * Free <b>all</b> all resources associated with this, rendering it no longer usable.
	 */
	public void recycle()
	{
		resetCaches();
		url = null;
		file = null;
	}

	public String host()
	{
		return (url == null) ? null : url.getHost();
	}
	
	/**
	 * 
	 * @return	A lightweight object corresponding to this, either a URL or a File
	 */
	public Object shadow()
	{
		return (url != null) ? url : file;
	}
	
	public ParsedURL filterArgs(String...argsToKeep)
	{
		if (url != null)
		{
			String query	= url.getQuery();
			StringTokenizer tokenizer	= new StringTokenizer(query, "&");
			if (!tokenizer.hasMoreElements())
				return this;
			StringBuilder resultQuery	= new StringBuilder(noAnchorNoQueryPageString());	// initialize w base URL
			boolean first							= true;
			while (tokenizer.hasMoreElements())
			{
				String token	= tokenizer.nextToken();
				for (String argToKeep: argsToKeep)
				{
					if (token.startsWith(argToKeep))
					{
						if (first)
						{
							first		= false;
							resultQuery.append('?');
						}
						else
							resultQuery.append('&');
						resultQuery.append(token);
					}
				}
			}
			return getAbsolute(resultQuery.toString());
		}
		return this;
	}
	public ParsedURL ignoreArgs(HashMap<String, String> argsToIgnore)
	{
		if (url != null)
		{
			String query	= url.getQuery();
			if (query !=null)
			{
				StringTokenizer tokenizer	= new StringTokenizer(query, "&");
				if (!tokenizer.hasMoreElements())
					return this;
				StringBuilder resultQuery	= new StringBuilder(noAnchorNoQueryPageString());	// initialize w base URL
				boolean first							= true;
				while (tokenizer.hasMoreElements())
				{
					String token	= tokenizer.nextToken();
					int argEnd		= token.indexOf('=');
					String arg		= argEnd == -1 ? token : token.substring(0, argEnd);
					if (!argsToIgnore.containsKey(arg))
					{
						if (first)
						{
							first		= false;
							resultQuery.append('?');
						}
						else
							resultQuery.append('&');
						resultQuery.append(token);
					}
				}
				return getAbsolute(resultQuery.toString());
			}
		}
		return this;
	}
	public String query()
	{
		return url.getQuery();
	}
	static public void main(String[] args)
	{
		try
		{
			URL u	= new URL("http://acm.org/citation.cfm?id=33344");
			System.out.println("query: " + u.getQuery() + "\n" + URLEncoder.encode("?"));
		}
		catch (MalformedURLException e)
		{
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}
	
	/**
	 * Extract arguments from the "query" portion of the URL (the part after ?).
	 * @param keepEmptyParams TODO
	 * 
	 * @return	HashMap of String name / value pairs.
	 */
	public HashMap<String, String> extractParams(boolean keepEmptyParams)
	{
		return StringTools.doubleSplit(url, keepEmptyParams);
	}
	
	/**
	 * Form a new ParsedURL using the base of this, while forming the query from a map of name / value pairs.
	 * 
	 * @param newParamMap	Map of name / value pairs.
	 * 
	 * @return	A new ParsedURL based on this one and the input argument map, or this, if that map is the same as in this.
	 */
	public ParsedURL updateParams(HashMap<String, String> newParamMap)
	{
		HashMap<String, String> oldParamMap	= extractParams(true);
		
		String newArgString	= StringTools.unDoubleSplit(newParamMap);
		String noArgsNoQuery= StringTools.noAnchorPageString(url, false);
		ParsedURL result		= this;
		if (newArgString != null && newArgString.length() > 0)
		{
			//TODO -- check to see if args are the same or different.
			result						= getAbsolute(noArgsNoQuery + '?' + newArgString);
		}
		else if (oldParamMap != null && oldParamMap.size() != 0)
		{
			result						= getAbsolute(noArgsNoQuery);
		}
		return result;
	}
	
	public ParsedURL changeHost(String newHost)
	{
		ParsedURL result	= null;
		if (newHost != null && newHost.length() > 0)
		{
			int port		= url.getPort();
			try
			{
				URL newURL	= (port > 0) ? new URL(url.getProtocol(), newHost, port, url.getFile()) :
					new URL(url.getProtocol(), newHost, url.getFile());
				
				result	= new ParsedURL(newURL);
			}
			catch (MalformedURLException e)
			{
				e.printStackTrace();
			}
		}
		return result;
	}

	public void setIncludePrefix(boolean includePrefix)
	{
		this.includePrefix = includePrefix;
	}
}