package ecologylab.net; import java.io.File; import java.io.IOException; import java.net.CookieHandler; import java.net.CookieManager; import java.net.MalformedURLException; import java.net.URI; import java.net.URL; import java.net.URLEncoder; import java.util.HashMap; import java.util.StringTokenizer; import ecologylab.collections.CollectionTools; import ecologylab.generic.Debug; import ecologylab.generic.IntSlot; import ecologylab.generic.StringTools; import ecologylab.io.Files; import ecologylab.platformspecifics.FundamentalPlatformSpecifics; import ecologylab.serialization.SIMPLTranslationException; import ecologylab.serialization.SimplTypesScope; import ecologylab.serialization.formatenums.Format; /** * Extends the URL with many features for the convenience and power of network programmers. New * class for manipulating and displaying URLs. * * Uses lazy evaluation to minimize storage allocation. * * @author andruid * @author eunyee * @author madhur */ public class ParsedURL extends Debug implements MimeType { private static final String NOT_IN_THE_FORMAT_OF_A_WEB_ADDRESS = " is not in the format of a web address"; private static final String DEFAULT_USER_AGENT = "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.1.2) Gecko/20090729 Firefox/3.5.2 (.NET CLR 3.5.30729)"; /** * this is the no hash url, that is, the one with # and anything after it stripped out. */ protected URL url = null; /** * If this is built from an entity of the local file system, store a reference to the object for * that here. */ File file; /** * URL with hash, that is, a reference to an anchor within the document. */ protected URL hashUrl = null; /** * Directory that the document referred to by the URL resides in. */ protected URL directory = null; private ParsedURL directoryPURL; /** * String representation of the URL. */ protected String string = null; /** * Shorter version of the string, for printing in tight spaces. */ String shortString; /* lower case of the url string */ protected String lc = null; /* suffix string of the url */ protected String suffix = null; /* domain value string of the ulr */ protected String domain = null; protected boolean includePrefix = true; public static CookieManager cookieManager = new CookieManager(); static { //cookieManager.setCookiePolicy(CookiePolicy.ACCEPT_ALL); CookieHandler.setDefault(cookieManager); } public ParsedURL(URL url) { String hash = url.getRef(); if ("file".equals(url.getProtocol())) { String urlString = url.toString(); if (urlString.startsWith("file://")) // this should be the case... { this.file = new File(urlString.substring(7)); } else // if not, try our hardest to make a good file { this.file = new File(url.getHost()+url.getPath()); } this.url = url; } else if (hash == null) { this.url = url; this.hashUrl = url; } else { this.hashUrl = url; try { // form no hash url (toss hash) this.url = new URL(url.getProtocol(), url.getHost(), url.getPort(), url.getFile()); } catch (MalformedURLException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } /** * * @return true if this refers to a file, and that file exists. Also true if this does not refer to a file. */ public boolean isNotFileOrExists() { return (file == null) || file.exists(); } /** * Create a ParsedURL from a file. If the file is a directory, append "/" to the path, so that * relative URLs will be formed properly later. * * @param file */ public ParsedURL(File file) { try { String urlString = "file://" + file.getAbsolutePath(); urlString = urlString.replace('\\', '/'); if (file.isDirectory()) urlString += "/"; this.url = new URL(urlString); } catch (MalformedURLException e) { e.printStackTrace(); } this.file = file; } /* * Constructor with a url string parameter. get absolute URL with getAbsolute() method. */ /* * public ParsedURL(String urlString) { // The second parameter of getAbolute method is error * description. this.url = getAbsolute(urlString, "").url(); } */ // ///////////////////////////////////////////////////////////////////// /** * Create a PURL from an absolute address. (Do it the quick and dirty way, providing less error * handling.) NB: Only call this method if you are *sure* a MalformedURlException would never be * produced. * */ public static ParsedURL getAbsolute(String webAddr) { return getAbsolute(webAddr, "getAbsolute(String) "); } public static ParsedURL get(URI uri) { return getAbsolute(uri.toString()); } /** * Create a PURL from an absolute address. * * @param webAddr * url string * @param errorDescriptor * which will be printed out in the trace file if there is something happen converting * from the url string to URL. * @return ParsedURL from url string parameter named webAddr, or null if the param is malformed. */ public static ParsedURL getAbsolute(String webAddr, String errorDescriptor) { if (webAddr == null || webAddr.length() <= 7) { println("ERROR: ParsedURL.getAbsolute() webAddr is null or too short: [" + webAddr + "]"); //Thread.dumpStack(); //We don't really need such a hostile message. } else { try { URL url = new URL(webAddr); if (isUndetectedMalformedURL(url)) return null; return new ParsedURL(url); } catch (MalformedURLException e) { if (!"".equals(errorDescriptor)) errorDescriptor = "\n" + errorDescriptor; Debug.error(webAddr, NOT_IN_THE_FORMAT_OF_A_WEB_ADDRESS + "." + errorDescriptor); } } return null; } /** * Determines a URL is malformed since Java fails to detect this. * * @param url * @return */ private static boolean isUndetectedMalformedURL(URL url) { // originally checked against "file:", but on OS X, we just get "file"; this is probably true // everywhere else too, but I will leave "file:" for the time being. -Zach boolean isFileProtocol = "file".equals(url.getProtocol()) || "file:".equals(url.getProtocol()); String host = url.getHost().trim(); return ((!isFileProtocol && ("".equals(host) || "/".equals(host))) || (isFileProtocol && ("".equals(url.getPath().trim()) || "localhost".equalsIgnoreCase(host)))); } /** * Form a ParsedURL, based on a relative path, using this as the base. * * @param relativeURLPath * Path relative to this. * @param errorDescriptor * * @return New ParsedURL based on this and the relative path. */ public final ParsedURL getRelative(String relativeURLPath, String errorDescriptor) { if (isFile()) { File newFile = Files.newFile(file, relativeURLPath); // remove ..'s from path if (newFile.getAbsolutePath().contains("..")) { try { File canonicalFile = newFile.getCanonicalFile(); return new ParsedURL(canonicalFile); } catch (IOException e) { e.printStackTrace(); } } return new ParsedURL(newFile); } else return getRelative(url, relativeURLPath, errorDescriptor); } /** * Form a ParsedURL, based on a relative path, using this as the base. * * @param relativeURLPath * Path relative to this. * * @return New ParsedURL based on this and the relative path. */ public final ParsedURL getRelative(String relativeURLPath) { return getRelative(relativeURLPath, ""); } /** * Form a new ParsedURL, relative from a supplied base URL. Checks to see if the relativePath * starts w a protocol spec. If so, calls getAbsolute(). Otherwise, forms a relative URL using the * URL base. * * @param relativeURLPath * @param errorDescriptor * @return New ParsedURL */ public static ParsedURL getRelative(URL base, String relativeURLPath, String errorDescriptor) { if (relativeURLPath == null) return null; ParsedURL result = null; if (!relativeURLPath.startsWith("http://") && !relativeURLPath.startsWith("ftp://")) { try { URL resultURL = new URL(base, relativeURLPath); result = new ParsedURL(resultURL); } catch (MalformedURLException e) { if (!"".equals(errorDescriptor)) errorDescriptor = "\n" + errorDescriptor; Debug.error(relativeURLPath, NOT_IN_THE_FORMAT_OF_A_WEB_ADDRESS + "[" + base + "]." + errorDescriptor); } } else return getAbsolute(relativeURLPath, errorDescriptor); return result; } /** * Use this as the source of stuff to translate from XML * * @param translationScope * Translations that specify package + class names for translating. * @return ElementState object derived from XML at the InputStream of this. * @throws SIMPLTranslationException */ public Object translateFromXML(SimplTypesScope translationScope) throws SIMPLTranslationException { return translationScope.deserialize(this, Format.XML); } public static URL getURL(URL base, String path, String error) { // ??? might want to allow this default behaviour ??? if (path == null) return null; try { // System.err.println("\nGENERIC - base, path, error = \n" + base + "\n" + path); URL newURL = new URL(base, path); // System.err.println("\nNEW URL = " + newURL); return newURL; } catch (MalformedURLException e) { if (error != null) throw new Error(e + "\n" + error + " " + base + " -> " + path); return null; } } /** * Uses lazy evaluation to minimize storage allocation. * * @return The URL as a String. */ @Override public String toString() { String result = string; if (result == null) { if (isFile() && includePrefix) result = "file://" + file.toString().replace('\\', '/'); else if (isFile() && !includePrefix) result = file.toString().replace('\\', '/'); else if (url == null) result = "weirdly null"; else result = StringTools.pageString(url); string = result; } return result; } /** * Uses lazy evaluation to minimize storage allocation. * * @return Lower case rendition of the URL String. */ public String lc() { String result = lc; if (result == null) { result = toString().toLowerCase(); lc = result; } return result; } /** * Uses lazy evaluation to minimize storage allocation. * * @return The suffix of the filename, in lower case. */ public String suffix() { String result = suffix; if (result == null) { String path = url.getPath(); if (path != null) { result = suffix(path.toLowerCase()); } // TODO make sure that there isnt code somewhere testing suffix for null! if (result == null) result = ""; suffix = result; } return result; } /** * Form a ParsedURL based on this, if this is a directory. Otherwise, form the ParsedURL from the * parent of this. Process files carefully to propagate their file-ness. * * @return */ public ParsedURL directoryPURL() { ParsedURL result = directoryPURL; if (result == null) { if (isFile()) { if (file.isDirectory()) result = this; else { File parent = file.getParentFile(); result = new ParsedURL(parent); } } else { result = new ParsedURL(directory()); } this.directoryPURL = result; } return result; } /** * Get the URL for the directory associated with this. Requires looking for slash at the end, * looking for a suffix or arguments. As a result, we sometimes add a slash at the end, sometimes * peel off the filename. Result is cached a la lazy evaluation. * * @return Directory URL */ public URL directory() { URL result = this.directory; if (result == null) { if (StringTools.endsWithSlash(toString())) result = this.url; if (result == null) { String suffix = suffix(); try { String path = url.getPath(); String args = url.getQuery(); String protocol = url.getProtocol(); String host = url.getHost(); int port = url.getPort(); if (suffix.length() == 0) { // this is a directory that is unterminated by slash; we need to fix that if (path.length() == 0) result = new URL(protocol, host, port, "/"); else { if ((args == null) || (args.length() == 0)) result = new URL(protocol, host, port, path + '/'); else // this is a tricky executable with no suffix { // result = null; // drop down into the next block, and peel off that suffix-less executable name } } } // else if (result == null) { // you have a suffix, so we need to trim off the filename int lastSlashIndex = path.lastIndexOf('/'); if (lastSlashIndex == -1) // suffix, but not within any subdirectory result = new URL(protocol, host, port, "/"); else { String pathThroughLastSlash = path.substring(0, lastSlashIndex + 1); result = new URL(protocol, host, port, pathThroughLastSlash); } } } catch (MalformedURLException e) { debug("Unexpected ERROR forming directory."); e.printStackTrace(); } } this.directory = result; } return result; } /** * Uses lazy evaluation to minimize storage allocation. * * @return The domain of the URL. */ public String domain() { String result = domain; if (result == null && (url != null)) { result = StringTools.domain(url); domain = result; } return result; } public boolean isNull() { return url == null && file == null; } /** * @return The suffix of the filename, in whatever case is found in the input string. */ public static String suffix(String lc) { int afterDot = lc.lastIndexOf('.') + 1; int lastSlash = lc.lastIndexOf('/'); String result = ((afterDot == 0) || (afterDot < lastSlash)) ? "" : lc.substring(afterDot); return result; } public String filename() { String lowerCase = noAnchorNoQueryPageString(); int lastDot = lowerCase.lastIndexOf('.'); int lastSlash = lowerCase.lastIndexOf('/'); String result = (lastDot == 0 || (lastDot < lastSlash)) ? "" : lowerCase.substring(lastSlash, lastDot); return result; } /** * Uses lazy evaluation to minimize storage allocation. * * @return the URL. */ public final URL url() { return url; } public final URL hashUrl() { if (hashUrl == null) return url(); else return hashUrl; } /* * return noAnchor no query page string */ public String noAnchorNoQueryPageString() { return StringTools.noAnchorNoQueryPageString(url); } /* * return no anchor no page string. */ public String noAnchorPageString() { return StringTools.noAnchorPageString(url); } /** * @return true if the suffix of this is equal to that of the argument. */ public final boolean hasSuffix(String s) { return lc().endsWith(s); // return suffix().equals(s); } final static String unsupportedMimeStrings[] = { "ai", "bmp", "eps", "ps", "psd", "svg", "tif", "vrml", "doc", "xls", "pps", "ppt", "adp", "rtf", "vbs", "vsd", "wht", "aif", "aiff", "aifc", "au", "mp3", "wav", "ra", "ram", "wm", "wma", "wmf", "wmp", "wms", "wmv", "wmx", "wmz", "avi", "mov", "mpa", "mpeg", "mpg", "ppj", "swf", "spl", "qdb", "cab", "chm", "gzip", "hqx", "jar", "lzh", "tar", "zip", "wpd", "xsl", }; final static HashMap unsupportedMimes = CollectionTools .buildHashMapFromStrings(unsupportedMimeStrings); static final String[] unsupportedProtocolStrings = { "mailto", "vbscript", "news", "rtsp", "https", }; static final HashMap unsupportedProtocols = CollectionTools .buildHashMapFromStrings(unsupportedProtocolStrings); static final String[] supportedProtocolStrings = { "http", "ftp", "file", }; static final HashMap supportedProtocols = CollectionTools .buildHashMapFromStrings(supportedProtocolStrings); static final String[] imgSuffixStrings; static final String[] SOME_IMG_SUFFIXES = { "jpg", "jpeg", "pjpg", "pjpeg", "gif", "png", }; /* * { "jpg", "jpeg", "pjpg", "pjpeg", "gif", "png", }; */ static final HashMap imgSuffixMap; // case static final String[] jpegMimeStrings = { "jpg", "JPG", "jpeg", "JPEG", "pjpg", "pjpeg", }; static final String[] gifMimeStrings = { "gif", "GIF", }; static final String[] pngMimeStrings = { "png", "PNG", }; static final HashMap jpegSuffixMap = CollectionTools .buildHashMapFromStrings(jpegMimeStrings); static final String[] htmlSuffixStrings = { "html", "htm", "stm", "php", "jhtml", "jsp", "asp", "txt", "shtml", "pl", "plx", "exe" }; static final String[] noAlphaSuffixStrings = { "bmp", "BMP", "wbmp", "WBMP", "jpg", "JPG", "jpeg", "JPEG", "pjpg", "PJPG", "pjpeg", "PJPEG", }; static final HashMap noAlphaSuffixMap = CollectionTools .buildHashMapFromStrings(noAlphaSuffixStrings); static final HashMap htmlSuffixMap = CollectionTools .buildHashMapFromStrings(htmlSuffixStrings); static final String[] pdfMimeStrings = { "pdf" }; static final HashMap pdfSuffixMap = CollectionTools .buildHashMapFromStrings(pdfMimeStrings); static final String[] rssMimeStrings = { "rss", "xml" }; static final HashMap rssSuffixMap = CollectionTools .buildHashMapFromStrings(rssMimeStrings); static final HashMap<String, IntSlot> suffixesToMap = new HashMap<String, IntSlot>(); static { String[] platformSpecificImgFormats = null; try { platformSpecificImgFormats = FundamentalPlatformSpecifics.get().getReaderFormatNames(); } catch (Throwable e) { } imgSuffixStrings = (platformSpecificImgFormats == null) ? SOME_IMG_SUFFIXES : platformSpecificImgFormats; imgSuffixMap = CollectionTools.buildHashMapFromLCStrings(imgSuffixStrings); for (int i = 0; i < pdfMimeStrings.length; i++) CollectionTools.stringIntMapEntry(suffixesToMap, pdfMimeStrings[i], PDF); for (int i = 0; i < htmlSuffixStrings.length; i++) CollectionTools.stringIntMapEntry(suffixesToMap, htmlSuffixStrings[i], HTML); for (int i = 0; i < rssMimeStrings.length; i++) CollectionTools.stringIntMapEntry(suffixesToMap, rssMimeStrings[i], RSS); for (int i = 0; i < jpegMimeStrings.length; i++) CollectionTools.stringIntMapEntry(suffixesToMap, jpegMimeStrings[i], JPG); for (int i = 0; i < gifMimeStrings.length; i++) CollectionTools.stringIntMapEntry(suffixesToMap, gifMimeStrings[i], GIF); for (int i = 0; i < pngMimeStrings.length; i++) CollectionTools.stringIntMapEntry(suffixesToMap, pngMimeStrings[i], PNG); } /** * Called while processing (parsing) HTML. Used to create new <code>ParsedURL</code>s from * urlStrings in response to such as the <code>a</code> element's <code>href</code> attribute, the * <code>img</code> element's <code>src</code> attribute, etc. * <p> * Does processing of some fancy stuff, like, in the case of <code>javascript:</code> URLs, it * mines them for embedded absolute URLs, if possible, and uses only those embedded URLs. * * @param addressString * This may be specify a relative or absolute url. * * @return The resulting ParsedURL. It may be null. It will never have protocol * <code>javascript:</code>. */ public ParsedURL createFromHTML(String addressString) { return createFromHTML(addressString, false); } /** * Called while processing (parsing) HTML. Used to create new <code>ParsedURL</code>s from * urlStrings in response to such as the <code>a</code> element's <code>href</code> attribute, the * <code>img</code> element's <code>src</code> attribute, etc. * <p> * Does processing of some fancy stuff, like, in the case of <code>javascript:</code> URLs, it * mines them for embedded absolute URLs, if possible, and uses only those embedded URLs. * * @param addressString * This may be specify a relative or absolute url. * * @param fromSearchPage * If false, then add <code>/</code> to the end of the URL if it seems to be a directory. * * @return The resulting ParsedURL. It may be null. It will never have protocol * <code>javascript:</code>. */ public ParsedURL createFromHTML(String addressString, boolean fromSearchPage) { return createFromHTML(this, addressString, fromSearchPage); } protected static ParsedURL get(URL url, String addressString) { try { return new ParsedURL(new URL(url, addressString)); } catch (MalformedURLException e) { println("ParsedURL.get() cant from url from: " + /* url +"\n\taddressString = "+ */addressString); // e.printStackTrace(); } return null; } /** * Called while processing (parsing) HTML. Used to create new <code>ParsedURL</code>s from * urlStrings in response to such as the <code>a</code> element's <code>href</code> attribute, the * <code>img</code> element's <code>src</code> attribute, etc. * <p> * Does processing of some fancy stuff, like, in the case of <code>javascript:</code> URLs, it * mines them for embedded absolute URLs, if possible, and uses only those embedded URLs. * * @param addressString * This may be specify a relative or absolute url. * * @param fromSearchPage * If false, then add <code>/</code> to the end of the URL if it seems to be a directory. * * @return The resulting ParsedURL. It may be null. It will never have protocol * <code>javascript:</code>. */ public static ParsedURL createFromHTML(ParsedURL contextPURL, String addressString, boolean fromSearchPage) { if ((addressString == null) || (addressString.length() == 0)) return null; if (addressString.startsWith("#") || addressString.startsWith("mailto")) { // return get(contextPURL.url(), addressString); return null; } String lc = addressString.toLowerCase(); boolean javascript = lc.startsWith("javascript:"); // mine urls from javascript quoted strings if (javascript) { // !!! Could do an even better job here of mining quoted // !!! javascript strings. // println("Container.newURL("+s); int http = lc.lastIndexOf("http://"); // TODO learn to mine PDFs as well as html!! int html = lc.lastIndexOf(".html"); int pdf = lc.lastIndexOf(".pdf"); // println("Container.newURL() checking javascript url:="+s+ // " http="+http+" html="+html); if (http > -1) { // seek absolute web addrs if ((html > -1) && (http < html)) { int end = html + 5; addressString = addressString.substring(http, end); // println("Container.newURL fixed javascript:= " + s); lc = lc.substring(http, end); javascript = false; } else if ((pdf > -1) && (http < pdf)) { int end = pdf + 4; addressString = addressString.substring(http, end); // println("Container.newURL fixed javascript:= " + s); lc = lc.substring(http, end); javascript = false; } } else { // seek relative addresses // need to find the bounds of a quoted string, if there is one } // !!! What we should really do here is find quoted strings // (usually with single quote, but perhaps double as well) // (use regular expressions?? - are they fast enough?) // and look at each one to see if either protocol is supported // or suffix is htmlMime or imgMime. } if (javascript) return null; char argDelim = '?'; // url string always keep hash string. String hashString = StringTools.EMPTY_STRING; if (fromSearchPage) { // handle embedded http:// int lastHttp = addressString.lastIndexOf("http://"); // usually ? but could be & if (lastHttp > 0) { // this is search engine crap addressString = addressString.substring(lastHttp); // debugA("now addressString="+addressString); // handle any embedded args (for google mess) argDelim = '&'; } } else { // TODO do we really need to do any of this??????????????????????? // 1) peel off hash int hashPos = addressString.indexOf('#'); // String hashString= StringTools.EMPTY_STRING; if (hashPos > -1) { hashString = addressString.substring(hashPos); addressString = addressString.substring(0, hashPos); } // 2) peel off args int argPos = addressString.indexOf(argDelim); String argString = StringTools.EMPTY_STRING; if (argPos > -1) { argString = addressString.substring(argPos); addressString = addressString.substring(0, argPos); } // This seems uneccessary, crawling any wikimedia based site will break by adding an extra // slash. // else // { // // 3) if what's left is a directory (w/o a mime type),add slash // int endingSlash = addressString.lastIndexOf('/'); // int lastChar = addressString.length() - 1; // if (endingSlash == -1) // endingSlash++; // if ((lastChar > 0) && // (lastChar != endingSlash) && // (addressString.substring(endingSlash).indexOf('.') == -1)) // addressString += '/'; // } // 4) put back what we peeled off addressString = addressString + argString + hashString; } int protocolEnd = addressString.indexOf(":"); if (protocolEnd != -1) { // this is an absolute URL; check for supported protocol String protocol = addressString.substring(0, protocolEnd); if (protocolIsUnsupported(protocol)) return null; } ParsedURL parsedUrl; if (contextPURL == null || addressString.startsWith("http://")) { parsedUrl = getAbsolute(addressString, "in createFromHTML()"); } else { ParsedURL directoryPURL = contextPURL.directoryPURL(); parsedUrl = directoryPURL.getRelative(addressString); } return parsedUrl; } /** * * @return A String version of the URL path, in which all punctuation characters have been changed * into spaces. */ public String removePunctuation() { return StringTools.removePunctuation(toString()); } /** * @return true if they have same domains. false if they have different domains. */ public boolean sameDomain(ParsedURL other) { return (other != null) && domain().equals(other.domain()); } /** * @return true if they have same hosts. false if they have different hosts. */ public boolean sameHost(ParsedURL other) { return (other != null) && url.getHost().equals(other.url().getHost()); } /** * Use unsupportedMimes and protocolIsSupported to determine if this is content fit for * processing. * * @return true if this seems to be a web addr we can crawl to. (currently that means html). **/ public boolean crawlable() { return protocolIsSupported() && !unsupportedMimes.containsKey(suffix()); } /** * Check whether the protocol is supported or not. Currently, only http and ftp are. */ public boolean protocolIsSupported() { return (url != null) && protocolIsSupported(url.getProtocol()); } /** * Check whether the protocol is supported or not. Currently, only http and ftp are. */ public static boolean protocolIsSupported(String protocol) { return supportedProtocols.containsKey(protocol); } /** * Check whether the protocol is supported or not. Currently, only http and ftp are. */ public boolean protocolIsUnsupported() { return (url != null) && protocolIsUnsupported(url.getProtocol()); } /** * Check whether the protocol is supported or not. Currently, only http and ftp are. */ public static boolean protocolIsUnsupported(String protocol) { return unsupportedProtocols.containsKey(protocol); } /** * @return true if this is an image file. */ public boolean isImg() { return isImageSuffix(suffix()); } /** * * @param thatSuffix * @return true if the suffix passed in is one for an image type that we can handle. */ public static boolean isImageSuffix(String thatSuffix) { return imgSuffixMap.containsKey(thatSuffix); } /** * @return true if this is a JPEG image file. */ public boolean isJpeg() { return jpegSuffixMap.containsKey(suffix()); } /** * @return true if we can tell the image file wont have alpha, just from its suffix. This is * currently the case for jpeg and bmp. */ public boolean isNoAlpha() { return noAlphaSuffixMap.containsKey(suffix()); } /** * Test type of document this refers to. * * @return true if this refers to an HTML file */ public boolean isHTML() { return htmlSuffixMap.containsKey(suffix()); } /** * Test type of document this refers to. * * @return true if this refers to a PDF file */ public boolean isPDF() { return pdfSuffixMap.containsKey(suffix()); } /** * Test type of document this refers to. * * @return true if this refers to an RSS feed */ public boolean isRSS() { return rssSuffixMap.containsKey(suffix()); } int mimeIndex = -1; /** * Get MimeType index by seeing suffix(). * * @param parsedURL */ public int mimeIndex() { if (mimeIndex == -1) { String suffix = suffix(); IntSlot mimeSlot = suffixesToMap.get(suffix); mimeIndex = (mimeSlot != null) ? mimeSlot.value : UNKNOWN_MIME; return mimeIndex; } else return mimeIndex; } public static int mimeIndex(String location) { int afterLastDot = location.lastIndexOf('.') + 1; int result = UNKNOWN_MIME; if ((afterLastDot > 0) && (location.length() > afterLastDot)) { String suffix = location.substring(afterLastDot); IntSlot mimeSlot = suffixesToMap.get(suffix); if (mimeSlot != null) result = mimeSlot.value; } return result; } /** * Get Media MimeType indexes. Media MimeTypes are currently text and all kinds of images such as * JPG, GIF, and PNG. * * @param parsedURL */ public int mediaMimeIndex() { return (mimeIndex() >= MimeType.UNKNOWN_MIME) ? MimeType.UNKNOWN_MIME : mimeIndex(); } /* * Check the suffix whether it is in the unsupportedMimes or not. If it is in the * unsupportedMimes, return true, and if it is not, return false. */ public boolean isUnsupported() { return unsupportedMimes.containsKey(suffix()); } /* * return the inverse of isUnsupported(). Then, if the suffix is in the unsupportedMimes, return * false, and if it is not, return true. */ public boolean supportedMime() { return !isUnsupported(); } /** * @return The directory of this, with protocol and host. */ public String directoryString() { String path = pathDirectoryString(); int portNum = url.getPort(); String port = (portNum == -1) ? "" : ":" + portNum; String host = url.getHost(); String protocol = url.getProtocol(); int stringLength = protocol.length() + 3 + host.length() + port.length() + path.length(); StringBuffer buffy = new StringBuffer(stringLength); buffy.append(protocol).append("://").append(host).append(port).append(path); return buffy.toString(); // dont copy; wont reuse buffy } /** * * @return The directory of this, without protocol and host. */ public String pathDirectoryString() { String path = url.getPath(); int lastSlash = path.lastIndexOf("/"); int lastDot = path.lastIndexOf("."); if (lastDot > lastSlash) path = path.substring(0, lastSlash); return path; } public String path() { return (url == null) ? null : url.getFile(); } /** * Return true if the other object is either a ParsedURL or a URL that refers to the same location * as this. Note: this is our own implementation. It is *much* faster and slightly less careful * than JavaSoft's. Checks port, host, file, protocol, and query. Ignores ref = hash. */ @Override public boolean equals(Object other) { if (other == null) return false; boolean otherIsPURL = other instanceof ParsedURL; boolean otherIsFile = other instanceof File; if (otherIsPURL || otherIsFile) { File otherFile = otherIsFile ? (File) other : ((ParsedURL) other).file; if (file != null) { return file.equals(otherFile); } if (otherFile != null) return false; // other has file but this does not } else if (!(other instanceof URL)) return false; // not a PURL or an URL URL url = this.url; URL otherURL = otherIsPURL ? ((ParsedURL) other).url : (URL) other; if (url == null && otherURL == null) return true; if (url == null || otherURL == null) return false; // compare port if (url.getPort() != otherURL.getPort()) return false; // compare host if (!url.getHost().equals(otherURL.getHost())) return false; // compare file if (!url.getFile().equals(otherURL.getFile())) return false; // compare protocol if (!url.getProtocol().equals(otherURL.getProtocol())) return false; // compare arguments return bothNullOrEqual(url.getQuery(), otherURL.getQuery()); } private static boolean bothNullOrEqual(String a, String b) { return ((a == b) || // both are null or the same string ((a != null) && a.equals(b))); // now safe to use a.equals() } /** * Hash this by its URL. */ @Override public int hashCode() { if (url == null && file == null) debug("help!"); return (url != null) ? url.hashCode() : (file != null) ? file.hashCode() : -1; } /** * A shorter string for displaing in the modeline for debugging, and in popup messages. */ public String shortString() { String shortString = this.shortString; if (shortString == null) { URL url = this.url; if (url == null) shortString = "null"; else { String file = url.getFile(); shortString = url.getHost() + "/.../" + file.substring(file.lastIndexOf('/') + 1); } this.shortString = shortString; } return shortString; } /** * True if this ParsedURL represents an entity on the local file system. * * @return true if this is a local File object. */ public boolean isFile() { return file != null; } /** * @return The file system object associated with this, if this is an entity on the local file * system, or null, otherwise. */ public File file() { return file; } /** * Form a new ParsedURL from this, and the args passed in. A question mark is appended to the * String form of this, and then args are appended. * * @param args * @return ParsedURL with args after ? */ public ParsedURL withArgs(String args) { try { URL url = new URL(toString() + "?" + args); return new ParsedURL(url); } catch (MalformedURLException e) { return null; } } /** * Returns the name of the file or directory denoted by this abstract pathname. This is just the * last name in the pathname's name sequence. If the pathname's name sequence is empty, then the * empty string is returned. * <p/> * Analagous to File.getName(). * * @return Name of this, without directory, host, or protocol. */ public String getName() { URL url = this.url; String path = url.getPath(); int lastSlash = path.lastIndexOf('/'); if (lastSlash > -1) { path = path.substring(lastSlash + 1); } return path; } /** * Basic ConnectionHelper. Does *nothing special* when encountering directories, re-directs, ... */ private static final ConnectionAdapter connectionAdapter = new ConnectionAdapter(); // Set the URLConnection timeout a little smaller than our DownloadMonitor timeout. public static final int CONNECT_TIMEOUT = 15000; public static final int READ_TIMEOUT = 25000; /** * Create a connection, using the standard timeouts of 23 seconds, and the super-basic * ConnectionAdapter, which does *nothing special* when encountering directories, re-directs, ... * * @param connectionHelper * @return */ public PURLConnection connect() { return connect(connectionAdapter); } public PURLConnection connect(String userAgentName) { return connect(connectionAdapter, userAgentName); } /** * Create a connection, using the standard timeouts of 23 seconds. * * @param connectionHelper * @return */ public PURLConnection connect(ConnectionHelper connectionHelper) { return connect(connectionHelper, DEFAULT_USER_AGENT, CONNECT_TIMEOUT, READ_TIMEOUT); } public PURLConnection connect(ConnectionHelper connectionHelper, String userAgentString) { return (userAgentString != null) ? connect(connectionHelper, userAgentString, CONNECT_TIMEOUT, READ_TIMEOUT) : connect(connectionHelper); } /** * Create a connection. * * @param connectionHelper * @param userAgent * TODO * @param connectionTimeout * @param readTimeout * @return */ public PURLConnection connect(ConnectionHelper connectionHelper, String userAgent, int connectionTimeout, int readTimeout) { PURLConnection result = new PURLConnection(this); result.connect(connectionHelper, userAgent, connectionTimeout, readTimeout); return result; } /** * Free some memory resources. They can be re-allocated through subsequent lazy evaluation. The * object is still fully functional after this call. */ public void resetCaches() { this.directory = null; this.string = null; this.shortString = null; this.lc = null; this.suffix = null; this.domain = null; if (directoryPURL != null) { this.directoryPURL.recycle(); this.directoryPURL = null; } // TODO -- is this too agressive?! this.hashUrl = null; } /** * Free <b>all</b> all resources associated with this, rendering it no longer usable. */ public void recycle() { resetCaches(); url = null; file = null; } public String host() { return (url == null) ? null : url.getHost(); } /** * * @return A lightweight object corresponding to this, either a URL or a File */ public Object shadow() { return (url != null) ? url : file; } public ParsedURL filterArgs(String...argsToKeep) { if (url != null) { String query = url.getQuery(); StringTokenizer tokenizer = new StringTokenizer(query, "&"); if (!tokenizer.hasMoreElements()) return this; StringBuilder resultQuery = new StringBuilder(noAnchorNoQueryPageString()); // initialize w base URL boolean first = true; while (tokenizer.hasMoreElements()) { String token = tokenizer.nextToken(); for (String argToKeep: argsToKeep) { if (token.startsWith(argToKeep)) { if (first) { first = false; resultQuery.append('?'); } else resultQuery.append('&'); resultQuery.append(token); } } } return getAbsolute(resultQuery.toString()); } return this; } public ParsedURL ignoreArgs(HashMap<String, String> argsToIgnore) { if (url != null) { String query = url.getQuery(); if (query !=null) { StringTokenizer tokenizer = new StringTokenizer(query, "&"); if (!tokenizer.hasMoreElements()) return this; StringBuilder resultQuery = new StringBuilder(noAnchorNoQueryPageString()); // initialize w base URL boolean first = true; while (tokenizer.hasMoreElements()) { String token = tokenizer.nextToken(); int argEnd = token.indexOf('='); String arg = argEnd == -1 ? token : token.substring(0, argEnd); if (!argsToIgnore.containsKey(arg)) { if (first) { first = false; resultQuery.append('?'); } else resultQuery.append('&'); resultQuery.append(token); } } return getAbsolute(resultQuery.toString()); } } return this; } public String query() { return url.getQuery(); } static public void main(String[] args) { try { URL u = new URL("http://acm.org/citation.cfm?id=33344"); System.out.println("query: " + u.getQuery() + "\n" + URLEncoder.encode("?")); } catch (MalformedURLException e) { // TODO Auto-generated catch block e.printStackTrace(); } } /** * Extract arguments from the "query" portion of the URL (the part after ?). * @param keepEmptyParams TODO * * @return HashMap of String name / value pairs. */ public HashMap<String, String> extractParams(boolean keepEmptyParams) { return StringTools.doubleSplit(url, keepEmptyParams); } /** * Form a new ParsedURL using the base of this, while forming the query from a map of name / value pairs. * * @param newParamMap Map of name / value pairs. * * @return A new ParsedURL based on this one and the input argument map, or this, if that map is the same as in this. */ public ParsedURL updateParams(HashMap<String, String> newParamMap) { HashMap<String, String> oldParamMap = extractParams(true); String newArgString = StringTools.unDoubleSplit(newParamMap); String noArgsNoQuery= StringTools.noAnchorPageString(url, false); ParsedURL result = this; if (newArgString != null && newArgString.length() > 0) { //TODO -- check to see if args are the same or different. result = getAbsolute(noArgsNoQuery + '?' + newArgString); } else if (oldParamMap != null && oldParamMap.size() != 0) { result = getAbsolute(noArgsNoQuery); } return result; } public ParsedURL changeHost(String newHost) { ParsedURL result = null; if (newHost != null && newHost.length() > 0) { int port = url.getPort(); try { URL newURL = (port > 0) ? new URL(url.getProtocol(), newHost, port, url.getFile()) : new URL(url.getProtocol(), newHost, url.getFile()); result = new ParsedURL(newURL); } catch (MalformedURLException e) { e.printStackTrace(); } } return result; } public void setIncludePrefix(boolean includePrefix) { this.includePrefix = includePrefix; } }