package org.archive.url; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.httpclient.URIException; //import org.apache.http.client.utils.Punycode; public class URLParser { /** * RFC 2396-inspired regex. * * From the RFC Appendix B: * <pre> * URI Generic Syntax August 1998 * * B. Parsing a URI Reference with a Regular Expression * * As described in Section 4.3, the generic URI syntax is not sufficient * to disambiguate the components of some forms of URI. Since the * "greedy algorithm" described in that section is identical to the * disambiguation method used by POSIX regular expressions, it is * natural and commonplace to use a regular expression for parsing the * potential four components and fragment identifier of a URI reference. * * The following line is the regular expression for breaking-down a URI * reference into its components. * * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))? * 12 3 4 5 6 7 8 9 * * The numbers in the second line above are only to assist readability; * they indicate the reference points for each subexpression (i.e., each * paired parenthesis). We refer to the value matched for subexpression * <n> as $<n>. For example, matching the above expression to * * http://www.ics.uci.edu/pub/ietf/uri/#Related * * results in the following subexpression matches: * * $1 = http: * $2 = http * $3 = //www.ics.uci.edu * $4 = www.ics.uci.edu * $5 = /pub/ietf/uri/ * $6 = <undefined> * $7 = <undefined> * $8 = #Related * $9 = Related * * where <undefined> indicates that the component is not present, as is * the case for the query component in the above example. Therefore, we * can determine the value of the four components and fragment as * * scheme = $2 * authority = $4 * path = $5 * query = $7 * fragment = $9 * </pre> * * -- * <p>Below differs from the rfc regex in that... * (1) it has java escaping of regex characters * (2) we allow a URI made of a fragment only (Added extra * group so indexing is off by one after scheme). * (3) scheme is limited to legal scheme characters */ final public static Pattern RFC2396REGEX = Pattern.compile( "^(([a-zA-Z][a-zA-Z0-9\\+\\-\\.]*):)?((//([^/?#]*))?([^?#]*)(\\?([^#]*))?)?(#(.*))?"); // 12 34 5 6 7 8 9 A // 2 1 54 6 87 3 A9 // 1: scheme // 2: scheme: // 3: //authority/path // 4: //authority // 5: authority // 6: path // 7: ?query // 8: query // 9: #fragment // A: fragment public static final String COMMERCIAL_AT = "@"; public static final char PERCENT_SIGN = '%'; public static final char COLON = ':'; public static final String STRAY_SPACING = "[\n\r\t\\p{Zl}\\p{Zp}\u0085]+"; /** * Pattern that looks for case of three or more slashes after the * scheme. If found, we replace them with two only as mozilla does. */ final static Pattern HTTP_SCHEME_SLASHES = Pattern.compile("^(https?://)/+(.*)"); /** * ARC/WARC specific DNS resolution record. */ public final static String DNS_SCHEME = "dns:"; /** * ARC header record. */ public final static String FILEDESC_SCHEME = "filedesc:"; /** * WARC header record. */ public final static String WARCINFO_SCHEME = "warcinfo:"; /** * HTTP */ public final static String HTTP_SCHEME = "http://"; /** * HTTPS */ public final static String HTTPS_SCHEME = "https://"; /** * FTP */ public final static String FTP_SCHEME = "ftp://"; /** * MMS */ public final static String MMS_SCHEME = "mms://"; /** * RTSP */ public final static String RTSP_SCHEME = "rtsp://"; /** * Default scheme to assume if unspecified. No context implied... */ public final static String DEFAULT_SCHEME = HTTP_SCHEME; /** * go brewster */ public final static String WAIS_SCHEME = "wais://"; /** * array of static Strings for all "known" schemes */ public final static String ALL_SCHEMES[] = { HTTP_SCHEME, HTTPS_SCHEME, FTP_SCHEME, MMS_SCHEME, RTSP_SCHEME, WAIS_SCHEME }; public final static Pattern ALL_SCHEMES_PATTERN = Pattern.compile("(?i)^(http|https|ftp|mms|rtsp|wais)://.*"); /** * Attempt to find the scheme (http://, https://, etc) from a given URL. * @param url URL String to parse for a scheme. * @return the scheme, including trailing "://" if known, null otherwise. */ public static String urlToScheme(final String url) { for(final String scheme : ALL_SCHEMES) { if(url.startsWith(scheme)) { return scheme; } } return null; } public static String addDefaultSchemeIfNeeded(String urlString) { if(urlString == null) { return null; } // add http:// if no scheme is present: Matcher m2 = ALL_SCHEMES_PATTERN.matcher(urlString); if(m2.matches()) { return urlString; } return DEFAULT_SCHEME + urlString; } public static HandyURL parse(String urlString) throws URIException { // first strip leading or trailing spaces: // TODO: this strips too much - stripping non-printables urlString = urlString.trim(); // then remove leading, trailing, and internal TAB, CR, LF: urlString = urlString.replaceAll(STRAY_SPACING,""); // check for non-standard URLs: if(urlString.startsWith(DNS_SCHEME) || urlString.startsWith(FILEDESC_SCHEME) || urlString.startsWith(WARCINFO_SCHEME)) { HandyURL h = new HandyURL(); // TODO: we could set the authority - to allow SURT stuff to work.. h.setOpaque(urlString); return h; } // add http:// if no scheme is present.. urlString = addDefaultSchemeIfNeeded(urlString); // replace leading http:/// with http:// Matcher m1 = HTTP_SCHEME_SLASHES.matcher(urlString); if (m1.matches()) { urlString = m1.group(1) + m1.group(2); } // cross fingers, toes, eyes... Matcher matcher = RFC2396REGEX.matcher(urlString); if(!matcher.matches()) { throw new URIException("No Regex URI Match:" + urlString); } String uriScheme = matcher.group(2); // String uriSchemeSpecificPart = matcher.group(3); String uriAuthority = matcher.group(5); String uriPath = matcher.group(6); String uriQuery = matcher.group(7); String uriFragment = matcher.group(10); // Split Authority into USER:PASS@HOST:PORT String userName = null; String userPass = null; String hostname = null; int port = HandyURL.DEFAULT_PORT; String userInfo = null; String colonPort = null; int atIndex = uriAuthority.indexOf(COMMERCIAL_AT); int portColonIndex = uriAuthority.indexOf(COLON,(atIndex<0)?0:atIndex); if(atIndex<0 && portColonIndex<0) { // most common case: neither userinfo nor port hostname = uriAuthority; // return fixupDomainlabel(uriAuthority); } else if (atIndex<0 && portColonIndex>-1) { // next most common: port but no userinfo hostname = uriAuthority.substring(0,portColonIndex); colonPort = uriAuthority.substring(portColonIndex); // return domain + port; } else if (atIndex>-1 && portColonIndex<0) { // uncommon: userinfo, no port userInfo = uriAuthority.substring(0,atIndex+1); hostname = uriAuthority.substring(atIndex+1); // return userinfo + domain; } else { // uncommon: userinfo, port userInfo = uriAuthority.substring(0,atIndex+1); hostname = uriAuthority.substring(atIndex+1,portColonIndex); colonPort = uriAuthority.substring(portColonIndex); // return userinfo + domain + port; } if(colonPort != null) { if(colonPort.startsWith(":")) { try { port = Integer.parseInt(colonPort.substring(1)); } catch(NumberFormatException e) { throw new URIException(String.format("Bad port(%s) in (%s)", colonPort.substring(1),urlString)); } } else { // BUGBUG: what's happened?! } } if(userInfo != null) { int passColonIndex = userInfo.indexOf(COLON); if(passColonIndex == -1) { // no password: userName = userInfo; } else { userName = userInfo.substring(0,passColonIndex+1); userPass = userInfo.substring(passColonIndex+1); } } return new HandyURL(uriScheme,userName,userPass,hostname, port,uriPath,uriQuery,uriFragment); } }