package com.grendelscan.commons.http;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.http.NameValuePair;
import org.apache.http.message.BasicNameValuePair;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.grendelscan.commons.StringUtils;
import com.grendelscan.commons.formatting.encoding.UrlEncodingUtils;
import com.grendelscan.commons.http.factories.UriFactory;
public class URIStringUtils
{
private static final Logger LOGGER = LoggerFactory.getLogger(URIStringUtils.class);
private static final int INDEX_SCHEME = 0;
private static final int INDEX_HOST = 1;
private static final int INDEX_PORT = 2;
private static final int INDEX_DIRECTORY = 3;
private static final int INDEX_FILE = 4;
private static final int INDEX_SESSION = 5;
private static final int INDEX_QUERY = 6;
private static final int INDEX_FRAGMENT = 7; // <>#%{}|\^[]`
// http :// host :443 /full/ filename query fragment
private final static Pattern urlPattern = Pattern.compile("^(?:" + // beginning of optional hostname gorup
"([a-z]+)://" + // scheme
"([a-z0-9\\-\\.]++)" + // host
"(?::" + // colon for port and begining of optional port group
"(\\d++)" + // port
")?" + // end of optional port group
")?+" + // end of optional hostname group
"([^\\?#]*/)?" + // directory
"([^\\?/#;]*+)" + // file
"(?:;([^?]+=[^?]+))?" + // session ID
"(?:\\?([^#]*))?" + // query
"(?:#(.*))?$", // fragment
Pattern.CASE_INSENSITIVE);
private static final Pattern DOUBLE_SLASH_DIR = Pattern.compile("(?<!(?:https?|ftp):)//", Pattern.CASE_INSENSITIVE);
private static final Pattern badUriCharPattern = Pattern.compile("((?:[\\x00-\\x20\\x7f-\\xff\\{\\}\\\\\"'`^#|\\[\\]<>\\(\\)])|(?:%(?![0-9a-f]{2})))", Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
// public static String decodeUrlEncodedString(String url)
// {
// byte[] bytes = url.getBytes(StringUtils.getDefaultCharset());
//
// ByteArrayOutputStream buffer = new ByteArrayOutputStream();
//
// for (int i = 0; i < bytes.length; i++)
// {
// int b = bytes[i];
// if (b == '+')
// {
// buffer.write(' ');
// }
// else if (b == '%')
// {
// int u = -1;
// int l = -1;
// try
// {
// u = Character.digit((char) bytes[i + 1], 16);
// l = Character.digit((char) bytes[i + 2], 16);
// if ((u == -1) || (l == -1))
// {
// throw new DecoderException("Invalid URL encoding");
// }
// buffer.write((char) ((u << 4) + l));
// }
// catch (Exception e)
// {
// if (bytes.length > i)
// {
// buffer.write(bytes[i + 1]);
// }
//
// if (bytes.length > i + 1)
// {
// buffer.write(bytes[i + 2]);
// }
// }
// i += 2;
// }
// else
// {
// buffer.write(b);
// }
// }
//
// return buffer.toString();
// }
public static void assertAbsoluteHttpAndValid(final String uri) throws URISyntaxException
{
URI testUri = new URI(uri);
if (!testUri.isAbsolute())
{
throw new URISyntaxException(uri, "Not absolute URI");
}
if (!(testUri.getScheme().equalsIgnoreCase("http") || testUri.getScheme().equalsIgnoreCase("https")))
{
throw new URISyntaxException(uri, "Try using a web protocol");
}
}
public static String cleanupWhitespace(final String uri)
{
if (uri == null)
{
return uri;
}
return uri.replaceAll("[\\x00-\\x20]+", "");
}
private static String escapeString(final String string, final Pattern p)
{
String newString = string;
if (newString != null)
{
Matcher m = p.matcher(newString);
while (m.find())
{
String badchar = m.group(1);
String firstHalf = newString.substring(0, m.start());
String lastHalf = newString.substring(m.end());
String replacement;
if (badchar.equals(" "))
{
replacement = "+";
}
else
{
replacement = String.format("%%%02x", badchar.codePointAt(0)).toUpperCase();
}
newString = firstHalf + replacement + lastHalf;
m = p.matcher(newString);
}
}
return newString;
}
public static String escapeUri(final String uri)
{
return escapeString(uri, badUriCharPattern);
}
/**
* Adds a trailing slash to a URL for hostname-only URLs
*
* @param uri
* @return
*/
public static String fixBaseUri(final String uri)
{
int pos = 0;
int count = 0;
while (pos < uri.length() && count < 3)
{
pos = uri.indexOf('/', pos + 1);
if (pos < 0)
{
break;
}
count++;
}
if (count < 3)
{
LOGGER.warn("Appended slash to base URI " + uri);
return uri + "/";
}
return uri;
}
public static List<String> getAllDirectoryURIs(final String baseUri) throws URISyntaxException
{
List<String> uris = new ArrayList<String>(1);
String components[] = parseUriString(baseUri);
String currentPath = components[INDEX_SCHEME] + "://" + components[INDEX_HOST];
if (!components[INDEX_PORT].isEmpty())
{
currentPath += ":" + components[INDEX_PORT];
}
// currentPath += "/";
// uris.add(currentPath);
for (String section : components[INDEX_DIRECTORY].split("/"))
{
currentPath += section + "/";
uris.add(currentPath);
}
return uris;
}
/**
* This only returns the directory, not the host, port, scheme, etc
*
* @param uri
* @return
*/
public static String getDirectory(final String uri) throws URISyntaxException
{
return parseUriString(uri)[INDEX_DIRECTORY];
}
/**
* Returns the URI scheme, host, port, and path. No file, query or fragment is included.
*/
public static String getDirectoryUri(final String uri) throws URISyntaxException
{
String components[] = parseUriString(uri);
String dirUri = components[INDEX_SCHEME] + "://" + components[INDEX_HOST];
if (!components[INDEX_PORT].isEmpty())
{
dirUri += ":" + components[INDEX_PORT];
}
dirUri += components[INDEX_DIRECTORY];
return dirUri;
}
public static String getFilename(final String uri) throws URISyntaxException
{
return parseUriString(uri)[INDEX_FILE];
}
/**
* Returns the URI scheme, host, port, path & file. No query or fragment is included. Forces a trailing slash if it's only a host with no path
*/
public static String getFileUri(final String uri)
{
int queryStart = uri.indexOf("?");
if (queryStart < 0)
{
queryStart = uri.length();
}
int fragmentStart = uri.indexOf("#");
if (fragmentStart < 0)
{
fragmentStart = uri.length();
}
int end;
if (queryStart < fragmentStart)
{
end = queryStart;
}
else
{
end = fragmentStart;
}
String trailingSlash = "";
boolean absolute = uri.toLowerCase().startsWith("http");
if (absolute && uri.lastIndexOf("/") <= 7 || !absolute && uri.lastIndexOf("/") == -1)
{
trailingSlash = "/";
}
return uri.substring(0, end) + trailingSlash;
}
public static String getFirstQueryParameter(final String URI, final String parameterName) throws URISyntaxException
{
String value = null;
for (NameValuePair param : getQueryParametersFromUri(URI))
{
if (param.getName().equalsIgnoreCase(parameterName))
{
value = param.getValue();
break;
}
}
return value;
}
public static String getFragment(final String uri) throws URISyntaxException
{
return parseUriString(uri)[INDEX_FRAGMENT];
}
public static String getHost(final String uri) throws URISyntaxException
{
return parseUriString(uri)[INDEX_HOST];
}
/**
*
* @param uri
* @return The scheme, host & port (if in original URI)
*/
public static String getHostUri(final String uri) throws URISyntaxException
{
return getHostUriWithoutTrailingSlash(uri) + "/";
}
/**
*
* @param uri
* @return The scheme, host & port (if in original URI), but no slash
*/
public static String getHostUriWithoutTrailingSlash(final String uri) throws URISyntaxException
{
String components[] = parseUriString(uri);
String hostUri = components[INDEX_SCHEME] + "://" + components[INDEX_HOST];
if (!components[INDEX_PORT].isEmpty())
{
hostUri += ":" + components[INDEX_PORT];
}
return hostUri;
}
public static int getPort(final String uri) throws URISyntaxException
{
String components[] = parseUriString(uri);
String portString = components[INDEX_PORT];
if (portString.equals(""))
{
if (components[INDEX_SCHEME].equals(""))
{
portString = "0";
}
else if (components[INDEX_SCHEME].equalsIgnoreCase("https"))
{
portString = "443";
}
else
{
portString = "80";
}
}
return Integer.valueOf(portString);
}
public static String getQuery(final String uri) throws URISyntaxException
{
return parseUriString(uri)[INDEX_QUERY];
}
public static List<NameValuePair> getQueryParametersFromQuery(final String query)
{
List<NameValuePair> params = new ArrayList<NameValuePair>(1);
if (query != null && !query.equals(""))
{
String rawPairs[] = query.split("&");
for (String rawPair : rawPairs)
{
String name;
String value;
int ampPos = rawPair.indexOf('=');
if (ampPos >= 0)
{
name = rawPair.substring(0, ampPos);
value = rawPair.substring(ampPos + 1, rawPair.length());
}
else
{
name = rawPair;
value = "";
}
params.add(new BasicNameValuePair(name, value));
}
}
return params;
}
public static List<NameValuePair> getQueryParametersFromUri(final String URI) throws URISyntaxException
{
String query = getQuery(URI);
return getQueryParametersFromQuery(query);
}
public static String getRelativeUri(final String uri) throws URISyntaxException
{
String components[] = parseUriString(uri);
StringBuilder sb = new StringBuilder();
sb.append(components[INDEX_DIRECTORY]);
sb.append(components[INDEX_FILE]);
if (!components[INDEX_QUERY].isEmpty())
{
sb.append("?" + components[INDEX_QUERY]);
}
if (!components[INDEX_FRAGMENT].isEmpty())
{
sb.append("#" + components[INDEX_FRAGMENT]);
}
return sb.toString();
}
public static String getScheme(final String uri) throws URISyntaxException
{
return parseUriString(uri)[INDEX_SCHEME];
}
/**
* Will return something like http://www.grendel-scan.com:80 It doesn't check to see if this is an absolute URL, so you should check if it could be relative
*/
public static String getSchemeHostPort(final String uri) throws URISyntaxException
{
String components[] = parseUriString(uri);
return components[INDEX_SCHEME] + "://" + components[INDEX_HOST] + ":" + components[INDEX_PORT];
}
public static String getSession(final String uri) throws URISyntaxException
{
return parseUriString(uri)[INDEX_SESSION];
}
public static boolean isAbsolute(final String uri) throws URISyntaxException
{
URI testUri = new URI(uri);
return testUri.isAbsolute();
}
/**
* Checks to see if a URI is usable from a web point of view. Currently, it checks for a scheme of http, https or nothing, and a host if there is a scheme. It also checks for mailto: and
* javascript: URLs.
*
* @param uri
* @return
* @throws URISyntaxException
*/
public static boolean isUsableUri(final String uri) throws URISyntaxException
{
boolean usable = false;
if (uri != null && !uri.equals(""))
{
String uriLower = uri.toLowerCase();
if (!(uriLower.startsWith("javascript:") || uriLower.startsWith("mailto:")))
{
String components[] = parseUriString(uri);
if (components[INDEX_SCHEME].equals("") || (components[INDEX_SCHEME].equalsIgnoreCase("http") || components[INDEX_SCHEME].equalsIgnoreCase("https")) && !components[INDEX_HOST].equals(""))
{
usable = true;
}
}
}
return usable;
}
public static void main(final String[] args)
{
printComponents("asdf");
printComponents("asdf?fds");
printComponents("as/df");
printComponents("http://localhost");
printComponents("http://localhost:88");
printComponents("http://localhost:88#fds");
printComponents("http://localhost:88?q=a&b=c");
printComponents("http://127.0.0.1:88?q=a&b=c");
printComponents("/fdsa/1111/asdf.fds?q=a&b=c");
printComponents("/asdf?q=a&b=c");
printComponents("/asdf?q=a&b=c#fdsa");
printComponents("/");
printComponents("/asdf");
printComponents("#asdf");
printComponents("http://www.example.com/classics/index.html");
printComponents("http://www.example.com/classics/;JSESS=123456");
printComponents("http://www.example.com/classics/;veryodd/index.html");
printComponents("http://www.example.com/classics/index.html;JSESS=123456");
printComponents("http://www.example.com/classics/index.html;JSESS=123456?a=1&b=2");
printComponents("http://www.example.com/classics/index.html;JSESS=123456?a=1&b=2&c=asdf;fdsa");
printComponents("http://www.example.com/classics/index.html" + "?a=1&b=2&c=asdf;fdsa");
// parse2("asdf");
// parse2("asdf?fds");
// parse2("as/df");
// parse2("http://localhost");
// parse2("http://localhost:88");
// parse2("http://localhost:88#fds");
// parse2("http://localhost:88?q=a&b=c");
// parse2("http://127.0.0.1:88?q=a&b=c");
// parse2("/fdsa/1111/asdf.fds?q=a&b=c");
// parse2("/asdf?q=a&b=c");
// parse2("/asdf?q=a&b=c#fdsa");
// parse2("/");
// parse2("http://www.example.com/classics/index.html");
// parse2("http://www.example.com/classics/;veryodd/index.html");
// parse2("http://www.example.com/classics/;JSESS=123456");
// parse2("http://www.example.com/classics/index.html;JSESS=123456");
// parse2("http://www.example.com/classics/index.html;JSESS=123456?a=1&b=2");
// parse2("http://www.example.com/classics/index.html;JSESS=123456?a=1&b=2&c=asdf;fdsa");
// parse2("http://www.example.com/classics/index.html" +
// "?a=1&b=2&c=asdf;fdsa");
}
/**
* Takes the URI and adds a port (if the scheme is known), sorts the parameter order, removes duplicate paramater names (if this breaks your stuff, follow the standard (RTFRFC)), removes extra
* ampersands (&) in the query, sets the scheme and host to all lower case (path & query can be case sensitive depending on OS & web server), and removes the fragment (since it isn't relilvent for
* requests)
*
* @param uri
* @return
* @throws URISyntaxException
*/
public static String normalizeUri(final String uri) throws URISyntaxException
{
String components[] = parseUriString(uri);
if (StringUtils.notEmpty(components[INDEX_SCHEME]) && !StringUtils.notEmpty(components[INDEX_PORT]))
{
if (components[INDEX_SCHEME].equalsIgnoreCase("HTTPS"))
{
components[INDEX_PORT] = "443";
}
else if (components[INDEX_SCHEME].equalsIgnoreCase("HTTP"))
{
components[INDEX_PORT] = "80";
}
}
String newUri = "";
if (StringUtils.notEmpty(components[INDEX_SCHEME]))
{
newUri += components[INDEX_SCHEME].toLowerCase() + "://";
}
if (StringUtils.notEmpty(components[INDEX_HOST]))
{
newUri += components[INDEX_HOST].toLowerCase();
}
if (StringUtils.notEmpty(components[INDEX_PORT]))
{
newUri += ":" + components[INDEX_PORT];
}
if (StringUtils.notEmpty(components[INDEX_DIRECTORY]))
{
newUri += components[INDEX_DIRECTORY];
}
if (StringUtils.notEmpty(components[INDEX_FILE]))
{
newUri += components[INDEX_FILE];
}
if (StringUtils.notEmpty(components[INDEX_SESSION]))
{
newUri += ";" + components[INDEX_SESSION];
}
if (StringUtils.notEmpty(components[INDEX_QUERY]))
{
newUri += "?" + normalizeUriQuery(components[INDEX_QUERY]);
}
if (StringUtils.notEmpty(components[INDEX_FRAGMENT]))
{
newUri += "#" + normalizeUriQuery(components[INDEX_FRAGMENT]);
}
return newUri;
}
public static String normalizeUriQuery(final String query)
{
String newQuery = query;
String parameters[] = newQuery.split("&");
java.util.Arrays.sort(parameters, String.CASE_INSENSITIVE_ORDER);
newQuery = "";
Set<String> parameterNames = new HashSet<String>(parameters.length);
for (String parameter : parameters)
{
String name;
String value;
int eq = parameter.indexOf("=");
if (eq > 0)
{
name = parameter.substring(0, eq);
value = parameter.substring(eq + 1);
}
else
{
name = parameter;
value = "";
}
if (parameterNames.contains(name))
{
continue;
}
newQuery += name + "=" + value + "&";
}
return newQuery.replaceAll("&{2,}", "&").replaceFirst("^&+", "").replaceFirst("&+$", "");
}
private static String[] parseUriString(final String uri) throws URISyntaxException
{
String components[] = { "", "", "", "", "", "", "", "" };
Matcher m = urlPattern.matcher(uri);
if (m.matches())
{
int groupNum = 1;
components[INDEX_SCHEME] = m.group(groupNum++);
if (components[INDEX_SCHEME] == null)
{
components[INDEX_SCHEME] = "";
}
components[INDEX_HOST] = m.group(groupNum++);
if (components[INDEX_HOST] == null)
{
components[INDEX_HOST] = "";
}
components[INDEX_PORT] = m.group(groupNum++);
if (components[INDEX_PORT] == null)
{
components[INDEX_PORT] = "";
}
components[INDEX_DIRECTORY] = m.group(groupNum++);
if (components[INDEX_DIRECTORY] == null)
{
components[INDEX_DIRECTORY] = "";
}
components[INDEX_FILE] = m.group(groupNum++);
if (components[INDEX_FILE] == null)
{
components[INDEX_FILE] = "";
}
if (m.groupCount() > 6)
{
components[INDEX_SESSION] = m.group(groupNum++);
if (components[INDEX_SESSION] == null)
{
components[INDEX_SESSION] = "";
}
}
components[INDEX_QUERY] = m.group(groupNum++);
if (components[INDEX_QUERY] == null)
{
components[INDEX_QUERY] = "";
}
components[INDEX_FRAGMENT] = m.group(groupNum++);
if (components[INDEX_FRAGMENT] == null)
{
components[INDEX_FRAGMENT] = "";
}
}
else
{
throw new URISyntaxException(uri, uri + " did not match regex");
}
return components;
}
private static void printComponents(final String uri)
{
String[] comps;
try
{
comps = parseUriString(uri);
System.out.println(uri + "\n" + "Scheme: " + comps[INDEX_SCHEME] + "\n" + "Host: " + comps[INDEX_HOST] + "\n" + "Port: " + comps[INDEX_PORT] + "\n" + "Dir: " + comps[INDEX_DIRECTORY] + "\n" + "File: " + comps[INDEX_FILE] + "\n" + "Session: "
+ comps[INDEX_SESSION] + "\n" + "Query: " + comps[INDEX_QUERY] + "\n" + "Frag: " + comps[INDEX_FRAGMENT] + "\n" + "\n\n");
}
catch (URISyntaxException e)
{
// TODO Auto-generated catch block
e.printStackTrace();
}
}
private static String reconstituteUri(final String[] components)
{
StringBuilder sb = new StringBuilder();
if (!components[INDEX_SCHEME].isEmpty())
{
sb.append(components[INDEX_SCHEME] + "://");
}
sb.append(components[INDEX_HOST]);
if (!components[INDEX_PORT].isEmpty())
{
sb.append(":" + components[INDEX_PORT]);
}
sb.append(components[INDEX_DIRECTORY]);
sb.append(components[INDEX_FILE]);
if (!components[INDEX_QUERY].isEmpty())
{
sb.append("?" + components[INDEX_QUERY]);
}
if (!components[INDEX_SESSION].isEmpty())
{
sb.append(";" + components[INDEX_SESSION]);
}
if (!components[INDEX_FRAGMENT].isEmpty())
{
sb.append("#" + components[INDEX_FRAGMENT]);
}
return sb.toString();
}
public static String removeDoubleSlashesFromDir(final String uri)
{
if (uri == null)
{
return uri;
}
return DOUBLE_SLASH_DIR.matcher(uri).replaceAll("/");
}
public static String removeQueryParameter(final String uri, final String parameterName) throws URISyntaxException
{
String newUri = uri;
String query = getQuery(newUri);
if (StringUtils.notEmpty(query))
{
query = query.replaceAll("(?:&|^)" + Pattern.quote(parameterName) + "=[^=&]+", "").replaceFirst("^&", "");
newUri = getFileUri(newUri) + "?" + query;
}
return newUri;
}
public static String replaceFilename(final String uri, final String newFilename) throws URISyntaxException
{
String[] components = parseUriString(uri);
components[INDEX_FILE] = newFilename;
return reconstituteUri(components);
}
public static String replaceQuery(final String uri, final String newQuery) throws URISyntaxException
{
String[] components = parseUriString(uri);
components[INDEX_QUERY] = newQuery;
return reconstituteUri(components);
}
public static String replaceQueryParameter(final String uri, final String parameterName, final String parameterValue) throws URISyntaxException
{
String newUri = uri;
String query = getQuery(newUri);
if (StringUtils.notEmpty(query))
{
query = query.replaceAll("&?+" + Pattern.quote(parameterName) + "=[^=&+]", new String(UrlEncodingUtils.encodeForParam(parameterValue.getBytes())));
newUri = getFileUri(newUri) + "?" + query;
}
return newUri;
}
public static String replaceSession(final String uri, final String newSession) throws URISyntaxException
{
String[] components = parseUriString(uri);
components[INDEX_SESSION] = newSession;
return reconstituteUri(components);
}
public static String urlEncode(final Iterable<NameValuePair> pairs)
{
boolean first = true;
StringBuilder buf = new StringBuilder();
for (NameValuePair pair : pairs)
{
if (pair.getName() != null)
{
if (!first)
{
buf.append("&");
}
first = false;
buf.append(new String(UrlEncodingUtils.encodeForParam(pair.getName().getBytes())));
buf.append("=");
if (pair.getValue() != null)
{
buf.append(new String(UrlEncodingUtils.encodeForParam(pair.getValue().getBytes())));
}
}
}
return buf.toString();
}
public static String urlEncode(final NameValuePair[] pairs)
{
return urlEncode(Arrays.asList(pairs));
}
public static boolean validateURISyntax(final String URI, final boolean mustBeAbsolute)
{
boolean good = true;
try
{
URI uri = UriFactory.makeUri(URI, false);
if (mustBeAbsolute && !uri.isAbsolute())
{
good = false;
}
}
catch (URISyntaxException e)
{
good = false;
}
return good;
}
}