package org.altbeacon.beacon.utils; import java.net.MalformedURLException; import java.util.ArrayList; import java.util.Arrays; import java.util.Iterator; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * Provides encoding / decoding functions for the URL beacon https://github.com/google/uribeacon */ public class UrlBeaconUrlCompressor { private static final String EDDYSTONE_URL_REGEX = "^((?i)http|https):\\/\\/((?i)www\\.)?((?:[0-9a-zA-Z_-]+\\.?)+)(/?)([./0-9a-zA-Z_-]*)"; // Break into components private static final int EDDYSTONE_URL_PROTOCOL_GROUP = 1; private static final int EDDYSTONE_URL_WWW_GROUP = 2; private static final int EDDYSTONE_URL_FQDN_GROUP = 3; private static final int EDDYSTONE_URL_SLASH_GROUP = 4; private static final int EDDYSTONE_URL_PATH_GROUP = 5; private static final String URL_PROTOCOL_HTTP_WWW_DOT = "http://www."; private static final String URL_PROTOCOL_HTTPS_WWW_DOT = "https://www."; private static final String URL_PROTOCOL_HTTP = "http"; private static final String URL_PROTOCOL_HTTP_COLON_SLASH_SLASH = "http://"; private static final String URL_PROTOCOL_HTTPS_COLON_SLASH_SLASH = "https://"; private static final String URL_HOST_WWW = "www."; private static final String URL_TLD_DOT_COM = ".com"; private static final String URL_TLD_DOT_ORG = ".org"; private static final String URL_TLD_DOT_EDU = ".edu"; private static final String URL_TLD_DOT_NET = ".net"; private static final String URL_TLD_DOT_INFO = ".info"; private static final String URL_TLD_DOT_BIZ = ".biz"; private static final String URL_TLD_DOT_GOV = ".gov"; private static final String URL_TLD_DOT_COM_SLASH = ".com/"; private static final String URL_TLD_DOT_ORG_SLASH = ".org/"; private static final String URL_TLD_DOT_EDU_SLASH = ".edu/"; private static final String URL_TLD_DOT_NET_SLASH = ".net/"; private static final String URL_TLD_DOT_INFO_SLASH = ".info/"; private static final String URL_TLD_DOT_BIZ_SLASH = ".biz/"; private static final String URL_TLD_DOT_GOV_SLASH = ".gov/"; private static final byte EDDYSTONE_URL_PROTOCOL_HTTP_WWW = 0x00; private static final byte EDDYSTONE_URL_PROTOCOL_HTTPS_WWW = 0x01; private static final byte EDDYSTONE_URL_PROTOCOL_HTTP = 0x02; private static final byte EDDYSTONE_URL_PROTOCOL_HTTPS = 0x03; private static final byte EDDYSTONE_URL_COM_SLASH = 0x00; private static final byte EDDYSTONE_URL_ORG_SLASH = 0x01; private static final byte EDDYSTONE_URL_EDU_SLASH = 0x02; private static final byte EDDYSTONE_URL_NET_SLASH = 0x03; private static final byte EDDYSTONE_URL_INFO_SLASH = 0x04; private static final byte EDDYSTONE_URL_BIZ_SLASH = 0x05; private static final byte EDDYSTONE_URL_GOV_SLASH = 0x06; private static final byte EDDYSTONE_URL_COM = 0x07; private static final byte EDDYSTONE_URL_ORG = 0x08; private static final byte EDDYSTONE_URL_EDU = 0x09; private static final byte EDDYSTONE_URL_NET = 0x0a; private static final byte EDDYSTONE_URL_INFO = 0x0b; private static final byte EDDYSTONE_URL_BIZ = 0x0c; private static final byte EDDYSTONE_URL_GOV = 0x0d; private static final byte TLD_NOT_ENCODABLE = (byte) 0xff; // Maps from the top level domains (with or without trailing slash) // to the associated encoded byte. private static class TLDMapEntry { public final String tld; public final byte encodedByte; public TLDMapEntry(String topLevelDomain, byte encodedTLDByte) { tld = topLevelDomain; encodedByte = encodedTLDByte; } } private static List<TLDMapEntry> tldMap; static { tldMap = new ArrayList<>(); tldMap.add(new TLDMapEntry(URL_TLD_DOT_COM_SLASH , EDDYSTONE_URL_COM_SLASH )); tldMap.add(new TLDMapEntry(URL_TLD_DOT_ORG_SLASH , EDDYSTONE_URL_ORG_SLASH )); tldMap.add(new TLDMapEntry(URL_TLD_DOT_EDU_SLASH , EDDYSTONE_URL_EDU_SLASH )); tldMap.add(new TLDMapEntry(URL_TLD_DOT_NET_SLASH , EDDYSTONE_URL_NET_SLASH )); tldMap.add(new TLDMapEntry(URL_TLD_DOT_INFO_SLASH, EDDYSTONE_URL_INFO_SLASH)); tldMap.add(new TLDMapEntry(URL_TLD_DOT_BIZ_SLASH , EDDYSTONE_URL_BIZ_SLASH )); tldMap.add(new TLDMapEntry(URL_TLD_DOT_GOV_SLASH , EDDYSTONE_URL_GOV_SLASH )); tldMap.add(new TLDMapEntry(URL_TLD_DOT_COM , EDDYSTONE_URL_COM )); tldMap.add(new TLDMapEntry(URL_TLD_DOT_ORG , EDDYSTONE_URL_ORG )); tldMap.add(new TLDMapEntry(URL_TLD_DOT_EDU , EDDYSTONE_URL_EDU )); tldMap.add(new TLDMapEntry(URL_TLD_DOT_NET , EDDYSTONE_URL_NET )); tldMap.add(new TLDMapEntry(URL_TLD_DOT_INFO , EDDYSTONE_URL_INFO )); tldMap.add(new TLDMapEntry(URL_TLD_DOT_BIZ , EDDYSTONE_URL_BIZ )); tldMap.add(new TLDMapEntry(URL_TLD_DOT_GOV , EDDYSTONE_URL_GOV )); }; private static byte encodedByteForTopLevelDomain(String tld) { byte encodedByte = TLD_NOT_ENCODABLE; boolean tldFound = false; Iterator<TLDMapEntry> iterator = tldMap.iterator(); while (! tldFound && iterator.hasNext()) { TLDMapEntry entry = iterator.next(); tldFound = entry.tld.equalsIgnoreCase(tld); if (tldFound) { encodedByte = entry.encodedByte; } } return encodedByte; } private static String topLevelDomainForByte(Byte encodedByte) { String tld = null; boolean tldFound = false; Iterator<TLDMapEntry> iterator = tldMap.iterator(); while (! tldFound && iterator.hasNext()) { TLDMapEntry entry = iterator.next(); tldFound = entry.encodedByte == encodedByte; if (tldFound) { tld = entry.tld; } } return tld; } /** * Converts the given URL string into a byte array "compressed" version of the URL. * * The regex needs to determine what the URL starts with and what the hostname ends * with. The URL must start with one of the following: * * http://www. * https://www. * http:// * https:// * * The hostname may end with one of the following TLDs: * * .com * .org * .edu * .net * .info * .biz * .gov * * If the path component of the URL is non-empty, then the "slash" version of * the matching TLD can be used. Otherwise, the "non-slash" version of the TLD is used. * If the hostname doesn't end with a TLD, that's fine; it just isn't compressed * into a single byte. * * Therefore, the following regex should tell me what I need to know about the URL: * * ^(http|https):\/\/(www.)?((?:[0-9a-z_-]+\.??)+)(\.[0-9a-z_-]+\/?)(.*)$ * * Groups: * * 1: http or https * 2: www. or empty * 3: hostname including optional leading www. but excluding trailing dot up to but not including TLD * 4: TLD with leading dot and optional trailing slash * 5: path without leading slash or empty * * @param urlString * @return */ public static byte[] compress(String urlString) throws MalformedURLException { byte[] compressedBytes = null; if (urlString != null) { // Figure the compressed bytes can't be longer than the original string. byte[] byteBuffer = new byte[urlString.length()]; int byteBufferIndex = 0; Arrays.fill(byteBuffer, (byte) 0x00); Pattern urlPattern = Pattern.compile(EDDYSTONE_URL_REGEX); Matcher urlMatcher = urlPattern.matcher(urlString); if (urlMatcher.matches()) { // www. String wwwdot = urlMatcher.group(EDDYSTONE_URL_WWW_GROUP); boolean haswww = (wwwdot != null); // Protocol. String rawProtocol = urlMatcher.group(EDDYSTONE_URL_PROTOCOL_GROUP); String protocol = rawProtocol.toLowerCase(); if (protocol.equalsIgnoreCase(URL_PROTOCOL_HTTP)) { byteBuffer[byteBufferIndex] = (haswww ? EDDYSTONE_URL_PROTOCOL_HTTP_WWW : EDDYSTONE_URL_PROTOCOL_HTTP); } else { byteBuffer[byteBufferIndex] = (haswww ? EDDYSTONE_URL_PROTOCOL_HTTPS_WWW : EDDYSTONE_URL_PROTOCOL_HTTPS); } byteBufferIndex++; // Fully-qualified domain name (FQDN). This includes the hostname and any other components after the dots // but BEFORE the first single slash in the URL. byte[] hostnameBytes = urlMatcher.group(EDDYSTONE_URL_FQDN_GROUP).getBytes(); String rawHostname = new String(hostnameBytes); String hostname = rawHostname.toLowerCase(); String[] domains = hostname.split(Pattern.quote(".")); boolean consumedSlash = false; if (domains != null) { // Write the hostname/subdomains prior to the last one. If there's only one (e. g. http://localhost) // then that's the only thing to write out. byte[] periodBytes = {'.'}; int writableDomainsCount = (domains.length == 1 ? 1 : domains.length - 1); for (int domainIndex = 0; domainIndex < writableDomainsCount; domainIndex++) { // Write out leading period, if necessary. if (domainIndex > 0) { System.arraycopy(periodBytes, 0, byteBuffer, byteBufferIndex, periodBytes.length); byteBufferIndex += periodBytes.length; } byte[] domainBytes = domains[domainIndex].getBytes(); int domainLength = domainBytes.length; System.arraycopy(domainBytes, 0, byteBuffer, byteBufferIndex, domainLength); byteBufferIndex += domainLength; } // Is the TLD one that we can encode? if (domains.length > 1) { String tld = "." + domains[domains.length - 1]; String slash = urlMatcher.group(EDDYSTONE_URL_SLASH_GROUP); String encodableTLDCandidate = (slash == null ? tld : tld + slash); byte encodedTLDByte = encodedByteForTopLevelDomain(encodableTLDCandidate); if (encodedTLDByte != TLD_NOT_ENCODABLE) { byteBuffer[byteBufferIndex++] = encodedTLDByte; consumedSlash = (slash != null); } else { byte[] tldBytes = tld.getBytes(); int tldLength = tldBytes.length; System.arraycopy(tldBytes, 0, byteBuffer, byteBufferIndex, tldLength); byteBufferIndex += tldLength; } } } // Optional slash. if (! consumedSlash) { String slash = urlMatcher.group(EDDYSTONE_URL_SLASH_GROUP); if (slash != null) { int slashLength = slash.length(); System.arraycopy(slash.getBytes(), 0, byteBuffer, byteBufferIndex, slashLength); byteBufferIndex += slashLength; } } // Path. String path = urlMatcher.group(EDDYSTONE_URL_PATH_GROUP); if (path != null) { int pathLength = path.length(); System.arraycopy(path.getBytes(), 0, byteBuffer, byteBufferIndex, pathLength); byteBufferIndex += pathLength; } // Copy the result. compressedBytes = new byte[byteBufferIndex]; System.arraycopy(byteBuffer, 0, compressedBytes, 0, compressedBytes.length); } else { throw new MalformedURLException(); } } else { throw new MalformedURLException(); } return compressedBytes; } public static String uncompress(byte[] compressedURL) { StringBuffer url = new StringBuffer(); switch (compressedURL[0] & 0x0f) { case EDDYSTONE_URL_PROTOCOL_HTTP_WWW: url.append(URL_PROTOCOL_HTTP_WWW_DOT); break; case EDDYSTONE_URL_PROTOCOL_HTTPS_WWW: url.append(URL_PROTOCOL_HTTPS_WWW_DOT); break; case EDDYSTONE_URL_PROTOCOL_HTTP: url.append(URL_PROTOCOL_HTTP_COLON_SLASH_SLASH); break; case EDDYSTONE_URL_PROTOCOL_HTTPS: url.append(URL_PROTOCOL_HTTPS_COLON_SLASH_SLASH); break; default: break; } byte lastByte = -1; for (int i = 1; i < compressedURL.length; i++) { byte b = compressedURL[i]; if (lastByte == 0 && b == 0 ) { break; } lastByte = b; String tld = topLevelDomainForByte(b); if (tld != null) { url.append(tld); } else { url.append((char) b); } } return url.toString(); } }