/** * Copyright 2008 - CommonCrawl Foundation * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * **/ package org.commoncrawl.util; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.HashSet; import java.util.Map; import java.util.Set; import java.util.TreeMap; import java.util.TreeSet; import java.util.Vector; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; import com.google.common.collect.Lists; import com.google.common.collect.TreeMultimap; public class URLPattern { public static interface URLFragment { public boolean isWildcard(); public boolean isMatch(String urlFragment); public int getMatchCount(); } /* * URLPathFragment definition */ public static class URLPathFragment implements URLFragment { boolean _isLeafNode = false; ArrayList<String> _matches = new ArrayList<String>(); public URLPathFragment(String partString, boolean isLeafNode) { _matches.add(partString); _isLeafNode = isLeafNode; } public boolean isWildcard() { return false; } public boolean isMatch(String urlFragment) { return _matches.get(0).equals(urlFragment); } public int getMatchCount() { return _matches.size(); } public ArrayList<String> getMatches() { return _matches; } public void addMatch(String pathPart, boolean isLeafNode) { _matches.add(pathPart); _isLeafNode |= isLeafNode; } @Override public String toString() { if (_matches.size() == 1) { return "/" + _matches.get(0); } else { String partOut = "/("; int partCount = 0; for (String part : _matches) { if (partCount++ != 0) partOut += "|"; partOut += part; } partOut += ")"; return partOut; } } } public static class URLQueryFragment implements URLFragment { String _key = null; Set<String> _values = new TreeSet<String>(); public URLQueryFragment(String key, String value) { _key = key; _values.add(value); } public static String[] separateKeyValue(String queryPart) { String arrayOut[] = new String[2]; int indexOfSep = queryPart.indexOf("="); if (indexOfSep != -1) { arrayOut[0] = queryPart.substring(0, indexOfSep); arrayOut[1] = ((indexOfSep + 1) != queryPart.length()) ? queryPart .substring(indexOfSep + 1) : ""; } else { arrayOut[0] = queryPart; arrayOut[1] = ""; } return arrayOut; } public void addMatch(String value, boolean isLeafNode) { _values.add(value); } @Override public int getMatchCount() { return _values.size(); } @Override public boolean isWildcard() { return false; } @Override public boolean isMatch(String pathFragment) { String keyValuePair[] = separateKeyValue(pathFragment); return keyValuePair[0].equals(_key); } public String getKey() { return _key; } public Set<String> getValues() { return _values; } } /* * Wildcard definition */ public static class WildcardPathFragment extends URLPathFragment { public WildcardPathFragment(boolean isLeafNode) { super("[^/?]*", isLeafNode); } public boolean isWildcard() { return true; } } /* * component vector */ ArrayList<URLPathFragment> _pathFragments = new ArrayList<URLPathFragment>(); TreeMap<String, URLQueryFragment> _queryFragments = new TreeMap<String, URLQueryFragment>(); ArrayList<TextBytes> _matches = new ArrayList<TextBytes>(); public URLPattern(String url, ArrayList<String> pathComponents, ArrayList<String> queryComponents) { for (String pathComponenet : pathComponents) { _pathFragments.add(new URLPathFragment( escapeReservedCharacters(pathComponenet), false)); } for (String queryComponent : queryComponents) { String keyValuePair[] = URLQueryFragment.separateKeyValue(queryComponent); if (queryComponents.size() == 1 && (keyValuePair[1] == null || keyValuePair[1].length() == 0)) { // NOOP? } else { URLQueryFragment fragment = _queryFragments.get(keyValuePair[0]); if (fragment == null) { _queryFragments.put(keyValuePair[0], new URLQueryFragment( keyValuePair[0], keyValuePair[1])); } else { fragment.addMatch(keyValuePair[1], false); } } } _matches.add(new TextBytes(url)); } public int isMatch(ArrayList<String> pathComponents, ArrayList<String> queryComponents) { return addOrComputeMatch(pathComponents, queryComponents, true); } public int getMatchCount() { return _matches.size(); } public String getMatchAt(int index) { return _matches.get(index).toString(); } public ArrayList<TextBytes> getMatches() { return _matches; } public void addURL(String url, ArrayList<String> pathComponents, ArrayList<String> queryComponents) { addOrComputeMatch(pathComponents, queryComponents, false); _matches.add(new TextBytes(url)); } private int addOrComputeMatch(ArrayList<String> pathComponents, ArrayList<String> queryComponents, boolean checkOnly) { if (pathComponents.size() == _pathFragments.size() && queryComponents.size() == _queryFragments.size()) { // all query parameters must match ... ? for (String queryPart : queryComponents) { String keyValuePair[] = URLQueryFragment.separateKeyValue(queryPart); if (!_queryFragments.containsKey(keyValuePair[0])) { return 0; } } Set<Integer> pathMatchIndexes = new HashSet<Integer>(); for (int i = 0; i < _pathFragments.size(); ++i) { if (!_pathFragments.get(i).isWildcard() && _pathFragments.get(i).isMatch( escapeReservedCharacters(pathComponents.get(i)))) { pathMatchIndexes.add(i); } } if ((pathMatchIndexes.size() != 0 || _pathFragments.size() == 0) && !checkOnly) { for (int i = 0; i < _pathFragments.size(); ++i) { if (!pathMatchIndexes.contains(i)) { if (!_pathFragments.get(i).isWildcard()) { _pathFragments.get(i).addMatch( escapeReservedCharacters(pathComponents.get(i)), false); } } } // add query parameters for (String queryPart : queryComponents) { String keyValuePair[] = URLQueryFragment.separateKeyValue(queryPart); _queryFragments.get(keyValuePair[0]).addMatch(keyValuePair[1], false); } } return (_pathFragments.size() == 0) ? 1 : pathMatchIndexes.size(); } return 0; } public int getPathLength() { return _pathFragments.size(); } public int getQueryLength() { return _queryFragments.size(); } public URLPathFragment getURLPartAt(int index) { return _pathFragments.get(index); } private void makePartAtIndexWildcard(int index) { if (index < _pathFragments.size()) { URLPathFragment partAtIndex = _pathFragments.get(index); if (!partAtIndex.isWildcard()) { _pathFragments.set(index, new WildcardPathFragment(false)); } } } public void compact() { for (int i = 0; i < _pathFragments.size(); ++i) { URLPathFragment part = _pathFragments.get(i); if (!part.isWildcard() && (part.getMatchCount() > 3 || valuesLookNumeric(part.getMatches()))) { makePartAtIndexWildcard(i); } } } public static String escapeReservedCharacters(String incoming) { incoming = incoming.replaceAll("([\\?\\\\\\[\\]\\+\\.\\*\\(\\)\\{\\}])", "\\\\$1"); return incoming; } static Pattern isAllDigits = Pattern.compile("^[0-9]*$"); static Pattern isAllHex = Pattern.compile("^[a-fA-F0-9-]*$"); public static boolean valuesLookNumeric(Iterable<String> values) { for (String str : values) { if (!isAllDigits.matcher(str).matches() && !isAllHex.matcher(str).matches()) { return false; } } return true; } // destructoid.com static Pattern containsEscapeChar = Pattern.compile("[%+]"); public static boolean valuesContainsEscapeChars(Iterable<String> values) { for (String str : values) { if (containsEscapeChar.matcher(str).find()) { return true; } } return false; } public String generateRegEx() { String regExOut = "http://[^/]*"; for (URLPathFragment component : _pathFragments) { regExOut += component.toString(); } if (!regExOut.endsWith("/")) { regExOut += "/*"; } if (_queryFragments.size() != 0) { regExOut += "\\?"; } int queryItemCount = 0; for (URLQueryFragment fragment : _queryFragments.values()) { if (queryItemCount++ != 0) { regExOut += "&"; } regExOut += escapeReservedCharacters(fragment.getKey()); Set<String> values = fragment.getValues(); String firstValue = values.iterator().next(); if (values.size() == 1 && valuesLookNumeric(values)) { regExOut += "=*[^&]*"; } else { if (values.size() > 1 || !firstValue.equals("")) { if (values.size() > 3 || firstValue.equals("") || valuesContainsEscapeChars(values) || (values.size() > 1 && valuesLookNumeric(values))) { if (firstValue.equals("")) { regExOut += "[=]*[^&]*"; } else { regExOut += "=[^&]+"; } } else { if (firstValue.equals("")) { regExOut += "[=]*("; } else { regExOut += "=("; } int valueCount = 0; for (String value : values) { if (!value.equals("")) { if (valueCount++ != 0) { regExOut += "|"; } regExOut += escapeReservedCharacters(value); } } regExOut += ")"; if (firstValue.equals("")) { regExOut += "*"; } } } } regExOut += "[&]*"; } regExOut += ".*"; return regExOut; } @Override public String toString() { String debugOut = "Pattern: " + generateRegEx(); debugOut += "\n"; debugOut += "Matched:\n"; for (TextBytes url : _matches) { debugOut += url + "\n"; } debugOut += "\n"; return debugOut; } public static TreeMultimap<String, String> extractQueryParts(String content) { String key = null; String value = null; int mark = -1; TreeMultimap<String, String> map = TreeMultimap.create(); for (int i = 0; i < content.length(); i++) { char c = content.charAt(i); switch (c) { case '&': value = content.substring(mark + 1, i); mark = i; if (key != null) { map.put(key, value); key = null; } break; case '=': if (key != null) break; key = content.substring(mark + 1, i); mark = i; break; case '+': break; } } if (key != null) { value = content.substring(mark + 1); map.put(key, value); } else if (mark < content.length()) { key = content.substring(mark + 1); map.put(key, ""); } return map; } public static ArrayList<String> extractQueryPartsAsArrayList( String queryFragment) { ArrayList<String> arrayListOut = new ArrayList<String>(); TreeMultimap<String, String> queryMap = extractQueryParts(queryFragment); for (Map.Entry<String, String> entry : queryMap.entries()) { String keyValue = entry.getKey(); if (entry.getValue().length() != 0) { keyValue += "="; keyValue += entry.getValue(); } arrayListOut.add(keyValue); } return arrayListOut; } public static String normalizeQueryURL(String url) { GoogleURL urlObject = new GoogleURL(url); if (urlObject.isValid() && urlObject.has_query()) { StringBuilder urlOut = new StringBuilder(); urlOut.append(urlObject.getScheme()); urlOut.append("://"); if (urlObject.getUserName() != GoogleURL.emptyString) { urlOut.append(urlObject.getUserName()); if (urlObject.getPassword() != GoogleURL.emptyString) { urlOut.append(":"); urlOut.append(urlObject.getPassword()); } urlOut.append("@"); } urlOut.append(urlObject.getHost()); if (urlObject.getPort() != GoogleURL.emptyString) { urlOut.append(":"); urlOut.append(urlObject.getPort()); } if (urlObject.getPath() != GoogleURL.emptyString) { urlOut.append(urlObject.getPath()); } if (urlObject.getQuery() != GoogleURL.emptyString) { urlOut.append("?"); TreeMultimap<String, String> queryMap = extractQueryParts(urlObject .getQuery()); int partCount = 0; for (Map.Entry<String, String> entry : queryMap.entries()) { if (partCount++ != 0) urlOut.append("&"); urlOut.append(entry.getKey()); if (entry.getValue().length() != 0) { urlOut.append("="); urlOut.append(entry.getValue()); } } } String urlFinal = urlOut.toString(); if (urlFinal.endsWith("=") || urlFinal.endsWith("&")) { urlFinal = urlFinal.substring(0, urlFinal.length() - 1); } return urlOut.toString(); } return url; } public static class URLPatternBuilder { Vector<URLPattern> _patterns = new Vector<URLPattern>(); public void addPath(String url) { GoogleURL urlObject = new GoogleURL(url); if (urlObject.isValid()) { ArrayList<String> pathPartsVector = new ArrayList<String>(); ArrayList<String> queryPartsVector = new ArrayList<String>(); if (!urlObject.getPath().equals("/")) { String path = urlObject.getPath(); if (path.length() != 0) { String pathParts[] = path.substring(1).split("/"); pathPartsVector = Lists.newArrayList(pathParts); } } if (urlObject.has_query()) { String query = urlObject.getQuery(); queryPartsVector = extractQueryPartsAsArrayList(query); if (queryPartsVector.size() == 1 && queryPartsVector.get(0).indexOf("=") == -1) { queryPartsVector.clear(); } } int highestMatchCount = 0; URLPattern matchedPattern = null; for (URLPattern pattern : _patterns) { if (pattern.getPathLength() == pathPartsVector.size() && pattern.getQueryLength() == queryPartsVector.size()) { int matchCount = pattern.isMatch(pathPartsVector, queryPartsVector); if (matchCount > highestMatchCount) { highestMatchCount = matchCount; matchedPattern = pattern; } } } if (matchedPattern != null) { matchedPattern.addURL(url, pathPartsVector, queryPartsVector); } else { _patterns.add(new URLPattern(url, pathPartsVector, queryPartsVector)); } } } public void consolidatePatterns() { for (URLPattern pattern : _patterns) { pattern.compact(); } } public Vector<URLPattern> getPatterns() { return _patterns; } void dumpPatterns() { Collections.sort(_patterns, new Comparator<URLPattern>() { @Override public int compare(URLPattern o1, URLPattern o2) { return ((Integer) o2.getMatchCount()).compareTo(o1.getMatchCount()); } }); for (URLPattern pattern : _patterns) { if (pattern.getMatchCount() > 1) { System.out.println(pattern.toString()); } } } } public static class URLPatternMatcher { private Pattern pattern; public URLPatternMatcher(String regularExpression) throws PatternSyntaxException { pattern = Pattern.compile(regularExpression); } public boolean matches(String url) { url = URLPattern.normalizeQueryURL(url); return pattern.matcher(url).matches(); } } }