/* * Copyright (C) 2007 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * * Imported from AOSP on 2011-01-12 by JRV. * Domain patterns updated from IANA on 2010-01-12 * * */ package com.fsck.k9droidtn.helper; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * Commonly used regular expression patterns. */ public class Regex { /** * Regular expression to match all IANA top-level domains. * List accurate as of 2011/01/12. List taken from: * http://data.iana.org/TLD/tlds-alpha-by-domain.txt * This pattern is auto-generated by frameworks/base/common/tools/make-iana-tld-pattern.py */ public static final String TOP_LEVEL_DOMAIN_STR = "((aero|arpa|asia|a[cdefgilmnoqrstuwxz])" + "|(biz|b[abdefghijmnorstvwyz])" + "|(cat|com|coop|c[acdfghiklmnoruvxyz])" + "|d[ejkmoz]" + "|(edu|e[cegrstu])" + "|f[ijkmor]" + "|(gov|g[abdefghilmnpqrstuwy])" + "|h[kmnrtu]" + "|(info|int|i[delmnoqrst])" + "|(jobs|j[emop])" + "|k[eghimnprwyz]" + "|l[abcikrstuvy]" + "|(mil|mobi|museum|m[acdeghklmnopqrstuvwxyz])" + "|(name|net|n[acefgilopruz])" + "|(org|om)" + "|(pro|p[aefghklmnrstwy])" + "|qa" + "|r[eosuw]" + "|s[abcdeghijklmnortuvyz]" + "|(tel|travel|t[cdfghjklmnoprtvwz])" + "|u[agksyz]" + "|v[aceginu]" + "|w[fs]" + "|(xn\\-\\-0zwm56d|xn\\-\\-11b5bs3a9aj6g|xn\\-\\-80akhbyknj4f|xn\\-\\-9t4b11yi5a|xn\\-\\-deba0ad|xn\\-\\-fiqs8s|xn\\-\\-fiqz9s|xn\\-\\-fzc2c9e2c|xn\\-\\-g6w251d|xn\\-\\-hgbk6aj7f53bba|xn\\-\\-hlcj6aya9esc7a|xn\\-\\-j6w193g|xn\\-\\-jxalpdlp|xn\\-\\-kgbechtv|xn\\-\\-kprw13d|xn\\-\\-kpry57d|xn\\-\\-mgbaam7a8h|xn\\-\\-mgbayh7gpa|xn\\-\\-mgberp4a5d4ar|xn\\-\\-o3cw4h|xn\\-\\-p1ai|xn\\-\\-pgbs0dh|xn\\-\\-wgbh1c|xn\\-\\-wgbl6a|xn\\-\\-xkc2al3hye2a|xn\\-\\-ygbi2ammx|xn\\-\\-zckzah)" + "|y[et]" + "|z[amw])"; /** * Regular expression pattern to match all IANA top-level domains. */ public static final Pattern TOP_LEVEL_DOMAIN_PATTERN = Pattern.compile(TOP_LEVEL_DOMAIN_STR); /** * Goegular expression to match all IANA top-level domains for WEB_URL. * List accurate as of 2011/01/12. List taken from: * http://data.iana.org/TLD/tlds-alpha-by-domain.txt * This pattern is auto-generated by frameworks/base/common/tools/make-iana-tld-pattern.py */ public static final String TOP_LEVEL_DOMAIN_STR_FOR_WEB_URL = "(?:" + "(?:aero|arpa|asia|a[cdefgilmnoqrstuwxz])" + "|(?:biz|b[abdefghijmnorstvwyz])" + "|(?:cat|com|coop|c[acdfghiklmnoruvxyz])" + "|d[ejkmoz]" + "|(?:edu|e[cegrstu])" + "|f[ijkmor]" + "|(?:gov|g[abdefghilmnpqrstuwy])" + "|h[kmnrtu]" + "|(?:info|int|i[delmnoqrst])" + "|(?:jobs|j[emop])" + "|k[eghimnprwyz]" + "|l[abcikrstuvy]" + "|(?:mil|mobi|museum|m[acdeghklmnopqrstuvwxyz])" + "|(?:name|net|n[acefgilopruz])" + "|(?:org|om)" + "|(?:pro|p[aefghklmnrstwy])" + "|qa" + "|r[eosuw]" + "|s[abcdeghijklmnortuvyz]" + "|(?:tel|travel|t[cdfghjklmnoprtvwz])" + "|u[agksyz]" + "|v[aceginu]" + "|w[fs]" + "|(?:xn\\-\\-0zwm56d|xn\\-\\-11b5bs3a9aj6g|xn\\-\\-80akhbyknj4f|xn\\-\\-9t4b11yi5a|xn\\-\\-deba0ad|xn\\-\\-fiqs8s|xn\\-\\-fiqz9s|xn\\-\\-fzc2c9e2c|xn\\-\\-g6w251d|xn\\-\\-hgbk6aj7f53bba|xn\\-\\-hlcj6aya9esc7a|xn\\-\\-j6w193g|xn\\-\\-jxalpdlp|xn\\-\\-kgbechtv|xn\\-\\-kprw13d|xn\\-\\-kpry57d|xn\\-\\-mgbaam7a8h|xn\\-\\-mgbayh7gpa|xn\\-\\-mgberp4a5d4ar|xn\\-\\-o3cw4h|xn\\-\\-p1ai|xn\\-\\-pgbs0dh|xn\\-\\-wgbh1c|xn\\-\\-wgbl6a|xn\\-\\-xkc2al3hye2a|xn\\-\\-ygbi2ammx|xn\\-\\-zckzah)" + "|y[et]" + "|z[amw]))"; /* This comprises most common used Unicode characters allowed in IRI * as detailed in RFC 3987. * Specifically, those two byte Unicode characters are not included. */ public static final String GOOD_IRI_CHAR = "a-zA-Z0-9\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF"; /** * Regular expression pattern to match most part of RFC 3987 * Internationalized URLs, aka IRIs. Commonly used Unicode characters are * added. */ public static final Pattern WEB_URL_PATTERN = Pattern.compile( "((?:(http|https|Http|Https|rtsp|Rtsp):\\/\\/(?:(?:[a-zA-Z0-9\\$\\-\\_\\.\\+\\!\\*\\'\\(\\)" + "\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,64}(?:\\:(?:[a-zA-Z0-9\\$\\-\\_" + "\\.\\+\\!\\*\\'\\(\\)\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,25})?\\@)?)?" + "((?:(?:[" + GOOD_IRI_CHAR + "][" + GOOD_IRI_CHAR + "\\-]{0,64}\\.)+" // named host + TOP_LEVEL_DOMAIN_STR_FOR_WEB_URL + "|(?:(?:25[0-5]|2[0-4]" // or ip address + "[0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9])\\.(?:25[0-5]|2[0-4][0-9]" + "|[0-1][0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1]" + "[0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}" + "|[1-9][0-9]|[0-9])))" + "(?:\\:\\d{1,5})?)" // plus option port number + "(\\/(?:(?:[" + GOOD_IRI_CHAR + "\\;\\/\\?\\:\\@\\&\\=\\#\\~" // plus option query params + "\\-\\.\\+\\!\\*\\'\\(\\)\\,\\_])|(?:\\%[a-fA-F0-9]{2}))*)?" + "(?:\\b|$)"); // and finally, a word boundary or end of // input. This is to stop foo.sure from // matching as foo.su public static final Pattern IP_ADDRESS_PATTERN = Pattern.compile( "((25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9])\\.(25[0-5]|2[0-4]" + "[0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(25[0-5]|2[0-4][0-9]|[0-1]" + "[0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}" + "|[1-9][0-9]|[0-9]))"); public static final Pattern DOMAIN_NAME_PATTERN = Pattern.compile( "(((([" + GOOD_IRI_CHAR + "][" + GOOD_IRI_CHAR + "\\-]*)*[" + GOOD_IRI_CHAR + "]\\.)+" + TOP_LEVEL_DOMAIN_PATTERN + ")|" + IP_ADDRESS_PATTERN + ")"); public static final Pattern EMAIL_ADDRESS_PATTERN = Pattern.compile( "[a-zA-Z0-9\\+\\.\\_\\%\\-\\+]{1,256}" + "\\@" + "[a-zA-Z0-9][a-zA-Z0-9\\-]{0,64}" + "(" + "\\." + "[a-zA-Z0-9][a-zA-Z0-9\\-]{0,25}" + ")+" ); /** * This pattern is intended for searching for things that look like they * might be phone numbers in arbitrary text, not for validating whether * something is in fact a phone number. It will miss many things that * are legitimate phone numbers. * * <p> The pattern matches the following: * <ul> * <li>Optionally, a + sign followed immediately by one or more digits. Spaces, dots, or dashes * may follow. * <li>Optionally, sets of digits in parentheses, separated by spaces, dots, or dashes. * <li>A string starting and ending with a digit, containing digits, spaces, dots, and/or dashes. * </ul> */ public static final Pattern PHONE_PATTERN = Pattern.compile( // sdd = space, dot, or dash "(\\+[0-9]+[\\- \\.]*)?" // +<digits><sdd>* + "(\\([0-9]+\\)[\\- \\.]*)?" // (<digits>)<sdd>* + "([0-9][0-9\\- \\.][0-9\\- \\.]+[0-9])"); // <digit><digit|sdd>+<digit> /** * Convenience method to take all of the non-null matching groups in a * regex Matcher and return them as a concatenated string. * * @param matcher The Matcher object from which grouped text will * be extracted * * @return A String comprising all of the non-null matched * groups concatenated together */ public static final String concatGroups(Matcher matcher) { StringBuilder b = new StringBuilder(); final int numGroups = matcher.groupCount(); for (int i = 1; i <= numGroups; i++) { String s = matcher.group(i); System.err.println("Group(" + i + ") : " + s); if (s != null) { b.append(s); } } return b.toString(); } /** * Convenience method to return only the digits and plus signs * in the matching string. * * @param matcher The Matcher object from which digits and plus will * be extracted * * @return A String comprising all of the digits and plus in * the match */ public static final String digitsAndPlusOnly(Matcher matcher) { StringBuilder buffer = new StringBuilder(); String matchingRegion = matcher.group(); for (int i = 0, size = matchingRegion.length(); i < size; i++) { char character = matchingRegion.charAt(i); if (character == '+' || Character.isDigit(character)) { buffer.append(character); } } return buffer.toString(); } }