// This file is part of TagSoup and is Copyright 2002-2008 by John Cowan.
//
// TagSoup is licensed under the Apache License,
// Version 2.0. You may obtain a copy of this license at
// http://www.apache.org/licenses/LICENSE-2.0 . You may also have
// additional legal rights not granted by this license.
//
// TagSoup is distributed in the hope that it will be useful, but
// unless required by applicable law or agreed to in writing, TagSoup
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
// OF ANY KIND, either express or implied; not even the implied warranty
// of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
package com.onegravity.rteditor.converter.tagsoup;
import java.util.regex.Pattern;
/**
* We use this class instead of android.util.Patterns since we want to use the latest version of the top level domain list from
* http://data.iana.org/TLD/tlds-alpha-by-domain.txt (Version 2014091501, Last Updated Tue Sep 16 07:07:01 2014 UTC)
*/
public class Patterns {
public static final String GOOD_IRI_CHAR =
"a-zA-Z0-9\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF";
public static final String TOP_LEVEL_DOMAIN_STR_FOR_WEB_URL =
"(?:"
+ "(?:axa|aero|army|arpa|asia|actor|archi|audio|autos|active|agency|academy|auction|airforce|attorney|associates|accountants|a[cdefgilmnoqrstuwxz])"
+ "|(?:bar|bid|bio|biz|bmw|boo|bzh|beer|best|bike|blue|buzz|black|build|bayern|berlin|bargains|boutique|brussels|builders|business|bnpparibas|blackfriday|b[abdefghijmnorstvwyz])"
+ "|(?:cab|cal|cat|ceo|com|camp|care|cash|cern|city|club|cool|coop|cards|cheap|citic|click|codes|cymru|camera|career|center|chrome|church|claims|clinic|coffee|condos|credit|capital|caravan|careers|channel|collee|cologne|company|cooking|country|cruises|capetown|catering|cleaning|clothing|computer|christmas|community|consulting|creditcard|cuisinella|contractors|construction|cancerresearch|c[acdfghiklmnoruvwxyz])"
+ "|(?:dad|day|dnp|desi|diet|dance|deals|dating|degree|dental|direct|durban|dentist|digital|domains|democrat|diamonds|discount|directory|d[ejkmoz])"
+ "|(?:eat|edu|esq|eus|email|estate|events|expert|exposed|engineer|exchange|education|equipment|engineering|enterprises|e[cegrstu])"
+ "|(?:fly|foo|frl|fail|farm|fish|fund|futbol|finance|fishing|fitness|flights|florist|frogans|feedback|financial|furniture|foundation|f[ijkmor])"
+ "|(?:gal|gle|gmo|gmx|gop|gov|gbiz|gent|gift|guru|gifts|gives|glass|globo|gmail|green|gripe|guide|global|google|gratis|gallery|guitars|graphics|g[abdefghilmnpqrstuwy])"
+ "|(?:hiv|how|haus|help|here|host|homes|horse|house|hiphop|hamburg|holiday|hosting|holdings|healthcare|h[kmnrtu])"
+ "|(?:ing|ink|int|immo|info|insure|institute|immobilien|industries|investments|international|i[delmnoqrst])"
+ "|(?:jobs|jetzt|joburg|juegos|j[emop])"
+ "|(?:kim|krd|kiwi|kred|koeln|kaufen|kitchen|k[eghimnprwyz])"
+ "|(?:land|lgbt|life|limo|link|ltda|luxe|lease|loans|lotto|lawyer|london|luxury|lacaixa|limited|lighting|l[abcikrstuvy])"
+ "|(?:mil|moe|mov|meet|meme|menu|mini|mobi|moda|mango|media|miami|maison|market|monash|moscow|museum|mortgage|marketing|melbourne|management|motorcycles|m[acdeghklmnopqrstuvwxyz])"
+ "|(?:net|new|ngo|nhk|nra|nrw|nyc|name|navy|nexus|ninja|nagoya|network|neustar|n[acefgilopruz])"
+ "|(?:ong|onl|ooo|org|ovh|otsuka|okinawa|organic|om)"
+ "|(?:pro|pub|pics|pink|post|prod|prof|paris|parts|photo|pizza|place|praxi|press|photos|physio|partners|pharmacy|pictures|plumbing|property|properties|photography|productions|p[aefghklmnrstwy])"
+ "|(?:qpon|quebec|qa)"
+ "|(?:red|ren|rio|rest|rich|rsvp|ruhr|rehab|reise|rocks|rodeo|reisen|repair|report|ryukyu|realtor|recipes|rentals|reviews|republican|restaurant|r[eosuw])"
+ "|(?:sca|scb|soy|sarl|scot|sexy|sohu|surf|shoes|solar|space|schule|social|supply|suzuki|schmidt|shiksha|singles|spiegel|support|surgery|systems|saarland|services|software|supplies|solutions|s[abcdeghijklmnortuvxyz])"
+ "|(?:tax|tel|top|tips|town|toys|tatar|tirol|today|tokyo|tools|trade|tattoo|tienda|travel|training|technology|t[cdfghjklmnoprtvwz])"
+ "|(?:uno|uol|university|u[agksyz])"
+ "|(?:vet|vote|voto|vegas|vodka|viajes|villas|vision|voting|voyage|ventures|vacations|vlaanderen|versicherung|v[aceginu])"
+ "|(?:wed|wme|wtc|wtf|wang|wien|wiki|wales|watch|works|webcam|website|whoswho|williamhill|w[fs])"
+ "|(?:xn\\-\\-0zwm56d|xn\\-\\-11b5bs3a9aj6g|xn\\-\\-80akhbyknj4f|xn\\-\\-9t4b11yi5a|xn\\-\\-deba0ad|xn\\-\\-fiqs8s|xn\\-\\-fiqz9s|xn\\-\\-fzc2c9e2c|xn\\-\\-g6w251d|xn\\-\\-hgbk6aj7f53bba|xn\\-\\-hlcj6aya9esc7a|xn\\-\\-j6w193g|xn\\-\\-jxalpdlp|xn\\-\\-kgbechtv|xn\\-\\-kprw13d|xn\\-\\-kpry57d|xn\\-\\-mgbaam7a8h|xn\\-\\-mgbayh7gpa|xn\\-\\-mgberp4a5d4ar|xn\\-\\-o3cw4h|xn\\-\\-p1ai|xn\\-\\-pgbs0dh|xn\\-\\-wgbh1c|xn\\-\\-wgbl6a|xn\\-\\-xkc2al3hye2a|xn\\-\\-ygbi2ammx|xn\\-\\-zckzah)"
+ "|(?:xxx|xyz)"
+ "|(?:yachts|yandex|youtube|yokohama|y[et])"
+ "|(?:zip|zone|z[amw])))";
public static final Pattern WEB_URL = Pattern.compile(
"((?:(http|https|Http|Https|rtsp|Rtsp):\\/\\/(?:(?:[a-zA-Z0-9\\$\\-\\_\\.\\+\\!\\*\\'\\(\\)"
+ "\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,64}(?:\\:(?:[a-zA-Z0-9\\$\\-\\_"
+ "\\.\\+\\!\\*\\'\\(\\)\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,25})?\\@)?)?"
+ "((?:(?:[" + GOOD_IRI_CHAR + "][" + GOOD_IRI_CHAR + "\\-]{0,64}\\.)+" // named host
+ TOP_LEVEL_DOMAIN_STR_FOR_WEB_URL
+ "|(?:(?:25[0-5]|2[0-4]" // or ip address
+ "[0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9])\\.(?:25[0-5]|2[0-4][0-9]"
+ "|[0-1][0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1]"
+ "[0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}"
+ "|[1-9][0-9]|[0-9])))"
+ "(?:\\:\\d{1,5})?)" // plus option port number
+ "(\\/(?:(?:[" + GOOD_IRI_CHAR + "\\;\\/\\?\\:\\@\\&\\=\\#\\~" // plus option query params
+ "\\-\\.\\+\\!\\*\\'\\(\\)\\,\\_])|(?:\\%[a-fA-F0-9]{2}))*)?"
+ "(?:\\b|$)"); // and finally, a word boundary or end of
// input. This is to stop foo.sure from
// matching as foo.su
}