/** * Copyright 2002 DFKI GmbH. * All Rights Reserved. Use is subject to license terms. * * This file is part of MARY TTS. * * MARY TTS is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, version 3 of the License. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * */ package marytts.language.de.preprocess; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import marytts.util.MaryUtils; import marytts.util.dom.MaryDomUtils; import org.apache.log4j.Logger; import org.w3c.dom.Document; import org.w3c.dom.Element; /** * An expansion pattern implementation for internet (URI or email) patterns. * * @author Marc Schröder */ public class NetEP extends ExpansionPattern { private final String[] _knownTypes = { "net", "net:email", "net:uri" }; /** * Every subclass has its own list knownTypes, an internal string representation of known types. These are possible values of * the <code>type</code> attribute to the <code>say-as</code> element, as defined in MaryXML.dtd. If there is more than one * known type, the first type (<code>knownTypes[0]</code>) is expected to be the most general one, of which the others are * specializations. */ private final List<String> knownTypes = Arrays.asList(_knownTypes); public List<String> knownTypes() { return knownTypes; } // Domain-specific primitives: /* * Email syntax is specified in http://www.faqs.org/rfcs/rfc2822.html (see end of this file for excerpt) */ protected final String aText = "[A-Za-z0-9\\!\\#\\$\\%\\&\\'\\*\\+\\-\\/\\=\\?\\^\\_\\`\\{\\|\\}\\~]+"; protected final String dotAtomText = "(?:" + aText + "(?:\\." + aText + ")*)"; protected final String sNetEmail = "(?:(" + dotAtomText + ")\\@(" + dotAtomText + "))"; /* * For the URI regular expression, see the excerpt from RFC2396 as found at http://www.ietf.org/rfc/rfc2396.txt at the bottom * of this file. */ // protected final String domainSuffix = // "(?:ad|ae|af|ag|ai|al|am|an|ao|aq|ar|arpa|as|at|au|aw|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cf|cg|ch|ci|ck|cl|cm|cn|co|com|cr|cs|cu|cv|cx|cy|cz|de|dj|dk|dm|do|dz|ec|edu|ee|eg|eh|es|et|fi|fj|fk|fm|fo|fr|fx|ga|gb|gd|ge|gf|gh|gi|gl|gm|gn|gov|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|in|int|io|iq|ir|is|it|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|mg|mh|mil|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nato|nc|ne|net|nf|ng|ni|nl|no|np|nr|nt|nu|nz|om|org|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|pt|pw|py|qa|re|ro|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|st|su|sv|sy|sz|tc|td|tf|tg|th|tj|tk|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|um|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zr|zw)"; protected final String domainSuffix = "(?:com|edu|net|info|biz|org|de|eu|uk|ie|fr|au|jp|at|ch|ws|tv|cc)"; protected final String domain = "(?:(?:[A-Za-z0-9\\-]+\\.)+" + domainSuffix + ")"; protected final String path = "(?:(?:/~?[A-Za-z0-9\\-\\.\\_]+)+/?)"; protected final String sNetUri = "(?:(?:(?:http|ftp)://)?(" + domain + ")(" + path + ")?)"; // protected final String sNetUriSubstructure = "^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?"; // in this, $4 is www.xy.com and $5 is /pub/data/myfile.html // We don't use sMatchingChars here, but override isCandidate(). protected final Pattern reNetEmail = Pattern.compile(sNetEmail); protected final Pattern reNetUri = Pattern.compile(sNetUri); // protected final Pattern reNetUriSubstructure = Pattern.compile(sNetUriSubstructure); private final Pattern reMatchingChars = null; public Pattern reMatchingChars() { return reMatchingChars; } /** * Every subclass has its own logger. The important point is that if several threads are accessing the variable at the same * time, the logger needs to be thread-safe or it will produce rubbish. */ private Logger logger = MaryUtils.getLogger("NetEP"); public NetEP() { super(); } protected boolean isCandidate(Element t) { String s = MaryDomUtils.tokenText(t); return (s.indexOf('@') != -1 || s.indexOf('.') != -1 || s.indexOf('/') != -1 || s.indexOf(':') != -1 || s.equals("http") || s.equals("ftp") || s.equals("mailto")); } /** * Inform whether this module performs a full expansion of the input, or whether other patterns should be applied after this * one. * * @return false */ protected boolean doesFullExpansion() { return false; } protected int canDealWith(String s, int type) { return match(s, type); } protected int match(String s, int type) { switch (type) { case 0: if (matchNetEmail(s)) return 1; if (matchNetUri(s)) return 2; break; case 1: if (matchNetEmail(s)) return 1; break; case 2: if (matchNetUri(s)) return 2; break; } return -1; } protected List<Element> expand(List<Element> tokens, String s, int type) { if (tokens == null) throw new NullPointerException("Received null argument"); if (tokens.isEmpty()) throw new IllegalArgumentException("Received empty list"); Element firstOld = (Element) tokens.get(0); Document doc = firstOld.getOwnerDocument(); // we expect type to be one of the return values of match(): List<Element> expanded = null; switch (type) { case 1: expanded = expandNetEmail(doc, s); break; case 2: expanded = expandNetUri(doc, s); break; } replaceTokens(tokens, expanded); // Slow down the new part, // so the spelled out form will be understandable. // slowDown((Element)expanded.get(0), // (Element)expanded.get(expanded.size()-1)); return expanded; } private boolean matchNetEmail(String s) { return reNetEmail.matcher(s).matches(); } private boolean matchNetUri(String s) { return reNetUri.matcher(s).matches(); } protected List<Element> expandNetEmail(Document doc, String s) { ArrayList<Element> exp = new ArrayList<Element>(); Matcher reMatcher = reNetEmail.matcher(s); if (!reMatcher.find()) return null; String localPart = reMatcher.group(1); String localPartExpanded = abbrev.ruleExpandAbbrev(localPart, true); // true = sayPuncutation exp.addAll(makeNewTokens(doc, localPartExpanded, true, // create mtu localPart, true)); // force accents exp.add(MaryDomUtils.createBoundary(doc)); exp.addAll(makeNewTokens(doc, "at['?{t]", true, // create mtu "@", true)); // force accents String domain = reMatcher.group(2); exp.addAll(expandDomain(doc, domain)); return exp; } protected List<Element> expandNetUri(Document doc, String s) { ArrayList<Element> exp = new ArrayList<Element>(); Matcher reMatcher = reNetUri.matcher(s); if (!reMatcher.find()) return null; String domain = reMatcher.group(1); exp.addAll(expandDomain(doc, domain)); String path = reMatcher.group(2); if (path != null && path.length() > 0) { String pathExpanded = abbrev.ruleExpandAbbrev(path, true); // true = sayPunctuation exp.addAll(makeNewTokens(doc, pathExpanded, true, // create mtu path, true)); // force accents exp.add(MaryDomUtils.createBoundary(doc)); } return exp; } private List<Element> expandDomain(Document doc, String domain) { logger.debug("Expanding domain `" + domain + "'"); ArrayList<Element> exp = new ArrayList<Element>(); String domainSuffix = null; String toExpand; if (domain.endsWith(".com")) { toExpand = domain.substring(0, domain.length() - 4); domainSuffix = " dot['dOt] com['kOm]"; } else if (domain.lastIndexOf(".") != -1 && domain.lastIndexOf(".") == domain.length() - 3) { // Domains consisting of two characters (de, ch, uk, ...) // are to be spelled out toExpand = domain.substring(0, domain.length() - 3); logger.debug("toExpand = `" + toExpand + "'"); domainSuffix = " " + domain.substring(domain.length() - 2, domain.length() - 1) + " " + domain.substring(domain.length() - 1); logger.debug("domainSuffix = `" + domainSuffix + "'"); } else { toExpand = domain; } String domainExpanded = toExpand.replaceAll("\\.", " "); logger.debug("domainExpanded = `" + domainExpanded + "'"); if (domainSuffix != null) domainExpanded += domainSuffix; logger.debug("domainExpanded with suffix = `" + domainExpanded + "'"); exp.addAll(makeNewTokens(doc, domainExpanded, true, // create mtu domain, true)); // force accents // System.err.println("Expanded `" + s + "' as `" + domainExpanded + "'"); exp.add(MaryDomUtils.createBoundary(doc)); return exp; } } /* * Request for Comments: 2822 3. Syntax * * 3.2.4. Atom * * Several productions in structured header field bodies are simply strings of certain basic characters. Such productions are * called atoms. * * Some of the structured header field bodies also allow the period character (".", ASCII value 46) within runs of atext. An * additional "dot-atom" token is defined for those purposes. * * atext = ALPHA / DIGIT / ; Any character except controls, "!" / "#" / ; SP, and specials. "$" / "%" / ; Used for atoms "&" / "'" * / "*" / "+" / "-" / "/" / "=" / "?" / "^" / "_" / "`" / "{" / "|" / "}" / "~" * * atom = [CFWS] 1*atext [CFWS] * * dot-atom = [CFWS] dot-atom-text [CFWS] * * dot-atom-text = 1*atext *("." 1*atext) * * Both atom and dot-atom are interpreted as a single unit, comprised of the string of characters that make it up. Semantically, * the optional comments and FWS surrounding the rest of the characters are not part of the atom; the atom is only the run of * atext characters in an atom, or the atext and "." characters in a dot-atom. * * 3.4. Address Specification * * Addresses occur in several message header fields to indicate senders and recipients of messages. An address may either be an * individual mailbox, or a group of mailboxes. * * address = mailbox / group * * mailbox = name-addr / addr-spec * * name-addr = [display-name] angle-addr * * angle-addr = [CFWS] "<" addr-spec ">" [CFWS] / obs-angle-addr * * group = display-name ":" [mailbox-list / CFWS] ";" [CFWS] * * display-name = phrase * * mailbox-list = (mailbox *("," mailbox)) / obs-mbox-list * * address-list = (address *("," address)) / obs-addr-list * * A mailbox receives mail. It is a conceptual entity which does not necessarily pertain to file storage. For example, some sites * may choose to print mail on a printer and deliver the output to the addressee's desk. Normally, a mailbox is comprised of two * parts: (1) an optional display name that indicates the name of the recipient (which could be a person or a system) that could * be displayed to the user of a mail application, and (2) an addr-spec address enclosed in angle brackets ("<" and ">"). There is * also an alternate simple form of a mailbox where the addr-spec address appears alone, without the recipient's name or the angle * brackets. The Internet addr-spec address is described in section 3.4.1. * * Note: Some legacy implementations used the simple form where the addr-spec appears without the angle brackets, but included the * name of the recipient in parentheses as a comment following the addr-spec. Since the meaning of the information in a comment is * unspecified, implementations SHOULD use the full name-addr form of the mailbox, instead of the legacy form, to specify the * display name associated with a mailbox. Also, because some legacy implementations interpret the comment, comments generally * SHOULD NOT be used in address fields to avoid confusing such implementations. * * When it is desirable to treat several mailboxes as a single unit (i.e., in a distribution list), the group construct can be * used. The group construct allows the sender to indicate a named group of recipients. This is done by giving a display name for * the group, followed by a colon, followed by a comma separated list of any number of mailboxes (including zero and one), and * ending with a semicolon. Because the list of mailboxes can be empty, using the group construct is also a simple way to * communicate to recipients that the message was sent to one or more named sets of recipients, without actually providing the * individual mailbox address for each of those recipients. * * 3.4.1. Addr-spec specification * * An addr-spec is a specific Internet identifier that contains a locally interpreted string followed by the at-sign character * ("@", ASCII value 64) followed by an Internet domain. The locally interpreted string is either a quoted-string or a dot-atom. * If the string can be represented as a dot-atom (that is, it contains no characters other than atext characters or "." * surrounded by atext * * characters), then the dot-atom form SHOULD be used and the quoted-string form SHOULD NOT be used. Comments and folding white * space SHOULD NOT be used around the "@" in the addr-spec. * * addr-spec = local-part "@" domain * * local-part = dot-atom / quoted-string / obs-local-part * * domain = dot-atom / domain-literal / obs-domain * * domain-literal = [CFWS] "[" *([FWS] dcontent) [FWS] "]" [CFWS] * * dcontent = dtext / quoted-pair * * dtext = NO-WS-CTL / ; Non white space controls * * %d33-90 / ; The rest of the US-ASCII %d94-126 ; characters not including "[", ; "]", or "\" * * The domain portion identifies the point to which the mail is delivered. In the dot-atom form, this is interpreted as an * Internet domain name (either a host name or a mail exchanger name) as described in [STD3, STD13, STD14]. In the domain-literal * form, the domain is interpreted as the literal Internet address of the particular host. In both cases, how addressing is used * and how messages are transported to a particular host is covered in the mail transport document [RFC2821]. These mechanisms are * outside of the scope of this document. * * The local-part portion is a domain dependent string. In addresses, it is simply interpreted on the particular host as a name of * a particular mailbox. */ /* * RFC 2396 URI Generic Syntax August 1998 * * * B. Parsing a URI Reference with a Regular Expression * * As described in Section 4.3, the generic URI syntax is not sufficient to disambiguate the components of some forms of URI. * Since the "greedy algorithm" described in that section is identical to the disambiguation method used by POSIX regular * expressions, it is natural and commonplace to use a regular expression for parsing the potential four components and fragment * identifier of a URI reference. * * The following line is the regular expression for breaking-down a URI reference into its components. * * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))? 12 3 4 5 6 7 8 9 * * The numbers in the second line above are only to assist readability; they indicate the reference points for each subexpression * (i.e., each paired parenthesis). We refer to the value matched for subexpression <n> as $<n>. For example, matching the above * expression to * * http://www.ics.uci.edu/pub/ietf/uri/#Related * * results in the following subexpression matches: * * $1 = http: $2 = http $3 = //www.ics.uci.edu $4 = www.ics.uci.edu $5 = /pub/ietf/uri/ $6 = <undefined> $7 = <undefined> $8 = * #Related $9 = Related * * where <undefined> indicates that the component is not present, as is the case for the query component in the above example. * Therefore, we can determine the value of the four components and fragment as * * scheme = $2 authority = $4 path = $5 query = $7 fragment = $9 * * and, going in the opposite direction, we can recreate a URI reference from its components using the algorithm in step 7 of * Section 5.2. * * * * * * Berners-Lee, et. al. Standards Track [Page 29] */