/* * Copyright 2014 michael-simons.eu. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package ac.simons.autolinker; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.jsoup.nodes.Element; import org.jsoup.nodes.Node; import org.jsoup.nodes.TextNode; import org.jsoup.parser.Tag; /** * Turns valid ASCII email adresses into anchor text. The label can be * obsfuscated, the email address can be encoded to hex. * * @author Michael J. Simons, 2014-12-27 */ public class EmailAddressAutoLinker implements AutoLinker { /** * Regex according to http://www.w3.org/Protocols/rfc822/, Originally * written by Cal Henderson * (http://iamcal.com/publish/articles/php/parsing_email/), Translated to * Ruby by Tim Fletcher, with changes suggested by Dan Kubb. Translated to * Java by Michael J. Simons */ private final static String VALID_EMAIL_ADDRESS_REGEX = "(?:[^\\x00-\\x20\\x22\\x28\\x29\\x2c\\x2e\\x3a-\\x3c\\x3e\\x40\\x5b-\\x5d\\x7f-\\xff]+|\\x22(?:[^\\x0d\\x22\\x5c\\x80-\\xff]|\\x5c[\\x00-\\x7f])*\\x22)(?:\\x2e(?:[^\\x00-\\x20\\x22\\x28\\x29\\x2c\\x2e\\x3a-\\x3c\\x3e\\x40\\x5b-\\x5d\\x7f-\\xff]+|\\x22(?:[^\\x0d\\x22\\x5c\\x80-\\xff]|\\x5c[\\x00-\\x7f])*\\x22))*\\x40(?:[^\\x00-\\x20\\x22\\x28\\x29\\x2c\\x2e\\x3a-\\x3c\\x3e\\x40\\x5b-\\x5d\\x7f-\\xff]+|\\x5b(?:[^\\x0d\\x5b-\\x5d\\x80-\\xff]|\\x5c[\\x00-\\x7f])*\\x5d)(?:\\x2e(?:[^\\x00-\\x20\\x22\\x28\\x29\\x2c\\x2e\\x3a-\\x3c\\x3e\\x40\\x5b-\\x5d\\x7f-\\xff]+|\\x5b(?:[^\\x0d\\x5b-\\x5d\\x80-\\xff]|\\x5c[\\x00-\\x7f])*\\x5d))*"; public final static Pattern VALID_EMAIL_ADDRESS = Pattern.compile(String.format("\\A%s\\z", VALID_EMAIL_ADDRESS_REGEX)); public final static Pattern VALID_EMAIL_ADRESS_ML = Pattern.compile(String.format("(?m)(?<![^\\s])%s", VALID_EMAIL_ADDRESS_REGEX)); public final static Pattern AT_SIGNS = Pattern.compile("[@\uFF20\\x40]"); /** * A flag if the addresses in the mailto: protocoll should be hex-encoded */ private final boolean hexEncodeEmailAddress; /** * A flag if the labels should be obfuscated */ private final boolean obfuscateEmailAddress; /** * Instantiates a new email address autolinker. * * @param hexEncodeEmailAddress Should mailto: Addresses be hex-encoded? * @param obfuscateEmailAddress Should labels be obfuscated? */ public EmailAddressAutoLinker( boolean hexEncodeEmailAddress, boolean obfuscateEmailAddress ) { this.hexEncodeEmailAddress = hexEncodeEmailAddress; this.obfuscateEmailAddress = obfuscateEmailAddress; } @Override public List<Node> createLinks(final TextNode textNode) { final List<Node> rv = new ArrayList<>(); int start = 0; final String nodeText = textNode.getWholeText(); final String baseUri = textNode.baseUri(); final Matcher matcher = VALID_EMAIL_ADRESS_ML.matcher(nodeText); while (matcher.find()) { final String emailAddress = matcher.group(); if (!(new String(emailAddress.getBytes(), StandardCharsets.US_ASCII)).equals(emailAddress)) { continue; } final String textBefore = nodeText.substring(start, matcher.start()); if (!textBefore.isEmpty()) { rv.add(new TextNode(textBefore, baseUri)); } final Element newAnchor = new Element(Tag.valueOf("a"), baseUri); newAnchor.attr("href", String.format("%s%s", "mailto:", hexEncodeEmailAddress ? hexEncodeEmailAddress(emailAddress) : emailAddress)); newAnchor.appendChild(new TextNode(obfuscateEmailAddress ? obfuscateEmailAddress(emailAddress) : emailAddress, baseUri)); rv.add(newAnchor); start = matcher.end(); } // Add a new textnode for everything after final String textAfter = nodeText.substring(start); if (!textAfter.isEmpty()) { rv.add(new TextNode(textAfter, baseUri)); } return rv; } /** * Obfuscates an email address. @ will be replaced throught " [AT] " and . * through " [DOT] ". The email address is lowercased before processing. * * @param emailAddress The email address to obfuscate * @return An obfuscated email address */ public String obfuscateEmailAddress(final String emailAddress) { return AT_SIGNS.matcher(emailAddress.toLowerCase()).replaceAll(" [AT] ").replaceAll("\\.", " [DOT] "); } /** * Hex encodes an email addess, leaving the '@' intact. Browsers are able to * decode this and maybe it's stops spammers from using emails like that. * The email address is lowercased before processing. * * @param emailAddress The email address that should be encoded to * hexadecimal * @return An hexadecimal encoded email adresse */ public String hexEncodeEmailAddress(String emailAddress) { final String emailAddressLc = emailAddress.toLowerCase(); final StringBuilder rv = new StringBuilder(); for (int i = 0; i < emailAddressLc.length(); ++i) { char c = emailAddressLc.charAt(i); rv.append(c == '@' ? c : String.format("%%%x", (int) c)); } return rv.toString(); } }