package folioxml.export.html; import folioxml.core.InvalidMarkupException; import folioxml.core.TokenUtils; import folioxml.export.NodeListProcessor; import folioxml.xml.Node; import folioxml.xml.NodeList; import java.net.MalformedURLException; import java.net.URL; import java.util.regex.Pattern; public class FixHttpLinks implements NodeListProcessor { public FixHttpLinks() { } protected static Pattern scheme = Pattern.compile("\\A\\s*(mailto:|[a-zA-Z][A-Za-z0-9+.-]*://)", Pattern.CASE_INSENSITIVE); protected static Pattern missingslash = Pattern.compile("\\A\\s*(https?):/([^/])", Pattern.CASE_INSENSITIVE);//\\G\\s++(\\w[\\w-:]*+)(?:\\s*+=\\s*+\"([^\"]*+)\"|\\s*+=\\s*+'([^']*+)'|\\s*+=\\s*+([^\\s=/>]*+)|(\\s*?))"); private boolean validUrl(String address) { try { URL u = new URL(address); return true; } catch (MalformedURLException e) { return false; } } public NodeList process(NodeList nodes) throws InvalidMarkupException { NodeList links = nodes.filterByTagName("a|link", true); for (Node n : links.list()) { //Type may be any of 'folio', 'data-link', 'ole', or 'class-object', as the attributes from the object definition have been merged in //Limit this exclusively to web links; data links are physical paths. if (TokenUtils.fastMatches("folio|data-link", n.get("type"))) { continue; } String url = n.get("href"); String repairedUrl = repairUrl(url); if (repairedUrl != null && !repairedUrl.equals(url)) { n.set("href", repairedUrl); n.setTagName("a"); //Just those we change. } else { if (validUrl(url)) { n.setTagName("a"); //And those that validate as a correct URL w/schema & everything. } } } return nodes; } public String repairUrl(String url) { //Backslashes indicate a physical path; do not touch. if (url == null || url.contains("\\")) return url; //First check if the scheme is missing if (!scheme.matcher(url).find()) { //Try to repair http:/ -> http:// String url2 = missingslash.matcher(url).replaceFirst("$1://$2"); //If the scheme is still missing, add it if (!scheme.matcher(url2).find()) { url2 = "http://" + url.trim(); } //If that causes the URL to be correctly formed, return that repaired url if (validUrl(url2)) { return url2; } } return url; } }