/*
* Copyright (c) 2006-2013 by Public Library of Science
*
* http://plos.org
* http://ambraproject.org
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.ambraproject.util;
import java.io.StringReader;
import java.io.StringWriter;
import java.net.MalformedURLException;
import java.net.URI;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.regex.Pattern;
import java.util.List;
import java.util.ArrayList;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import org.apache.commons.lang.StringEscapeUtils;
import org.apache.commons.lang.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import com.opensymphony.util.UrlUtils;
import org.xml.sax.InputSource;
import sun.misc.BASE64Encoder;
/**
* Provides some useful text manipulation functions.
*/
public class TextUtils {
public static final String HTTP_PREFIX = "http://";
private static final Pattern maliciousContentPattern = Pattern.compile("[<>\"\'%;()&+]");
private static final Pattern lineBreakPattern = Pattern.compile("\\p{Zl}|\r\n|\n|\u0085|\\p{Zp}");
private static final Pattern strongPattern = Pattern.compile("'''");
private static final Pattern emphasizePattern = Pattern.compile("''");
private static final Pattern strongEmphasizePattern = Pattern.compile("'''''");
private static final Pattern superscriptPattern = Pattern.compile("\\^\\^");
private static final Pattern subscriptPattern = Pattern.compile("~~");
private static Logger log = LoggerFactory.getLogger(TextUtils.class);
/**
* Create a hash of a string
*
* @param string the string to make the hash
*
* @return the hash of the string
*/
public static String createHash(String string) {
return createHash(string.getBytes());
}
/**
* Create a hash of a byte array
*
* @param bytes
*
* @return the hash of the byte array
*/
public static String createHash(byte[] bytes) {
try {
MessageDigest messageDigest = MessageDigest.getInstance("SHA-1");
messageDigest.update(bytes);
return encodeText(messageDigest.digest());
} catch(NoSuchAlgorithmException ex) {
throw new RuntimeException(ex);
}
}
/**
* Produces a String value suitable for rendering in HTML for the given binary data.
*/
private static String encodeText(byte[] data) {
BASE64Encoder encoder = new BASE64Encoder();
String base64 = encoder.encodeBuffer(data);
// Make the returned value a little prettier by replacing slashes with underscores, and removing the trailing
// "=".
base64 = base64.replace('/', '_').trim();
return base64.substring(0, base64.length() - 1);
}
/**
* Convert a List of URIs to a List of Strings
* @param list a List of URIs
* @return a list of strings
*/
public static List<String> toStringList(List<URI> list) {
List<String> simpleCollection = new ArrayList<String>();
for (URI uri : list) {
simpleCollection.add(uri.toString());
}
return simpleCollection;
}
/**
* Takes in a String and returns it with all line separators replaced by <br/> tags suitable
* for display as HTML.
*
* @param input HTML
* @return String with line separators replaced with <br/>
*/
public static String makeHtmlLineBreaks (final String input) {
if (StringUtils.isBlank(input)) {
return input;
}
return lineBreakPattern.matcher(input).replaceAll("<br/>");
}
/**
* Takes in a String and returns it with all pairs of <code>'''</code>
* replaced by \<strong\>\</strong\> tags suitable for display as HTML.
* For example: <code>foo '''bar''' baz</code> is transformed into
* <code>foo \<strong\>bar\</strong\> baz</code>
* <p/>
* The <code>strong</code> tag is used instead of the <code>b</code> tag
* because <code>strong</code> is preferred for CSS styling.
* <p/>
* There is no check for consistency of markup pairs (e.g., <code>foo ''bar''' baz</code> will become
* <code>foo ''bar\<strong\> baz</code>) which will, rightfully, infuriate some users.
*
* @param input HTML
* @return String with all pairs of <code>'''</code> replaced by \<strong\>\</strong\> tags
*/
public static String makeHtmlStrong (final String input) {
// If no Pattern in "input" parameter, then do nothing.
if (StringUtils.isBlank(input)
|| (! strongPattern.matcher(input).find())) {
return input;
}
String transformedInput = input; // This will be the String that gets returned.
boolean isInsideATag = false; // Whether an open tag was the most recent substitution.
// While there is Pattern in "input" parameter, replace each instance of Pattern with
// either an open or close tag. Alternate the tag substituted to give tag pairs.
while (strongPattern.matcher(transformedInput).find()) {
if (! isInsideATag) {
transformedInput = strongPattern.matcher(transformedInput).replaceFirst("<strong>");
isInsideATag = true;
} else {
transformedInput = strongPattern.matcher(transformedInput).replaceFirst("</strong>");
isInsideATag = false;
}
}
return transformedInput;
}
/**
* Takes in a String and returns it with all pairs of <code>''</code> replaced by \<em\>\</em\>
* tags suitable for display as HTML.
* <p/>
* For example: <code>foo ''bar'' baz</code> is transformed into <code>foo \<em\>bar\</em\> baz</code>
* <p/>
* The <code>em</code> tag is used instead of the <code>i</code> tag
* because <code>em</code> is preferred for CSS styling.
* <p/>
* There is no check for consistency of markup pairs (e.g., <code>foo 'bar'' baz</code> will become
* <code>foo 'bar\<em\> baz</code>) which will, rightfully, infuriate some users.
*
* @param input HTML
* @return String with all pairs of <code>''</code> replaced by \<em\>\</em\> tags
*/
public static String makeHtmlEmphasized (final String input) {
// If no Pattern in "input" parameter, then do nothing.
if (StringUtils.isBlank(input)
|| (! emphasizePattern.matcher(input).find())) {
return input;
}
String transformedInput = input; // This will be the String that gets returned.
boolean isInsideATag = false; // Whether an open tag was the most recent substitution.
// While there is Pattern in "input" parameter, replace each instance of Pattern with
// either an open or close tag. Alternate the tag substituted to give tag pairs.
while (emphasizePattern.matcher(transformedInput).find()) {
if (! isInsideATag) {
transformedInput = emphasizePattern.matcher(transformedInput).replaceFirst("<em>");
isInsideATag = true;
} else {
transformedInput = emphasizePattern.matcher(transformedInput).replaceFirst("</em>");
isInsideATag = false;
}
}
return transformedInput;
}
/**
* Takes in a String and returns it with all pairs of <code>'''''</code> replaced by
* \<strong\>\<em\>\</em\>\</strong\> tags suitable for display as HTML.
* <p/>
* For example: <code>foo '''''bar''''' baz</code> is
* transformed into <code>foo \<strong\>\<em\>bar\</em\>\</strong\> baz</code>
* <p/>
* The <code>em</code> tag is used instead of the <code>i</code> tag
* because <code>em</code> is preferred for CSS styling.
* The <code>strong</code> tag is used instead of the <code>b</code> tag
* because <code>strong</code> is preferred for CSS styling.
* <p/>
* There is no check for consistency of markup pairs (e.g., <code>foo 'bar''''' baz</code> will become
* <code>foo 'bar\<strong\>\<em\> baz</code>) which will, rightfully, infuriate some users.
*
* @param input HTML
* @return String with all pairs of <code>'''''</code> replaced by \<strong\>\<em\>\</em\>\</strong\> tags
*/
public static String makeHtmlStrongEmphasized (final String input) {
// If no Pattern in "input" parameter, then do nothing.
if (StringUtils.isBlank(input)
|| (! strongEmphasizePattern.matcher(input).find())) {
return input;
}
String transformedInput = input; // This will be the String that gets returned.
boolean isInsideATag = false; // Whether an open tag was the most recent substitution.
// While there is Pattern in "input" parameter, replace each instance of Pattern with
// either an open or close tag. Alternate the tag substituted to give tag pairs.
while (strongEmphasizePattern.matcher(transformedInput).find()) {
if (! isInsideATag) {
transformedInput = strongEmphasizePattern.matcher(transformedInput).replaceFirst("<strong><em>");
isInsideATag = true;
} else {
transformedInput = strongEmphasizePattern.matcher(transformedInput).replaceFirst("</em></strong>");
isInsideATag = false;
}
}
return transformedInput;
}
/**
* Takes in a String and returns it with all pairs of <code>^^</code> replaced by \<sup\>\</sup\>
* tags suitable for display as HTML.
* <p/>
* For example: <code>foo ^^bar^^ baz</code> is transformed into
* <code>foo \<sup\>bar\</sup\> baz</code>
* <p/>
* There is no check for consistency of markup pairs (e.g., <code>foo ^bar^^ baz</code> will become
* <code>foo ^bar\<sup\> baz</code>) which will, rightfully, infuriate some users.
*
* @param input HTML
* @return String with all pairs of <code>^^</code> replaced by \<sup\>\</sup\> tags
*/
public static String makeHtmlSuperscript (final String input) {
// If no Pattern in "input" parameter, then do nothing.
if (StringUtils.isBlank(input)
|| (! superscriptPattern.matcher(input).find())) {
return input;
}
String transformedInput = input; // This will be the String that gets returned.
boolean isInsideATag = false; // Whether an open tag was the most recent substitution.
// While there is Pattern in "input" parameter, replace each instance of Pattern with
// either an open or close tag. Alternate the tag substituted to give tag pairs.
while (superscriptPattern.matcher(transformedInput).find()) {
if (! isInsideATag) {
transformedInput = superscriptPattern.matcher(transformedInput).replaceFirst("<sup>");
isInsideATag = true;
} else {
transformedInput = superscriptPattern.matcher(transformedInput).replaceFirst("</sup>");
isInsideATag = false;
}
}
return transformedInput;
}
/**
* Takes in a String and returns it with all pairs of <code>~~</code> replaced by \<sub\>\</sub\>
* tags suitable for display as HTML.
* <p/>
* For example: <code>foo ~~bar~~ baz</code> is transformed into
* <code>foo \<sub\>bar\</sub\> baz</code>
* <p/>
* There is no check for consistency of markup pairs (e.g., <code>foo ~bar~~ baz</code> will become
* <code>foo ~bar\<sub\> baz</code>) which will, rightfully, infuriate some users.
*
* @param input HTML
* @return String with all pairs of <code>~~</code> replaced by \<sub\>\</sub\> tags
*/
public static String makeHtmlSubscript (final String input) {
// If no Pattern in "input" parameter, then do nothing.
if (StringUtils.isBlank(input)
|| (! subscriptPattern.matcher(input).find())) {
return input;
}
String transformedInput = input; // This will be the String that gets returned.
boolean isInsideATag = false; // Whether an open tag was the most recent substitution.
// While there is Pattern in "input" parameter, replace each instance of Pattern with
// either an open or close tag. Alternate the tag substituted to give tag pairs.
while (subscriptPattern.matcher(transformedInput).find()) {
if (! isInsideATag) {
transformedInput = subscriptPattern.matcher(transformedInput).replaceFirst("<sub>");
isInsideATag = true;
} else {
transformedInput = subscriptPattern.matcher(transformedInput).replaceFirst("</sub>");
isInsideATag = false;
}
}
return transformedInput;
}
/**
* Linkify any possible web links excepting email addresses and enclosed with <p> tags
* @param text text
* @param maxLength The max length (in displayed characters) of the text to be displayed inside the <a>tag</a>
* @return hyperlinked text
*/
public static String hyperlinkEnclosedWithPTags(final String text, int maxLength) {
final StringBuilder retStr = new StringBuilder("<p>");
retStr.append(hyperlink(text, maxLength));
retStr.append("</p>");
return (retStr.toString());
}
/**
* Linkify any possible web links excepting email addresses and enclosed with <p> tags
* @param text text
* @return hyperlinked text
*/
public static String hyperlinkEnclosedWithPTags(final String text) {
return hyperlinkEnclosedWithPTags(text, 0);
}
/**
* Linkify any possible web links excepting email addresses
*
* @param text text
* @param maxLength The max length (in displayed characters) of the text to be displayed
* inside the <a>tag</a>
* @return hyperlinked text
*/
public static String hyperlink(final String text, int maxLength) {
if (StringUtils.isBlank(text)) {
return text;
}
/*
* HACK: [issue - if the text ends with ')' this is included in the hyperlink]
* so to avoid this we explicitly guard against it here
* NOTE: com.opensymphony.util.TextUtils.linkURL guards against an atomically wrapped url:
* "(http://www.domain.com)" but NOT "(see http://www.domain.com)"
*/
if (text.indexOf('}') >= 0 || text.indexOf('{') >= 0) {
return linkURL(text, null, maxLength);
}
String s = text.replace('(', '{');
s = s.replace(')', '}');
s = linkURL(s, null, maxLength);
s = StringUtils.replace(s, "{", "(");
s = StringUtils.replace(s, "}", ")");
return s;
// END HACK
}
/**
* Linkify any possible web links excepting email addresses
*
* @param text text
* @return hyperlinked text
*/
public static String hyperlink(final String text) {
return hyperlink(text, 0);
}
/**
* Return the escaped html. Useful when you want to make any dangerous scripts safe to render.
* <p/>
* Also transforms wiki-type markup into HTML tags and replaces line breaks with HTML "break" tags.
*
* @param bodyContent bodyContent
* @return escaped html text
*/
public static String escapeHtml(final String bodyContent) {
String transformedBodyContent = makeHtmlLineBreaks(StringEscapeUtils.escapeHtml(bodyContent));
// The order of these three methods is important; we have to transform all instances of
// ''''' before trying to match instances of ''' or ''
transformedBodyContent = makeHtmlStrongEmphasized(transformedBodyContent); // matches '''''
transformedBodyContent = makeHtmlStrong(transformedBodyContent); // matches '''
transformedBodyContent = makeHtmlEmphasized(transformedBodyContent); // matches ''
transformedBodyContent = makeHtmlSuperscript(transformedBodyContent); // matches ^^
transformedBodyContent = makeHtmlSubscript(transformedBodyContent); // matches ~~
return transformedBodyContent;
}
/**
* @param bodyContent bodyContent
* @return Return escaped and hyperlinked text
*/
public static String escapeAndHyperlink(final String bodyContent) {
return hyperlinkEnclosedWithPTags(escapeHtml(bodyContent),0);
}
/**
* Transforms an org.w3c.dom.Document into a String
*
* @param node Document to transform
* @return String representation of node
* @throws TransformerException TransformerException
*/
public static String getAsXMLString(final Node node) throws TransformerException {
final Transformer tf = TransformerFactory.newInstance().newTransformer();
final StringWriter stringWriter = new StringWriter();
tf.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
tf.transform(new DOMSource(node), new StreamResult(stringWriter));
return stringWriter.toString();
}
/**
* @param url A URL
* @return whether the url is a valid address
*/
public static boolean verifyUrl(final String url) {
try {
URI u = new URI(url);
// To see if we can get a valid url or if we get an exception
u.toURL();
return true;
} catch (Exception e) {
return false;
}
}
/**
* Make a valid url from the given input url or url fragment
* @param url url
* @return valid url
* @throws MalformedURLException MalformedURLException
*/
public static String makeValidUrl(final String url) throws MalformedURLException {
String finalUrl = url;
if (!verifyUrl(finalUrl)) {
finalUrl = HTTP_PREFIX + finalUrl;
if (!verifyUrl(finalUrl)) {
throw new MalformedURLException("Invalid url:" + url);
}
}
return finalUrl;
}
/**
* Check if the input text is potentially malicious. For more details read;
* http://www.dwheeler.com/secure-programs/Secure-Programs-HOWTO/cross-site-malicious-content.html
* @param text text
* @return boolean
*/
public static boolean isPotentiallyMalicious(final String text) {
return maliciousContentPattern.matcher(text).find();
}
/**
* Escape html entity characters and high characters (eg "curvy" Word quotes).
* Note this method can also be used to encode XML.
*
* @param s the String to escape.
* @param encodeSpecialChars if true high characters will be encode other wise not.
* @return the escaped string
*/
private static String htmlEncode(String s, boolean encodeSpecialChars) {
s = noNull(s, "");
StringBuilder str = new StringBuilder();
for (int j = 0; j < s.length(); j++) {
char c = s.charAt(j);
// encode standard ASCII characters into HTML entities where needed
if (c < '\200') {
switch (c) {
case '"':
str.append(""");
break;
case '&':
str.append("&");
break;
case '<':
str.append("<");
break;
case '>':
str.append(">");
break;
default:
str.append(c);
}
}
// encode 'ugly' characters (ie Word "curvy" quotes etc)
else if (encodeSpecialChars && (c < '\377')) {
String hexChars = "0123456789ABCDEF";
int a = c % 16;
int b = (c - a) / 16;
str.append("")
.append(hexChars.charAt(b))
.append(hexChars.charAt(a))
.append(';');
}
//add other characters back in - to handle charactersets
//other than ascii
else {
str.append(c);
}
}
return str.toString();
}
/**
* Wrap all urls ('abc://' and 'www.abc') in specified string with href tags.
* Any text after the length defined by the maxDisplayLength parameter will be dropped and three periods will be added "..."
*
* @param str The block of text to check.
* @param target The target to use for the href (optional).
* @param maxDisplayLength The max length (in displayed characters) of the text to be displayed inside the <a>tag</a>
* @return String The block of text with all url's placed in href tags.
*/
//TODO: If openSymphony's implemntation of this method one day mactches this, we can remove this class
private static String linkURL(String str, String target, int maxDisplayLength) {
StringBuilder sb = new StringBuilder((int) (str.length() * 1.05));
sb.append(str);
linkURL(sb, target, maxDisplayLength);
return sb.toString();
}
/**
* Return <code>string</code>, or <code>defaultString</code> if
* <code>string</code> is <code>null</code> or <code>""</code>.
* Never returns <code>null</code>.
*
* <p>Examples:</p>
* <pre>
* // prints "hello"
* String s=null;
* System.out.println(TextUtils.noNull(s,"hello");
*
* // prints "hello"
* s="";
* System.out.println(TextUtils.noNull(s,"hello");
*
* // prints "world"
* s="world";
* System.out.println(TextUtils.noNull(s, "hello");
* </pre>
*
* @param string the String to check.
* @param defaultString The default string to return if <code>string</code> is <code>null</code> or <code>""</code>
* @return <code>string</code> if <code>string</code> is non-empty, and <code>defaultString</code> otherwise
* @see #stringSet(String)
*/
private static String noNull(String string, String defaultString) {
return (stringSet(string)) ? string : defaultString;
}
/**
* Check whether <code>string</code> has been set to
* something other than <code>""</code> or <code>null</code>.
* @param string the <code>String</code> to check
* @return a boolean indicating whether the string was non-empty (and non-null)
*/
private static boolean stringSet(String string) {
return (string != null) && !"".equals(string);
}
/**
* Get the starting index of a URL (either 'abc://' or 'www.')
* @param str String builder
* @param startIndex index
* @return new index
*/
private static int getStartUrl(StringBuilder str, int startIndex) {
int schemeIndex = getSchemeIndex(str, startIndex);
final int wwwIndex = str.indexOf("www.", startIndex + 1);
if ((schemeIndex == -1) && (wwwIndex == -1)) {
return -1;
} else if (schemeIndex == -1) {
return wwwIndex;
} else if (wwwIndex == -1) {
return schemeIndex;
}
return Math.min(schemeIndex, wwwIndex);
}
private static void linkURL(StringBuilder str, String target, int maxDisplayLength) {
String urlToDisplay;
int lastEndIndex = -1; //Stores the index position, within the whole string, of the ending char of the last URL found.
String targetString = ((target == null) || (target.trim().length() == 0)) ? "" : (" target=\"" + target.trim() + '\"');
while (true) {
int linkStartIndex = getStartUrl(str, lastEndIndex);
//if no more links found - then end the loop
if (linkStartIndex == -1) {
break;
} else {
//Get the whole URL...
//We move forward and add each character to the URL string until we encounter
//an invalid URL character (we assume that the URL ends there).
int linkEndIndex = linkStartIndex;
String urlStr = "";
while (true) {
// if char at linkEndIndex is '&' then we look at the next 4 chars
// to see if they make up "&" altogether. This is the html coded
// '&' and will pretty much stuff up an otherwise valid link becos of the ';'.
// We therefore have to remove it before proceeding...
if (str.charAt(linkEndIndex) == '&') {
if (((linkEndIndex + 6) <= str.length()) && """.equals(str.substring(linkEndIndex, linkEndIndex + 6))) {
break;
} else if (((linkEndIndex + 5) <= str.length()) && "&".equals(str.substring(linkEndIndex, linkEndIndex + 5))) {
str.replace(linkEndIndex, linkEndIndex + 5, "&");
}
}
if (UrlUtils.isValidURLChar(str.charAt(linkEndIndex))) {
urlStr += str.charAt(linkEndIndex);
linkEndIndex++;
if (linkEndIndex == str.length()) { //Reached end of str...
break;
}
} else {
break;
}
}
//if the characters before the linkStart equal 'href="' then don't link the url - CORE-44
if (linkStartIndex >= 6) { //6 = "href\"".length()
String prefix = str.substring(linkStartIndex - 6, linkStartIndex);
if ("href=\"".equals(prefix)) {
lastEndIndex = linkEndIndex;
continue;
}
}
//if the characters after the linkEnd are '</a>' then this url is probably already linked - CORE-44
if (str.length() >= (linkEndIndex + 4)) { //4 = "</a>".length()
String suffix = str.substring(linkEndIndex, linkEndIndex + 4);
if ("</a>".equals(suffix)) {
lastEndIndex = linkEndIndex + 4;
continue;
}
}
//Decrement linkEndIndex back by 1 to reflect the real ending index position of the URL...
linkEndIndex--;
// If the last char of urlStr is a '.' we exclude it. It is most likely a full stop and
// we don't want that to be part of an url.
while (true) {
char lastChar = urlStr.charAt(urlStr.length() - 1);
if (lastChar == '.') {
urlStr = urlStr.substring(0, urlStr.length() - 1);
linkEndIndex--;
} else {
break;
}
}
//if the URL had a '(' before it, and has a ')' at the end, trim the last ')' from the url
//ie '(www.opensymphony.com)' => '(<a href="http://www.openymphony.com/">www.opensymphony.com</a>)'
char lastChar = urlStr.charAt(urlStr.length() - 1);
if (lastChar == ')') {
if ((linkStartIndex > 0) && ('(' == (str.charAt(linkStartIndex - 1)))) {
urlStr = urlStr.substring(0, urlStr.length() - 1);
linkEndIndex--;
}
} else if (lastChar == '\'') {
if ((linkStartIndex > 0) && ('\'' == (str.charAt(linkStartIndex - 1)))) {
urlStr = urlStr.substring(0, urlStr.length() - 1);
linkEndIndex--;
}
}
//perhaps we ended with '>', '<' or '"'
//We need to strip these
//ie '"www.opensymphony.com"' => '"<a href="http://www.openymphony.com/">www.opensymphony.com</a>"'
//ie '<www.opensymphony.com>' => '<<a href="http://www.openymphony.com/">www.opensymphony.com</a>>'
else if (lastChar == ';') {
// 6 = """.length()
if ((urlStr.length() > 6) && """.equalsIgnoreCase(urlStr.substring(urlStr.length() - 6))) {
urlStr = urlStr.substring(0, urlStr.length() - 6);
linkEndIndex -= 6;
}
// 4 = "<".length() || ">".length()
else if (urlStr.length() > 4) {
final String endingStr = urlStr.substring(urlStr.length() - 4);
if ("<".equalsIgnoreCase(endingStr) || ">".equalsIgnoreCase(endingStr)) {
urlStr = urlStr.substring(0, urlStr.length() - 4);
linkEndIndex -= 4;
}
}
}
// we got the URL string, now we validate it and convert it into a hyperlink...
if (maxDisplayLength > 0 && urlStr.length() > maxDisplayLength) {
urlToDisplay = htmlEncode(urlStr.substring(0, maxDisplayLength), true) + "...";
} else {
urlToDisplay = htmlEncode(urlStr, true);
}
if (urlStr.toLowerCase().startsWith("www.")) {
urlStr = "http://" + urlStr;
}
if (UrlUtils.verifyHierachicalURI(urlStr)) {
//Construct the hyperlink for the url...
String urlLink;
if (maxDisplayLength > 0 && urlStr.length() > maxDisplayLength) {
//urlLink = "<a href=\"" + urlStr + "\"" + targetString + ">" + urlToDisplay + "</a>";
urlLink = "<a href=\"" + urlStr + "\"" + targetString + " title=\"" + htmlEncode(urlStr, true) + "\">" + urlToDisplay + "</a>";
} else {
urlLink = "<a href=\"" + urlStr + "\"" + targetString + ">" + urlToDisplay + "</a>";
}
//urlLink = "<a href=\"" + urlStr + '\"' + targetString + '>' + urlToDisplay + "</a>";
//Remove the original urlStr from str and put urlLink there instead...
str.replace(linkStartIndex, linkEndIndex + 1, urlLink);
//Set lastEndIndex to reflect the position of the end of urlLink
//within the whole string...
lastEndIndex = (linkStartIndex - 1) + urlLink.length();
} else {
//lastEndIndex is different from the one above cos' there's no
//<a href...> tags added...
lastEndIndex = (linkStartIndex - 1) + urlStr.length();
}
}
}
}
/**
* Given a string, and the index to start looking at, find the index of the start of the scheme. Eg.
* <pre>
* getSchemeIndex("notes://abc", 0) -> 0
* getSchemeIndex("abc notes://abc", 0) -> 4
* </pre>
* @param str The string to search for
* @param startIndex Where to start looking at
* @return The location the string was found, ot -1 if the string was not found.
*/
private static int getSchemeIndex(StringBuilder str, int startIndex) {
int schemeIndex = str.indexOf(UrlUtils.SCHEME_URL, startIndex + 1);
//if it was not found, or found at the start of the string, then return 'not found'
if (schemeIndex <= 0) {
return -1;
}
//walk backwards through the scheme until we find the first non valid character
int schemeStart;
for (schemeStart = schemeIndex - 1; schemeStart >= 0; schemeStart--) {
char currentChar = str.charAt(schemeStart);
if (!UrlUtils.isValidSchemeChar(currentChar)) {
break;
}
}
//reset the scheme to the starting character
schemeStart++;
/*
we don't want to do this, otherwise an invalid scheme would ruin the linking for later schemes
if (UrlUtils.isValidScheme(str.substring(schemeStart, schemeIndex)))
return schemeStart;
else
return -1;
*/
return schemeStart;
}
/**
* Remove all of the XML and HTML tags from the <code>s</code> parameter.
* The RegEx in this method removes everything between two "innermost" brackets
* (e.g., <code><...></code>) so
* it may accidentally remove sections of text that are not tags, just because both the
* "greater than" and "less than" symbols exist and there is no tag bewteen them.
* <p/>
* For instance, the title: "Yak mass < whale mass, but yak mass > weasel mass" would
* be reduced to: "Yak mass weasel mass" which is very much not the desired result.
* That is why this method is prefaced with the lable "simple".
* <p/>
* Note that the above example only fails because there is no tag between the
* < and > for this method to remove.
* If the title was, instead, "Yak mass < whale mass, <p>but yak mass > weasel mass",
* then the <p> tag would be removed and the rest of the title would be left alone.
*
* TODO: Augment the RegEx to fix the above corner case. This can be accomplished by ensuring
* todo: all openning tags have matching closing tags, then handling valid singleton tags (e.g.,
* todo: <p/>) as special cases.
*
* @param s The String which will have all of its tags removed
* @return The <code>s</code> parameter with all tags removed
*/
public static String simpleStripAllTags(String s) {
return s.replaceAll("<[^<>]*?>", "");
}
/**
* Transform a xml string to html text
* @param xmlContent xml
* @return html html text
*/
public static String transformXMLtoHtmlText(String xmlContent)
{
if(xmlContent != null) {
String htmlContent = "";
try {
DocumentBuilder db = DocumentBuilderFactory.newInstance().newDocumentBuilder();
// surround the xml content with temporary root element to make sure that it can be parsed.
InputSource source = new InputSource(new StringReader("<temprootelement>" + xmlContent + "</temprootelement>"));
Document doc = db.parse(source);
// remove all the elements from the xml content
StringWriter stw = new StringWriter();
Transformer transformer = TransformerFactory.newInstance().newTransformer();
transformer.setOutputProperty(OutputKeys.METHOD, "text");
transformer.transform(new DOMSource(doc), new StreamResult(stw));
htmlContent = stw.toString();
// make sure all the characters are escaped using html entities
htmlContent = StringEscapeUtils.escapeHtml(htmlContent);
} catch (Exception e) {
log.info("Failed to transform " + xmlContent + " to html text", e);
}
return htmlContent;
} else {
return "";
}
}
/**
* truncate text
* @param text text to truncate
* @param truncatedLength truncate length
* @return truncated text
*/
public static String truncateText(String text, int truncatedLength) {
if (StringUtils.isBlank(text)) {
return text;
}
if (text.length() > truncatedLength) {
final String abrsfx = "...";
final int abrsfxlen = 3;
// attempt to truncate on a word boundary
int index = truncatedLength - 1;
while (!Character.isWhitespace(text.charAt(index)) ||
index > (truncatedLength - abrsfxlen - 1)) {
if (--index == 0) {
break;
}
}
if (index == 0) {
index = truncatedLength - abrsfxlen - 1;
}
text = text.substring(0, index) + abrsfx;
assert text.length() <= truncatedLength;
}
return text;
}
/**
* truncate text and close open tags
* @param text text to truncate
* @param truncatedLength truncate length
* @return truncated text
*/
public static String truncateTextCloseOpenTag(String text, final int truncatedLength) {
String shortenedText = truncateText(text, truncatedLength);
int openIndex = shortenedText.lastIndexOf("<i>");
if (openIndex != -1) {
int closeIndex = shortenedText.indexOf("</i>", openIndex);
if (closeIndex == -1) {
shortenedText = shortenedText + "</i>";
}
}
return shortenedText;
}
/**
* Create a list of first, second and last authors
*
* @param authors the list of authors
*
* @return a combined string of first, second and last authors
*/
public static String makeAuthorString(String[] authors) {
if (authors.length <= 3) {
return StringUtils.join(authors, ", ");
}
else {
//use first two and last.
return authors[0].trim() + ", " + authors[1].trim() + ", [...], " + authors[authors.length-1].trim();
}
}
}