/*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.util;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.StringReader;
import java.io.StringWriter;
import java.io.UnsupportedEncodingException;
import java.io.Writer;
import java.net.URLDecoder;
import java.net.URLEncoder;
import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.ConcurrentMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang.StringEscapeUtils;
import com.google.common.base.Function;
import com.google.common.collect.MapMaker;
public class TextUtils {
private static final String FIRSTWORD = "^([^\\s]*).*$";
/** thread-local cached matchers, by string key */
private static final ThreadLocal<Map<String,Matcher>> TL_MATCHER_MAP
= new ThreadLocal<Map<String,Matcher>>() {
protected Map<String,Matcher> initialValue() {
return new HashMap<String,Matcher>(50);
}
};
/** global soft-cache of Patterns, by string key */
private static final ConcurrentMap<String, Pattern> PATTERNS = new MapMaker()
.concurrencyLevel(16)
.softValues()
.makeComputingMap(new Function<String, Pattern>() {
public Pattern apply(String regex) {
return Pattern.compile(regex);
}
});
/**
* Get a matcher object for a precompiled regex pattern.
*
* This method tries to reuse Matcher objects for efficiency.
* It can hold for recycling one Matcher per pattern per thread.
*
* Matchers retrieved should be returned for reuse via the
* recycleMatcher() method, but no errors will occur if they
* are not.
*
* This method is a hotspot frequently accessed.
*
* @param pattern the string pattern to use
* @param input the character sequence the matcher should be using
* @return a matcher object loaded with the submitted character sequence
*/
public static Matcher getMatcher(String pattern, CharSequence input) {
if (pattern == null) {
throw new IllegalArgumentException("String 'pattern' must not be null");
}
input = new InterruptibleCharSequence(input);
final Map<String,Matcher> matchers = TL_MATCHER_MAP.get();
Matcher m = (Matcher)matchers.get(pattern);
if(m == null) {
m = PATTERNS.get(pattern).matcher(input);
} else {
matchers.put(pattern,null);
m.reset(input);
}
return m;
}
public static void recycleMatcher(Matcher m) {
// while cached, eliminate reference to potentially-large prior 'input'
m.reset("");
final Map<String,Matcher> matchers = TL_MATCHER_MAP.get();
matchers.put(m.pattern().pattern(),m);
}
/**
* Utility method using a precompiled pattern instead of using the
* replaceAll method of the String class. This method will also be reusing
* Matcher objects.
*
* @see java.util.regex.Pattern
* @param pattern precompiled Pattern to match against
* @param input the character sequence to check
* @param replacement the String to substitute every match with
* @return the String with all the matches substituted
*/
public static String replaceAll(
String pattern, CharSequence input, String replacement) {
input = new InterruptibleCharSequence(input);
Matcher m = getMatcher(pattern, input);
String res = m.replaceAll(replacement);
recycleMatcher(m);
return res;
}
/**
* Utility method using a precompiled pattern instead of using the
* replaceFirst method of the String class. This method will also be reusing
* Matcher objects.
*
* @see java.util.regex.Pattern
* @param pattern precompiled Pattern to match against
* @param input the character sequence to check
* @param replacement the String to substitute the first match with
* @return the String with the first match substituted
*/
public static String replaceFirst(
String pattern, CharSequence input, String replacement) {
input = new InterruptibleCharSequence(input);
Matcher m = getMatcher(pattern, input);
String res = m.replaceFirst(replacement);
recycleMatcher(m);
return res;
}
/**
* Utility method using a precompiled pattern instead of using the matches
* method of the String class. This method will also be reusing Matcher
* objects.
*
* @see java.util.regex.Pattern
* @param pattern precompiled Pattern to match against
* @param input the character sequence to check
* @return true if character sequence matches
*/
public static boolean matches(String pattern, CharSequence input) {
input = new InterruptibleCharSequence(input);
Matcher m = getMatcher(pattern, input);
boolean res = m.matches();
recycleMatcher(m);
return res;
}
/**
* Utility method using a precompiled pattern instead of using the split
* method of the String class.
*
* @see java.util.regex.Pattern
* @param pattern precompiled Pattern to split by
* @param input the character sequence to split
* @return array of Strings split by pattern
*/
public static String[] split(String pattern, CharSequence input) {
input = new InterruptibleCharSequence(input);
Matcher m = getMatcher(pattern,input);
String[] retVal = m.pattern().split(input);
recycleMatcher(m);
return retVal;
}
/**
* @param s String to find first word in (Words are delimited by
* whitespace).
* @return First word in the passed string else null if no word found.
*/
public static String getFirstWord(String s) {
Matcher m = getMatcher(FIRSTWORD, s);
String retVal = (m != null && m.matches())? m.group(1): null;
recycleMatcher(m);
return retVal;
}
/**
* Escapes a string so that it can be passed as an argument to a javscript
* in a JSP page. This method takes a string and returns the same string
* with any single quote escaped by prepending the character with a
* backslash. Linebreaks are also replaced with '\n'. Also,
* less-than signs and ampersands are replaced with HTML entities.
*
* @param s The string to escape
* @return The same string escaped.
*/
public static String escapeForHTMLJavascript(String s) {
return escapeForHTML(StringEscapeUtils.escapeJavaScript(s));
}
/**
* Escapes a string so that it can be placed inside XML/HTML attribute.
* Replaces ampersand, less-than, greater-than, single-quote, and
* double-quote with escaped versions.
* @param s The string to escape
* @return The same string escaped.
*/
public static String escapeForMarkupAttribute(String s) {
return StringEscapeUtils.escapeXml(s);
}
/**
* Minimally escapes a string so that it can be placed inside XML/HTML
* attribute.
* Escapes lt and amp.
* @param s The string to escape
* @return The same string escaped.
*/
public static String escapeForHTML(String s) {
// TODO: do this in a single pass instead of creating 5 junk strings
String escaped = s.replaceAll("&","&");
return escaped.replaceAll("<","<");
}
/**
* Utility method for writing a (potentially large) String to a JspWriter,
* escaping it for HTML display, without constructing another large String
* of the whole content.
* @param s String to write
* @param out destination JspWriter
* @throws IOException
*/
public static void writeEscapedForHTML(String s, Writer w)
throws IOException {
PrintWriter out = new PrintWriter(w);
BufferedReader reader = new BufferedReader(new StringReader(s));
String line;
while((line=reader.readLine()) != null){
out.println(StringEscapeUtils.escapeHtml(line));
}
}
/**
* Replaces HTML Entity Encodings.
* @param cs The CharSequence to remove html codes from
* @return the same CharSequence or an escaped String.
*/
public static CharSequence unescapeHtml(final CharSequence cs) {
if (cs == null) {
return cs;
}
return StringEscapeUtils.unescapeHtml(cs.toString());
}
/**
* @param message Message to put at top of the string returned. May be
* null.
* @param e Exception to write into a string.
* @return Return formatted string made of passed message and stack trace
* of passed exception.
*/
public static String exceptionToString(String message, Throwable e) {
StringWriter sw = new StringWriter();
if (message == null || message.length() == 0) {
sw.write(message);
sw.write("\n");
}
e.printStackTrace(new PrintWriter(sw));
return sw.toString();
}
/**
* Exception- and warning-free URL-escaping utility method.
*
* @param s String to escape
* @return URL-escaped string
*/
@SuppressWarnings("deprecation")
public static String urlEscape(String s) {
try {
return URLEncoder.encode(s,"UTF8");
} catch (UnsupportedEncodingException e) {
// should be impossible; all JVMs must support UTF8
// but have a fallback just in case
return URLEncoder.encode(s);
}
}
/**
* Exception- and warning-free URL-unescaping utility method.
*
* @param s String do unescape
* @return URL-unescaped String
*/
@SuppressWarnings("deprecation")
public static String urlUnescape(String s) {
try {
return URLDecoder.decode(s, "UTF8");
} catch (UnsupportedEncodingException e) {
// should be impossible; all JVMs must support UTF8
// but have a fallback just in case
return URLDecoder.decode(s);
}
}
}