/**
* Basic String manipulation utilities.
* (c) Winterwell 2010 and ThinkTank Mathematics 2007
*/
package winterwell.markdown;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.math.BigInteger;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.regex.Pattern;
import winterwell.utils.Mutable;
import winterwell.utils.containers.Pair;
/**
* A collection of general-purpose String handling methods.
*
* @author daniel.winterstein
*/
public final class StringMethods {
/**
* Removes xml tags, comment blocks and script blocks.
*
* @param page
* @return the page with all xml tags removed.
*/
public static String stripTags(String page) {
// This code is rather ugly, but it does the job
StringBuilder stripped = new StringBuilder(page.length());
boolean inTag = false;
// Comment blocks and script blocks are given special treatment
boolean inComment = false;
boolean inScript = false;
// Go through the text
for (int i = 0; i < page.length(); i++) {
char c = page.charAt(i);
// First check whether we are ignoring text
if (inTag) {
if (c == '>')
inTag = false;
} else if (inComment) {
if (c == '>' && page.charAt(i - 1) == '-'
&& page.charAt(i - 1) == '-') {
inComment = false;
}
} else if (inScript) {
if (c == '>' && page.substring(i - 7, i).equals("/script")) {
inScript = false;
}
} else {
// Check for the start of a tag - looks for '<' followed by any
// non-whitespace character
if (c == '<' && !Character.isWhitespace(page.charAt(i + 1))) {
// Comment, script-block or tag?
if (page.charAt(i + 1) == '!' && page.charAt(i + 2) == '-'
&& page.charAt(i + 3) == '-') {
inComment = true;
} else if (i + 8 < page.length()
&& page.substring(i + 1, i + 7).equals("script")) {
inScript = true;
i += 7;
} else
inTag = true; // Normal tag by default
} else {
// Append all non-tag chars
stripped.append(c);
}
} // end if...
}
return stripped.toString();
}
/**
* The local line-end string. \n on unix, \r\n on windows, \r on mac.
*/
public static final String LINEEND = System.getProperty("line.separator");
private static final Pattern abc = Pattern.compile("\\S");
/**
* @param s
* @return A version of s where the first letter is uppercase and all others
* are lowercase
*/
public static final String capitalise(final String s) {
return s.substring(0, 1).toUpperCase() + s.substring(1).toLowerCase();
}
/**
* Convert all line breaks into the system line break.
*/
public static final String convertLineBreaks(String text) {
return convertLineBreaks(text, LINEEND);
}
/**
* Convert all line breaks into the specified line break.
*/
public static final String convertLineBreaks(String text, String br) {
text = text.replaceAll("\r\n", br);
text = text.replaceAll("\r", br);
text = text.replaceAll("\n", br);
return text;
}
/**
* @param string
* @param character
* @return the number of times character appears in the string
* @author Sam Halliday
*/
static public int countCharsInString(String string, char character) {
int count = 0;
for (char c : string.toCharArray()) {
if (c == character) {
count++;
}
}
return count;
}
/**
*
* E.g.
* <code>findEnclosingRegion("text with a [region] inside", 15, '[', ']')</code>
* is (??,??)
*
* @param text
* @param offset
* @param start
* @param end
* @return the smallest enclosed region (including start and end chars, the
* 1st number is inclusive, the 2nd exclusive), or null if none. So
* text.subString(start,end) is the specified region
*/
public static Pair<Integer> findEnclosingRegion(String text, int offset,
char startMarker, char endMarker) {
// Forward
int end = findEnclosingRegion2(text, offset, endMarker, 1);
if (end == -1)
return null;
end++; // end is exclusive
// Backward
int start = findEnclosingRegion2(text, offset, startMarker, -1);
if (start == -1)
return null;
// Sanity
assert text.substring(start, end).charAt(0) == startMarker;
assert text.substring(start, end).endsWith("" + endMarker);
// Done
return new Pair<Integer>(start, end);
}
private static int findEnclosingRegion2(String text, int offset,
char endMarker, int direction) {
while (offset > -1 && offset < text.length()) {
char c = text.charAt(offset);
if (c == endMarker)
return offset;
offset += direction;
}
return -1;
}
/**
* A convenience wrapper for
* {@link #findEnclosingRegion(String, int, char, char)} E.g. <code>
findEnclosingRegion("text with a [region] inside", 15, '[', ']') .equals("[region]");
</code>
*
* @param text
* @param offset
* @param start
* @param end
* @return the smallest enclosed region (including start and end chars), or
* null if none.
*/
public static String findEnclosingText(String text, int offset,
char startMarker, char endMarker) {
Pair<Integer> region = findEnclosingRegion(text, offset, startMarker,
endMarker);
if (region == null)
return null;
String s = text.substring(region.first, region.second);
return s;
}
/**
* Format a block of text to use the given line-width. I.e. adjust the line
* breaks. Also known as <i>hard</i> line-wrapping. Paragraphs are
* recognised by a line of blank space between them (e.g. two returns).
* <p>
* Note: a side-effect of this method is that it converts all line-breaks
* into the local system's line-breaks. E.g. on Windows, \n will become \r\n
*
* @param text
* The text to format
* @param lineWidth
* The number of columns in a line. Typically 78 or 80.
* @param respectLeadingCharacters
* Can be null. If set, the specified leading characters will be
* copied if the line is split. Use with " \t" to keep indented
* paragraphs properly indented. Use with "> \t" to also handle
* email-style quoting. Note that respected leading characters
* receive no special treatment when they are used inside a
* paragraph.
* @return A copy of text, formatted to the given line-width.
* <p>
* TODO: recognise paragraphs by changes in the respected leading
* characters
*/
public static String format(String text, int lineWidth, int tabWidth,
String respectLeadingCharacters) {
// Switch to Linux line breaks for easier internal workings
text = convertLineBreaks(text, "\n");
// Find paragraphs
List<String> paras = format2_splitParagraphs(text,
respectLeadingCharacters);
// Rebuild text
StringBuilder sb = new StringBuilder(text.length() + 10);
for (String p : paras) {
String fp = format3_oneParagraph(p, lineWidth, tabWidth,
respectLeadingCharacters);
sb.append(fp);
// Paragraphs end with a double line break
sb.append("\n\n");
}
// Pop the last line breaks
sb.delete(sb.length() - 2, sb.length());
// Convert line breaks to system ones
text = convertLineBreaks(sb.toString());
// Done
return text;
}
private static List<String> format2_splitParagraphs(String text,
String respectLeadingCharacters) {
List<String> paras = new ArrayList<String>();
Mutable.Int index = new Mutable.Int(0);
// TODO The characters prefacing this paragraph
String leadingChars = "";
while (index.value < text.length()) {
// One paragraph
boolean inSpace = false;
int start = index.value;
while (index.value < text.length()) {
char c = text.charAt(index.value);
index.value++;
if (!Character.isWhitespace(c)) {
inSpace = false;
continue;
}
// Line end?
if (c == '\r' || c == '\n') {
// // Handle MS Windows 2 character \r\n line breaks
// if (index.value < text.length()) {
// char c2 = text.charAt(index.value);
// if (c=='\r' && c2=='\n') index.value++; // Push on past
// the 2nd line break char
// }
// Double line end - indicating a paragraph break
if (inSpace)
break;
inSpace = true;
}
// TODO Other paragraph markers, spotted by a change in
// leadingChars
}
String p = text.substring(start, index.value);
paras.add(p);
}
// Done
return paras;
}
/**
* Format a block of text to fit the given line width
*
* @param p
* @param lineWidth
* @param tabWidth
* @param respectLeadingCharacters
* @return
*/
private static String format3_oneParagraph(String p, int lineWidth,
int tabWidth, String respectLeadingCharacters) {
// Collect the reformatted paragraph
StringBuilder sb = new StringBuilder(p.length() + 10); // Allow for
// some extra
// line-breaks
// Get respected leading chars
String leadingChars = format4_getLeadingChars(p,
respectLeadingCharacters);
// First Line
sb.append(leadingChars);
int lineLength = leadingChars.length();
int index = leadingChars.length();
// Loop
while (index < p.length()) {
// Get the next word
StringBuilder word = new StringBuilder();
char c = p.charAt(index);
index++;
while (!Character.isWhitespace(c)) {
word.append(c);
if (index == p.length())
break;
c = p.charAt(index);
index++;
}
// Break the line if the word will not fit
if (lineLength + word.length() > lineWidth && lineLength != 0) {
trimEnd(sb);
sb.append('\n'); // lineEnd(sb);
// New line
sb.append(leadingChars);
lineLength = leadingChars.length();
}
// Add word
sb.append(word);
lineLength += word.length();
// Add the whitespace
if (index != p.length() && lineLength < lineWidth) {
if (c == '\n') {
c = ' ';
}
sb.append(c);
lineLength += (c == '\t') ? tabWidth : 1;
}
}
// A final trim
trimEnd(sb);
// Done
return sb.toString();
}
/**
*
* @param text
* @param respectLeadingCharacters
* Can be null
* @return The characters at the beginning of text which are respected. E.g.
* ("> Hello", " \t>") --> "> "
*/
private static String format4_getLeadingChars(String text,
String respectLeadingCharacters) {
if (respectLeadingCharacters == null)
return "";
// Line-breaks cannot be respected
assert respectLeadingCharacters.indexOf('\n') == -1;
// Look for the first non-respected char
for (int i = 0; i < text.length(); i++) {
char c = text.charAt(i);
if (respectLeadingCharacters.indexOf(c) == -1) {
// Return the previous chars
return text.substring(0, i);
}
}
// All chars are respected
return text;
}
/**
* Ensure that line ends with the right line-end character(s)
*/
public static final String lineEnd(String line) {
// strip possibly inappropriate line-endings
if (line.endsWith("\n")) {
line = line.substring(0, line.length() - 1);
}
if (line.endsWith("\r\n")) {
line = line.substring(0, line.length() - 2);
}
if (line.endsWith("\r")) {
line = line.substring(0, line.length() - 1);
}
// add in proper line end
if (!line.endsWith(LINEEND)) {
line += LINEEND;
}
return line;
}
/**
* Ensure that line ends with the right line-end character(s). This is more
* efficient than the version for Strings.
*
* @param line
*/
public static final void lineEnd(final StringBuilder line) {
if (line.length() == 0) {
line.append(LINEEND);
return;
}
// strip possibly inappropriate line-endings
final char last = line.charAt(line.length() - 1);
if (last == '\n') {
if ((line.length() > 1) && (line.charAt(line.length() - 2) == '\r')) {
// \r\n
line.replace(line.length() - 2, line.length(), LINEEND);
return;
}
line.replace(line.length() - 1, line.length(), LINEEND);
return;
}
if (last == '\r') {
line.replace(line.length() - 1, line.length(), LINEEND);
return;
}
line.append(LINEEND);
return;
}
/**
* @param string
* @return the MD5 sum of the string using the default charset. Null if
* there was an error in calculating the hash.
* @author Sam Halliday
*/
public static String md5Hash(String string) {
MessageDigest md5 = null;
try {
md5 = MessageDigest.getInstance("MD5");
} catch (NoSuchAlgorithmException e) {
// ignore this exception, we know MD5 exists
}
md5.update(string.getBytes());
BigInteger hash = new BigInteger(1, md5.digest());
return hash.toString(16);
}
/**
* Escape some of the characters that are meaningful to the regex engine.
* (i.e. create a string that will be treated by replace as if it were
* literally the input string).
*
* @see Pattern
* @param s
* @return
*/
public static final String regexEsc(String s) {
// these daft looking replacements should create the right level of
// escaping for \ characters
s = s.replace("\\", "\\\\"); // replaces \ in s with \\,
// honest.
s = s.replace("$", "\\$"); // replaces $ in s with \$
s = s.replace("^", "\\^"); // replaces ^ in s with \^
s = s.replace("|", "\\|"); // replaces | in s with \|
s = s.replace("(", "\\("); // replaces ( in s with \(
s = s.replace(")", "\\)"); // replaces ) in s with \)
return s;
}
/**
* Removes HTML-style tags from a string.
*
* @param s
* a String from which to remove tags
* @return a string with all instances of <.*> removed.
*/
public static String removeTags(String s) {
StringBuffer sb = new StringBuffer();
boolean inTag = false;
for (int i = 0; i < s.length(); i++) {
char c = s.charAt(i);
if (c == '<')
inTag = true;
if (!inTag)
sb.append(c);
if (c == '>')
inTag = false;
}
return sb.toString();
}
/**
* Repeat a character.
*
* @param c
* @param i
* @return A String consisting of i x c.
* @example assert repeat('-', 5).equals("-----");
*/
public static String repeat(Character c, int i) {
StringBuilder dashes = new StringBuilder(i);
for (int j = 0; j < i; j++)
dashes.append(c);
return dashes.toString();
}
/**
* Split a piece of text into separate lines. The line breaks are left at
* the end of each line.
*
* @param text
* @return The individual lines in the text.
*/
public static List<String> splitLines(String text) {
List<String> lines = new ArrayList<String>();
// Search for lines
int start = 0;
for (int i = 0; i < text.length(); i++) {
char c = text.charAt(i);
if (c == '\r' || c == '\n') {
// Handle MS Windows 2 character \r\n line breaks
if (i + 1 < text.length()) {
char c2 = text.charAt(i + 1);
if (c == '\r' && c2 == '\n')
i++;
}
// Get the line, with the line break
String line = text.substring(start, i + 1);
lines.add(line);
start = i + 1;
}
}
// Last one
if (start != text.length()) {
String line = text.substring(start);
lines.add(line);
}
return lines;
}
/**
* Remove <i>trailing</i> whitespace. c.f. String#trim() which removes
* leading and trailing whitespace.
*
* @param sb
*/
private static void trimEnd(StringBuilder sb) {
while (true) {
// Get the last character
int i = sb.length() - 1;
if (i == -1)
return; // Quit if sb is empty
char c = sb.charAt(i);
if (!Character.isWhitespace(c))
return; // Finish?
sb.deleteCharAt(i); // Remove and continue
}
}
/**
* Returns true if the string is just whitespace, or empty, or null.
*
* @param s
*/
public static final boolean whitespace(final String s) {
if (s == null) {
return true;
}
for (int i = 0; i < s.length(); i++) {
final char c = s.charAt(i);
if (!Character.isWhitespace(c)) {
return false;
}
}
return true;
}
/**
* @param text
* @return the number of words in text. Uses a crude whitespace
* measure.
*/
public static int wordCount(String text) {
String[] bits = text.split("\\W+");
int wc = 0;
for (String string : bits) {
if (!whitespace(string)) wc++;
}
return wc;
}
}