/*
* TaggedFormatter.java
*
* This work is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published
* by the Free Software Foundation; either version 2 of the License,
* or (at your option) any later version.
*
* This work is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
* USA
*
* Copyright (c) 2004-2009 Per Cederberg. All rights reserved.
*/
package org.liquidsite.core.text;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* A tagged text formatter. This class contains static methods for
* processing tagged text.
*
* @author Per Cederberg, <per at percederberg dot net>
* @version 1.0
*/
public class TaggedFormatter {
/**
* The trailing whitespace regex.
*/
private static Pattern SPACE_END = Pattern.compile("\\s+$");
/**
* The horizontal ruler regex.
*/
private static Pattern HORIZ_RULE = Pattern.compile("((---+)|(___+))(\\n|$)");
/**
* The pre-formatted text tag regex.
*/
private static Pattern TAG_PRE = Pattern.compile("<pre[^>]*>");
/**
* Cleans a tagged text string. Unneeded line feeds and space
* characters will be removed.
*
* @param text the tagged text string
*
* @return the cleaned tagged text string
*/
public static String clean(String text) {
StringBuffer result = new StringBuffer();
int pos = 0;
Matcher m = TAG_PRE.matcher(text);
while (m.find(pos)) {
if (m.start() > 0) {
result.append(cleanMarkup(text.substring(pos, m.start())));
result.append("\n\n");
}
result.append(m.group());
pos = text.indexOf("</pre>", m.end());
if (pos < 0) {
result.append(text.substring(m.end()));
result.append("</pre>\n\n");
pos = text.length();
} else {
result.append(text.substring(m.end(), pos));
result.append("</pre>\n\n");
pos += 6;
}
}
if (pos < text.length()) {
result.append(cleanMarkup(text.substring(pos)));
}
m = SPACE_END.matcher(result);
if (m.find()) {
result.setLength(m.start());
}
return result.toString();
}
/**
* Cleans a tagged text string. Unneeded line feeds and space
* characters will be removed. This method doesn't handle
* pre-formatted text.
*
* @param text the tagged text string
*
* @return the cleaned tagged text string
*/
private static String cleanMarkup(String text) {
StringBuffer result = new StringBuffer();
int pos = 0;
text = cleanWhitespace(text);
while (pos < text.length()) {
if (text.charAt(pos) == '\n') {
pos++;
} else {
if (result.length() > 0) {
result.append("\n\n");
}
pos = cleanBlock(text, pos, result);
}
}
return cleanWhitespace(result.toString());
}
/**
* Cleans a single block in a tagged text string. This will
* normalize all tags and clean an inline content. This method
* returns when a block break or double newline is encountered.
*
* @param text the tagged text string
* @param pos the current text position
* @param result the cleaned tagged text
*
* @return the new text position
*/
private static int cleanBlock(String text, int pos, StringBuffer result) {
Matcher m = HORIZ_RULE.matcher(text);
int backupLength;
int newPos;
String tag;
m.region(pos, text.length());
if (m.lookingAt()) {
result.append("---");
pos = m.end();
} else if (text.charAt(pos) == '<') {
backupLength = result.length();
newPos = cleanTag(text, pos, result);
tag = result.substring(backupLength);
if (newPos < pos + 3) {
pos = cleanInline(text, newPos, result);
} else if (tag.equals("")){ // This is the <p> tag
pos = cleanInline(text, newPos, result);
if (text.startsWith("</p>", pos)) {
pos += 4;
}
} else if (tag.equals("<h1>")) {
pos = cleanInline(text, newPos, result);
pos = cleanTagEnd(text, pos, "</h1>", result);
} else if (tag.equals("<h2>")) {
pos = cleanInline(text, newPos, result);
pos = cleanTagEnd(text, pos, "</h2>", result);
} else if (tag.equals("<h3>")) {
pos = cleanInline(text, newPos, result);
pos = cleanTagEnd(text, pos, "</h3>", result);
} else if (tag.startsWith("<list")) {
result.append("\n");
pos = cleanList(text, newPos, result);
pos = cleanTagEnd(text, pos, "</list>", result);
} else if (tag.startsWith("<box")) {
pos = cleanInline(text, newPos, result);
pos = cleanTagEnd(text, pos, "</box>", result);
} else {
result.setLength(backupLength);
pos = cleanInline(text, pos, result);
}
} else {
pos = cleanInline(text, pos, result);
}
return pos;
}
/**
* Cleans the inline content in a tagged text string. This will
* normalize all tags. This method returns when it encounters a
* block tag or a double newline.
*
* @param text the tagged text string
* @param pos the current text position
* @param result the cleaned tagged text
*
* @return the new text position
*/
private static int cleanInline(String text, int pos, StringBuffer result) {
LinkedList stack = new LinkedList();
int backupLength;
int newPos;
String tag;
while (pos < text.length()) {
if (text.startsWith("\n\n", pos)
|| text.startsWith("\n---", pos)
|| text.startsWith("\n___", pos)
|| text.startsWith("\n<p", pos)
|| text.startsWith("\n<h", pos)
|| text.startsWith("\n<list", pos)
|| text.startsWith("\n</list", pos)
|| text.startsWith("\n<item", pos)
|| text.startsWith("\n<box", pos)) {
break;
} else if (text.charAt(pos) == '<') {
backupLength = result.length();
newPos = cleanTag(text, pos, result);
tag = result.substring(backupLength);
if (newPos < pos + 3) {
pos = newPos;
} else if (tag.equals("<b>")) {
stack.addLast("</b>");
pos = newPos;
} else if (tag.equals("<i>")) {
stack.addLast("</i>");
pos = newPos;
} else if (tag.equals("<code>")) {
stack.addLast("</code>");
pos = newPos;
} else if (tag.startsWith("<link")) {
stack.addLast("</link>");
pos = newPos;
} else if (tag.startsWith("<image")) {
pos = newPos;
} else if (tag.equals("</b>")
|| tag.equals("</i>")
|| tag.equals("</code>")
|| tag.equals("</link>")) {
result.setLength(backupLength);
if (stack.contains(tag)) {
while (!stack.getLast().equals(tag)) {
result.append(stack.removeLast());
}
result.append(stack.removeLast());
}
pos = newPos;
} else {
result.setLength(backupLength);
break;
}
} else {
result.append(text.charAt(pos));
pos++;
}
}
while (stack.size() > 0) {
result.append(stack.removeLast());
}
return pos;
}
/**
* Cleans the list content in a tagged text string. This will
* normalize all tags. This method returns when it encounters the
* end of the list.
*
* @param text the tagged text string
* @param pos the current text position
* @param result the cleaned tagged text
*
* @return the new text position
*/
private static int cleanList(String text, int pos, StringBuffer result) {
int backupLength;
int newPos;
String tag;
while (pos < text.length()) {
if (text.charAt(pos) == '\n') {
pos++;
} else if (text.charAt(pos) == '<') {
backupLength = result.length();
newPos = cleanTag(text, pos, result);
tag = result.substring(backupLength);
if (newPos < pos + 3) {
result.insert(backupLength, "<item>");
pos = cleanInline(text, newPos, result);
pos = cleanTagEnd(text, pos, "</item>", result);
result.append("\n");
} else if (tag.equals("<item>")) {
pos = cleanInline(text, newPos, result);
pos = cleanTagEnd(text, pos, "</item>", result);
result.append("\n");
} else if (tag.equals("</item>")) {
result.setLength(backupLength);
pos = newPos;
} else if (tag.startsWith("<list")) {
result.append("\n");
pos = cleanList(text, newPos, result);
pos = cleanTagEnd(text, pos, "</list>", result);
result.append("\n");
} else if (tag.equals("</list>")) {
result.setLength(backupLength);
break;
} else if (tag.equals("<b>")
|| tag.equals("<i>")
|| tag.equals("<code>")
|| tag.startsWith("<link")
|| tag.startsWith("<image")) {
result.setLength(backupLength);
result.append("<item>");
pos = cleanInline(text, pos, result);
pos = cleanTagEnd(text, pos, "</item>", result);
result.append("\n");
} else {
result.setLength(backupLength);
break;
}
} else {
backupLength = result.length();
result.append("<item>");
newPos = cleanInline(text, pos, result);
if (newPos != pos) {
pos = cleanTagEnd(text, newPos, "</item>", result);
result.append("\n");
} else {
result.setLength(backupLength);
}
}
}
return pos;
}
/**
* Cleans and normalizes a tag in a tagged text string.
*
* @param text the tagged text string
* @param pos the current text position
* @param result the cleaned tagged text
*
* @return the new text position (after the tag)
*/
private static int cleanTag(String text, int pos, StringBuffer result) {
int start = pos;
int end;
String name;
boolean insideQuote = false;
boolean isEnd = false;
// Find ending '>' character
while (pos < text.length()
&& (text.charAt(pos) != '>' || insideQuote)) {
if (text.charAt(pos) == '"') {
insideQuote = !insideQuote;
}
pos++;
}
if (pos >= text.length()) {
result.append("<");
return start + 1;
}
end = pos + 1;
// Find tag name and attribute start
pos = text.indexOf(' ', start);
if (pos < 0 || pos >= end) {
pos = end - 1;
}
name = text.substring(start + 1, pos);
if (name.startsWith("/")) {
name = name.substring(1);
isEnd = true;
}
// Check for unknown tag names
if (!name.equals("h1") && !name.equals("h2")
&& !name.equals("h3") && !name.equals("p")
&& !name.equals("b") && !name.equals("i")
&& !name.equals("link") && !name.equals("image")
&& !name.equals("list") && !name.equals("item")
&& !name.equals("box") && !name.equals("code")) {
result.append("<");
return start + 1;
}
if (isEnd && name.equals("image")) {
return end;
}
// Check for suppressed tags
if (name.equals("p")) {
return end;
}
// Normalize tag
result.append("<");
if (isEnd) {
result.append("/");
}
result.append(name);
// Normalize tag attributes
if (!isEnd) {
if (name.equals("link")
|| name.equals("image")
|| name.equals("list")
|| name.equals("box")) {
cleanTagAttributes(text, pos, end - 1, name, result);
}
if (name.equals("image")) {
result.append(" /");
}
}
result.append(">");
return end;
}
/**
* Cleans an end tag in a tagged text string. This will print the
* end tag to the result, and if the string contains the
* specified end tag the current text position will be advanced.
*
* @param text the tagged text string
* @param pos the current text position
* @param tag the end tag to print
* @param result the cleaned tagged text
*
* @return the new text position (after the tag)
*/
private static int cleanTagEnd(String text,
int pos,
String tag,
StringBuffer result) {
result.append(tag);
if (text.startsWith(tag, pos)) {
pos += tag.length();
}
return pos;
}
/**
* Cleans and normalizes a tag attribute string.
*
* @param text the tagged text string
* @param pos the current text position
* @param end the end position of the tag (inclusive)
* @param tagName the tag name
* @param result the cleaned tagged text
*/
private static void cleanTagAttributes(String text,
int pos,
int end,
String tagName,
StringBuffer result) {
HashMap attributes = new HashMap();
String str;
// Parse tag attributes
if (text.charAt(end) == '>') {
end--;
}
if (tagName.equals("image") && text.charAt(end) == '/') {
end--;
}
attributes = parseTagAttributes(text, pos, end + 1);
// Normalize attributes
if (tagName.equals("list")) {
str = (String) attributes.get("type");
if (str != null && !str.equals("")) {
result.append(" type=\"");
result.append(str);
result.append("\"");
}
} else if (tagName.equals("link")) {
str = (String) attributes.get("url");
result.append(" url=\"");
if (str != null) {
result.append(str);
}
result.append("\"");
str = (String) attributes.get("window");
if (str != null) {
result.append(" window=\"");
result.append(str);
result.append("\"");
}
} else if (tagName.equals("image")) {
str = (String) attributes.get("url");
result.append(" url=\"");
if (str != null) {
result.append(str);
}
result.append("\"");
str = (String) attributes.get("layout");
if (str != null) {
result.append(" layout=\"");
result.append(str);
result.append("\"");
}
} else if (tagName.equals("box")) {
str = (String) attributes.get("layout");
if (str != null) {
result.append(" layout=\"");
result.append(str);
result.append("\"");
}
}
}
/**
* Cleans a tagged text string for excessive whitespace.
*
* @param text the tagged text string
*
* @return the cleaned tagged text string
*/
private static String cleanWhitespace(String text) {
StringBuffer buffer = new StringBuffer();
int pos;
// Trim each line
while ((pos = text.indexOf("\n")) >= 0) {
buffer.append(text.substring(0, pos).trim());
buffer.append("\n");
text = text.substring(pos + 1);
}
buffer.append(text.trim());
// Remove empty starting and ending lines
while (buffer.length() > 0 && buffer.charAt(0) == '\n') {
buffer.deleteCharAt(0);
}
while (buffer.length() > 0
&& buffer.charAt(buffer.length() - 1) == '\n') {
buffer.setLength(buffer.length() - 1);
}
// Replace tab characters with spaces
pos = buffer.indexOf("\t");
while (pos > 0) {
buffer.replace(pos, pos + 1, " ");
pos = buffer.indexOf("\t");
}
// Remove duplicate empty lines
pos = buffer.indexOf("\n\n\n");
while (pos > 0) {
buffer.deleteCharAt(pos);
pos = buffer.indexOf("\n\n\n");
}
return buffer.toString();
}
/**
* Formats a tagged text string in HTML. This method will resolve
* any links in the tagged text and convert the tags to valid
* HTML tags.
*
* @param text the tagged text string
* @param context the formatting context
*
* @return the HTML encoded text
*/
public static String formatHtml(String text, FormattingContext context) {
StringBuffer result = new StringBuffer();
int pos = 0;
Matcher m = TAG_PRE.matcher(text);
String str;
while (m.find(pos)) {
if (m.start() > 0) {
str = text.substring(pos, m.start());
result.append(formatHtmlMarkup(str, context));
}
result.append(m.group());
pos = text.indexOf("</pre>", m.end());
if (pos < 0) {
str = text.substring(m.end());
result.append(PlainFormatter.escapeHtml(str));
result.append("</pre>");
pos = text.length();
} else {
str = text.substring(m.end(), pos);
result.append(PlainFormatter.escapeHtml(str));
result.append("</pre>");
pos += 6;
}
}
if (pos < text.length()) {
result.append(formatHtmlMarkup(text.substring(pos), context));
}
return result.toString();
}
/**
* Formats a tagged text string in HTML. This method will resolve
* any links in the tagged text and convert the tags to valid
* HTML tags. This method doesn't handle pre-formatted text.
*
* @param text the tagged text string
* @param context the formatting context
*
* @return the HTML encoded text
*/
private static String formatHtmlMarkup(String text, FormattingContext context) {
StringBuffer result = new StringBuffer();
int pos = 0;
while (pos < text.length()) {
if (text.charAt(pos) == '\n') {
pos++;
} else {
if (result.length() > 0) {
result.append("\n\n");
}
pos = formatHtmlBlock(text, pos, context, result);
}
}
return result.toString();
}
/**
* Formats a single block in a tagged text string. This method
* returns when a block break or double newline is encountered.
*
* @param text the tagged text string
* @param pos the current text position
* @param context the formatting context
* @param result the cleaned tagged text
*
* @return the new text position
*/
private static int formatHtmlBlock(String text,
int pos,
FormattingContext context,
StringBuffer result) {
Matcher m = HORIZ_RULE.matcher(text);
m.region(pos, text.length());
if (m.lookingAt()) {
result.append("<hr/>");
pos = m.end();
} else if (text.startsWith("<list", pos)) {
pos = formatHtmlList(text, pos, context, result);
} else if (text.startsWith("<h1>", pos)
|| text.startsWith("<h2>", pos)
|| text.startsWith("<h3>", pos)
|| text.startsWith("<box>", pos)) {
pos = formatHtmlInline(text, pos, context, result);
} else {
result.append("<p>");
pos = formatHtmlInline(text, pos, context, result);
result.append("</p>");
}
return pos;
}
/**
* Formats an inline text in a tagged text string. This method
* returns when a double newline is encountered.
*
* @param text the tagged text string
* @param pos the current text position
* @param context the formatting context
* @param result the cleaned tagged text
*
* @return the new text position
*/
private static int formatHtmlInline(String text,
int pos,
FormattingContext context,
StringBuffer result) {
while (pos < text.length()
&& !text.startsWith("\n\n", pos)
&& !text.startsWith("\n---", pos)
&& !text.startsWith("\n___", pos)
&& !text.startsWith("<list", pos)
&& !text.startsWith("</list>", pos)
&& !text.startsWith("<item>", pos)
&& !text.startsWith("</item>", pos)) {
switch (text.charAt(pos)) {
case '<':
pos = formatHtmlTag(text, pos, context, result);
break;
case '>':
result.append(">");
pos++;
break;
case '&':
result.append("&");
pos++;
break;
case '\n':
result.append("<br/>\n");
pos++;
break;
case '\r':
pos++;
break;
case '@':
result.append("@");
pos++;
break;
default:
result.append(text.charAt(pos));
pos++;
}
}
return pos;
}
/**
* Formats a list block in a tagged text string. This method
* returns when a block break or double newline is encountered.
*
* @param text the tagged text string
* @param pos the current text position
* @param context the formatting context
* @param result the cleaned tagged text
*
* @return the new text position
*/
private static int formatHtmlList(String text,
int pos,
FormattingContext context,
StringBuffer result) {
pos = formatHtmlTag(text, pos, context, result);
result.append("\n");
while (pos < text.length()) {
if (text.charAt(pos) == '\n') {
pos++;
} else if (text.startsWith("<item>", pos)) {
pos = formatHtmlTag(text, pos, context, result);
pos = formatHtmlInline(text, pos, context, result);
} else if (text.startsWith("</item>", pos)) {
pos = formatHtmlTag(text, pos, context, result);
result.append("\n\n");
} else if (text.startsWith("<list", pos)) {
pos = formatHtmlList(text, pos, context, result);
result.append("\n");
} else if (text.startsWith("</list>", pos)) {
return formatHtmlTag(text, pos, context, result);
} else {
result.append("<li>");
pos = formatHtmlInline(text, pos, context, result);
result.append("</li>\n");
}
}
result.append("</ul>");
return pos;
}
/**
* Formats a tag in a tagged text string.
*
* @param text the tagged text string
* @param pos the current text position
* @param context the formatting context
* @param result the cleaned tagged text
*
* @return the new text position
*/
private static int formatHtmlTag(String text,
int pos,
FormattingContext context,
StringBuffer result) {
int start = pos;
int end;
String name;
HashMap attributes;
boolean insideQuote = false;
String str;
// Find ending '>' character
while (pos < text.length()
&& (text.charAt(pos) != '>' || insideQuote)) {
if (text.charAt(pos) == '"') {
insideQuote = !insideQuote;
}
pos++;
}
if (pos >= text.length()) {
result.append("<");
return start + 1;
}
end = pos + 1;
// Find tag name and attribute start
pos = text.indexOf(' ', start);
if (pos < 0 || pos >= end) {
pos = end - 1;
}
name = text.substring(start + 1, pos);
// Format tag
if (name.equals("p") || name.equals("/p")
|| name.equals("h1") || name.equals("/h1")
|| name.equals("h2") || name.equals("/h2")
|| name.equals("h3") || name.equals("/h3")
|| name.equals("b") || name.equals("/b")
|| name.equals("i") || name.equals("/i")
|| name.equals("code") || name.equals("/code")) {
result.append("<");
result.append(name);
result.append(">");
} else if (name.equals("link")) {
attributes = parseTagAttributes(text, pos, end - 1);
result.append("<a href=\"");
str = context.linkTo((String) attributes.get("url"));
result.append(PlainFormatter.escapeHtml(str));
result.append("\"");
str = (String) attributes.get("window");
if (str != null && str.equals("new")) {
result.append(" target=\"_blank\"");
}
result.append(">");
} else if (name.equals("/link")) {
result.append("</a>");
} else if (name.equals("image")) {
attributes = parseTagAttributes(text, pos, end - 1);
result.append("<img src=\"");
str = context.linkTo((String) attributes.get("url"));
result.append(PlainFormatter.escapeHtml(str));
result.append("\" alt=\"\"");
str = (String) attributes.get("layout");
if (str != null && str.equals("right")) {
result.append(" style=\"float: right;\"");
} else if (str != null && str.equals("left")) {
result.append(" style=\"float: left;\"");
}
result.append(" />");
} else if (name.equals("list")) {
attributes = parseTagAttributes(text, pos, end - 1);
result.append("<ul");
str = (String) attributes.get("type");
if (str != null && str.equals("*")) {
result.append(" style=\"list-style-type: disc;\"");
} else if (str != null && str.equals("1")) {
result.append(" style=\"list-style-type: decimal;\"");
} else if (str != null && str.equals("i")) {
result.append(" style=\"list-style-type: lower-roman;\"");
} else if (str != null && str.equals("I")) {
result.append(" style=\"list-style-type: upper-roman;\"");
} else if (str != null && str.equals("a")) {
result.append(" style=\"list-style-type: lower-alpha;\"");
} else if (str != null && str.equals("A")) {
result.append(" style=\"list-style-type: upper-alpha;\"");
}
result.append(">");
} else if (name.equals("/list")) {
result.append("</ul>");
} else if (name.equals("item")) {
result.append("<li>");
} else if (name.equals("/item")) {
result.append("</li>");
} else if (name.equals("box")) {
attributes = parseTagAttributes(text, pos, end - 1);
result.append("<p class=\"box-layout-");
str = (String) attributes.get("layout");
if (str != null && str.equals("right")) {
result.append("right");
} else if (str != null && str.equals("left")) {
result.append("left");
}
result.append("\">");
} else if (name.equals("/box")) {
result.append("</p>");
} else {
result.append("<");
end = start + 1;
}
return end;
}
/**
* Parses the tag attributes. This method extracts all the
* attributes and their values from the string and returns the
* mappings in a hash map.
*
* @param text the tagged text string
* @param pos the current text position
* @param end the end position (exclusive)
*
* @return the hash map with attribute names and values
*/
private static HashMap parseTagAttributes(String text, int pos, int end) {
HashMap result = new HashMap();
String name;
String value;
int temp;
while (pos < end) {
if (text.charAt(pos) == ' ') {
pos++;
} else {
temp = text.indexOf('=', pos);
if (temp <= 0 || temp >= end) {
name = text.substring(pos, end).trim();
result.put(name, "");
break;
}
name = text.substring(pos, temp).trim();
pos = temp + 1;
while (text.charAt(pos) == ' '
|| text.charAt(pos) == '\n') {
pos++;
if (pos >= end) {
break;
}
}
if (pos >= end) {
result.put(name, "");
break;
} else if (text.charAt(pos) == '"') {
temp = text.indexOf('"', pos + 1);
if (temp < 0 || temp >= end) {
value = text.substring(pos + 1, end);
pos = end;
} else {
value = text.substring(pos + 1, temp);
pos = temp + 1;
}
} else {
temp = text.indexOf(' ', pos);
if (temp < 0 || temp >= end) {
value = text.substring(pos, end);
pos = end;
} else {
value = text.substring(pos, temp);
pos = temp + 1;
}
}
result.put(name, value);
}
}
return result;
}
}