/* Copyright (c) 2008 Google Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.google.gdata.util.common.html; import java.util.regex.Pattern; import com.google.gdata.util.common.base.StringUtil; /** * Convert provided html formatted string to text format. * * */ public final class HtmlToText { /** * Regular expression to match html line breaks or paragraph tags * and adjacent whitespace */ private static final Pattern htmlNewlinePattern = Pattern.compile("\\s*<(br|/?p)>\\s*"); /** Regular expression to match list tags and adjacent whitespace */ private static final Pattern htmlListPattern = Pattern.compile("\\s*<li>\\s*"); /** Regular expression to match any remaining html tags */ private static final Pattern htmlTagPattern = Pattern.compile("</?([^<]*)>"); /** Maximum length of a line in email body (in characters) */ public static final int EMAIL_LINE_WIDTH_MAX = 72; // This class should not be instantiated, hence the private constructor private HtmlToText() {} /** * Convert provided html string to plain text preserving the formatting * as much as possible. Ensure line wrapping to 72 chars as default. * NOTE: add support for more HTML tags here. * For the present, convert <br> to '\n' * convert <p> and </p> to '\n' * convert <li> to "\n- " * @throws NullPointerException */ public static String htmlToPlainText(String html) { if (html == null) { throw new NullPointerException("Html parameter may not be null."); } // Clear any html indentation and incidental whitespace String text = StringUtil.stripAndCollapse(html); /* * Replace <br> and <p> tags with new line characters. * Replace <li> tags (HTML bullets) with dashes. * Remove any remaining HTML tags not supported yet. * Finally replace any HTML escape string with appropriate character */ text = htmlNewlinePattern.matcher(text).replaceAll("\n"); text = htmlListPattern.matcher(text).replaceAll("\n- "); text = htmlTagPattern.matcher(text).replaceAll(""); text = StringUtil.unescapeHTML(text).trim(); /* * Ensure no line of plain text is longer than default (72 chars) * NOTE: Use String.split, NOT StringUtil.split, in order to preserve * consecutive newline characters originating from <br> and <p> tags */ return StringUtil.fixedWidth(text.split("\n"), EMAIL_LINE_WIDTH_MAX); } }