/**
*
*/
package org.howsun.util;
import java.util.regex.Pattern;
/**
* 说明:<br>
*
* @author 张纪豪
* @version
* Build Time Mar 28, 2009
*/
public abstract class Htmls {
private static final String TAG_SCRIPT_START = "<script";
private static final String TAG_SCRIPT_END = "</script>";
private static final String TAG_OBJECT_START = "<object";
private static final String TAG_OBJECT_END = "</object>";
private static final String TAG_IFRAME_START = "<iframe";
private static final String TAG_IFRAME_END = "</iframe>";
private static final String TAG_STYLE_START = "<style";
private static final String TAG_STYLE_END = "</style>";
/**
* 将过滤掉Html标签
*/
public static String cleanHtmlTag(String surText) {
return Strings.toString(surText).replaceAll("<[.[^<]]*>", "").replace("\n", "<br/>");
}
/**
* 清除脚本标签,包括scrpit,object, iframe<br/>
* 特点:无论大小写都可以清除
* @param html
* @return
*/
public static String removeScriptTags(String html) {
if(html==null)
return "";
boolean found;
int count = 0;
int start, end;
StringBuilder sb = new StringBuilder(html);
StringBuilder sb_lowercase = new StringBuilder(html.toLowerCase());
do {
found = false;
// remove <script>...</script>
start = sb_lowercase.lastIndexOf(TAG_SCRIPT_START);
end = sb_lowercase.lastIndexOf(TAG_SCRIPT_END);
if(start<end && start>=0) {
found = true;
count++;
sb_lowercase.delete(start, end+9);
sb.delete(start, end+9);
}
else { // remove <object>...</object>
start = sb_lowercase.lastIndexOf(TAG_OBJECT_START);
end = sb_lowercase.lastIndexOf(TAG_OBJECT_END);
if(start<end && start>=0) {
found = true;
count++;
sb_lowercase.delete(start, end+9);
sb.delete(start, end+9);
}
}
// remove <iframe>...</iframe>
start = sb_lowercase.lastIndexOf(TAG_IFRAME_START);
end = sb_lowercase.lastIndexOf(TAG_IFRAME_END);
if(start<end && start>=0) {
found = true;
count++;
sb_lowercase.delete(start, end+9);
sb.delete(start, end+9);
}
start = sb_lowercase.lastIndexOf(TAG_STYLE_START);
end = sb_lowercase.lastIndexOf(TAG_STYLE_END);
if(start<end && start>=0) {
found = true;
count++;
sb_lowercase.delete(start, end+8);
sb.delete(start, end+8);
}
} while(found);
if(count==0) // no tags found, just return the original String!
return html;
return sb.toString();
}
/**
* 消除Html标签,
* 特点:可以定制要失效的标签
* @param html
* @return
*/
public static String htmlEscape(String html, String[] tags) {
if(html==null)
return "";
boolean found;
int count = 0;
int start;
StringBuilder sb = new StringBuilder(html);
StringBuilder sb_lowercase = new StringBuilder(html.toLowerCase());
for(String tag : tags){
tag.replaceAll(">", "");
if(tag.indexOf('<')==-1){
tag = "<" + tag;
}
do {
found = false;
start = sb_lowercase.lastIndexOf(tag);
if(start > -1) {
found = true;
count++;
sb.replace(start, start+1, "<");
sb_lowercase.replace(start, start+1, "<");
tag = "</"+tag.substring(1);
start = sb_lowercase.lastIndexOf(tag);
if(start > -1){
sb.replace(start, start+2, "</");
sb_lowercase.replace(start, start+2, "</");
}
}
} while(found);
}
if(count==0) // no tags found, just return the original String!
return html;
return sb.toString();
}
/**
* 转义所有的Html标签
* @param text
* @return
*/
public static String htmlEscape(String text){
if(text == null || "".equals(text))
return text;
return text.replace("<", "<")
.replace(">", ">")
.replace(" ", " ")
.replace("\"", """)
.replace("\'", "'")
.replace("\n", "<br/>");
}
public static String stringToHtml(String text){
if(text == null || "".equals(text))
return text;
return text.replace("<", "<")
.replace(">", ">")
.replace(" ", " ")
.replace(""", "\"")
.replace("'", "\'")
.replace("<br/>", "\n");
}
/**
* 去除html代码:该方法不能过滤含有大写字母或大小写混合的标签
* @param inputString
* @return
*/
public static String html2Text(String inputString) {
String htmlStr = inputString; //含html标签的字符串
String textStr ="";
java.util.regex.Pattern p_script;
java.util.regex.Matcher m_script;
java.util.regex.Pattern p_style;
java.util.regex.Matcher m_style;
java.util.regex.Pattern p_html;
java.util.regex.Matcher m_html;
java.util.regex.Pattern p_ba;
java.util.regex.Matcher m_ba;
try {
//过滤script标签
//定义script的正则表达式{或<script[^>]*?>[\\s\\S]*?<\\/script> }
String regEx_script = "<[\\s]*?script[^>]*?>[\\s\\S]*?<[\\s]*?\\/[\\s]*?script[\\s]*?>";
p_script = Pattern.compile(regEx_script,Pattern.CASE_INSENSITIVE);
m_script = p_script.matcher(htmlStr);
htmlStr = m_script.replaceAll("");
//过滤style标签
//定义style的正则表达式{或<style[^>]*?>[\\s\\S]*?<\\/style> }
String regEx_style = "<[\\s]*?style[^>]*?>[\\s\\S]*?<[\\s]*?\\/[\\s]*?style[\\s]*?>";
p_style = Pattern.compile(regEx_style,Pattern.CASE_INSENSITIVE);
m_style = p_style.matcher(htmlStr);
htmlStr = m_style.replaceAll("");
//过滤html标签
//定义HTML标签的正则表达式
String regEx_html = "<[^>]+>";
// p_html = Pattern.compile(regEx_html,Pattern.CASE_INSENSITIVE);
// m_html = p_html.matcher(htmlStr);
//htmlStr = m_html.replaceAll("");
//空格表达式
String patternStr = "\\s+";
p_ba = Pattern.compile(patternStr,Pattern.CASE_INSENSITIVE);
m_ba = p_ba.matcher(htmlStr);
htmlStr = m_ba.replaceAll("");
textStr = htmlStr;
}catch(Exception e) {
System.err.println("Html2Text: " + e.getMessage());
}
return textStr;//返回文本字符串
}
public static String clearWordFormat(String content) {
//把<P></P>转换成</div></div>保留样式
//content = content.replaceAll("(<P)([^>]*>.*?)(<\\/P>)", "<div$2</div>");
//把<P></P>转换成</div></div>并删除样式
content = content.replaceAll("(<P)([^>]*)(>.*?)(<\\/P>)", "<p$3</p>");
//删除不需要的标签
content = content.replaceAll("<[/]?(html|HTML|head|HEAD|link|LINK|title|TITLE|font|FONT|span|SPAN|xml|XML|del|DEL|ins|INS|meta|META|[ovwxpOVWXP]:\\w+)[^>]*?>", "");
//删除不需要的属性
content = content.replaceAll("<([^>]*)(?:lang|LANG|class|CLASS|style|STYLE|size|SIZE|face|FACE|[ovwxpOVWXP]:\\w+)=(?:'[^']*'|\"\"[^\"\"]*\"\"|[^>]+)([^>]*)>", "<$1$2>");
//删除<STYLE TYPE="text/css"></STYLE>及之间的内容
int styleBegin = content.indexOf("<STYLE");
int styleEnd = content.indexOf("</STYLE>") + 8;
if(styleBegin > -1){
String style = content.substring(styleBegin, styleEnd);
content = content.replace(style, "");
}
int styleLowerCaseBegin = content.indexOf("<style");
int styleLowerCaseEnd = content.indexOf("</style>") + 8;
if(styleLowerCaseBegin > -1){
String style = content.substring(styleLowerCaseBegin, styleLowerCaseEnd);
content = content.replace(style, "");
}
return content;
}
public static String cleanXSS(String value) {
if(value != null){
//You'll need to remove the spaces from the html entities below
value = value.replaceAll("<", "& lt;").replaceAll(">", "& gt;");
value = value.replaceAll("\\(", "& #40;").replaceAll("\\)", "& #41;");
value = value.replaceAll("'", "& #39;");
value = value.replaceAll("eval\\((.*)\\)", "");
value = value.replaceAll("[\\\"\\\'][\\s]*javascript:(.*)[\\\"\\\']", "\"\"");
value = value.replaceAll("script", "");
}
return value;
}
}