/** * Copyright (c)2010-2011 Enterprise Website Content Management System(EWCMS), All rights reserved. * EWCMS PROPRIETARY/CONFIDENTIAL. Use is subject to license terms. * http://www.ewcms.com */ package com.ewcms.common.io; import java.io.StringReader; import java.util.List; import java.util.regex.Pattern; import org.apache.commons.codec.binary.Hex; import org.apache.commons.lang.StringEscapeUtils; import org.apache.html.dom.HTMLDocumentImpl; import org.cyberneko.html.parsers.DOMFragmentParser; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.w3c.dom.html.HTMLDocument; import org.xml.sax.InputSource; /** * 字符操作 * * @author 吴智俊 */ public class HtmlStringUtil { private static final Logger logger = LoggerFactory.getLogger(HtmlStringUtil.class); public static Pattern patternHtmlTag = Pattern.compile("<[^<>]+>", 32); public static String clearHtmlTag(String html) { String text = patternHtmlTag.matcher(html).replaceAll(""); if (isEmpty(text)) { return ""; } else { text = htmlDecode(html); return text.replaceAll("[\\s\u3000]{2,}", " "); } } public static boolean isEmpty(String str) { return str == null || str.length() == 0; } public static boolean isNotEmpty(String str) { return !isEmpty(str); } public static String htmlDecode(String txt) { txt = replaceEx(txt, "•", "\267"); return StringEscapeUtils.unescapeHtml(txt); } public static String replaceEx(String str, String subStr, String reStr) { if (str == null) return null; if (subStr == null || subStr.equals("") || subStr.length() > str.length() || reStr == null) return str; StringBuffer sb = new StringBuffer(); int lastIndex = 0; do { int index = str.indexOf(subStr, lastIndex); if (index >= 0) { sb.append(str.substring(lastIndex, index)); sb.append(reStr); lastIndex = index + subStr.length(); } else { sb.append(str.substring(lastIndex)); return sb.toString(); } } while (true); } public static String getPureText(String html) { try { DOMFragmentParser parser; org.w3c.dom.DocumentFragment fragment; parser = new DOMFragmentParser(); HTMLDocument document = new HTMLDocumentImpl(); fragment = document.createDocumentFragment(); String txt; parser.parse(new InputSource(new StringReader(html)), fragment); txt = getPureText(((Node) (fragment))); return htmlDecode(txt); } catch (Exception e) { logger.error("XML中存在非法字符"); return null; } } public static String getPureText(Node node) { if (!node.hasChildNodes() && isTextNode(node)) return node.getNodeValue(); if (isFiltered(node)) return ""; if (node.hasAttributes()) { Node a = node.getAttributes().getNamedItem("style"); if (a != null) { String style = a.getNodeValue(); Pattern p = Pattern.compile("display\\s*\\:\\s*none", 2); if (p.matcher(style).find()) return ""; } } StringBuffer sb = new StringBuffer(); NodeList list = node.getChildNodes(); for (int i = 0; i < list.getLength(); i++) { Node child = list.item(i); String name = child.getNodeName(); sb.append(getPureText(child)); sb.append(" "); if (name.equals("TR") || name.equals("P") || name.equals("DIV")) sb.append("\n"); } return sb.toString(); } public static boolean isTextNode(Node node) { if (node == null) return false; short nodeType = node.getNodeType(); return nodeType == 4 || nodeType == 3; } private static boolean isFiltered(Node node) { short type = node.getNodeType(); String name = node.getNodeName(); if (type == 8) return true; if (name.equals("SCRIPT")) return true; if (name.equals("LINK")) return true; if (name.equals("STYLE")) return true; return name.equals("OBJECT"); } public static String hexEncode(byte bs[]) { return new String((new Hex()).encode(bs)); } public static String join(Object arr[]) { return join(arr, ","); } public static String join(Object arr[][]) { return join(arr, "\n", ","); } public static String join(Object arr[], String spliter) { if (arr == null) return null; StringBuffer sb = new StringBuffer(); for (int i = 0; i < arr.length; i++) { if (i != 0) sb.append(spliter); sb.append(arr[i]); } return sb.toString(); } public static String join(Object arr[][], String spliter1, String spliter2) { if (arr == null) return null; StringBuffer sb = new StringBuffer(); for (int i = 0; i < arr.length; i++) { if (i != 0) sb.append(spliter2); sb.append(join(arr[i], spliter2)); } return sb.toString(); } public static String join(List<String> list) { return join(list, ","); } public static String join(List<String> list, String spliter) { if (list == null) return null; StringBuffer sb = new StringBuffer(); for (int i = 0; i < list.size(); i++) { if (i != 0) sb.append(spliter); sb.append(list.get(i)); } return sb.toString(); } }