package com.lgq.rssreader.utils; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.jsoup.Jsoup; import org.jsoup.nodes.Attribute; import org.jsoup.nodes.Attributes; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import com.lgq.rssreader.dal.ImageRecordDalHelper; import android.text.TextUtils; public class HtmlHelper { public static String UrlEncodeUpper(String str) { StringBuilder builder = new StringBuilder(); for(char c : str.toCharArray()) { if (TextUtils.htmlEncode(String.valueOf(c)).length() > 1) { builder.append(TextUtils.htmlEncode(String.valueOf(c)).toUpperCase()); } else { builder.append(c); } } return builder.toString(); } public static String unescape (String s) { while (true) { int n=s.indexOf("&#"); if (n<0) break; int m=s.indexOf(";",n+2); if (m<0) break; try { s=s.substring(0,n)+(char)(Integer.parseInt(s.substring(n+2,m)))+ s.substring(m+1); } catch (Exception e) { return s; } } s=s.replace(""","\""); s=s.replace("<","<"); s=s.replace(">",">"); s=s.replace("&","&"); return s; } public static String trim(String s) { return s.replace("\n"," ").replace("'", " "); } public static String HtmlToText(String str){ str=str.replace("<br />", "\n"); str=str.replace("<br/>", "\n"); str=str.replace("  ", "\t"); str=str.replace(" ", " "); str=str.replace("'","\\"); str=str.replace(""", "\\"); str=str.replace(">",">"); str=str.replace("<","<"); str=str.replace("&", "&"); return str; } private final static String regxpForHtml = "<([^>]*)>"; // 过滤所有以<开头以>结尾的标签 private final static String regxpForImgTag = "<\\s*img\\s+([^>]*)\\s*>"; // // 找出IMG标签 private final static String regxpForImaTagSrcAttrib = "src=\"([^\"]+)\""; // // 找出IMG标签的SRC属性 /** * * 基本功能:替换标记以正常显示 * <p> * * @param input * @return String */ public static String replaceTag(String input) { if (!hasSpecialChars(input)) { return input; } StringBuffer filtered = new StringBuffer(input.length()); char c; for (int i = 0; i <= input.length() - 1; i++) { c = input.charAt(i); switch (c) { case '<' : filtered.append("<"); break; case '>' : filtered.append(">"); break; case '"' : filtered.append("""); break; case '&' : filtered.append("&"); break; default : filtered.append(c); } } return (filtered.toString()); } /** * * 基本功能:判断标记是否存在 * <p> * * @param input * @return boolean */ public static boolean hasSpecialChars(String input) { boolean flag = false; if ((input != null) && (input.length() > 0)) { char c; for (int i = 0; i <= input.length() - 1; i++) { c = input.charAt(i); switch (c) { case '>' : flag = true; break; case '<' : flag = true; break; case '"' : flag = true; break; case '&' : flag = true; break; } } } return flag; } /** * * 基本功能:过滤所有以"<"开头以">"结尾的标签 * <p> * * @param str * @return String */ public static String filterHtml(String str) { Pattern pattern = Pattern.compile(regxpForHtml); Matcher matcher = pattern.matcher(str); StringBuffer sb = new StringBuffer(); boolean result1 = matcher.find(); while (result1) { matcher.appendReplacement(sb, ""); result1 = matcher.find(); } matcher.appendTail(sb); return sb.toString(); } /** * * 基本功能:过滤指定标签 * <p> * * @param str * @param tag * 指定标签 * @return String */ public static String fiterHtmlTag(String str, String tag) { String regxp = "<\\s*" + tag + "\\s+([^>]*)\\s*>"; Pattern pattern = Pattern.compile(regxp); Matcher matcher = pattern.matcher(str); StringBuffer sb = new StringBuffer(); boolean result1 = matcher.find(); while (result1) { matcher.appendReplacement(sb, ""); result1 = matcher.find(); } matcher.appendTail(sb); return sb.toString(); } /** * * 基本功能:替换指定的标签 * <p> * * @param str * @param beforeTag * 要替换的标签 * @param tagAttrib * 要替换的标签属性值 * @param startTag * 新标签开始标记 * @param endTag * 新标签结束标记 * @return String * @如:替换img标签的src属性值为[img]属性值[/img] */ public static String replaceHtmlTag(String str, String beforeTag, String tagAttrib, String startTag, String endTag) { String regxpForTag = "<\\s*" + beforeTag + "\\s+([^>]*)\\s*>"; String regxpForTagAttrib = tagAttrib + "=\"([^\"]+)\""; Pattern patternForTag = Pattern.compile(regxpForTag); Pattern patternForAttrib = Pattern.compile(regxpForTagAttrib); Matcher matcherForTag = patternForTag.matcher(str); StringBuffer sb = new StringBuffer(); boolean result = matcherForTag.find(); while (result) { StringBuffer sbreplace = new StringBuffer(); Matcher matcherForAttrib = patternForAttrib.matcher(matcherForTag .group(1)); if (matcherForAttrib.find()) { matcherForAttrib.appendReplacement(sbreplace, startTag + matcherForAttrib.group(1) + endTag); } matcherForTag.appendReplacement(sb, sbreplace.toString()); result = matcherForTag.find(); } matcherForTag.appendTail(sb); return sb.toString(); } public static String ConvertHtmlToEnml(String html) //, Note note) { String[] prohibitedArray = new String[] { "applet", "base", "basefont", "bgsound", "blink", "body", "button", "dir", "embed", "fieldset", "form", "frame", "frameset", "head", "html", "iframe", "ilayer", "input", "isindex", "label", "layer", "legend", "link", "marquee", "menu", "meta", "noframes", "noscript", "object", "optgroup", "option", "param", "plaintext", "script", "select", "style", "textarea", "xml", "image" }; String[] disableAttributesArray = new String[] {"id", "class", "accesskey", "data", "dynsrc", "tabindex", "sizset"}; List<String> prohibited = Arrays.asList(prohibitedArray); List<String> disableAttributes = Arrays.asList(disableAttributesArray); Document doc = Jsoup.parse(html); //var imgs = new ImageRecordDbPersistence().LoadFromFile().ToList(); ImageRecordDalHelper helper = new ImageRecordDalHelper(); Elements nodes = doc.getAllElements(); int total = nodes.size() - 1; for (int j = total; j >= 0; j--) { //remove all prohibited node if (prohibited.contains(nodes.get(j).tagName())) { if (!(nodes.get(j).childNodeSize() > 0)) nodes.get(j).remove(); else { for(Element child : nodes.get(j).children()) nodes.get(j).parent().appendChild(child); nodes.get(j).remove(); } } //remove disabled attribute if (nodes.get(j).attributes().size() > 0) { int count = nodes.get(j).attributes().size() - 1; List<Attribute> attributes = nodes.get(j).attributes().asList(); //int count = disableAttributes.size(); for (int i = count; i >= 0; i--) { if (disableAttributes.contains(attributes.get(i).getKey())) { nodes.get(j).removeAttr(attributes.get(i).getKey()); continue; } //deal with on* if (attributes.get(i).getKey().startsWith("on")) { nodes.get(j).removeAttr(attributes.get(i).getKey()); continue; } if (attributes.get(i).getKey().startsWith("sizcache")) { nodes.get(j).removeAttr(attributes.get(i).getKey()); continue; } if (attributes.get(i).getKey().startsWith("f-size")) { nodes.get(j).removeAttr(attributes.get(i).getKey()); continue; } } } //deal with relative href if (nodes.get(j).tagName().equals("a")) { if (nodes.get(j).attributes().size() > 0 && nodes.get(j).hasAttr("href")){ String href = nodes.get(j).attr("href"); if (!href.startsWith("http") || !href.startsWith("https") || !href.startsWith("www")) { nodes.get(j).removeAttr("href"); } } } //deal with cached img, replace with online src if (nodes.get(j).tagName().equals("img") && nodes.get(j).attributes().size() > 0) { if (nodes.get(j).hasAttr("xsrc")) { if (nodes.get(j).attr("xsrc").startsWith("mnt:")) { String src = nodes.get(j).attr("xsrc"); for(Attribute attr : nodes.get(j).attributes().asList()){ nodes.get(j).removeAttr(attr.getKey()); } //nodes.get(j).attr("src", imgs.First(r => src.Contains(r.StoredName)).OriginUrl); nodes.get(j).attr("src", helper.GetImageRecordEntityByStoreName(src).OriginUrl); } else { String xsrc = nodes.get(j).attr("xsrc");; for(Attribute attr : nodes.get(j).attributes().asList()){ nodes.get(j).removeAttr(attr.getKey()); } nodes.get(j).attr("src", xsrc); } } else{ for(Attribute attr : nodes.get(j).attributes().asList()){ nodes.get(j).removeAttr(attr.getKey()); } } //better reading experience in mobile client and web client nodes.get(j).attr("style", "max-height:100%; max-width:100%;"); } if (nodes.get(j).tagName().equals("a") && nodes.get(j).attributes().size() > 0) { if (nodes.get(j).hasAttr("href")) { String href = nodes.get(j).attr("href"); for(Attribute attr : nodes.get(j).attributes().asList()){ nodes.get(j).removeAttr(attr.getKey()); } nodes.get(j).attr("href", href); } else { for(Attribute attr : nodes.get(j).attributes().asList()){ nodes.get(j).removeAttr(attr.getKey()); } } } } char[] xmlChar = doc.html().toCharArray(); for (int i = 0; i < xmlChar.length; ++i) { if (xmlChar[i] > 0xFFFD) { //或者直接替换掉0xb xmlChar[i] = ' '; // 用空格替换 } else if (xmlChar[i] < 0x20 && xmlChar[i] != 't' & xmlChar[i] != 'n' & xmlChar[i] != 'r') { //或者直接替换掉0xb xmlChar[i] = ' '; // 用空格替换 } } helper.Close(); return new String(xmlChar).replace("<?xml version=\"1.0\" encoding=\"utf-8\"?>", ""); } }