HtmlHelper.java example

Explorer
Android_RssReader-master
package com.lgq.rssreader.utils;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Attribute;
import org.jsoup.nodes.Attributes;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import com.lgq.rssreader.dal.ImageRecordDalHelper;

import android.text.TextUtils;

public class HtmlHelper {
	public static String UrlEncodeUpper(String str)
    {
        StringBuilder builder = new StringBuilder();
        for(char c : str.toCharArray())
        {
            if (TextUtils.htmlEncode(String.valueOf(c)).length() > 1)
            {
                builder.append(TextUtils.htmlEncode(String.valueOf(c)).toUpperCase());
            }
            else
            {
                builder.append(c);
            }
        }
        return builder.toString();
    }
		
	public static String unescape (String s)
	{
	    while (true)
	    {
	        int n=s.indexOf("&#");
	        if (n<0) break;
	        int m=s.indexOf(";",n+2);
	        if (m<0) break;
	        try
	        {
	            s=s.substring(0,n)+(char)(Integer.parseInt(s.substring(n+2,m)))+
	                s.substring(m+1);
	        }
	        catch (Exception e)
	        {
	            return s;
	        }
	    }
	    s=s.replace(""","\"");
	    s=s.replace("<","<");
	    s=s.replace(">",">");
	    s=s.replace("&","&");
	    return s;
	}
	
	public static String trim(String s)
	{
		return s.replace("\n"," ").replace("'", " ");		
	}
	
	public static String HtmlToText(String str){
		str=str.replace("<br />", "\n");
		str=str.replace("<br/>", "\n");
		str=str.replace("  ", "\t");
		str=str.replace(" ", " ");
		str=str.replace("'","\\");
		str=str.replace(""", "\\");
		str=str.replace(">",">");
		str=str.replace("<","<");
		str=str.replace("&", "&");

		return str;
	}	
	
	private final static String regxpForHtml = "<([^>]*)>"; // 过滤所有以<开头以>结尾的标签

	private final static String regxpForImgTag = "<\\s*img\\s+([^>]*)\\s*>";
	// // 找出IMG标签

	private final static String regxpForImaTagSrcAttrib = "src=\"([^\"]+)\"";
	// // 找出IMG标签的SRC属性

	/**
	 * 
	 * 基本功能：替换标记以正常显示
	 * <p>
	 * 
	 * @param input
	 * @return String
	 */
	public static String replaceTag(String input) {
		if (!hasSpecialChars(input)) {
			return input;
		}
		StringBuffer filtered = new StringBuffer(input.length());
		char c;
		for (int i = 0; i <= input.length() - 1; i++) {
			c = input.charAt(i);
			switch (c) {
				case '<' :
					filtered.append("<");
					break;
				case '>' :
					filtered.append(">");
					break;
				case '"' :
					filtered.append(""");
					break;
				case '&' :
					filtered.append("&");
					break;
				default :
					filtered.append(c);
			}

		}
		return (filtered.toString());
	}

	/**
	 * 
	 * 基本功能：判断标记是否存在
	 * <p>
	 * 
	 * @param input
	 * @return boolean
	 */
	public static boolean hasSpecialChars(String input) {
		boolean flag = false;
		if ((input != null) && (input.length() > 0)) {
			char c;
			for (int i = 0; i <= input.length() - 1; i++) {
				c = input.charAt(i);
				switch (c) {
					case '>' :
						flag = true;
						break;
					case '<' :
						flag = true;
						break;
					case '"' :
						flag = true;
						break;
					case '&' :
						flag = true;
						break;
				}
			}
		}
		return flag;
	}

	/**
	 * 
	 * 基本功能：过滤所有以"<"开头以">"结尾的标签
	 * <p>
	 * 
	 * @param str
	 * @return String
	 */
	public static String filterHtml(String str) {
		Pattern pattern = Pattern.compile(regxpForHtml);
		Matcher matcher = pattern.matcher(str);
		StringBuffer sb = new StringBuffer();
		boolean result1 = matcher.find();
		while (result1) {
			matcher.appendReplacement(sb, "");
			result1 = matcher.find();
		}
		matcher.appendTail(sb);
		return sb.toString();
	}

	/**
	 * 
	 * 基本功能：过滤指定标签
	 * <p>
	 * 
	 * @param str
	 * @param tag
	 *            指定标签
	 * @return String
	 */
	public static String fiterHtmlTag(String str, String tag) {
		String regxp = "<\\s*" + tag + "\\s+([^>]*)\\s*>";
		Pattern pattern = Pattern.compile(regxp);
		Matcher matcher = pattern.matcher(str);
		StringBuffer sb = new StringBuffer();
		boolean result1 = matcher.find();
		while (result1) {
			matcher.appendReplacement(sb, "");
			result1 = matcher.find();
		}
		matcher.appendTail(sb);
		return sb.toString();
	}

	/**
	 * 
	 * 基本功能：替换指定的标签
	 * <p>
	 * 
	 * @param str
	 * @param beforeTag
	 *            要替换的标签
	 * @param tagAttrib
	 *            要替换的标签属性值
	 * @param startTag
	 *            新标签开始标记
	 * @param endTag
	 *            新标签结束标记
	 * @return String
	 * @如：替换img标签的src属性值为[img]属性值[/img]
	 */
	public static String replaceHtmlTag(String str, String beforeTag,
			String tagAttrib, String startTag, String endTag) {
		String regxpForTag = "<\\s*" + beforeTag + "\\s+([^>]*)\\s*>";
		String regxpForTagAttrib = tagAttrib + "=\"([^\"]+)\"";
		Pattern patternForTag = Pattern.compile(regxpForTag);
		Pattern patternForAttrib = Pattern.compile(regxpForTagAttrib);
		Matcher matcherForTag = patternForTag.matcher(str);
		StringBuffer sb = new StringBuffer();
		boolean result = matcherForTag.find();
		while (result) {
			StringBuffer sbreplace = new StringBuffer();
			Matcher matcherForAttrib = patternForAttrib.matcher(matcherForTag
					.group(1));
			if (matcherForAttrib.find()) {
				matcherForAttrib.appendReplacement(sbreplace, startTag
						+ matcherForAttrib.group(1) + endTag);
			}
			matcherForTag.appendReplacement(sb, sbreplace.toString());
			result = matcherForTag.find();
		}
		matcherForTag.appendTail(sb);
		return sb.toString();
	}
	
	public static String ConvertHtmlToEnml(String html) //, Note note)
    {
		String[] prohibitedArray = new String[]
            {
                "applet", "base", "basefont", "bgsound", "blink", "body", "button", "dir", "embed", "fieldset",
                "form", "frame", "frameset",
                "head", "html", "iframe", "ilayer", "input", "isindex", "label", "layer", "legend", "link",
                "marquee", "menu", "meta", "noframes",
                "noscript", "object", "optgroup", "option", "param", "plaintext", "script", "select", "style",
                "textarea", "xml", "image"
            };

		String[] disableAttributesArray = new String[]
            {"id", "class", "accesskey", "data", "dynsrc", "tabindex", "sizset"};
		
		
		List<String> prohibited = Arrays.asList(prohibitedArray);
		List<String> disableAttributes = Arrays.asList(disableAttributesArray);
		
		Document doc = Jsoup.parse(html);

        //var imgs = new ImageRecordDbPersistence().LoadFromFile().ToList();
		ImageRecordDalHelper helper = new ImageRecordDalHelper();

        Elements nodes = doc.getAllElements();
        int total = nodes.size() - 1;
        for (int j = total; j >= 0; j--)
        {
            //remove all prohibited node
            if (prohibited.contains(nodes.get(j).tagName()))
            {
                if (!(nodes.get(j).childNodeSize() > 0))
                    nodes.get(j).remove();
                else
                {
                	for(Element child : nodes.get(j).children())
                		nodes.get(j).parent().appendChild(child);
                    nodes.get(j).remove();
                }
            }

            //remove disabled attribute
            if (nodes.get(j).attributes().size() > 0)
            {
                int count = nodes.get(j).attributes().size() - 1;
                List<Attribute> attributes = nodes.get(j).attributes().asList();
            	//int count = disableAttributes.size();
                for (int i = count; i >= 0; i--)
                {
                    if (disableAttributes.contains(attributes.get(i).getKey()))
                    {
                    	nodes.get(j).removeAttr(attributes.get(i).getKey());
                        continue;
                    }

                    //deal with on*
                    if (attributes.get(i).getKey().startsWith("on"))
                    {
                    	nodes.get(j).removeAttr(attributes.get(i).getKey());
                        continue;
                    }
                    
                    if (attributes.get(i).getKey().startsWith("sizcache"))
                    {
                    	nodes.get(j).removeAttr(attributes.get(i).getKey());
                        continue;
                    }
                    
                    if (attributes.get(i).getKey().startsWith("f-size"))
                    {
                    	nodes.get(j).removeAttr(attributes.get(i).getKey());
                        continue;
                    }
                }
            }

            //deal with relative href
            if (nodes.get(j).tagName().equals("a"))
            {
                if (nodes.get(j).attributes().size() > 0 && nodes.get(j).hasAttr("href")){
                    String href = nodes.get(j).attr("href");

                    if (!href.startsWith("http") || !href.startsWith("https") || !href.startsWith("www"))
                    {
                    	nodes.get(j).removeAttr("href");
                    }
                }
            }

            //deal with cached img, replace with online src
            if (nodes.get(j).tagName().equals("img") && nodes.get(j).attributes().size() > 0)
            {
                if (nodes.get(j).hasAttr("xsrc"))
                {
                    if (nodes.get(j).attr("xsrc").startsWith("mnt:"))
                    {
                        String src = nodes.get(j).attr("xsrc");
                        for(Attribute attr : nodes.get(j).attributes().asList()){
                    		nodes.get(j).removeAttr(attr.getKey());
                    	}
                        
                        //nodes.get(j).attr("src", imgs.First(r => src.Contains(r.StoredName)).OriginUrl);
                        nodes.get(j).attr("src", helper.GetImageRecordEntityByStoreName(src).OriginUrl);
                    }
                    else
                    {
                    	String xsrc = nodes.get(j).attr("xsrc");;
                    	for(Attribute attr : nodes.get(j).attributes().asList()){
                    		nodes.get(j).removeAttr(attr.getKey());
                    	}
                    	nodes.get(j).attr("src", xsrc);
                    }
                }
                else{
                	for(Attribute attr : nodes.get(j).attributes().asList()){
                		nodes.get(j).removeAttr(attr.getKey());
                	}                	
                }
                
                //better reading experience in mobile client and web client
                nodes.get(j).attr("style", "max-height:100%; max-width:100%;");
            }

            if (nodes.get(j).tagName().equals("a") && nodes.get(j).attributes().size() > 0)
            {
            	if (nodes.get(j).hasAttr("href"))
                {
                    String href = nodes.get(j).attr("href");
                    for(Attribute attr : nodes.get(j).attributes().asList()){
                		nodes.get(j).removeAttr(attr.getKey());
                	}
                    nodes.get(j).attr("href", href);
                }
                else
                {
                	for(Attribute attr : nodes.get(j).attributes().asList()){
                		nodes.get(j).removeAttr(attr.getKey());
                	}
                }
            }            
        }

        char[] xmlChar = doc.html().toCharArray();
        for (int i = 0; i < xmlChar.length; ++i)
        {
            if (xmlChar[i] > 0xFFFD)
            {
                //或者直接替换掉0xb 
                xmlChar[i] = ' '; // 用空格替换
            }
            else if (xmlChar[i] < 0x20 && xmlChar[i] != 't' & xmlChar[i] != 'n' & xmlChar[i] != 'r')
            {
                //或者直接替换掉0xb
                xmlChar[i] = ' '; // 用空格替换
            }
        }
        
        helper.Close();

        return new String(xmlChar).replace("<?xml version=\"1.0\" encoding=\"utf-8\"?>", "");
    }
}