package org.jcommons.file.xml; import java.util.Arrays; import java.util.Collection; import java.util.HashSet; import java.util.List; import org.jcommons.common.CommonUtils; /** * 强大的XML标签属性过滤【通过正则】 * @author weiwei l.weiwei@163.com * @date 2013-1-5 下午09:44:24 */ public class Attrs { public static void main(String[] args){ //XML文本 String xml = "<div style='width:250; height:auto;'>This is div.<img src='http://www.baidu.com/logo.gif' alt='This is img' /></div><p style='padding:5px;'>This is p.<ul><li>This is li.<a href='http://www.baidu.com'>This is link.</a></li></ul></p>"; //删除所有标签的所有属性 String rs = Attrs.me().xml(xml).rm().ok(); System.out.println("<div >This is div.<img /></div><p >This is p.<ul><li>This is li.<a >This is link.</a></li></ul></p>".equals(rs)); //删除所有标签的style属性和alt属性 String rs2 = Attrs.me().xml(xml).rm("style", "alt").exe().Tags().ok(); System.out.println("<div >This is div.<img src='http://www.baidu.com/logo.gif' /></div><p >This is p.<ul><li>This is li.<a href='http://www.baidu.com'>This is link.</a></li></ul></p>".equals(rs2)); //删除img标签的src、alt属性,删除div、p标签的style属性 String rs3 = Attrs.me().xml(xml).tag("img").rm("src", "alt").exe().tag("div", "p").rm("style").ok(); System.out.println("<div >This is div.<img /></div><p >This is p.<ul><li>This is li.<a href='http://www.baidu.com'>This is link.</a></li></ul></p>".equals(rs3)); } private String xml = null;//需要操作的xml文本 private Collection<String> tags = new HashSet<String>(); private Collection<String> rms = new HashSet<String>(); private Collection<String> kps = new HashSet<String>(); /** * 构造一个Attrs实例对象 * @date 2013-1-7 下午03:54:48 * @return */ public static Attrs me(){ return new Attrs(); } public Attrs filter(Filter filter){ final String fmt = "(?<=<%s{1,999999999} [\\s\\S]{0,999999999})%s=([\"'])[^=]*\\1"; for (String tag : this.tags) { final String regex = String.format(fmt, tag, "\\w+"); List<String> attrs = CommonUtils.findByRegex(xml, regex); if (attrs == null) continue; for (String attr : attrs) { String name = CommonUtils.findOneByRegex(attr, ".*(?=\\=)").trim(); String value = CommonUtils.findOneByRegex(attr, "(?<=\\=).*").trim(); String new_value = filter.onAttr(tag, name, value); xml = xml.replace(attr, " "+name+"="+new_value+" "); } } return this; } /** * 设置要操作的XML文本 * @date 2013-1-7 下午03:55:05 * @param xml * @return */ public Attrs xml(String xml){ this.xml = xml; return this; } /** * 切换到Tags * @date 2013-1-7 下午03:55:14 * @return */ public Tags Tags(){ return Tags.me().xml(xml); } /** * 删除所有标签的所有属性 * @date 2013-1-7 下午03:55:39 * @return */ public Attrs rm(){ xml = removeXmlTagAttr(xml, "", null); return this; } public Attrs tag(String tag){ this.tags.add(tag); return this; } public Attrs tag(String... tags){ this.tags.addAll(Arrays.asList(tags)); return this; } /** * 删除当前标签的指定属性 * @date 2013-1-7 下午03:56:21 * @param attr * @return */ public Attrs rm(String... attr){ this.rms.addAll(Arrays.asList(attr)); return this; } /** * 保留当前标签指定的属性,其他都删除 * @param attr * @return */ public Attrs kp(String... attr){ this.kps.addAll(Arrays.asList(attr)); return this; } /** * 删除当前标签的指定属性 * @date 2013-1-7 下午03:56:21 * @param attr * @return */ public Attrs rm(String attr){ this.rms.add(attr); return this; } /** * 保留当前标签指定的属性,其他都删除 * @param attr * @return */ public Attrs kp(String attr){ this.kps.add(attr); return this; } public Attrs exe(){ if (this.rms != null && !this.rms.isEmpty()) { xml = removeXmlTagAttr(xml, this.tags, this.rms); } if (this.kps != null) { xml = removeOtherXmlTagAttr(xml, this.tags, this.kps); } tags.clear(); rms.clear(); kps.clear(); return this; } /** * 返回已处理过的XML文本 * @date 2013-1-7 下午03:56:50 * @return */ public String ok(){ exe(); return xml; } /** * 删除XML文本里给定标签的属性 * @date 2013-1-7 下午03:57:04 * @param html * @param tags * @param attrs * @return */ public String removeXmlTagAttr(String xml, Collection<String> tags, Collection<String> attrs){ if (xml == null || xml.trim().length() == 0) return ""; if (tags == null || tags.isEmpty()) return removeXmlTagAttr(xml, "", attrs); String rs = xml; for (String tag : tags){ rs = removeXmlTagAttr(rs, tag, attrs); } return rs; } /** * 删除XML文本里给定标签的属性 * @date 2013-1-7 下午03:58:04 * @param xml * @param tag * @param attrs * @return */ public String removeXmlTagAttr(String xml, String tag, Collection<String> attrs){ if (xml == null || xml.trim().length() == 0) return ""; // String fmt = "(?<=<%s{1,255})\\s+%s=[\"'][^'\"]*[\"']"; // final String fmt = "(?<=<%s{1,255})\\s+%s=([\"'=])[^=]*\\1"; final String fmt = "(?<=<%s{1,999999999} [\\s\\S]{0,999999999})%s=([\"'])[^=]*\\1"; if (tag == null || tag.trim().length() == 0) tag = "\\w";//all tags if (attrs == null || attrs.size() == 0) { String regex = String.format(fmt, tag, "\\w+"); return xml.replaceAll(regex, "");//all attributes } for (String _attr : attrs){ String attr = _attr; if (attr == null || attr.trim().length() == 0) continue; String regex = String.format(fmt, tag, attr); List<String> values = CommonUtils.findByRegex(xml, regex); if (values == null) continue; for (String _value : values) { String value = _value; xml = xml.replace(value, ""); } } return xml; } /** * 删除除了XML文本里给定标签的属性之外的属性 * @date 2013-1-7 下午03:58:04 * @param xml * @param tag * @param attrs * @return */ public String removeOtherXmlTagAttr(String xml, Collection<String> tags, Collection<String> keeps){ if (xml == null || xml.trim().length() == 0) return ""; final String fmt = "(?<=<%s{1,999999999} [\\s\\S]{0,999999999})%s=([\"'])[^=]*\\1"; if (tags == null || tags.isEmpty()) tags = Arrays.asList(".");//all tags if (keeps == null || keeps.size() == 0) { return xml; } String regex = String.format(fmt, "\\w", "\\w+"); //拿到所有属性 List<String> allAttrValues = CommonUtils.findByRegex(xml, regex); if (allAttrValues == null) return xml; for (String attrVal : allAttrValues){ if (attrVal == null || attrVal.trim().length() == 0) continue; boolean isKp = false; for (String keep : keeps) { label:for (String tag : tags) { String _regex = String.format(fmt, tag, keep); List<String> keepAttrValues = CommonUtils.findByRegex(xml, _regex); if (keepAttrValues != null && keepAttrValues.contains(attrVal)) { isKp = true; break label; } } } if (isKp) continue; xml = xml.replace(attrVal, ""); } return xml; } public static final String regex(String tag, String attr){ // String fmt = "(?<=<%s{1,255})\\s+%s=([\"'=])[^=]*\\1"; final String fmt = "(?<=<%s{1,999999999} [\\s\\S]{0,999999999})%s=([\"'])[^=]*\\1"; String regex = String.format(fmt, tag, attr); return regex; } public static interface Filter{ public String onAttr(String tag, String attrName, String attrValue); } }