package jef.tools; import java.io.File; import java.io.FileOutputStream; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.NoSuchElementException; import jef.common.log.LogUtil; import jef.tools.string.StringSpliterEx; import jef.tools.string.Substring; import jef.tools.string.SubstringIterator; import org.w3c.dom.Attr; import org.w3c.dom.Document; import org.w3c.dom.Element; import org.w3c.dom.NamedNodeMap; import org.w3c.dom.Node; import org.w3c.dom.NodeList; /** * 自行实现的Xpath计算 *TODO 需要修改部分功能,进一步兼容W3C的Xpath实现 * * @author jiyi * */ public class SimpleXPath { /** * 控制台打印出节点和其下属的内容,在解析和调试DOM时很有用。 * * @param node * 要打印的节点 * @param maxLevel * 打印几层 */ public static void printChilrenNodes(Node node, int maxLevel, boolean attFlag) { if (maxLevel < 0) maxLevel = Integer.MAX_VALUE; printChilrenNodes(node, 0, maxLevel, attFlag); } private static void printChilrenNodes(Node parentNode, int level, int maxLevel, boolean attFlag) { for (Node node : XMLUtils.toArray(parentNode.getChildNodes())) { if (node.getNodeType() == Node.TEXT_NODE) { continue; } String span = StringUtils.repeat(" ", level); StringBuilder sb = new StringBuilder(); sb.append(span); sb.append("<").append(node.getNodeName()); if (attFlag && node.getAttributes() != null) {// 打印属性 Node[] atts = XMLUtils.toArray(node.getAttributes()); for (Node att : atts) { sb.append(" "); sb.append(att.getNodeName() + "=\"" + att.getNodeValue() + "\""); } } if (node.hasChildNodes()) { sb.append(">"); if (node.getChildNodes().getLength() == 1 && node.getFirstChild().getNodeType() == Node.TEXT_NODE) { sb.append(node.getFirstChild().getNodeValue().trim()); sb.append("</" + node.getNodeName() + ">"); LogUtil.show(sb.toString()); } else { LogUtil.show(sb.toString()); if (maxLevel > level) { printChilrenNodes(node, level + 1, maxLevel, attFlag); } LogUtil.show(span + "</" + node.getNodeName() + ">"); } } else { sb.append("/>"); LogUtil.show(sb.toString()); } } } static final int INDEX_DEFAULT = -999; static final int INDEX_RANGE = -1000; // 1号解析函数,处理单个节点的运算 // 解析并获取Xpath的对象,这个方法不够安全,限制类内部使用 // 返回以下对象 // String / Node /NodeList/ List<String> private static Object getByXPath(Node node, String xPath, boolean allowNull) { if (StringUtils.isEmpty(xPath)) return node; Node curNode = node; XPathFunction function = null; for (Iterator<Substring> iter = new SubstringIterator(new Substring(xPath), XPATH_KEYS, true); iter.hasNext();) { Substring str = iter.next(); if (str.isEmpty()) continue; if (str.startsWith("/count:")) { Assert.isNull(function); function = XPathFunction.COUNT; continue; } else if (str.startsWith("/plain:")) { Assert.isNull(function); function = XPathFunction.PLAIN; continue; } else if (str.startsWith("/childrenplain:")) { Assert.isNull(function); function = XPathFunction.CHILDRENPLAIN; continue; } else if (str.startsWith("/text:")) { Assert.isNull(function); function = XPathFunction.TEXT; continue; } else if (str.startsWith("/find:")) { str = StringUtils.stringRight(str.toString(), "find:", false); Element newNode = XMLUtils.findElementById(curNode, str.toString()); if (newNode == null) { throw new IllegalArgumentException("There's no element with id=" + str + " under xpath " + getXPath(curNode)); } curNode = newNode; continue; } else if (str.startsWith("/findby:")) { str = StringUtils.stringRight(str.toString(), "findby:", false); String[] args = StringUtils.split(str.toString(), ':'); Assert.isTrue(args.length == 2, "findby function must have to args, divide with ':'"); Element[] newNodes = XMLUtils.findElementsByAttribute(curNode, args[0], args[1]); if (newNodes.length == 0) { throw new IllegalArgumentException("There's no element with attrib is " + str + " under xpath " + getXPath(curNode)); } else if (newNodes.length == 1) { curNode = newNodes[0]; continue; } else { return withFunction(getByXPath(XMLUtils.toNodeList(newNodes), iter), function);// 按照NodeList继续进行计算 } } else if (str.startsWith("/parent:")) { str = StringUtils.stringRight(str.toString(), "parent:", false); Element newNode = XMLUtils.firstParent(curNode, str.toString()); if (newNode == null) { throw new IllegalArgumentException("There's no element with name=" + str + " under xpath " + getXPath(curNode)); } curNode = newNode; continue; } else if (str.startsWith("/next:")) { str = StringUtils.stringRight(str.toString(), "next:", false); Element newNode = XMLUtils.firstSibling(curNode, str.toString()); if (newNode == null) { throw new IllegalArgumentException("There's no element with name=" + str + " under xpath " + getXPath(curNode)); } curNode = newNode; continue; } else if (str.startsWith("/prev:")) { str = StringUtils.stringRight(str.toString(), "prev:", false); Element newNode = XMLUtils.firstPrevSibling(curNode, str.toString()); if (newNode == null) { throw new IllegalArgumentException("There's no element with name=" + str + " under xpath " + getXPath(curNode)); } curNode = newNode; continue; } else if (str.equals("/.")) { continue; } else if (str.equals("/..")) { curNode = (Element) curNode.getParentNode(); continue; } String elementName = null; String index = null; boolean isWild = false; if (str.startsWith("//")) { StringSpliterEx sp = new StringSpliterEx(str.sub(2, str.length())); if (sp.setKeys("[", "]") == StringSpliterEx.RESULT_BOTH_KEY) { elementName = sp.getLeft().toString(); index = sp.getMiddle().toString(); } else { elementName = sp.getSource().toString(); } isWild = true; } else if (str.startsWith("/")) { StringSpliterEx sp = new StringSpliterEx(str.sub(1, str.length())); if (sp.setKeys("[", "]") == StringSpliterEx.RESULT_BOTH_KEY) { elementName = sp.getLeft().toString(); index = sp.getMiddle().toString(); } else { elementName = sp.getSource().toString(); } } else if (str.startsWith("@")) { String attribName = str.sub(1, str.length()).toString(); String value = null; if (attribName.equals("#text")) { value = XMLUtils.nodeText(curNode); } else if (attribName.equals("#alltext")) { value = XMLUtils.nodeText(curNode, true); } else { Element el = null; if (curNode instanceof Document) { el = ((Document) curNode).getDocumentElement(); } else { el = (Element) curNode; } if (str.siblingLeft().equals("//")) { return XMLUtils.attribs(el, attribName); } value = XMLUtils.attrib(el, attribName); } if (value == null) value = ""; if (iter.hasNext()) throw new IllegalArgumentException("Xpath invalid, there's no attributes after."); return withFunction(value, function); // 返回节点内容 } else { StringSpliterEx sp = new StringSpliterEx(str); if (sp.setKeys("[", "]") == StringSpliterEx.RESULT_BOTH_KEY) { elementName = sp.getLeft().toString(); index = sp.getMiddle().toString(); } else { elementName = sp.getSource().toString(); } } NodeList nds = null; int i; if ("?".equals(index)) { i = INDEX_DEFAULT; } else if (index != null && index.lastIndexOf("-") > 0) {// 指定的是一个Index范围 i = INDEX_RANGE; } else { i = StringUtils.toInt(index, 1); } if (StringUtils.isNotEmpty(elementName)) { if (isWild) { nds = XMLUtils.toNodeList(XMLUtils.getElementsByTagNames(curNode, StringUtils.split(elementName, '|'))); } else { nds = XMLUtils.toNodeList(XMLUtils.childElements(curNode, StringUtils.split(elementName, '|'))); } if ((!iter.hasNext() && index == null)) i = INDEX_DEFAULT;// 没有下一个并且没有显式指定序号 if (i == INDEX_DEFAULT) {// && nds.getLength()!=1 return withFunction(getByXPath(nds, iter), function);// 按照NodeList继续进行计算 } else if (i == INDEX_RANGE) { // 指定序号范围 Node[] nArray = XMLUtils.toArray(nds); int x = index.indexOf("--"); if (x < 0) x = index.lastIndexOf('-'); int iS = StringUtils.toInt(index.substring(0, x), 1); if (iS < 0) iS += nds.getLength() + 1; int iE = StringUtils.toInt(index.substring(x + 1), nArray.length); if (iE < 0) iE += nds.getLength() + 1; nds = XMLUtils.toNodeList(ArrayUtils.subArray(nArray, iS - 1, iE)); return withFunction(getByXPath(nds, iter), function);// 按照NodeList继续进行计算 } else if (i < 0) {// 倒数第i个节点 if (nds.getLength() < Math.abs(i)) { if (allowNull) return null; throw new NoSuchElementException("Node not found:" + getXPath(curNode) + " " + str + "the parent nodelist has " + nds.getLength() + " elements, but index is " + i); } else { curNode = (Element) nds.item(nds.getLength() + i); } } else {// 正数第i个节点 if (nds.getLength() < i) { if (allowNull) return null; throw new NoSuchElementException("Node not found:" + getXPath(curNode) + " /" + elementName + " element.[" + str + "] the nodelist has " + nds.getLength() + " elements, but index is " + i); } else { curNode = (Element) nds.item(i - 1); } } } else {// 无视节点名称 } } return withFunction(curNode, function); } /** * 得到指定节点的Xpath * * @param node * @return */ public static String getXPath(Node node) { String path = ""; if (node.getNodeType() == Node.ATTRIBUTE_NODE) { path = "@" + node.getNodeName(); node = ((Attr) node).getOwnerElement(); } while (node != null) { int index = getIndexOfNode(node); String tmp = "/" + ((index > 1) ? node.getNodeName() + "[" + index + "]" : node.getNodeName()); path = tmp + path; node = node.getParentNode(); } return path; } @SuppressWarnings({ "rawtypes" }) private static Object withFunction(Object obj, XPathFunction function) { if (function == null) return obj; if (function == XPathFunction.COUNT) { if (obj instanceof NodeList) { return String.valueOf(((NodeList) obj).getLength()); } else if (obj instanceof NamedNodeMap) { return String.valueOf(((NamedNodeMap) obj).getLength()); } else if (obj instanceof List) { return String.valueOf(((List) obj).size()); } else { throw new IllegalArgumentException(); } } else if (function == XPathFunction.PLAIN) { return toPlainText(obj, true); } else if (function == XPathFunction.CHILDRENPLAIN) { return toPlainText(obj, false); } else if (function == XPathFunction.TEXT) { if (obj instanceof Node) { return htmlNodeToString((Node) obj, true); } else if (obj instanceof NodeList) { StringBuilder sb = new StringBuilder(); Node[] list = XMLUtils.toArray((NodeList) obj); for (int i = 0; i < list.length; i++) { Node node = list[i]; if (i > 0) sb.append("\n"); sb.append(htmlNodeToString(node, true)); } return sb.toString(); } else if (obj instanceof List) { StringBuilder sb = new StringBuilder(); for (Object o : (List) obj) { if (sb.length() > 0) sb.append(","); sb.append(o.toString()); } return sb.toString(); } else { return obj.toString(); } } return obj; } @SuppressWarnings("rawtypes") private static String toPlainText(Object obj, boolean includeMe) { if (obj instanceof Node) { if (includeMe) { return XMLUtils.toString((Node) obj); } else { StringBuilder sb = new StringBuilder(); for (Node node : XMLUtils.toArray(((Node) obj).getChildNodes())) { sb.append(XMLUtils.toString(node)); } return sb.toString(); } } else if (obj instanceof NodeList) { StringBuilder sb = new StringBuilder(); for (Node node : XMLUtils.toArray((NodeList) obj)) { sb.append(toPlainText(node, includeMe)); } return sb.toString(); } else if (obj instanceof List) { StringBuilder sb = new StringBuilder(); for (Object o : (List) obj) { if (sb.length() > 0) sb.append(","); sb.append(o.toString()); } return sb.toString(); } else { return obj.toString(); } } // 2号解析函数,处理节点集的下一步运算,合并结果集 @SuppressWarnings("unchecked") private static Object getByXPath(NodeList nds, Iterator<Substring> iter) { for (; iter.hasNext();) { List<Node> nlist = new ArrayList<Node>(); List<String> slist = new ArrayList<String>(); Tee type = null; String xpath = iter.next().toString(); if (xpath.indexOf(':') > -1) {// 如果是函数,就将剩下的字串全部交给1号解析函数处理 for (; iter.hasNext();) { xpath += iter.next(); } } if (StringUtils.isEmpty(xpath)) continue; for (Node node : XMLUtils.toArray(nds)) { if (node.getNodeType() != Node.ELEMENT_NODE) { throw new UnsupportedOperationException("Unsupport node type:" + node.getNodeType()); } Object obj = getByXPath(node, xpath, true); if (obj == null) { // skip it; } else if (obj instanceof List) { if (type == null) { type = Tee.StringList; } else if (type != Tee.StringList) { throw new UnsupportedOperationException(); } slist.addAll((List<String>) obj); } else if (obj instanceof Node) { if (type == null) { type = Tee.Node; } else if (type != Tee.Node) { throw new UnsupportedOperationException("old type is " + type.name()); } nlist.add((Node) obj); } else if (obj instanceof NodeList) { if (type == null) { type = Tee.NodeList; } else if (type != Tee.NodeList) { throw new UnsupportedOperationException(); } nlist.addAll(XMLUtils.toList((NodeList) obj)); } else if (obj instanceof String) { if (type == null) { type = Tee.String; } else if (type != Tee.String) { throw new UnsupportedOperationException(); } slist.add((String) obj); } else { throw new UnsupportedOperationException(); } } if (type == Tee.String || type == Tee.StringList) { return slist; } else if (type == Tee.Node || type == Tee.NodeList) { nds = XMLUtils.toNodeList(nlist); } } return nds; } private static enum Tee { StringList, NodeList, Node, String } /** * JEF Enhanced Xpath <li>// 表示当前节点下多层</li> <li>/ 当前节点下一层</li> <li>../ 上一层</li> * <li>@ 取属性</li> <li>@#text 取节点文本</li> <li>| 允许选择多个不同名称的节点,用|分隔</li> <li> * [n] 选择器:返回第n个</li> <li>[-2] 选择器:倒数第二个</li> <li>[2--2] 选择器:从第2个到倒数第2个</li> * <li>[?] 选择器:返回所有</li> <li>/count: 函数,用于计算节点的数量</li> <li>/plain: * 函数,获得节点内下属结点转换而成文本(含节点本身)</li> <li>/childrenplain: * 函数,节点本身和下属结点转换而成文本(不含节点本身)</li> <li>/text: * 函数,节点下的所有文本节点输出,如果碰到HTML标签作一定的处理</li> <li>/find:<code>str</code> * 函数,自动查找id=str的节点</li> <li>/findby:<code>name:value</code> * 函数,自动查找属性名城和属性值匹配的节点</li> <li>/parent:<code>str</code> * 函数,向上级查找节点指定名称的父节点,如不指定,则等效于../</li> <li>/parent:<code>str</code> * 函数,向上级查找节点指定名称的父节点,如不指定,则等效于../</li> <li>/next:<code>str</code> 函数, * 在平级向后查找指定名称的兄弟节点,如不指定,则取后第一个兄弟节点</li> <li>/prev:<code>str</code> 函数, * 在平级向前查找指定名称的兄弟节点,如不指定,则取前第一个兄弟节点</li> */ public static enum XPathFunction { COUNT, // 用于计算节点的数量 PLAIN, // 用于获得节点内下属结点转换而成文本(含节点本身) CHILDRENPLAIN, // 用于获得节点本身和下属结点转换而成文本(不含节点本身) TEXT, // 将节点下的所有文本节点输出,如果碰到HTML标签作一定的处理 FIND, FINDBY, PARENT, NEXT, PREV } /** * W3C Xpath的语法规范功能很强,但是性能不佳,许多时候我只需要用一个表达式定位节点即可,不需要复杂考虑的逻辑运算、命名空间等问题。 * JEF在xpath规范的基础上,从新提炼了一个简化版的Xpath规则,参见:{@link XPathFunction} * * @param node * @param xPath * 简易版Xpath表达式 * @return 计算后的NodeList */ public static NodeList getNodeListByXPath(Node node, String xPath) { Object re = getByXPath(node, xPath, false); if (re instanceof NodeList) return (NodeList) re; throw new IllegalArgumentException("Can not return NodeList, the result type of xpath[" + xPath + "] is " + re.getClass().getSimpleName()); } /** * W3C Xpath的语法规范功能很强,但是性能不佳,许多时候我只需要用一个表达式定位节点即可,不需要复杂考虑的逻辑运算、命名空间等问题。 * JEF在xpath规范的基础上,从新提炼了一个简化版的Xpath规则,参见:{@link XPathFunction} * * 注意,这个方法的目的是在文档中定位,当运算中间结果出现多个元素时,会自动取第一个元素而不抛出异常。 下面提供了三个方法 * getAttributeByXPath / getNodeByXPath / getNodeListByXPath 用于快速计算XPath表达式。 * * @param node * @param xPath * 简易版Xpath表达式 * @return 计算后的节点 */ public static Node getNodeByXPath(Node node, String xPath) { try { Object re = getByXPath(node, xPath, false); if (re instanceof Node) return (Node) re; if (re instanceof NodeList) { NodeList l = ((NodeList) re); if (l.getLength() == 0) return null; return l.item(0); } throw new IllegalArgumentException("Can not return node, Xpath [" + xPath + "] result is a " + re.getClass().getSimpleName()); } catch (NullPointerException e) { try { File file = new File("c:/dump" + StringUtils.getTimeStamp() + ".xml"); FileOutputStream out = new FileOutputStream(file); XMLUtils.printNode(node, out); out.write(("======\nXPATH:" + xPath).getBytes()); LogUtil.show("Xpath error, dump file is:" + file.getAbsolutePath()); IOUtils.closeQuietly(out); } catch (Exception e1) { LogUtil.exception(e1); } throw new IllegalArgumentException(e); } } private static final String[] XPATH_KEYS = { "//", "@", "/" }; /** * W3C Xpath的语法规范功能很强,但是性能不佳,许多时候我只需要用一个表达式定位节点即可,不需要复杂考虑的逻辑运算、命名空间等问题。 * JEF在xpath规范的基础上,从新提炼了一个简化版的Xpath规则,参见:{@link XPathFunction} * * <p> * 提供了若干方法用于快速计算XPath表达式。 * </p> * * @param node * @param xPath * jef-xpath表达式 * @return 计算后的属性值,多值文本列表 */ @SuppressWarnings("unchecked") public static String[] getAttributesByXPath(Node node, String xPath) { Object re = getByXPath(node, xPath, false); if (re instanceof String) return new String[] { (String) re }; if (re instanceof List) return ((List<String>) re).toArray(ArrayUtils.EMPTY_STRING_ARRAY); if (re instanceof NodeList) { if (((NodeList) re).getLength() == 0) { return ArrayUtils.EMPTY_STRING_ARRAY; } } throw new IllegalArgumentException("Can not return Attribute, Xpath expression[" + xPath + "] result is a " + re.getClass().getSimpleName()); } /** * 将一个HTML节点内转换成格式文本 * * @param node * @param keepenter * 是否保留原来文字当中的换行符 * @return */ public static String htmlNodeToString(Node node, boolean... keepenter) { boolean keepEnter = (keepenter.length == 0 || keepenter[0] == true); if (node.getNodeType() == Node.TEXT_NODE) { if (keepEnter) { return node.getTextContent(); } else { String str = node.getTextContent(); str = StringUtils.remove(str, '\t'); str = StringUtils.remove(str, '\n'); return str; } } else { StringBuilder sb = new StringBuilder(); if ("BR".equals(node.getNodeName()) || "TR".equals(node.getNodeName()) || "P".equals(node.getNodeName())) { // if (keepEnter) { sb.append("\n"); // } } else if ("TD".equals(node.getNodeName())) { // if (keepEnter) { sb.append("\t"); // } } else if ("IMG".equals(node.getNodeName())) { sb.append("[img]").append(XMLUtils.attrib((Element) node, "src")).append("[img]"); } for (Node child : XMLUtils.toArray(node.getChildNodes())) { sb.append(htmlNodeToString(child, keepenter)); } return sb.toString(); } } private static int getIndexOfNode(Node node) { if (node.getParentNode() == null) return 0; int count = 0; for (Node e : XMLUtils.toArray(node.getParentNode().getChildNodes())) { if (e.getNodeName().equals(node.getNodeName())) { count++; if (e == node) return count; } } throw new RuntimeException("Cann't locate the node's index of its parent."); } /** * W3C Xpath的语法规范功能很强,但是性能不佳,许多时候我只需要用一个表达式定位节点即可,不需要复杂考虑的逻辑运算、命名空间等问题。 * JEF在xpath规范的基础上,从新提炼了一个简化版的Xpath规则,参见:{@link XPathFunction} * */ public static String getAttributeByXPath(Node node, String xPath) { String[] re = getAttributesByXPath(node, xPath); if (re.length > 0) return re[0]; throw new IllegalArgumentException("No proper attribute matchs. can not return Attribute."); } public static void setAttributeByXPath(Node node, String xPath, String value) { int i = xPath.lastIndexOf('@'); if (i < 0) throw new IllegalArgumentException("there is no @ in your xpath."); String left = xPath.substring(0, i); String right = xPath.substring(i + 1); Node n = getNodeByXPath(node, left); if (n instanceof Element) { if ("#text".equals(right)) { XMLUtils.setText(n, value); } else { ((Element) n).setAttribute(right, value); } } else { throw new IllegalArgumentException("node at " + left + " is not a element!"); } } /** * 根据xpath设置若干属性的值 * * @param node * @param xPath * @param attribute * @param isSubNode */ public static void setAttributeByXpath(Node node, String xPath, Map<String, Object> attribute, boolean isSubNode) { int i = xPath.lastIndexOf('@'); if (i >= 0) throw new IllegalArgumentException("there is @ in your xpath."); Node n = getNodeByXPath(node, xPath); if (n instanceof Element) { XMLUtils.setAttributesByMap((Element) n, attribute, isSubNode); } else { throw new IllegalArgumentException("node at " + xPath + " is not a element!"); } } final static class NodeListIterable implements Iterable<Node> { private int n; private int len; private NodeList nds; NodeListIterable(NodeList nds) { this.nds = nds; this.len = nds.getLength(); } public Iterator<Node> iterator() { return new Iterator<Node>() { public boolean hasNext() { return n < len; } public Node next() { return nds.item(n++); } public void remove() { throw new UnsupportedOperationException(); } }; } } }