// HTMLParser Library $Name: v1_6_20060319 $ - A java-based parser for HTML // http://sourceforge.org/projects/htmlparser // Copyright (C) 2004 Rogers George // // Revision Control Information // // $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/filters/CssSelectorNodeFilter.java,v $ // $Author: derrickoswald $ // $Date: 2005/05/15 11:49:04 $ // $Revision: 1.6 $ // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // package org.htmlparser.filters; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.htmlparser.Node; import org.htmlparser.NodeFilter; import org.htmlparser.Tag; import org.htmlparser.util.NodeList; /** * A NodeFilter that accepts nodes based on whether they match a CSS2 selector. * Refer to <a href="http://www.w3.org/TR/REC-CSS2/selector.html"> * http://www.w3.org/TR/REC-CSS2/selector.html</a> for syntax. * <p> * Todo: more thorough testing, any relevant pseudo-classes, css3 features */ public class CssSelectorNodeFilter implements NodeFilter { /** * Regular expression to split the selector into tokens. */ private static Pattern tokens = Pattern.compile("(" + "/\\*.*?\\*/" // comments + ") | (" + " \".*?[^\"]\"" // double quoted string + " | \'.*?[^\']\'" // single quoted string + " | \"\" | \'\' " // empty quoted string + ") | (" + " [\\~\\*\\$\\^]? = " // attrib-val relations + ") | (" + " [a-zA-Z_\\*](?:[a-zA-Z0-9_-]|\\\\.)* " // bare name + ") | \\s*(" + " [+>~\\s] " // combinators + ")\\s* | (" + " [\\.\\[\\]\\#\\:)(] " // class/id/attr/param delims + ") | (" + " [\\,] " // comma + ") | ( . )" // everything else (bogus) , Pattern.CASE_INSENSITIVE | Pattern.DOTALL | Pattern.COMMENTS); /** * Comment token type. */ private static final int COMMENT = 1; /** * quoted string token type. */ private static final int QUOTEDSTRING = 2; /** * Relation token type. */ private static final int RELATION = 3; /** * Name token type. */ private static final int NAME = 4; /** * Combinator token type. */ private static final int COMBINATOR = 5; /** * Delimiter token type. */ private static final int DELIM = 6; /** * Comma token type. */ private static final int COMMA = 7; private NodeFilter therule; private Matcher m = null; private int tokentype = 0; private String token = null; /** * Create a Cascading Style Sheet node filter. * @param selector The selector expression. */ public CssSelectorNodeFilter(String selector) { m = tokens.matcher (selector); if (nextToken ()) therule = parse (); } /** * Accept nodes that match the selector expression. * @param node The node to check. * @return <code>true</code> if the node matches, * <code>false</code> otherwise. */ public boolean accept (Node node) { return (therule.accept (node)); } private boolean nextToken () { if (m != null && m.find ()) for (int i = 1; i < m.groupCount (); i++) if (null != m.group (i)) { tokentype = i; token = m.group (i); return true; } tokentype = 0; token = null; return (false); } private NodeFilter parse () { NodeFilter ret; ret = null; do { switch (tokentype) { case COMMENT: case NAME: case DELIM: if (ret == null) ret = parseSimple (); else ret = new AndFilter (ret, parseSimple ()); break; case COMBINATOR: switch (token.charAt (0)) { case '+': ret = new AdjacentFilter (ret); break; case '>': ret = new HasParentFilter (ret); break; default: // whitespace ret = new HasAncestorFilter (ret); } nextToken (); break; case COMMA: ret = new OrFilter (ret, parse ()); nextToken (); break; } } while (token != null); return (ret); } private NodeFilter parseSimple() { boolean done = false; NodeFilter ret = null; if (token != null) do { switch (tokentype) { case COMMENT: nextToken(); break; case NAME: if ("*".equals(token)) ret = new YesFilter(); else if (ret == null) ret = new TagNameFilter(unescape(token)); else ret = new AndFilter(ret, new TagNameFilter(unescape(token))); nextToken(); break; case DELIM: switch (token.charAt(0)) { case '.': nextToken(); if (tokentype != NAME) throw new IllegalArgumentException("Syntax error at " + token); if (ret == null) ret = new HasAttributeFilter("class", unescape(token)); else ret = new AndFilter(ret, new HasAttributeFilter("class", unescape(token))); break; case '#': nextToken(); if (tokentype != NAME) throw new IllegalArgumentException("Syntax error at " + token); if (ret == null) ret = new HasAttributeFilter("id", unescape(token)); else ret = new AndFilter(ret, new HasAttributeFilter("id", unescape(token))); break; case ':': nextToken(); if (ret == null) ret = parsePseudoClass(); else ret = new AndFilter(ret, parsePseudoClass()); break; case '[': nextToken(); if (ret == null) ret = parseAttributeExp(); else ret = new AndFilter(ret, parseAttributeExp()); break; } nextToken(); break; default: done = true; } } while (!done && token != null); return ret; } private NodeFilter parsePseudoClass() { throw new IllegalArgumentException("pseudoclasses not implemented yet"); } private NodeFilter parseAttributeExp() { NodeFilter ret = null; if (tokentype == NAME) { String attrib = token; nextToken(); if ("]".equals(token)) ret = new HasAttributeFilter(unescape(attrib)); else if (tokentype == RELATION) { String val = null, rel = token; nextToken(); if (tokentype == QUOTEDSTRING) val = unescape(token.substring(1, token.length() - 1)); else if (tokentype == NAME) val = unescape(token); if ("~=".equals(rel) && val != null) ret = new AttribMatchFilter(unescape(attrib), "\\b" + val.replaceAll("([^a-zA-Z0-9])", "\\\\$1") + "\\b"); else if ("=".equals(rel) && val != null) ret = new HasAttributeFilter(attrib, val); } } if (ret == null) throw new IllegalArgumentException("Syntax error at " + token + tokentype); nextToken(); return ret; } /** * Replace escape sequences in a string. * @param escaped The string to examine. * @return The argument with escape sequences replaced by their * equivalent character. */ public static String unescape(String escaped) { StringBuffer result = new StringBuffer(escaped.length()); Matcher m = Pattern.compile("\\\\(?:([a-fA-F0-9]{2,6})|(.))").matcher( escaped); while (m.find()) { if (m.group(1) != null) m.appendReplacement(result, String.valueOf((char)Integer.parseInt(m.group(1), 16))); else if (m.group(2) != null) m.appendReplacement(result, m.group(2)); } m.appendTail(result); return result.toString(); } private static class HasAncestorFilter implements NodeFilter { private NodeFilter atest; public HasAncestorFilter(NodeFilter n) { atest = n; } public boolean accept(Node n) { while (n != null) { n = n.getParent(); if (atest.accept(n)) return true; } return false; } } private static class AdjacentFilter implements NodeFilter { private NodeFilter sibtest; public AdjacentFilter(NodeFilter n) { sibtest = n; } public boolean accept(Node n) { if (n.getParent() != null) { NodeList l = n.getParent().getChildren(); for (int i = 0; i < l.size(); i++) if (l.elementAt(i) == n && i > 0) return (sibtest.accept(l.elementAt(i - 1))); } return false; } } private static class YesFilter implements NodeFilter { public boolean accept(Node n) {return true;} } private static class AttribMatchFilter implements NodeFilter { private Pattern rel; private String attrib; public AttribMatchFilter(String attrib, String regex) { rel = Pattern.compile(regex); this.attrib = attrib; } public boolean accept(Node node) { if (node instanceof Tag && ((Tag)node).getAttribute(attrib) != null) if (rel != null && !rel.matcher(((Tag)node).getAttribute(attrib)).find()) return false; else return true; else return false; } } }