/* Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.riotfamily.search.index.html; import org.htmlparser.NodeFilter; import org.htmlparser.Tag; import org.htmlparser.beans.StringBean; import org.htmlparser.filters.AndFilter; import org.htmlparser.filters.NodeClassFilter; import org.htmlparser.nodes.TagNode; import org.htmlparser.tags.MetaTag; import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; public class HtmlParserUtils { /** * Extracts all text nodes from the given NodeList that are accepted by * the specified filter. */ public static String extractText(NodeList nodeList, NodeFilter filter) { return toText(nodeList.extractAllNodesThatMatch(filter, true)); } /** * Extracts all text nodes from the given NodeList. Non-breaking spaces * are replaced by normal space characters. Subsequent whitespace characters * are collapsed to a single character. */ public static String toText(NodeList nodeList) { try { StringBean toStringVisitor = new StringBean(); nodeList.visitAllNodesWith(toStringVisitor); return toStringVisitor.getStrings(); } catch (ParserException e) { return null; } } /** * Returns the specified attribute value of the first node that is accepted * by the given filter. */ public static String extractAttribute(NodeList nodeList, NodeFilter filter, String attributeName) throws ParserException { NodeList matches = nodeList.extractAllNodesThatMatch(filter, true); if (matches.size() > 0) { return ((TagNode) matches.elementAt(0)).getAttribute(attributeName); } return null; } /** * Returns the <code>content</code> attribute of the first <code>META</code> * tag with the specified name. */ public static String getMeta(NodeList nodeList, String name) { return getMeta(nodeList, name, false); } /** * Returns the content of the first <code>META</code> tag with the * specified <code>http-eqiv</code> attribute. */ public static String getHttpEquiv(NodeList nodeList, String name) { return getMeta(nodeList, name, true); } private static String getMeta(NodeList nodeList, String name, boolean httpEquiv) { NodeFilter filter = new AndFilter( new NodeClassFilter(MetaTag.class), new AttributeNodeFilter(httpEquiv ? "http-equiv" : "name", name)); NodeList nodes = nodeList.extractAllNodesThatMatch(filter, true); if (nodes.size() > 0) { Tag tag = (Tag) nodes.elementAt(0); return tag.getAttribute("content"); } return null; } }