QueryHtml.java example

Explorer
uml-auto-assessment-master
- web-cat-src
package student.web.internal.tests;
 /**
 * Demo of screenscraping using TagSoup and XPATH as described at
 * http://blog.oroup.com/2006/11/05/the-joys-of-screenscraping/
 *
 * This example class downloads the content of a page from Google
 * Finance and parses it for the Google stock price. It completely
 * omits all error handling for brevity. Also a lot of objects
 * should be cached and re-used if you were really going to call
 * this multiple times.
 *
 * @author Oliver Roup <oroup@oroup.com>
 */

import java.io.InputStream;
import java.io.StringWriter;
import java.lang.ref.SoftReference;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Result;
import javax.xml.transform.Source;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMResult;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.sax.SAXTransformerFactory;
import javax.xml.transform.sax.TransformerHandler;
import javax.xml.transform.stream.StreamResult;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathFactory;
import org.ccil.cowan.tagsoup.Parser;
import org.w3c.dom.Attr;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import student.web.HtmlElement;
import student.web.HtmlHeadingElement;
import student.web.internal.MutableNamespaceContext;

public class QueryHtml
{
    // ----------------------------------------------------------
    public static void main(String[] args)
        throws Exception
    {
        QueryHtml qh = new QueryHtml();

        // Get the page and coerce it to an XML DOM. This loads the whole
        // thing into memory so massive pages should be cut down first
        // using SAX or something similar.
        Node node = qh.getHtmlUrlNode("http://localhost/mypage.html");

        final String[] queries = new String[] {
            "//html:a",
            "//html:h1",
            "//html:li",
            "//html:li//html:a",
            "//html:li/html:a",
            "//html:p//html:a",
            "//html:img",

            // All img or p tags
            "//html:img|//html:p",
            // All heading or anchor tags
            "//html:h1|//html:h2|//html:h3|//html:h4|//html:h5|//html:h6"
            + "|//html:a",
            // All tags by id
            "//*[@id='myid']",
            // All tags by class
            "//*[@class='green']"
            };

        for (String query : queries)
        {
            System.out.println("--------------------");
            System.out.println("searching for first: " + query);
            dumpTagDetails(qh.xPathQueryFirst(node, query));

            System.out.println();
            System.out.println("searching for all: " + query);
            List<HtmlElement> result = qh.xPathQueryAll(node, query);
            if (result.size() == 0)
            {
                System.out.println("No matches found.");
            }
            else
            {
                System.out.println("results:");
            }

            // Print out the result.
            for (HtmlElement tag : result)
            {
                dumpTagDetails(tag);
            }
            System.out.println();
        }

    }


    // ----------------------------------------------------------
    public static void dumpTagDetails(HtmlElement tag)
    {
        System.out.println(tag);
        if (tag == null) return;
        System.out.println("    type = " + tag.getType());
        System.out.println("    text = " + tag.getText());
        System.out.println("    attributes:");
        for (String attribute : tag.getAttributes())
        {
            System.out.println(
                "        " + attribute + " => "
                + tag.getAttributeValue(attribute));
        }
    }


    // ----------------------------------------------------------
    public QueryHtml()
    {
        // Create a mutable namespace context. This should really be provided
        // by the JDK, but the default implementation does not allow new
        // entries to be added.
        nc = new MutableNamespaceContext();

        // Set the prefix "html" to correspond to the xhtml namespace.
        // This can be called multiple times with different prefixes.
        nc.setNamespace("html", "http://www.w3.org/1999/xhtml");
//        nc.setNamespace("", "http://www.w3.org/1999/xhtml");
        xpath.setNamespaceContext(nc);

        try
        {
//            parser.setFeature(
//                "http://xml.org/sax/features/namespace-prefixes",true);
            xformer = TransformerFactory.newInstance().newTransformer();
            xformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
        }
        catch (Exception e)
        {
            e.printStackTrace();
        }
    }


    // ----------------------------------------------------------
    /**
     * @param urlString The URL of the page to retrieve
     * @return A Node with a well formed XML doc coerced from the page.
     * @throws Exception if something goes wrong. No error handling at all
     * for brevity.
     */
    public Node getHtmlUrlNode(String urlString)
        throws Exception
    {
        TransformerHandler th = stf.newTransformerHandler();

        // This dom result will contain the results of the transformation
        DOMResult dr = new DOMResult();
        th.setResult(dr);

        parser.setContentHandler(th);

        URL url = new URL(urlString);
        URLConnection urlConn = url.openConnection();
        InputStream stream = urlConn.getInputStream();

        // This is where the magic happens to convert HTML to XML
        parser.parse(new InputSource(stream));
        stream.close();

        return dr.getNode();
    }


    // ----------------------------------------------------------
    private HtmlElement tagForNode(Node node)
    {
        String tagName = node.getNodeName();
        if (tagName != null
            && tagName.length() == 2
            && (tagName.charAt(0) == 'h'
                || tagName.charAt(0) == 'H')
            && tagName.charAt(1) > '0'
            && tagName.charAt(1) < '7')
        {
            return new HtmlHeadingNodeTag(node, xformer);
        }
        else
        {
            return new HtmlNodeTag(node, xformer);
        }

    }


    // ----------------------------------------------------------
    public HtmlElement xPathQueryFirst(Node node, String query)
        throws Exception
    {
        NodeList nl =
            (NodeList)xpath.evaluate(query, node, XPathConstants.NODESET);
        return (nl == null || nl.getLength() == 0)
            ? null
            : tagForNode(nl.item(0));
    }


    // ----------------------------------------------------------
    /**
     * @param node An XML DOM Tree for query
     * @param query An XPATH query to run against the DOM Tree
     * @param nc The namespaceContext that maps prefixes to XML namespace
     * @return A list of nodes that result from running the query against
     * the node.
     * @throws Exception If anything goes wrong. No error handling for brevity
     */
    public List<HtmlElement> xPathQueryAll(Node node, String query)
        throws Exception
    {
        NodeList nl =
            (NodeList)xpath.evaluate(query, node, XPathConstants.NODESET);
        ArrayList<HtmlElement> result = new ArrayList<HtmlElement>();
        for (int i = 0; i < nl.getLength(); i++)
        {
            result.add(tagForNode(nl.item(i)));
        }
        return result;
    }


    //~ private classes .......................................................

    // ----------------------------------------------------------
    private static class AttributeIterator
        implements Iterator<String>, Iterable<String>
    {
        // ----------------------------------------------------------
        public AttributeIterator(NamedNodeMap map)
        {
            inner = map;
            pos = 0;
        }


        // ----------------------------------------------------------
        public boolean hasNext()
        {
            return pos < inner.getLength();
        }


        // ----------------------------------------------------------
        public String next()
        {
            Attr attr = (Attr)inner.item(pos);
            pos++;
            return attr.getName();
        }


        // ----------------------------------------------------------
        public void remove()
        {
            throw new UnsupportedOperationException();
        }


        // ----------------------------------------------------------
        public Iterator<String> iterator()
        {
            return this;
        }


        //~ Instance/static variables .........................................
        private NamedNodeMap inner;
        private int pos;
    }


    // ----------------------------------------------------------
    private static class HtmlHeadingNodeTag
        extends HtmlNodeTag
        implements HtmlHeadingElement
    {
        // ----------------------------------------------------------
        public HtmlHeadingNodeTag(Node node, Transformer transformer)
        {
            super(node, transformer);
        }


        // ----------------------------------------------------------
        public int getHeadingLevel()
        {
            if (level == 0)
            {
                String name = getType();
                level = (int)(name.charAt(1) - '0');
            }
            return level;
        }


        //~ Instance/static variables .........................................
        private int level = 0;
    }


    // ----------------------------------------------------------
    private static class HtmlNodeTag
        implements HtmlElement
    {
        // ----------------------------------------------------------
        public HtmlNodeTag(Node node, Transformer transformer)
        {
            inner = node;
            xformer = transformer;
        }


        // ----------------------------------------------------------
        public String getType()
        {
            return inner.getNodeName();
        }


        // ----------------------------------------------------------
        public String getText()
        {
            String result = getInnerHTML();
            if (result != null)
            {
                Matcher m = INNER_TAG_TRIMMER.matcher(result);
                result = m.replaceAll("");
            }
            return result;
        }


        // ----------------------------------------------------------
        public String getInnerHTML()
        {
            if (nodeChildrenAsTextIsNull)
            {
                return null;
            }

            String result = nodeChildrenAsText == null
                ? null
                : nodeChildrenAsText.get();
            if (result == null)
            {
                result = toString();
                if (result != null)
                {
                    Matcher m = TAG_TRIMMER.matcher(result);
                    if (m.find())
                    {
                        result = m.group(1);
                        nodeChildrenAsText = new SoftReference<String>(result);
                    }
                    else
                    {
                        result = null;
                        nodeChildrenAsTextIsNull = true;
                    }
                }
            }
            return result;
        }


        // ----------------------------------------------------------
        public boolean hasAttribute(String attributeName)
        {
            return inner.getAttributes().getNamedItem(attributeName) != null;
        }


        // ----------------------------------------------------------
        public String getAttributeValue(String attributeName)
        {
            Attr attr =
                (Attr)inner.getAttributes().getNamedItem(attributeName);
            return attr == null
                ? null
                : attr.getNodeValue();
        }


        // ----------------------------------------------------------
        public Iterable<String> getAttributes()
        {
            return new AttributeIterator(inner.getAttributes());
        }


        // ----------------------------------------------------------
        public String toString()
        {
            String result = nodeAsText == null ? null : nodeAsText.get();
            if (result == null)
            {
                try
                {
                    result = dumpNode(inner);
                    nodeAsText = new SoftReference<String>(result);
                }
                catch (Exception e)
                {
                    e.printStackTrace();
                }
            }
            return result;
        }


        // ----------------------------------------------------------
        /**
         * @param node A node to be dumped to a string
         * @param omitDeclaration A boolean whether to omit the XML declaration
         * @return A string representation of the node.
         * @throws Exception If anything goes wrong. Error handling omitted.
         */
        private String dumpNode(Node node)
            throws Exception
        {
            StringWriter sw = new StringWriter();
            Result result = new StreamResult(sw);
            Source source = new DOMSource(node);
            xformer.transform(source, result);
            return sw.toString();
        }


        //~ Instance/static variables .........................................
        private Transformer xformer;
        private Node inner;
        private SoftReference<String> nodeAsText;
        private SoftReference<String> nodeChildrenAsText;
        private boolean nodeChildrenAsTextIsNull;
    }


    //~ Instance/static variables .............................................

    private MutableNamespaceContext nc;
    private SAXTransformerFactory stf =
        (SAXTransformerFactory)TransformerFactory.newInstance();
    private Parser parser = new Parser();
    private XPathFactory xpf = XPathFactory.newInstance();
    private XPath xpath = xpf.newXPath();
    private Transformer xformer;
    private static final Pattern TAG_TRIMMER =
        Pattern.compile("^<[^>]*>(.*)</[^>]*>$");
    private static final Pattern INNER_TAG_TRIMMER =
        Pattern.compile("<[^>]*>");
}