TurboWebBot.java example

Explorer
uml-auto-assessment-master
- web-cat-src
/*==========================================================================*\
 |  $Id: TurboWebBot.java,v 1.3 2010/02/23 17:06:36 stedwar2 Exp $
 |*-------------------------------------------------------------------------*|
 |  Copyright (C) 2007-2010 Virginia Tech
 |
 |  This file is part of the Student-Library.
 |
 |  The Student-Library is free software; you can redistribute it and/or
 |  modify it under the terms of the GNU Lesser General Public License as
 |  published by the Free Software Foundation; either version 3 of the
 |  License, or (at your option) any later version.
 |
 |  The Student-Library is distributed in the hope that it will be useful,
 |  but WITHOUT ANY WARRANTY; without even the implied warranty of
 |  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 |  GNU Lesser General Public License for more details.
 |
 |  You should have received a copy of the GNU Lesser General Public License
 |  along with the Student-Library; if not, see <http://www.gnu.org/licenses/>.
\*==========================================================================*/

package student.web;

import java.io.File;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URL;
import java.util.List;
import java.util.regex.Pattern;

//-------------------------------------------------------------------------
/**
 *  This advanced WebBot provides additional methods useful for
 *  extracting content from web pages basdon tag type, tag id, CSS class,
 *  or other features.
 *
 *  @author  Stephen Edwards
 *  @author Last changed by $Author: stedwar2 $
 *  @version $Revision: 1.3 $, $Date: 2010/02/23 17:06:36 $
 */
public class TurboWebBot
    extends WebBot
{
    //~ Constructors ..........................................................

    // ----------------------------------------------------------
    /**
     * Creates a new WebBot that is not yet viewing any web page.
     */
    public TurboWebBot()
    {
        super();
    }


    // ----------------------------------------------------------
    /**
     * Creates a new WebBot for a given URI.
     * @param uri The web page where the robot will start.
     */
    public TurboWebBot(URI uri)
    {
        this();
        jumpToPage(uri);
    }


    // ----------------------------------------------------------
    /**
     * Creates a new WebBot for a given URL.
     * @param url The web page where the robot will start.
     */
    public TurboWebBot(URL url)
    {
        this();
        jumpToPage(url);
    }


    // ----------------------------------------------------------
    /**
     * Creates a new WebBot for a given URL.
     * @param url The web page where the robot will start.
     */
    public TurboWebBot(String url)
    {
        super(url);
    }


    // ----------------------------------------------------------
    /**
     * Creates a new WebBot for a given file.
     * @param file The web page where the robot will start.
     */
    public TurboWebBot(File file)
    {
        this();
        jumpToPage(file);
    }


    //~ Public methods ........................................................

    // ----------------------------------------------------------
    /**
     * A key phrase of interest to look for in documents.  This
     * string will be interpreted as a case-insensitive
     * {@link Pattern regular expression}.
     * @param phrase a regular expression
     */
    public void setPhraseOfInterest(String phrase)
    {
        targetPhrase = Pattern.compile(phrase, Pattern.CASE_INSENSITIVE);
    }


    // ----------------------------------------------------------
    /**
     * Get a count of the number of times the set phrase of interest
     * occurs in the current page.
     *
     * <b>Requires</b> the bot to be viewing a web page, and that the
     * phrase of interest has been set.
     *
     * @return The number of occurrences of the phrase of interest in the
     * current web page
     */
    public int getPagePhraseCount()
    {
        assert isViewingWebPage() : "Not viewing a web page";
        assert targetPhrase != null
            : "You must set the phrase of interest first";
        return pages.peek().page.getPatternCount();
    }


    // ----------------------------------------------------------
    /**
     * Get the frequency of the phrase of interest in the current page.
     * This is a number between 0 and 1 that approximates the fraction of
     * the page that is made up by the target phrase.  It is calculated by
     * taking the size of all the occurrences of the target phrase in the
     * document and dividing by the document's total size.
     *
     * <p>Note that this number tends to be small, since even interesting
     * phrases usually constitute only a small fraction of a page with any
     * interesting amount of information in it.  However, it does provide a
     * relative measure of how many times a phrase has been used, normalized
     * by the size of the document.</p>
     *
     * <b>Requires</b> the bot to be viewing a web page, and that the
     * phrase of interest has been set.
     *
     * @return The frequency of the phrase of interest in the
     * current web page
     */
    public double getPagePhraseFrequency()
    {
        assert isViewingWebPage() : "Not viewing a web page";
        assert targetPhrase != null
            : "You must set the phrase of interest first";
        return pages.peek().page.getPatternFrequency();
    }


    // ----------------------------------------------------------
    /**
     * Advance the robot forward in the current document until it is looking
     * at (or standing on) the next HTML element of interest it can find.
     * Elements of interest can be controlled by calling
     * {@link #resetElementsOfInterest(String...)} (the default is all links
     * and all heading tags).  If there are no elements of interest in the
     * document, it will end up looking at the end of the page.
     *
     * <b>Requires</b> the bot to be viewing a web page.
     */
    public void advanceToNextElement()
    {
        assert isViewingWebPage() : "Not viewing a web page";
        PageLocation loc = pages.peek();
        if (loc.pos < loc.len())
        {
            loc.pos++;
        }
    }


    // ----------------------------------------------------------
    /**
     * Advance the robot forward in the current document until it is looking
     * at (or standing on) the next HTML element of the specified type that it
     * can find.  The specified element type must be one of the elements of
     * interest, as specified by calling
     * {@link #resetElementsOfInterest(String...)} (the default is all links
     * and all heading tags).  If there are no more elements of the desired
     * type in the document, or the desired type is not an element of interest,
     * the robot will end up looking at the end of the page.
     *
     * <b>Requires</b> the bot to be viewing a web page.
     *
     * @param tagType The type of element to look for (case-sensitive)
     */
    public void advanceToNextElement(String tagType)
    {
        assert isViewingWebPage() : "Not viewing a web page";
        assert tagType != null    : "You must provide a tagType";
        PageLocation loc = pages.peek();

        // move to first element if necessary
        if (loc.pos < 0)
        {
            loc.pos = 0;
        }
        while (loc.hasCurrentElt()
            && !tagType.equals(loc.currentElt().getType()))
        {
            loc.pos++;
        }
    }


    // ----------------------------------------------------------
    /**
     * Determine whether there are any more HTML elements of interest
     * further down the page from the robot's current position.  Elements of
     * interest can be controlled by calling
     * {@link #resetElementsOfInterest(String...)} (the default is all links
     * and all heading tags).
     *
     * <b>Requires</b> the bot to be viewing a web page.
     *
     * @return True if there are any more elements of interest in the remainder
     *         of document
     */
    public boolean hasNextElement()
    {
        assert isViewingWebPage() : "Not viewing a web page";
        PageLocation loc = pages.peek();
        return loc.pos < loc.len() - 1;
    }


    // ----------------------------------------------------------
    /**
     * Determine whether there are any more HTML elements of the specified type
     * further down the page from the robot's current position.    The
     * specified element type must be one of the elements of interest, as
     * specified by calling {@link #resetElementsOfInterest(String...)} (the
     * default is all links and all heading tags).
     *
     * <b>Requires</b> the bot to be viewing a web page.
     *
     * @param tagType The type of element to look for
     * @return True if there are any more elements of the specified type in the
     *         remainder of document.  False if there are no more elements of
     *         that type, or if the specified tag type is not an element of
     *         interest.
     */
    public boolean hasNextElement(String tagType)
    {
        assert isViewingWebPage() : "Not viewing a web page";
        assert tagType != null    : "You must provide a tagType";
        PageLocation loc = pages.peek();
        int pos = loc.pos + 1;

        while (pos < loc.len()
            && !tagType.equals(loc.elts().get(pos).getType()))
        {
            pos++;
        }
        return pos < loc.len();
    }


    // ----------------------------------------------------------
    /**
     * Is the robot looking at (or standing on) an HTML element of interest on
     * the current page?  Elements of interest can be controlled by calling
     * {@link #resetElementsOfInterest(String...)} (the default is all links
     * and all heading tags).
     *
     * @return True if the robot is positioned at an element of interest,
     *         or false otherwise.
     */
    public boolean isLookingAtElement()
    {
        assert isViewingWebPage() : "Not viewing a web page";
        return pages.peek().hasCurrentElt();
    }


    // ----------------------------------------------------------
    /**
     * Is the robot looking at (or standing on) an HTML element of the
     * specified type on the current page?  The
     * specified element type must be one of the elements of interest, as
     * specified by calling {@link #resetElementsOfInterest(String...)} (the
     * default is all links and all heading tags).
     *
     * @return True if the robot is positioned at an element of the desired
     *         type, or false otherwise.  Also false if the specified tag
     *         type is not an element of interest.
     */
    public boolean isLookingAtElement(String tagType)
    {
        assert isViewingWebPage() : "Not viewing a web page";
        assert tagType != null    : "You must provide a tagType";
        PageLocation loc = pages.peek();
        return loc.hasCurrentElt()
            && tagType.equals(loc.currentElt().getType());
    }


    // ----------------------------------------------------------
    /**
     * Get the first HTML element of the specified type on this web page.
     * This method does not affect the robot's current position (the robot
     * will not move), and it does not depend on the elements of interest.
     * The specified tag type can be any HTML element, and the robot will
     * search for and find the first such element on the page, regardless of
     * where the robot is currently standing.
     *
     * <b>Requires</b> the bot to be viewing a web page.
     *
     * @param tagType The kind of element to search for.
     * @return The first matching element on the current web page, or null if
     *         none is found.
     * @see #getAllMatchingElements(String)
     */
    public HtmlElement getFirstMatchingElement(String tagType)
    {
        assert isViewingWebPage() : "Not viewing a web page";
        assert tagType != null    : "You must provide a tagType";
        return pages.peek().page.xPathFindFirst(HTML_NODE_PREFIX + tagType);
    }


    // ----------------------------------------------------------
    /**
     * Get the first HTML element of the specified type on this web page, based
     * on the context where the element appears.  For example, if you want the
     * first anchor in the first row of the first table on a page, you could
     * use this call:
     *
     * <pre>
     * HtmlElement result = myBot.getFirstMatchingElement("table", "tr", "a");
     * </pre>
     *
     * This method supports a variable number of arguments.  It will find the
     * first occurrence of the first element type listed.  Then,
     * <em>inside</em> that element, it will look for the first occurrence of
     * the second element type, and then search <em>inside</em> that one for
     * the first occurrence of the third element type, and so on.  It returns
     * the most deeply nested element in this series that it finds.
     *
     * This method does not affect the robot's current position (the robot
     * will not move), and it does not depend on the elements of interest.  The
     * specified tag type(s) can be any HTML element, and the robot will search
     * for and find the first matching element on the page, regardless of where
     * the robot is currently standing.
     *
     * <b>Requires</b> the bot to be viewing a web page.
     *
     * @param parentTag The first element to search for.
     * @param childTag Additional elements to find--each one will be searched
     *        for <em>within</em> the contents of the element immediately
     *        preceding it in the argument list.
     * @return The first matching element on the current web page, or null if
     *         none is found.
     * @see #getAllMatchingElements(String, String...)
     */
    public HtmlElement getFirstMatchingElement(
        String parentTag, String ... childTag)
    {
        assert isViewingWebPage() : "Not viewing a web page";
        assert parentTag != null  : "You must provide a parentTag";
        StringBuffer sb = new StringBuffer(10 + 10 * childTag.length);
        sb.append(HTML_NODE_PREFIX);
        sb.append(parentTag);
        for (String tag : childTag)
        {
            if (tag != null)
            {
                sb.append(HTML_NODE_PREFIX);
                sb.append(tag);
            }
        }
        return pages.peek().page.xPathFindFirst(sb.toString());
    }


    // ----------------------------------------------------------
    /**
     * Get all HTML elements of the specified type on this web page.
     * This method is just like {@link #getFirstMatchingElement(String)},
     * except that it returns all matches instead of just the first one.
     * This method does not affect the robot's current position (the robot
     * will not move), and it does not depend on the elements of interest.
     * The specified tag type can be any HTML element, and the robot will
     * search for and find all such elements on the page, regardless of where
     * the robot is currently standing.
     *
     * <b>Requires</b> the bot to be viewing a web page.
     *
     * @param tagType The kind of element to search for.
     * @return A list of all the matching elements.  The list will be empty if
     *         none are found.
     * @see #getFirstMatchingElement(String)
     */
    public List<HtmlElement> getAllMatchingElements(String tagType)
    {
        assert isViewingWebPage() : "Not viewing a web page";
        assert tagType != null    : "You must provide a tagType";
        return pages.peek().page.xPathFindAll(HTML_NODE_PREFIX + tagType);
    }


    // ----------------------------------------------------------
    /**
     * Get all HTML elements of the specified type on this web page, based
     * on the context where the elements appear. This method is just like
     * {@link #getFirstMatchingElement(String, String...)},
     * except that it returns all matches instead of just the first one.
     * This method does not affect the robot's current position (the robot
     * will not move), and it does not depend on the elements of interest.
     * The specified tag types can be any HTML element, and the robot will
     * search for and find all such elements on the page, regardless of where
     * the robot is currently standing.
     *
     * <b>Requires</b> the bot to be viewing a web page.
     *
     * @param parentTag The first element to search for.
     * @param childTag Additional elements to find--each one will be searched
     *        for <em>within</em> the contents of the element immediately
     *        preceding it in the argument list.
     * @return A list of all the matching elements.  The list will be empty if
     *         none are found.
     * @see #getFirstMatchingElement(String, String...)
     */
    public List<HtmlElement> getAllMatchingElements(
        String parentTag, String ... childTag)
    {
        assert isViewingWebPage() : "Not viewing a web page";
        assert parentTag != null  : "You must provide a parentTag";
        StringBuffer sb = new StringBuffer(10 + 10 * childTag.length);
        sb.append(HTML_NODE_PREFIX);
        sb.append(parentTag);
        for (String tag : childTag)
        {
            if (tag != null)
            {
                sb.append(HTML_NODE_PREFIX);
                sb.append(tag);
            }
        }
        return pages.peek().page.xPathFindAll(sb.toString());
    }


    // ----------------------------------------------------------
    /**
     * Get the first HTML element with the specified id on this web page,
     * using the HTML id="..." attribute on the element.
     * This method does not affect the robot's current position (the robot
     * will not move), and it does not depend on the elements of interest.
     * The robot will search for and find the first element with the given
     * id on the page, regardless of where the robot is currently standing.
     *
     * <b>Requires</b> the bot to be viewing a web page.
     *
     * @param id The id to search for.
     * @return The first (and usually only) element on the current web page
     *         with the given id, or null if none is found.
     */
    public HtmlElement getElementById(String id)
    {
        assert isViewingWebPage() : "Not viewing a web page";
        assert id != null         : "You must provide an id";
        return pages.peek().page.xPathFindFirst("//*[@id='" + id + "']");
    }


    // ----------------------------------------------------------
    /**
     * Get all the HTML elements with the specified CSS class on this web
     * page, using the HTML class="..." attribute on the elements.
     * This method does not affect the robot's current position (the robot
     * will not move), and it does not depend on the elements of interest.
     * The robot will search for and find all the elements with the given
     * CSS class on the page, regardless of where the robot is currently
     * standing.
     *
     * <b>Requires</b> the bot to be viewing a web page.
     *
     * @param cssClass The CSS class to search for.
     * @return A list of all elements on the current web page with
     *         the given CSS class.  The list will be empty if none are found.
     */
    public List<HtmlElement> getElementsByCssClass(String cssClass)
    {
        assert isViewingWebPage() : "Not viewing a web page";
        assert cssClass != null   : "You must provide a cssClass";
        return pages.peek().page.xPathFindAll("//*[@class='" + cssClass + "']");
    }


    // ----------------------------------------------------------
    /**
     * Move the WebBot back to the beginning of the page and reset the
     * set of elements that it can walk over to the given set of elements.  By
     * default, a WebBot is interested in links and headings (a,
     * h1, h2, h3, h4, h5, h6), but you can change the set of headings it
     * will step through to any group of HTML elements you like.  This method
     * supports a variable number of arguments, so you can provide as many
     * different element types as you like--if you provide no arguments, it
     * will reset back to the default of all links and headings.
     * <p>
     * For example, to ignore all elements (including links and headings)
     * except for image elements, use:</p>
     * <pre>
     * myBot.resetElementsOfInterest("img");
     * </pre>
     * <p>
     * If you want to look at links and at table cells:</p>
     * <pre>
     * myBot.resetElementsOfInterest("a", "td");
     * </pre>
     * <p>
     * <b>Requires</b> the bot to be viewing a web page.
     * </p>
     *
     * @param tagTypes a list of zero or more element types to look for.  If
     *        none are specified, the default of ("a", "h1", "h2", "h3", "h4",
     *        "h5", "h6") will be used instead
     */
    public void resetElementsOfInterest(String ... tagTypes)
    {
        assert isViewingWebPage() : "Not viewing a web page";
        if (tagTypes == null || tagTypes.length == 0)
        {
            pages.peek().setElementXpath(null);
        }
        else
        {
            StringBuffer sb = new StringBuffer(tagTypes.length * 10);
            for (String tag : tagTypes)
            {
                if (sb.length() > 0)
                {
                    sb.append('|');
                }
                sb.append(HTML_NODE_PREFIX);
                sb.append(tag);
            }
            pages.peek().setElementXpath(sb.toString());
        }
    }


    // ----------------------------------------------------------
    /**
     * Bind a symbolic name to an XML namespace URL so that the symbolic name
     * can be used as a namespace prefix on identifiers in XPATH expressions.
     * This method is for <b>advanced users only</b>.  It is only necessary
     * if your WebBot is manipulating content that is not HTML/XHTML, and you
     * need to write XPATH expressions in some other XML namespace.  The
     * default namespace bindings are for the prefix "html" to be bound to
     * the namespace http://www.w3.org/1999/xhtml.  You can add as many
     * additional namespaces as you need in order to build your own XPATH
     * expressions.
     *
     * @param name The symbolic prefix to use for this namesapce
     * @param url  The URL identifying this XML namespace
     */
    public void addXpathNamespace(String name, String url)
    {
        super.addXpathNamespace(name, url);
    }


    // ----------------------------------------------------------
    /**
     * Find nodes within the current document using an XPATH expression.
     * This method is for <b>advanced users only</b>, and requires that you
     * understand XPATH.
     * This method does not affect the robot's current position (the robot
     * will not move), and it does not depend on the elements of interest.
     * The robot will search for and find all nodes on the page that match the
     * given XPATH expression, regardless of where the robot is currently
     * standing.
     *
     * Your XPATH expression must use namespaces for all element names.  The
     * default namespace bindings are for the prefix "html" to be bound to
     * the namespace http://www.w3.org/1999/xhtml.  You can add additional
     * namespace bindings yourself using
     * {@link #addXpathNamespace(String, String)} if you need more.
     *
     * <b>Requires</b> the bot to be viewing a web page.
     *
     * @param xpathExpression The XPATH expression to search for
     * @return A list of all matching nodes.  The list will be empty if
     *         no matches were found.
     */
    public List<HtmlElement> getAllElementsMatchingXpath(
        String xpathExpression)
    {
        assert isViewingWebPage() : "Not viewing a web page";
        assert xpathExpression != null : "You must provide an xpathExpression";
        return pages.peek().page.xPathFindAll(xpathExpression);
    }


    // ----------------------------------------------------------
    /**
     * Get the current web page's entire content as a string.
     *
     * <b>Requires</b> the bot to be viewing a web page.
     *
     * @return The page's content
     */
    public String getPageContent()
    {
        return super.getPageContent();
    }


    // ----------------------------------------------------------
    /**
     * Causes the bot to temporarily leave the current page and hop over to
     * the specified file.  The bot will "remember" where it came from,
     * keeping track of past pages in a stack.  After working with the other
     * page, you can use {@link #returnToPreviousPage()} to come back to the
     * point where you left off.
     * @param file The new page to jump to
     */
    public void jumpToPage(File file)
    {
        assert file != null : "Specified file cannot be null";
        assert file.exists() : "Specified file must exist in file system";
        jumpToNormalizedURL(file);
    }


    // ----------------------------------------------------------
    /**
     * Check whether this robot has visited this page before.
     * @param file The page to check
     * @return True if this robot has previously visited (or is currently on)
     * the given web page
     */
    public boolean hasVisitedPage(File file)
    {
        try
        {
            return hasVisitedPage(makeFileAbsolute(file).toURI().toURL());
        }
        catch (MalformedURLException e)
        {
            return false;
        }
    }
}