/*==========================================================================*\
| $Id: TurboWebBot.java,v 1.3 2010/02/23 17:06:36 stedwar2 Exp $
|*-------------------------------------------------------------------------*|
| Copyright (C) 2007-2010 Virginia Tech
|
| This file is part of the Student-Library.
|
| The Student-Library is free software; you can redistribute it and/or
| modify it under the terms of the GNU Lesser General Public License as
| published by the Free Software Foundation; either version 3 of the
| License, or (at your option) any later version.
|
| The Student-Library is distributed in the hope that it will be useful,
| but WITHOUT ANY WARRANTY; without even the implied warranty of
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
| GNU Lesser General Public License for more details.
|
| You should have received a copy of the GNU Lesser General Public License
| along with the Student-Library; if not, see <http://www.gnu.org/licenses/>.
\*==========================================================================*/
package student.web;
import java.io.File;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URL;
import java.util.List;
import java.util.regex.Pattern;
//-------------------------------------------------------------------------
/**
* This advanced WebBot provides additional methods useful for
* extracting content from web pages basdon tag type, tag id, CSS class,
* or other features.
*
* @author Stephen Edwards
* @author Last changed by $Author: stedwar2 $
* @version $Revision: 1.3 $, $Date: 2010/02/23 17:06:36 $
*/
public class TurboWebBot
extends WebBot
{
//~ Constructors ..........................................................
// ----------------------------------------------------------
/**
* Creates a new WebBot that is not yet viewing any web page.
*/
public TurboWebBot()
{
super();
}
// ----------------------------------------------------------
/**
* Creates a new WebBot for a given URI.
* @param uri The web page where the robot will start.
*/
public TurboWebBot(URI uri)
{
this();
jumpToPage(uri);
}
// ----------------------------------------------------------
/**
* Creates a new WebBot for a given URL.
* @param url The web page where the robot will start.
*/
public TurboWebBot(URL url)
{
this();
jumpToPage(url);
}
// ----------------------------------------------------------
/**
* Creates a new WebBot for a given URL.
* @param url The web page where the robot will start.
*/
public TurboWebBot(String url)
{
super(url);
}
// ----------------------------------------------------------
/**
* Creates a new WebBot for a given file.
* @param file The web page where the robot will start.
*/
public TurboWebBot(File file)
{
this();
jumpToPage(file);
}
//~ Public methods ........................................................
// ----------------------------------------------------------
/**
* A key phrase of interest to look for in documents. This
* string will be interpreted as a case-insensitive
* {@link Pattern regular expression}.
* @param phrase a regular expression
*/
public void setPhraseOfInterest(String phrase)
{
targetPhrase = Pattern.compile(phrase, Pattern.CASE_INSENSITIVE);
}
// ----------------------------------------------------------
/**
* Get a count of the number of times the set phrase of interest
* occurs in the current page.
*
* <b>Requires</b> the bot to be viewing a web page, and that the
* phrase of interest has been set.
*
* @return The number of occurrences of the phrase of interest in the
* current web page
*/
public int getPagePhraseCount()
{
assert isViewingWebPage() : "Not viewing a web page";
assert targetPhrase != null
: "You must set the phrase of interest first";
return pages.peek().page.getPatternCount();
}
// ----------------------------------------------------------
/**
* Get the frequency of the phrase of interest in the current page.
* This is a number between 0 and 1 that approximates the fraction of
* the page that is made up by the target phrase. It is calculated by
* taking the size of all the occurrences of the target phrase in the
* document and dividing by the document's total size.
*
* <p>Note that this number tends to be small, since even interesting
* phrases usually constitute only a small fraction of a page with any
* interesting amount of information in it. However, it does provide a
* relative measure of how many times a phrase has been used, normalized
* by the size of the document.</p>
*
* <b>Requires</b> the bot to be viewing a web page, and that the
* phrase of interest has been set.
*
* @return The frequency of the phrase of interest in the
* current web page
*/
public double getPagePhraseFrequency()
{
assert isViewingWebPage() : "Not viewing a web page";
assert targetPhrase != null
: "You must set the phrase of interest first";
return pages.peek().page.getPatternFrequency();
}
// ----------------------------------------------------------
/**
* Advance the robot forward in the current document until it is looking
* at (or standing on) the next HTML element of interest it can find.
* Elements of interest can be controlled by calling
* {@link #resetElementsOfInterest(String...)} (the default is all links
* and all heading tags). If there are no elements of interest in the
* document, it will end up looking at the end of the page.
*
* <b>Requires</b> the bot to be viewing a web page.
*/
public void advanceToNextElement()
{
assert isViewingWebPage() : "Not viewing a web page";
PageLocation loc = pages.peek();
if (loc.pos < loc.len())
{
loc.pos++;
}
}
// ----------------------------------------------------------
/**
* Advance the robot forward in the current document until it is looking
* at (or standing on) the next HTML element of the specified type that it
* can find. The specified element type must be one of the elements of
* interest, as specified by calling
* {@link #resetElementsOfInterest(String...)} (the default is all links
* and all heading tags). If there are no more elements of the desired
* type in the document, or the desired type is not an element of interest,
* the robot will end up looking at the end of the page.
*
* <b>Requires</b> the bot to be viewing a web page.
*
* @param tagType The type of element to look for (case-sensitive)
*/
public void advanceToNextElement(String tagType)
{
assert isViewingWebPage() : "Not viewing a web page";
assert tagType != null : "You must provide a tagType";
PageLocation loc = pages.peek();
// move to first element if necessary
if (loc.pos < 0)
{
loc.pos = 0;
}
while (loc.hasCurrentElt()
&& !tagType.equals(loc.currentElt().getType()))
{
loc.pos++;
}
}
// ----------------------------------------------------------
/**
* Determine whether there are any more HTML elements of interest
* further down the page from the robot's current position. Elements of
* interest can be controlled by calling
* {@link #resetElementsOfInterest(String...)} (the default is all links
* and all heading tags).
*
* <b>Requires</b> the bot to be viewing a web page.
*
* @return True if there are any more elements of interest in the remainder
* of document
*/
public boolean hasNextElement()
{
assert isViewingWebPage() : "Not viewing a web page";
PageLocation loc = pages.peek();
return loc.pos < loc.len() - 1;
}
// ----------------------------------------------------------
/**
* Determine whether there are any more HTML elements of the specified type
* further down the page from the robot's current position. The
* specified element type must be one of the elements of interest, as
* specified by calling {@link #resetElementsOfInterest(String...)} (the
* default is all links and all heading tags).
*
* <b>Requires</b> the bot to be viewing a web page.
*
* @param tagType The type of element to look for
* @return True if there are any more elements of the specified type in the
* remainder of document. False if there are no more elements of
* that type, or if the specified tag type is not an element of
* interest.
*/
public boolean hasNextElement(String tagType)
{
assert isViewingWebPage() : "Not viewing a web page";
assert tagType != null : "You must provide a tagType";
PageLocation loc = pages.peek();
int pos = loc.pos + 1;
while (pos < loc.len()
&& !tagType.equals(loc.elts().get(pos).getType()))
{
pos++;
}
return pos < loc.len();
}
// ----------------------------------------------------------
/**
* Is the robot looking at (or standing on) an HTML element of interest on
* the current page? Elements of interest can be controlled by calling
* {@link #resetElementsOfInterest(String...)} (the default is all links
* and all heading tags).
*
* @return True if the robot is positioned at an element of interest,
* or false otherwise.
*/
public boolean isLookingAtElement()
{
assert isViewingWebPage() : "Not viewing a web page";
return pages.peek().hasCurrentElt();
}
// ----------------------------------------------------------
/**
* Is the robot looking at (or standing on) an HTML element of the
* specified type on the current page? The
* specified element type must be one of the elements of interest, as
* specified by calling {@link #resetElementsOfInterest(String...)} (the
* default is all links and all heading tags).
*
* @return True if the robot is positioned at an element of the desired
* type, or false otherwise. Also false if the specified tag
* type is not an element of interest.
*/
public boolean isLookingAtElement(String tagType)
{
assert isViewingWebPage() : "Not viewing a web page";
assert tagType != null : "You must provide a tagType";
PageLocation loc = pages.peek();
return loc.hasCurrentElt()
&& tagType.equals(loc.currentElt().getType());
}
// ----------------------------------------------------------
/**
* Get the first HTML element of the specified type on this web page.
* This method does not affect the robot's current position (the robot
* will not move), and it does not depend on the elements of interest.
* The specified tag type can be any HTML element, and the robot will
* search for and find the first such element on the page, regardless of
* where the robot is currently standing.
*
* <b>Requires</b> the bot to be viewing a web page.
*
* @param tagType The kind of element to search for.
* @return The first matching element on the current web page, or null if
* none is found.
* @see #getAllMatchingElements(String)
*/
public HtmlElement getFirstMatchingElement(String tagType)
{
assert isViewingWebPage() : "Not viewing a web page";
assert tagType != null : "You must provide a tagType";
return pages.peek().page.xPathFindFirst(HTML_NODE_PREFIX + tagType);
}
// ----------------------------------------------------------
/**
* Get the first HTML element of the specified type on this web page, based
* on the context where the element appears. For example, if you want the
* first anchor in the first row of the first table on a page, you could
* use this call:
*
* <pre>
* HtmlElement result = myBot.getFirstMatchingElement("table", "tr", "a");
* </pre>
*
* This method supports a variable number of arguments. It will find the
* first occurrence of the first element type listed. Then,
* <em>inside</em> that element, it will look for the first occurrence of
* the second element type, and then search <em>inside</em> that one for
* the first occurrence of the third element type, and so on. It returns
* the most deeply nested element in this series that it finds.
*
* This method does not affect the robot's current position (the robot
* will not move), and it does not depend on the elements of interest. The
* specified tag type(s) can be any HTML element, and the robot will search
* for and find the first matching element on the page, regardless of where
* the robot is currently standing.
*
* <b>Requires</b> the bot to be viewing a web page.
*
* @param parentTag The first element to search for.
* @param childTag Additional elements to find--each one will be searched
* for <em>within</em> the contents of the element immediately
* preceding it in the argument list.
* @return The first matching element on the current web page, or null if
* none is found.
* @see #getAllMatchingElements(String, String...)
*/
public HtmlElement getFirstMatchingElement(
String parentTag, String ... childTag)
{
assert isViewingWebPage() : "Not viewing a web page";
assert parentTag != null : "You must provide a parentTag";
StringBuffer sb = new StringBuffer(10 + 10 * childTag.length);
sb.append(HTML_NODE_PREFIX);
sb.append(parentTag);
for (String tag : childTag)
{
if (tag != null)
{
sb.append(HTML_NODE_PREFIX);
sb.append(tag);
}
}
return pages.peek().page.xPathFindFirst(sb.toString());
}
// ----------------------------------------------------------
/**
* Get all HTML elements of the specified type on this web page.
* This method is just like {@link #getFirstMatchingElement(String)},
* except that it returns all matches instead of just the first one.
* This method does not affect the robot's current position (the robot
* will not move), and it does not depend on the elements of interest.
* The specified tag type can be any HTML element, and the robot will
* search for and find all such elements on the page, regardless of where
* the robot is currently standing.
*
* <b>Requires</b> the bot to be viewing a web page.
*
* @param tagType The kind of element to search for.
* @return A list of all the matching elements. The list will be empty if
* none are found.
* @see #getFirstMatchingElement(String)
*/
public List<HtmlElement> getAllMatchingElements(String tagType)
{
assert isViewingWebPage() : "Not viewing a web page";
assert tagType != null : "You must provide a tagType";
return pages.peek().page.xPathFindAll(HTML_NODE_PREFIX + tagType);
}
// ----------------------------------------------------------
/**
* Get all HTML elements of the specified type on this web page, based
* on the context where the elements appear. This method is just like
* {@link #getFirstMatchingElement(String, String...)},
* except that it returns all matches instead of just the first one.
* This method does not affect the robot's current position (the robot
* will not move), and it does not depend on the elements of interest.
* The specified tag types can be any HTML element, and the robot will
* search for and find all such elements on the page, regardless of where
* the robot is currently standing.
*
* <b>Requires</b> the bot to be viewing a web page.
*
* @param parentTag The first element to search for.
* @param childTag Additional elements to find--each one will be searched
* for <em>within</em> the contents of the element immediately
* preceding it in the argument list.
* @return A list of all the matching elements. The list will be empty if
* none are found.
* @see #getFirstMatchingElement(String, String...)
*/
public List<HtmlElement> getAllMatchingElements(
String parentTag, String ... childTag)
{
assert isViewingWebPage() : "Not viewing a web page";
assert parentTag != null : "You must provide a parentTag";
StringBuffer sb = new StringBuffer(10 + 10 * childTag.length);
sb.append(HTML_NODE_PREFIX);
sb.append(parentTag);
for (String tag : childTag)
{
if (tag != null)
{
sb.append(HTML_NODE_PREFIX);
sb.append(tag);
}
}
return pages.peek().page.xPathFindAll(sb.toString());
}
// ----------------------------------------------------------
/**
* Get the first HTML element with the specified id on this web page,
* using the HTML id="..." attribute on the element.
* This method does not affect the robot's current position (the robot
* will not move), and it does not depend on the elements of interest.
* The robot will search for and find the first element with the given
* id on the page, regardless of where the robot is currently standing.
*
* <b>Requires</b> the bot to be viewing a web page.
*
* @param id The id to search for.
* @return The first (and usually only) element on the current web page
* with the given id, or null if none is found.
*/
public HtmlElement getElementById(String id)
{
assert isViewingWebPage() : "Not viewing a web page";
assert id != null : "You must provide an id";
return pages.peek().page.xPathFindFirst("//*[@id='" + id + "']");
}
// ----------------------------------------------------------
/**
* Get all the HTML elements with the specified CSS class on this web
* page, using the HTML class="..." attribute on the elements.
* This method does not affect the robot's current position (the robot
* will not move), and it does not depend on the elements of interest.
* The robot will search for and find all the elements with the given
* CSS class on the page, regardless of where the robot is currently
* standing.
*
* <b>Requires</b> the bot to be viewing a web page.
*
* @param cssClass The CSS class to search for.
* @return A list of all elements on the current web page with
* the given CSS class. The list will be empty if none are found.
*/
public List<HtmlElement> getElementsByCssClass(String cssClass)
{
assert isViewingWebPage() : "Not viewing a web page";
assert cssClass != null : "You must provide a cssClass";
return pages.peek().page.xPathFindAll("//*[@class='" + cssClass + "']");
}
// ----------------------------------------------------------
/**
* Move the WebBot back to the beginning of the page and reset the
* set of elements that it can walk over to the given set of elements. By
* default, a WebBot is interested in links and headings (a,
* h1, h2, h3, h4, h5, h6), but you can change the set of headings it
* will step through to any group of HTML elements you like. This method
* supports a variable number of arguments, so you can provide as many
* different element types as you like--if you provide no arguments, it
* will reset back to the default of all links and headings.
* <p>
* For example, to ignore all elements (including links and headings)
* except for image elements, use:</p>
* <pre>
* myBot.resetElementsOfInterest("img");
* </pre>
* <p>
* If you want to look at links and at table cells:</p>
* <pre>
* myBot.resetElementsOfInterest("a", "td");
* </pre>
* <p>
* <b>Requires</b> the bot to be viewing a web page.
* </p>
*
* @param tagTypes a list of zero or more element types to look for. If
* none are specified, the default of ("a", "h1", "h2", "h3", "h4",
* "h5", "h6") will be used instead
*/
public void resetElementsOfInterest(String ... tagTypes)
{
assert isViewingWebPage() : "Not viewing a web page";
if (tagTypes == null || tagTypes.length == 0)
{
pages.peek().setElementXpath(null);
}
else
{
StringBuffer sb = new StringBuffer(tagTypes.length * 10);
for (String tag : tagTypes)
{
if (sb.length() > 0)
{
sb.append('|');
}
sb.append(HTML_NODE_PREFIX);
sb.append(tag);
}
pages.peek().setElementXpath(sb.toString());
}
}
// ----------------------------------------------------------
/**
* Bind a symbolic name to an XML namespace URL so that the symbolic name
* can be used as a namespace prefix on identifiers in XPATH expressions.
* This method is for <b>advanced users only</b>. It is only necessary
* if your WebBot is manipulating content that is not HTML/XHTML, and you
* need to write XPATH expressions in some other XML namespace. The
* default namespace bindings are for the prefix "html" to be bound to
* the namespace http://www.w3.org/1999/xhtml. You can add as many
* additional namespaces as you need in order to build your own XPATH
* expressions.
*
* @param name The symbolic prefix to use for this namesapce
* @param url The URL identifying this XML namespace
*/
public void addXpathNamespace(String name, String url)
{
super.addXpathNamespace(name, url);
}
// ----------------------------------------------------------
/**
* Find nodes within the current document using an XPATH expression.
* This method is for <b>advanced users only</b>, and requires that you
* understand XPATH.
* This method does not affect the robot's current position (the robot
* will not move), and it does not depend on the elements of interest.
* The robot will search for and find all nodes on the page that match the
* given XPATH expression, regardless of where the robot is currently
* standing.
*
* Your XPATH expression must use namespaces for all element names. The
* default namespace bindings are for the prefix "html" to be bound to
* the namespace http://www.w3.org/1999/xhtml. You can add additional
* namespace bindings yourself using
* {@link #addXpathNamespace(String, String)} if you need more.
*
* <b>Requires</b> the bot to be viewing a web page.
*
* @param xpathExpression The XPATH expression to search for
* @return A list of all matching nodes. The list will be empty if
* no matches were found.
*/
public List<HtmlElement> getAllElementsMatchingXpath(
String xpathExpression)
{
assert isViewingWebPage() : "Not viewing a web page";
assert xpathExpression != null : "You must provide an xpathExpression";
return pages.peek().page.xPathFindAll(xpathExpression);
}
// ----------------------------------------------------------
/**
* Get the current web page's entire content as a string.
*
* <b>Requires</b> the bot to be viewing a web page.
*
* @return The page's content
*/
public String getPageContent()
{
return super.getPageContent();
}
// ----------------------------------------------------------
/**
* Causes the bot to temporarily leave the current page and hop over to
* the specified file. The bot will "remember" where it came from,
* keeping track of past pages in a stack. After working with the other
* page, you can use {@link #returnToPreviousPage()} to come back to the
* point where you left off.
* @param file The new page to jump to
*/
public void jumpToPage(File file)
{
assert file != null : "Specified file cannot be null";
assert file.exists() : "Specified file must exist in file system";
jumpToNormalizedURL(file);
}
// ----------------------------------------------------------
/**
* Check whether this robot has visited this page before.
* @param file The page to check
* @return True if this robot has previously visited (or is currently on)
* the given web page
*/
public boolean hasVisitedPage(File file)
{
try
{
return hasVisitedPage(makeFileAbsolute(file).toURI().toURL());
}
catch (MalformedURLException e)
{
return false;
}
}
}