/*==========================================================================*\ | $Id: WebBot.java,v 1.5 2010/02/23 17:06:36 stedwar2 Exp $ |*-------------------------------------------------------------------------*| | Copyright (C) 2007-2010 Virginia Tech | | This file is part of the Student-Library. | | The Student-Library is free software; you can redistribute it and/or | modify it under the terms of the GNU Lesser General Public License as | published by the Free Software Foundation; either version 3 of the | License, or (at your option) any later version. | | The Student-Library is distributed in the hope that it will be useful, | but WITHOUT ANY WARRANTY; without even the implied warranty of | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | GNU Lesser General Public License for more details. | | You should have received a copy of the GNU Lesser General Public License | along with the Student-Library; if not, see <http://www.gnu.org/licenses/>. \*==========================================================================*/ package student.web; import static student.testingsupport.SystemIOUtilities.isOnServer; import java.io.File; import java.io.FileReader; import java.io.IOException; import java.io.PrintStream; import java.io.PrintWriter; import java.io.Reader; import java.io.StringReader; import java.io.StringWriter; import java.lang.ref.SoftReference; import java.net.MalformedURLException; import java.net.URI; import java.net.URISyntaxException; import java.net.URL; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Stack; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.xml.transform.OutputKeys; import javax.xml.transform.Result; import javax.xml.transform.Source; import javax.xml.transform.Transformer; import javax.xml.transform.TransformerFactory; import javax.xml.transform.dom.DOMResult; import javax.xml.transform.dom.DOMSource; import javax.xml.transform.sax.SAXTransformerFactory; import javax.xml.transform.sax.TransformerHandler; import javax.xml.transform.stream.StreamResult; import javax.xml.xpath.XPath; import javax.xml.xpath.XPathConstants; import javax.xml.xpath.XPathFactory; import org.ccil.cowan.tagsoup.Parser; import org.w3c.dom.Attr; import org.w3c.dom.NamedNodeMap; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.xml.sax.InputSource; import student.IOHelper; import student.testingsupport.PrintWriterWithHistory; import student.web.internal.MutableNamespaceContext; // ------------------------------------------------------------------------- /** * This class represents a robot that knows how to walk through a web * page and identify headings and links. It will automatically transform * "messy" real-world html into conforming XHTML as it visits pages, so * all tag matching and other support should presume XHTML conventions. * * @author Stephen Edwards * @author Last changed by $Author: stedwar2 $ * @version $Revision: 1.5 $, $Date: 2010/02/23 17:06:36 $ */ public class WebBot { //~ Constructors .......................................................... // ---------------------------------------------------------- /** * Creates a new WebBot that is not yet viewing any web page. */ public WebBot() { out = isOnServer() ? new PrintWriterWithHistory() : new PrintWriterWithHistory(System.out, true); // Create a mutable namespace context. This should really be provided // by the JDK, but the default implementation does not allow new // entries to be added. nc = new MutableNamespaceContext(); // Set the prefix "html" to correspond to the xhtml namespace. // This can be called multiple times with different prefixes. addXpathNamespace("html", "http://www.w3.org/1999/xhtml"); // nc.setNamespace("", "http://www.w3.org/1999/xhtml"); xpath.setNamespaceContext(nc); try { // parser.setFeature( // "http://xml.org/sax/features/namespace-prefixes",true); xformer = TransformerFactory.newInstance().newTransformer(); xformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes"); } catch (Exception e) { e.printStackTrace(out); } } // ---------------------------------------------------------- /** * Creates a new WebBot for a given URL. * @param url The web page where the robot will start. */ public WebBot(String url) { this(); jumpToPage(url); } //~ Public methods ........................................................ // ---------------------------------------------------------- /** * Is the robot currently viewing a real web page with readable contents? * Normally, this would be true, but may be false if the bot has not been * given a web page to start on, or if it has been given a malformed or * nonexistent URL address, or even if the server for the targeted page * is not available. * @return True if the robot is currently viewing a real web page with * readable contents */ public boolean isViewingWebPage() { return pages.size() > 0; } // ---------------------------------------------------------- /** * Has the robot advanced through all the contents (headings and links) * on the current page? Will also return true if * {@link #isViewingWebPage()} returns false. * @return True if the robot has advanced over all the headings and links * in the current document, or false if there are more headings and/or * links to visit. */ public boolean isLookingAtEndOfPage() { return !isViewingWebPage() || pages.peek().pos >= pages.peek().len(); } // ---------------------------------------------------------- /** * Moves the robot back to the start of the current page. * * <b>Requires</b> the bot to be viewing a web page. */ public void returnToStartOfPage() { assert isViewingWebPage() : "Not viewing a web page"; pages.peek().pos = -1; } // ---------------------------------------------------------- /** * Get the title the current web page. * * <b>Requires</b> the bot to be viewing a web page. * * @return The page's title, or null if the page has no title. */ public String getPageTitle() { assert isViewingWebPage() : "Not viewing a web page"; return pages.peek().page.getTitle(); } // ---------------------------------------------------------- /** * Echo the current web page title to the robot's default output channel. * * <b>Requires</b> the bot to be viewing a web page. */ public void echoPageTitle() { getOutputChannel().print(getPageTitle()); getOutputChannel().flush(); } // ---------------------------------------------------------- /** * Get the URL for the current web page. * * <b>Requires</b> the bot to be viewing a web page. * * @return The page's URL, if it exists. */ public URL getPageURL() { assert isViewingWebPage() : "Not viewing a web page"; return pages.peek().page.url; } // ---------------------------------------------------------- /** * Get a printable summary of this robot. * * @return The page's content */ public String toString() { String result = getClass().getName(); if (isViewingWebPage()) { result += "[" + getPageURL(); String title = getPageTitle(); if (title != null) { result += " => " + title; } result += "]"; } else { result += "[no page]"; } return result; } // ---------------------------------------------------------- /** * Get the HTML element of interest that the robot is currently standing * on. * * <b>Requires</b> the bot to be looking at an element on the * current web page. * * @return The heading's title. */ public HtmlElement getCurrentElement() { assert isViewingWebPage() : "Not viewing a web page"; assert !isLookingAtEndOfPage() : "Already passed all content on this web page"; PageLocation loc = pages.peek(); assert loc.hasCurrentElt() : "Not looking at any element"; return loc.currentElt(); } // ---------------------------------------------------------- /** * Is the robot looking at (or standing on) an HTML heading element on * the current page? * @return True if the robot is positioned at a heading, or false * otherwise. */ public boolean isLookingAtHeading() { return isViewingWebPage() && isHeading(pages.peek().currentElt()); } // ---------------------------------------------------------- /** * Advance the robot forward in the current document until it is looking * at (or standing on) the next HTML heading element it can find. If * there are no more headings in the document, it will end up looking * at the end of the page. * * <b>Requires</b> the bot to be viewing a web page. */ public void advanceToNextHeading() { assert isViewingWebPage() : "Not viewing a web page"; PageLocation loc = pages.peek(); if (loc.pos >= loc.len()) { // at end of page return; } loc.pos++; while ( loc.hasCurrentElt() && !isHeading(loc.currentElt())) { loc.pos++; } } // ---------------------------------------------------------- /** * Get an iterator over all headings in the current document. This * method is designed to make it easy to write foreach-style loops * over page headings. * * <b>Requires</b> the bot to be viewing a web page. * * @return an iterator of {@link HtmlHeadingElement} objects describing the * headings in the page. */ public List<HtmlHeadingElement> getHeadings() { assert isViewingWebPage() : "Not viewing a web page"; return pages.peek().page.getHeadings(6); } // ---------------------------------------------------------- /** * Get an iterator over all headings in the current document with a level * less than or equal to the value specified. This method is designed to * make it easy to write foreach-style loops over page headings. * * <b>Requires</b> the bot to be viewing a web page. * * @param level Only include headings at this level or above (i.e., * numerically less than or equal to this number) * @return an iterator of {@link HtmlHeadingElement} objects describing the * headings in the page with levels less than or equal to the * specified level. */ public List<HtmlHeadingElement> getHeadingsToLevel( int level ) { assert isViewingWebPage() : "Not viewing a web page"; return pages.peek().page.getHeadings(level); } // ---------------------------------------------------------- /** * Echo the text of the current HTML element (heading, link, etc.) to the * robot's default output channel. * * <b>Requires</b> the bot to be viewing an existing HTML element on the * current web page. */ public void echoCurrentElementText() { getOutputChannel().print(getCurrentElementText()); getOutputChannel().flush(); } // ---------------------------------------------------------- /** * Get the text of the current HTML element on this web page--i.e., the * title of a heading or the text associated with a link. * * <b>Requires</b> the bot to be looking at an element on the * current web page. * * @return The text contained by this element on the web page. */ public String getCurrentElementText() { assert isViewingWebPage() : "Not viewing a web page"; assert !isLookingAtEndOfPage() : "Already passed all content on this web page"; PageLocation loc = pages.peek(); assert loc.hasCurrentElt() : "Not looking at any element"; return loc.currentElt().getText(); } // ---------------------------------------------------------- /** * Get the heading level (1-6) of the current heading on this web page. * * <b>Requires</b> the bot to be looking at a heading element on the * current web page. * * @return The heading's level. */ public int getHeadingLevel() { assert isViewingWebPage() : "Not viewing a web page"; assert isLookingAtHeading() : "Not looking at a heading"; return levelOf(pages.peek().currentElt()); } // ---------------------------------------------------------- /** * Is the robot looking at (or standing on) an HTML anchor containing * an href attribute (that is, a link to another web page) on * the current page? * @return True if the robot is positioned at a link, or false * otherwise. */ public boolean isLookingAtLink() { return isViewingWebPage() && isLink(pages.peek().currentElt()); } // ---------------------------------------------------------- /** * Advance the robot forward in the current document until it is looking * at (or standing on) the next HTML anchor containing an href attribute * that it can find. If there are no more headings in the document, it * will end up looking at the end of the page. * * <b>Requires</b> the bot to be viewing a web page. */ public void advanceToNextLink() { assert isViewingWebPage() : "Not viewing a web page"; PageLocation loc = pages.peek(); if (loc.pos >= loc.len()) { // at end of page return; } loc.pos++; while ( loc.hasCurrentElt() && !isLink(loc.currentElt())) { loc.pos++; } } // ---------------------------------------------------------- /** * Get the URI of the current link on this web page. * * <b>Requires</b> the bot to be looking at a link (anchor) element on * the current web page. * * @return The link's destination. */ public URI getLinkURI() { assert isViewingWebPage() : "Not viewing a web page"; assert isLookingAtLink() : "Not looking at a link"; PageLocation loc = pages.peek(); return resolveURIFromPage(loc.currentElt().getAttributeValue("href")); } // ---------------------------------------------------------- /** * Check whether the URL of the current link on this web page refers to * a different page, or just another location within the current page. * * <b>Requires</b> the bot to be looking at a link (anchor) element on * the current web page. * * @return True if the link refers to a different page */ public boolean linkGoesToAnotherPage() { URI thisUri = getLinkURI(); URI parent = pages.peek().page.uri; boolean result = false; if (thisUri != null) { if (parent == null) { result = true; } else { result = ( !thisUri.getHost().equals(parent.getHost()) || !thisUri.getPath().equals(parent.getPath()) ); } } return result; } // ---------------------------------------------------------- /** * Check whether the URL of the current link on this web page refers to * a page on a separate server, or simply another location on the same * server. * * <b>Requires</b> the bot to be looking at a link (anchor) element on * the current web page. * * @return True if the link refers to a page located on a different server */ public boolean linkGoesToAnotherServer() { URI thisUri = getLinkURI(); URI parent = pages.peek().page.uri; boolean result = false; if (thisUri != null) { if (parent == null) { result = true; } else { result = !thisUri.getHost().equals(parent.getHost()); } } return result; } // ---------------------------------------------------------- /** * Get an iterator over all links in the current document. This method * is designed to make it easy to write foreach-style loops over links. * * <b>Requires</b> the bot to be viewing a web page. * * @return an iterator of {@link URI} objects describing the * links in the page. */ public List<URI> getLinks() { assert isViewingWebPage() : "Not viewing a web page"; return pages.peek().page.getLinks(ALL_LINKS); } // ---------------------------------------------------------- /** * Get an iterator over all links in the current document that refer to * other web pages. This is a subset of those returned by * {@link #getLinks()}, with any links to other locations within the same * page filtered out. This method is designed to make it easy to write * foreach-style loops over links. * * <b>Requires</b> the bot to be viewing a web page. * * @return an iterator of {@link URI} objects describing the * links in the page. */ public List<URI> getLinksToOtherPages() { assert isViewingWebPage() : "Not viewing a web page"; return pages.peek().page.getLinks(OTHER_PAGE_LINKS); } // ---------------------------------------------------------- /** * Get an iterator over all links in the current document that refer to * pages on other servers. This is a subset of those returned by * {@link #getLinks()}, with any links to pages on the same server as the * current page filtered out. This method is designed to make it easy * to write foreach-style loops over links. * * <b>Requires</b> the bot to be viewing a web page. * * @return an iterator of {@link URI} objects describing the * links in the page. */ public List<URI> getLinksOffServer() { assert isViewingWebPage() : "Not viewing a web page"; return pages.peek().page.getLinks(OTHER_SITE_LINKS); } // ---------------------------------------------------------- /** * Causes the bot to temporarily leave the current page and hop over to * the page at the end of the current link. The bot will "remember" where * it came from, keeping track of past pages in a stack. After working * with the other page, you can use {@link #returnToPreviousPage()} to * come back to the point where you left off. * * <b>Requires</b> the bot to be looking at a link (anchor) element on * the current web page. */ public void jumpToLinkedPage() { jumpToNormalizedURI(getLinkURI()); } // ---------------------------------------------------------- /** * Causes the bot to leave the current page and return to the page it was * previously visiting, at the location where it left off. The previous * page is the one that was most recently "remembered", or alternatively, * the one on top of the stack of previous pages that have been visited. * Use this method in conjunction with {@link #jumpToLinkedPage()} to * explore multiple pages. * * <b>Requires</b> the bot to have some previous page to return to. */ public void returnToPreviousPage() { assert hasPreviousPage() : "No previous page available"; pages.pop(); } // ---------------------------------------------------------- /** * Check to see if this bot previously visited a different page that it * can now return to. Is the stack of previous pages empty or not? * @return True if there is at least one previous page on the stack of * previous visited pages, or false if there are none. */ public boolean hasPreviousPage() { return pages.size() > 1; } // ---------------------------------------------------------- /** * How deep is the stack of previous pages that this robot can return to? * Each time the robot jumps to a new page, it remembers its previous * page so you can {@link #returnToPreviousPage()}. These previous pages * are remembered on a stack, and this method allows you to determine * how deep this stack is--that is, how many times you can repeatedly * call returnToPreviousPage() successfully. * @return The depth of the previous page stack. This result is zero if * the robot is on a page, but has not yet jumped to any others, or -1 * if there is no current page at all. */ public int numberOfPreviousPages() { return pages.size() - 1; } // ---------------------------------------------------------- /** * Causes the bot to temporarily leave the current page and hop over to * the page specified by the URL (as a string). The bot will "remember" * where it came from, keeping track of past pages in a stack. After * working with the other page, you can use {@link #returnToPreviousPage()} * to come back to the point where you left off. * @param url The new page to jump to */ public void jumpToPage(String url) { jumpToPage(urlForString(url)); } // ---------------------------------------------------------- /** * Causes the bot to temporarily leave the current page and hop over to * the page specified by the URL. The bot will "remember" where * it came from, keeping track of past pages in a stack. After working * with the other page, you can use {@link #returnToPreviousPage()} to * come back to the point where you left off. * @param url The new page to jump to */ public void jumpToPage(URL url) { assert url != null : "Specified url cannot be null"; jumpToNormalizedURL(normalizeURL(url)); } // ---------------------------------------------------------- /** * Causes the bot to temporarily leave the current page and hop over to * the page specified by the URL. The bot will "remember" where * it came from, keeping track of past pages in a stack. After working * with the other page, you can use {@link #returnToPreviousPage()} to * come back to the point where you left off. * @param uri The new page to jump to */ public void jumpToPage(URI uri) { assert uri != null : "Specified URI cannot be null"; jumpToNormalizedURI(uri.normalize()); } // ---------------------------------------------------------- /** * Causes the bot to temporarily leave the current page and hop over to * a specific HTML string provided as a parameter. Instead of reading * web content from the internet, the text you pass in will be used * instead. The bot will "remember" where it was before, keeping track * of past pages in a stack. After working with the provided HTML * content you pass in, you can use {@link #returnToPreviousPage()} * to come back to the point where you left off in the previous page. * @param html A string containing an HTML document to treat as if it * came from the web */ public void jumpToThisHTML(String html) { Page newPage = new Page(html); if ( newPage.success ) { jumpToPage(newPage); } } // ---------------------------------------------------------- /** * Get a fully-resolved URI from a (possibly relative) string URI, such as * the value of an anchor's href or an img's src attribute. If the * input parameter is a relative URI, it will be converted into an * appropriate absolute URI relative to the current page's web location. * * <b>Requires</b> the bot to be viewing a web page. * * @param uri The URI to convert to absolute form * @return The equivalent, fully-resolved URI, or null if * there is none. */ public URI resolveURIFromPage(String uri) { URI result = null; Page page = pages.peek().page; if (uri != null) { if (page.uri == null) { try { result = new URI(uri); } catch (URISyntaxException e) { try { URL resultAsUrl = new URL(uri); result = new URI( resultAsUrl.getProtocol(), resultAsUrl.getUserInfo(), resultAsUrl.getHost(), resultAsUrl.getPort(), resultAsUrl.getPath(), resultAsUrl.getQuery(), resultAsUrl.getRef() ); } catch (Exception ee) { ee.printStackTrace(out); } } } else { try { result = page.uri.resolve(uri); } catch (IllegalArgumentException e) { try { URL resultAsUrl = page.url == null ? new URL(page.url, uri) : new URL(uri); result = new URI( resultAsUrl.getProtocol(), resultAsUrl.getUserInfo(), resultAsUrl.getHost(), resultAsUrl.getPort(), resultAsUrl.getPath(), resultAsUrl.getQuery(), resultAsUrl.getRef() ); } catch (Exception ee) { ee.printStackTrace(out); } } } } if (result != null) { result = result.normalize(); } return result; } // ---------------------------------------------------------- /** * Check whether this robot has visited this page before. * @param uri The page to check * @return True if this robot has previously visited (or is currently on) * the given web page */ public boolean hasVisitedPage(URI uri) { try { return hasVisitedPage(uri.normalize().toURL()); } catch (MalformedURLException e) { return false; } } // ---------------------------------------------------------- /** * Check whether this robot has visited this page before. * @param url The page to check * @return True if this robot has previously visited (or is currently on) * the given web page */ public boolean hasVisitedPage(URL url) { return pageCache.containsKey(normalizeURL(url)); } // ---------------------------------------------------------- /** * Tell this bot where to send its output. Whenever you tell the bot to * echo content or headings, they will go to this destination. By * default, output goes to the standard output channel, but you can * change the destination here. * @param output The output channel to send messages to */ public void setOutputChannel(PrintWriter output) { assert output != null : "output parameter cannot be null"; if (output != trueChannel && output != out) { trueChannel = output; out = new PrintWriterWithHistory(trueChannel, true); } } // ---------------------------------------------------------- /** * Get the output channel where this bot is sending its output. * @return The current output channel for this bot */ public PrintWriterWithHistory getOutputChannel() { return out; } // ---------------------------------------------------------- /** * Get the output channel where this bot is sending its output. * This is just a short convenience synonym for * {@link #getOutputChannel()}. * @return The current output channel for this bot */ public PrintWriterWithHistory out() { return getOutputChannel(); } // ---------------------------------------------------------- /** * Check whether this robot's output should be treated as plain text, * or as HTML markup. The default is false (treat as plain text). * @return True if the output should be treated as HTML markup */ public boolean outputIsHtml() { return outputIsHtml; } // ---------------------------------------------------------- /** * Set whether this robot's output should be treated as plain text, * or as HTML markup. * @param value True if the output should be treated as HTML markup, false * if it should be treated as plain text */ public void setOutputIsHtml(boolean value) { outputIsHtml = value; } // ---------------------------------------------------------- /** * Execute this robot's built-in sequence of steps. The default sequence * is to do nothing, but subclasses can override this method to add * their own behaviors. These behaviors will be automatically run * if the robot is attached to a {@link RobotViewer}. */ public void run() { // The default does nothing } //~ Protected nested classes .............................................. // ---------------------------------------------------------- /** * Represents a web page that can be visited by this bot. This class is * not static, since it uses the output channel of the bot. */ protected class Page { /** This page's URL. */ public URL url; /** This page's URL as a URI. */ public URI uri; /** This page's title. */ private String title; /** This page's entire content as a string. */ private String content; /** This page's entire content as a string. */ private SoftReference<String> softContent; /** This page's entire content as a DOM tree. */ private SoftReference<Node> doc; /** Was this page read and initialized successfully? */ public boolean success = false; // ---------------------------------------------------------- /** * Create a new page by reading it from the web. * @param url the page's URL */ public Page(URL url) { this.url = url; // Initialize the uri field if (url != null) { try { uri = url.toURI(); success = true; } catch (URISyntaxException e) { e.printStackTrace(out); } } initialize(); } // ---------------------------------------------------------- /** * Create a new page by reading it from a local file. * @param file The file to read from */ public Page(File file) { try { FileReader in = new FileReader(file); content = readContentFrom(in); String name = file.getCanonicalPath(); String fileSeparator = System.getProperty("file.separator"); if (fileSeparator != null && fileSeparator != "/") { name.replaceAll("\\Q" + fileSeparator + "\\E", "/"); } if (!name.startsWith("/")) { name = "/" + name; } name = name.replaceAll(" ","%20"); url = new URL("file://" + name); success = true; } catch (IOException e) { e.printStackTrace(out); } initialize(); } // ---------------------------------------------------------- /** * Create a new page by reading it from a given HTML string. * @param htmlContent The content to use for this page */ public Page(String htmlContent) { content = htmlContent; success = content != null; initialize(); } // ---------------------------------------------------------- /** * Get an iterator over the headings in this document. * @param level The level of headings to get, where 0 is all headings, * and 1-6 are only the headings <= the given number * @return an iterator over the requested set of headings */ @SuppressWarnings("unchecked") public List<HtmlHeadingElement> getHeadings(int level) { return (List<HtmlHeadingElement>)(List)xPathFindAll(HTML_HEADING); } // ---------------------------------------------------------- /** * Get an iterator over the links in this document. * @param kind One of the constants ALL_LINKS, OTHER_PAGE_LINKS, * or OTHER_SITE_LINKS, indicating which links to include in the * iterator. * @return an iterator over the requested set of links */ public List<URI> getLinks(int kind) { List<HtmlElement> anchors = xPathFindAll(HTML_ANCHOR); List<URI> result = new ArrayList<URI>(); for (HtmlElement anchor : anchors) { URI thisUri = resolveURIFromPage(anchor.getAttributeValue("href")); if (thisUri != null) { if (kind == ALL_LINKS) { result.add(thisUri); } else { String scheme = uri.getScheme().toLowerCase(); if (!uri.isOpaque() || "http".equals(scheme) || "https".equals(scheme) || "file".equals(scheme)) { if (kind == OTHER_PAGE_LINKS) { result.add(thisUri); } else if (thisUri.getHost() != null && (!thisUri.getHost().equals( uri.getHost()))) { // kind must be OTHER_SITE_LINKS at this point result.add(thisUri); } } } } } return result; } // ---------------------------------------------------------- /** * Get this document's title a string. * @return The document title */ public String getTitle() { if (title == null) { HtmlElement e = xPathFindFirst("/html:html/html:head/html:title"); if (e != null) { title = e.getText(); } } return title; } // ---------------------------------------------------------- /** * Get this document's entire content as a string. * @return The document content */ public String getContent() { if (content != null) return content; String result = (softContent == null) ? null : softContent.get(); if (result == null) { result = student.web.internal.WebContent.get(url); softContent = new SoftReference<String>(result); } return result; } // ---------------------------------------------------------- /** * Get this document's entire content as a DOM tree. * @return a DOM node */ public Node getDoc() { Node result = (doc == null) ? null : doc.get(); if (result == null) { String docContent = getContent(); if (docContent != null) { try { TransformerHandler th = stf.newTransformerHandler(); // This dom result will contain the results of the // transformation DOMResult dr = new DOMResult(); th.setResult(dr); parser.setContentHandler(th); parser.parse( new InputSource(new StringReader(getContent()))); result = dr.getNode(); doc = new SoftReference<Node>(result); } catch (Exception e) { e.printStackTrace(out); success = false; } } else { success = false; } } return result; } // ---------------------------------------------------------- /** * @param xpathQuery An XPATH query to run against the DOM Tree * @return The first HTML element in the document that matches the * query. */ public HtmlElement xPathFindFirst(String xpathQuery) { NodeList nl = null; try { nl = (NodeList)xpath.evaluate( xpathQuery, getDoc(), XPathConstants.NODESET); } catch (Exception e) { e.printStackTrace(out); } return (nl == null || nl.getLength() == 0) ? null : tagForNode(nl.item(0)); } // ---------------------------------------------------------- /** * @param xpathQuery An XPATH query to run against the DOM Tree * @return A list of HTML elements that result from running the * query against the document. */ public List<HtmlElement> xPathFindAll(String xpathQuery) { NodeList nl = null; try { nl = (NodeList)xpath.evaluate( xpathQuery, getDoc(), XPathConstants.NODESET); } catch (Exception e) { e.printStackTrace(out); } ArrayList<HtmlElement> result = new ArrayList<HtmlElement>(); if (nl != null) { for (int i = 0; i < nl.getLength(); i++) { result.add(tagForNode(nl.item(i))); } } return result; } // ---------------------------------------------------------- /** * Get the number of times the {@link #targetPhrase} occurs in * this page. * @return The number of times the {@link #targetPhrase} occurred */ public int getPatternCount() { if ( lastPattern == null || !lastPattern.equals(targetPhrase) || patternCount < 0) { if (targetPhrase == null || content == null || content.length() == 0) { patternCount = 0; patternFrequency = 0.0; } else { Matcher matcher = targetPhrase.matcher(content); patternCount = 0; int chars = 0; while (matcher.find()) { patternCount++; chars += matcher.end() - matcher.start(); } if (chars == 0) { patternFrequency = 0.0; } else { patternFrequency = (double)chars/(double)content.length(); } } lastPattern = targetPhrase; } return patternCount; } // ---------------------------------------------------------- /** * Get the frequency of the {@link #targetPhrase}, which approximates * the size of all the occurrences of the target phrase in the document * divided by the document's total size. * @return The {@link #targetPhrase} frequency */ public double getPatternFrequency() { // Force it to be calculated first by getting the count getPatternCount(); // Now, just return the cached value return patternFrequency; } // ---------------------------------------------------------- private HtmlElement tagForNode(Node node) { String tagName = node.getNodeName(); if (tagName != null && tagName.length() == 2 && (tagName.charAt(0) == 'h' || tagName.charAt(0) == 'H') && tagName.charAt(1) > '0' && tagName.charAt(1) < '7') { return new HtmlHeadingNodeTag(node, xformer); } else { return new HtmlNodeTag(node, xformer); } } // ---------------------------------------------------------- /** * Dump this page for diagnostic purposes. * @param outstream The output channel to dump on */ private String readContentFrom(Reader in) { StringWriter writer = new StringWriter(8192); char buff[] = new char[8192]; try { int len = in.read(buff); while (len > -1) { writer.write(buff, 0, len); len = in.read(buff); } } catch (IOException e) { e.printStackTrace(out); } finally { try { in.close(); } catch (IOException e) { e.printStackTrace(out); } try { writer.close(); } catch (IOException e) { e.printStackTrace(out); } } return writer.toString(); } // ---------------------------------------------------------- /** * Dump this page for diagnostic purposes. * @param outstream The output channel to dump on */ public void dump(PrintStream outstream) { outstream.println("dumping doc: url = " + url); outstream.println(" success = " + success); outstream.println(" uri = " + uri); outstream.println(" title = " + title); outstream.println(" begin-content"); outstream.println(content); outstream.println(" end-content"); } // ---------------------------------------------------------- private void initialize() { // Force the document to be parsed getDoc(); } // ---------------------------------------------------------- private Pattern lastPattern; private int patternCount = -1; private double patternFrequency = -1.0; } // ---------------------------------------------------------- /** * Represents a bot location on a specific web page. */ protected static class PageLocation { /** The page containing this location. */ public Page page; /** The position within the list of elements of interest. */ public int pos = -1; private int len = -1; private String elementXpath = null; private SoftReference<List<HtmlElement>> elts; // ---------------------------------------------------------- /** * Create a new page location with its own list of elements of * interest. * @param p The page */ public PageLocation(Page p) { page = p; } // ---------------------------------------------------------- /** * Set the xpath expression defining the elements of interest * on this page, which will reset the current position and the * current lis of elements of interest. * @param xpath The new xpath expression */ public void setElementXpath(String xpath) { if (xpath == null) { if (elementXpath == null) return; pos = -1; len = -1; elts = null; elementXpath = xpath; } else if (!xpath.equals(elementXpath)) { pos = -1; len = -1; elts = null; elementXpath = xpath; } } // ---------------------------------------------------------- /** * Get the length of the current list of elements of interest. * Use this instead of elts().size() where possible. * @return The number of elements of interest on this page. */ public int len() { if (len == -1) { len = elts().size(); } return len; } // ---------------------------------------------------------- /** * Get the current list of elements of interest. * @return The list of elements of interest */ public List<HtmlElement> elts() { List<HtmlElement> result = (elts == null) ? null : elts.get(); if (result == null) { String xpathQuery = elementXpath; if (xpathQuery == null) { xpathQuery = HTML_HEADING_OR_ANCHOR; } result = page.xPathFindAll(xpathQuery); elts = new SoftReference<List<HtmlElement>>(result); } return result; } // ---------------------------------------------------------- /** * Determine whether this position refers to a current element. * @return True if this position is standing on an element */ public boolean hasCurrentElt() { return pos >= 0 && pos < len(); } // ---------------------------------------------------------- /** * Get the current element. * @return The current element, or null if none */ public HtmlElement currentElt() { return hasCurrentElt() ? elts().get(pos) : null; } } // ---------------------------------------------------------- private static class AttributeIterator implements Iterator<String>, Iterable<String> { // ---------------------------------------------------------- public AttributeIterator(NamedNodeMap map) { inner = map; pos = 0; } // ---------------------------------------------------------- public boolean hasNext() { return pos < inner.getLength(); } // ---------------------------------------------------------- public String next() { Attr attr = (Attr)inner.item(pos); pos++; return attr.getName(); } // ---------------------------------------------------------- public void remove() { throw new UnsupportedOperationException(); } // ---------------------------------------------------------- public Iterator<String> iterator() { return this; } //~ Instance/static variables ......................................... private NamedNodeMap inner; private int pos; } // ---------------------------------------------------------- private static class HtmlHeadingNodeTag extends HtmlNodeTag implements HtmlHeadingElement { // ---------------------------------------------------------- public HtmlHeadingNodeTag(Node node, Transformer transformer) { super(node, transformer); } // ---------------------------------------------------------- public int getHeadingLevel() { if (level == 0) { String name = getType(); level = (int)(name.charAt(1) - '0'); } return level; } //~ Instance/static variables ......................................... private int level = 0; } // ---------------------------------------------------------- private static class HtmlNodeTag implements HtmlElement { // ---------------------------------------------------------- public HtmlNodeTag(Node node, Transformer transformer) { inner = node; xformer = transformer; } // ---------------------------------------------------------- public String getType() { return inner.getNodeName(); } // ---------------------------------------------------------- public String getText() { String result = getInnerHTML(); if (result != null) { Matcher m = INNER_TAG_TRIMMER.matcher(result); result = m.replaceAll(""); } return result; } // ---------------------------------------------------------- public String getInnerHTML() { if (nodeChildrenAsTextIsNull) { return null; } String result = nodeChildrenAsText == null ? null : nodeChildrenAsText.get(); if (result == null) { result = toString(); if (result != null) { Matcher m = TAG_TRIMMER.matcher(result); if (m.find()) { result = m.group(1); nodeChildrenAsText = new SoftReference<String>(result); } else { result = null; nodeChildrenAsTextIsNull = true; } } } return result; } // ---------------------------------------------------------- public boolean hasAttribute(String attributeName) { return inner.getAttributes().getNamedItem(attributeName) != null; } // ---------------------------------------------------------- public String getAttributeValue(String attributeName) { Attr attr = (Attr)inner.getAttributes().getNamedItem(attributeName); return attr == null ? null : attr.getNodeValue(); } // ---------------------------------------------------------- public Iterable<String> getAttributes() { return new AttributeIterator(inner.getAttributes()); } // ---------------------------------------------------------- public String toString() { String result = nodeAsText == null ? null : nodeAsText.get(); if (result == null) { try { result = dumpNode(inner); nodeAsText = new SoftReference<String>(result); } catch (Exception e) { e.printStackTrace(); } } return result; } // ---------------------------------------------------------- /** * @param node A node to be dumped to a string * @param omitDeclaration A boolean whether to omit the XML declaration * @return A string representation of the node. * @throws Exception If anything goes wrong. Error handling omitted. */ private String dumpNode(Node node) throws Exception { StringWriter sw = new StringWriter(); Result result = new StreamResult(sw); Source source = new DOMSource(node); xformer.transform(source, result); return sw.toString(); } //~ Instance/static variables ......................................... private Transformer xformer; private Node inner; private SoftReference<String> nodeAsText; private SoftReference<String> nodeChildrenAsText; private boolean nodeChildrenAsTextIsNull; } //~ Protected methods ..................................................... // ---------------------------------------------------------- /** * Get the current web page's entire content as a string. * * <b>Requires</b> the bot to be viewing a web page. * * @return The page's content */ protected String getPageContent() { assert isViewingWebPage() : "Not viewing a web page"; return pages.peek().page.getContent(); } // ---------------------------------------------------------- /** * Bind a symbolic name to an XML namespace URL so that the symbolic name * can be used as a namespace prefix on identifiers in XPATH expressions. * This method is for <b>advanced users only</b>. It is only necessary * if your WebBot is manipulating content that is not HTML/XHTML, and you * need to write XPATH expressions in some other XML namespace. The * default namespace bindings are for the prefix "html" to be bound to * the namespace http://www.w3.org/1999/xhtml. You can add as many * additional namespaces as you need in order to build your own XPATH * expressions. * * @param name The symbolic prefix to use for this namesapce * @param url The URL identifying this XML namespace */ protected void addXpathNamespace(String name, String url) { nc.setNamespace(name, url); } // ---------------------------------------------------------- /** * Determine whether a given HTML element is an anchor tag with an HREF * attribute. * @param element The HTML element to test * @return True if it is a link */ protected boolean isLink(HtmlElement element) { if (element == null) return false; String name = element.getType(); return "a".equals(name) || "A".equals(name); } // ---------------------------------------------------------- /** * Determine whether a given HTML element is a heading tag. * @param element The HTML element to test * @return True if it is a heading (any level) */ protected boolean isHeading(HtmlElement element) { return element != null && element instanceof HtmlHeadingElement; } // ---------------------------------------------------------- /** * Convert an HTML element representing a heading tag into its * corresponding level number. * @param element The HTML element to look up * @return The heading's level, 1-6, or 0 if this is not a heading */ protected int levelOf(HtmlElement element) { return isHeading(element) ? ((HtmlHeadingElement)element).getHeadingLevel() : 0; } // ---------------------------------------------------------- /** * Retrieve the cached page for the given URL. This method will * create the page and insert it in the cache if it does not yet exist. * Assumes the URL has been normalized and is absolute. * @param url The URL to look up * @return the page object for this URL */ protected Page cachedPageFor(URL url) { Page result = pageCache.get(url); if (result == null) { result = new Page(url); pageCache.put(url, result); } return result; } // ---------------------------------------------------------- /** * Performs cleanup once this bot has completed all its tasks. Users * should never need to explicitly call this operation. */ protected void releaseCachedResources() { pages.clear(); pageCache.clear(); } // ---------------------------------------------------------- /** * Convert a string to a URL. * @param url The string to convert * @return the URL, if one exists, or null if a conversion error occurs. */ protected URL urlForString(String url) { try { return new URI(url).normalize().toURL(); } catch (URISyntaxException e) { e.printStackTrace(out); } catch (MalformedURLException e) { e.printStackTrace(out); } return null; } // ---------------------------------------------------------- /** * Normalize a URL. * @param url The url to normalize * @return the normalized version of the URL */ protected URL normalizeURL(URL url) { try { if (url.toString().indexOf(' ') >= 0) { url = new URL(url.toString().replaceAll(" ","%20")); } return url.toURI().normalize().toURL(); } catch (URISyntaxException e) { e.printStackTrace(out); } catch (MalformedURLException e) { e.printStackTrace(out); } return null; } // ---------------------------------------------------------- /** * The worker method for the various flavors of {@link #jumpToPage(URI)}. * This method assumes the given URI has been normalized. * @param uri The new page to jump to */ protected void jumpToNormalizedURI(URI uri) { try { jumpToNormalizedURL(uri.toURL()); } catch (MalformedURLException e) { e.printStackTrace(out); } } // ---------------------------------------------------------- /** * The worker method for the various flavors of {@link #jumpToPage(URL)}. * This method assumes the given URL has been normalized. * @param url The new page to jump to */ protected void jumpToNormalizedURL(URL url) { if (url == null) return; Page newPage = cachedPageFor(url); if (newPage.success) { jumpToPage(newPage); } } // ---------------------------------------------------------- /** * The worker method for the various flavors of {@link #jumpToPage(URL)}. * This method assumes the given URL has been normalized. * @param file The new page to jump to */ protected void jumpToNormalizedURL(File file) { try { jumpToNormalizedURL(normalizeURL(makeFileAbsolute(file).toURI().toURL())); } catch (MalformedURLException e) { e.printStackTrace(out); } } // ---------------------------------------------------------- /** * Adds this page to the history stack, enforcing required stack size * limit. * @param page The new page to add to the stack */ protected void jumpToPage(Page page) { pages.push(new PageLocation(page)); } // ---------------------------------------------------------- /** * This is needed to get around issues with relative file names when * the current working directory is unknown or when running on a * server. * @param file The file to turn into an absolute path * @return An absolute version of the file, relative to the "logical" * current working directory from a student perspective, which may be * different than the JVM's true cwd. * @see IOHelper#getFile(File) */ protected File makeFileAbsolute(File file) { File result = file; try { if (!file.isAbsolute()) { result = IOHelper.getFile(file); } } catch (NoClassDefFoundError e) { if (isOnServer()) { throw new RuntimeException( "You must use cs1705.IOHelper.getFile() to create file " + " objects on the server"); } } return result; } //~ Instance/static variables ............................................. /** The stack of pages in the current history trail, where the top of * the stack is the current page. */ protected Stack<PageLocation> pages = new Stack<PageLocation>(); /** The current output channel. */ protected PrintWriter trueChannel; /** The current output channel. */ protected PrintWriterWithHistory out; /** The target phrase to search for. */ protected Pattern targetPhrase; /** Internal constant used to specify the set of links to get from a * page. */ protected static final int ALL_LINKS = 0; /** Internal constant used to specify the set of links to get from a * page. */ protected static final int OTHER_PAGE_LINKS = 1; /** Internal constant used to specify the set of links to get from a * page. */ protected static final int OTHER_SITE_LINKS = 2; /** Internal constant used as search + namespace prefix for xpath nodes. * Its value is "//html:". */ protected static final String HTML_NODE_PREFIX = "//html:"; /** A cache of all pages visited so far. */ private Map<URL, Page> pageCache = new HashMap<URL, Page>(); /** Should the output stream generated by this bot be treated as HTML?. */ private boolean outputIsHtml = false; private MutableNamespaceContext nc; private Parser parser = new Parser(); private SAXTransformerFactory stf = (SAXTransformerFactory)TransformerFactory.newInstance(); private XPathFactory xpf = XPathFactory.newInstance(); private XPath xpath = xpf.newXPath(); private Transformer xformer; private static final Pattern TAG_TRIMMER = Pattern.compile("^<[^>]*>(.*)</[^>]*>$", Pattern.DOTALL); private static final Pattern INNER_TAG_TRIMMER = Pattern.compile("<[^>]*>", Pattern.DOTALL); private static final String HTML_ANCHOR = HTML_NODE_PREFIX + "a"; private static final String HTML_HEADING = HTML_NODE_PREFIX + "h1|" + HTML_NODE_PREFIX + "h2|" + HTML_NODE_PREFIX + "h3|" + HTML_NODE_PREFIX + "h4|" + HTML_NODE_PREFIX + "h5|" + HTML_NODE_PREFIX + "h6"; private static final String HTML_HEADING_OR_ANCHOR = HTML_HEADING + "|" + HTML_ANCHOR; }