// HTMLParser Library $Name: v1_6_20060319 $ - A java-based parser for HTML // http://sourceforge.org/projects/htmlparser // Copyright (C) 2003 Derrick Oswald // // Revision Control Information // // $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/parserapplications/SiteCapturer.java,v $ // $Author: derrickoswald $ // $Date: 2005/04/12 11:27:41 $ // $Revision: 1.9 $ // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // package org.htmlparser.parserapplications; import java.io.File; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.PrintWriter; import java.net.MalformedURLException; import java.net.URL; import java.net.URLConnection; import java.util.ArrayList; import java.util.HashSet; import javax.swing.JFileChooser; import javax.swing.JOptionPane; import org.htmlparser.NodeFilter; import org.htmlparser.Parser; import org.htmlparser.PrototypicalNodeFactory; import org.htmlparser.filters.AndFilter; import org.htmlparser.filters.HasAttributeFilter; import org.htmlparser.filters.NodeClassFilter; import org.htmlparser.tags.BaseHrefTag; import org.htmlparser.tags.FrameTag; import org.htmlparser.tags.ImageTag; import org.htmlparser.tags.LinkTag; import org.htmlparser.tags.MetaTag; import org.htmlparser.util.EncodingChangeException; import org.htmlparser.util.NodeIterator; import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; /** * Save a web site locally. * Illustrative program to save a web site contents locally. * It was created to demonstrate URL rewriting in it's simplest form. * It uses customized tags in the NodeFactory to alter the URLs. * This program has a number of limitations: * <ul> * <li>it doesn't capture forms, this would involve too many assumptions</li> * <li>it doesn't capture script references, so funky onMouseOver and other * non-static content will not be faithfully reproduced</li> * <li>it doesn't handle style sheets</li> * <li>it doesn't dig into attributes that might reference resources, so * for example, background images won't necessarily be captured</li> * <li>worst of all, it gets confused when a URL both has content and is * the prefix for other content, * i.e. http://whatever.com/top and http://whatever.com/top/sub.html both * yield content, since this cannot be faithfully replicated to a static * directory structure (this happens a lot with servlet based sites)</li> *</ul> */ public class SiteCapturer { /** * The web site to capture. * This is used as the base URL in deciding whether to adjust a link * and whether to capture a page or not. */ protected String mSource; /** * The local directory to capture to. * This is used as a base prefix for files saved locally. */ protected String mTarget; /** * The list of pages to capture. * Links are added to this list as they are discovered, and removed in * sequential order (FIFO queue) leading to a breadth * first traversal of the web site space. */ protected ArrayList mPages; /** * The set of pages already captured. * Used to avoid repeated acquisition of the same page. */ protected HashSet mFinished; /** * The list of resources to copy. * Images and other resources are added to this list as they are discovered. */ protected ArrayList mImages; /** * The set of resources already copied. * Used to avoid repeated acquisition of the same images and other resources. */ protected HashSet mCopied; /** * The parser to use for processing. */ protected Parser mParser; /** * If <code>true</code>, save resources locally too, * otherwise, leave resource links pointing to original page. */ protected boolean mCaptureResources; /** * The filter to apply to the nodes retrieved. */ protected NodeFilter mFilter; /** * Copy buffer size. * Resources are moved to disk in chunks this size or less. */ protected final int TRANSFER_SIZE = 4096; /** * Create a web site capturer. */ public SiteCapturer () { PrototypicalNodeFactory factory; mSource = null; mTarget = null; mPages = new ArrayList (); mFinished = new HashSet (); mImages = new ArrayList (); mCopied = new HashSet (); mParser = new Parser (); factory = new PrototypicalNodeFactory (); factory.registerTag (new LocalLinkTag ()); factory.registerTag (new LocalFrameTag ()); factory.registerTag (new LocalBaseHrefTag ()); factory.registerTag (new LocalImageTag ()); mParser.setNodeFactory (factory); mCaptureResources = true; mFilter = null; } /** * Getter for property source. * @return Value of property source. */ public String getSource () { return (mSource); } /** * Setter for property source. * This is the base URL to capture. URL's that don't start with this prefix * are ignored (left as is), while the ones with this URL as a base are * re-homed to the local target. * @param source New value of property source. */ public void setSource (String source) { if (source.endsWith ("/")) source = source.substring (0, source.length () - 1); mSource = source; } /** * Getter for property target. * @return Value of property target. */ public String getTarget () { return (mTarget); } /** * Setter for property target. * This is the local directory under which to save the site's pages. * @param target New value of property target. */ public void setTarget (String target) { mTarget = target; } /** * Getter for property captureResources. * If <code>true</code>, the images and other resources referenced by * the site and within the base URL tree are also copied locally to the * target directory. If <code>false</code>, the image links are left 'as * is', still refering to the original site. * @return Value of property captureResources. */ public boolean getCaptureResources () { return (mCaptureResources); } /** * Setter for property captureResources. * @param capture New value of property captureResources. */ public void setCaptureResources (boolean capture) { mCaptureResources = capture; } /** Getter for property filter. * @return Value of property filter. * */ public NodeFilter getFilter () { return (mFilter); } /** Setter for property filter. * @param filter New value of property filter. * */ public void setFilter (NodeFilter filter) { mFilter = filter; } /** * Returns <code>true</code> if the link is one we are interested in. * @param link The link to be checked. * @return <code>true</code> if the link has the source URL as a prefix * and doesn't contain '?' or '#'; the former because we won't be able to * handle server side queries in the static target directory structure and * the latter because presumably the full page with that reference has * already been captured previously. This performs a case insensitive * comparison, which is cheating really, but it's cheap. */ protected boolean isToBeCaptured (String link) { return ( link.toLowerCase ().startsWith (getSource ().toLowerCase ()) && (-1 == link.indexOf ("?")) && (-1 == link.indexOf ("#"))); } /** * Returns <code>true</code> if the link contains text/html content. * @param link The URL to check for content type. * @return <code>true</code> if the HTTP header indicates the type is * "text/html". * @exception ParserException If the supplied URL can't be read from. */ protected boolean isHtml (String link) throws ParserException { URL url; URLConnection connection; String type; boolean ret; ret = false; try { url = new URL (link); connection = url.openConnection (); type = connection.getContentType (); if (type == null) ret = false; else ret = type.startsWith ("text/html"); } catch (Exception e) { throw new ParserException ("URL " + link + " has a problem", e); } return (ret); } /** * Converts a link to local. * A relative link can be used to construct both a URL and a file name. * Basically, the operation is to strip off the base url, if any, * and then prepend as many dot-dots as necessary to make * it relative to the current page. * A bit of a kludge handles the root page specially by calling it * index.html, even though that probably isn't it's real file name. * This isn't pretty, but it works for me. * @param link The link to make relative. * @param current The current page URL, or empty if it's an absolute URL * that needs to be converted. * @return The URL relative to the current page. */ protected String makeLocalLink (String link, String current) { int i; int j; String ret; if (link.equals (getSource ()) || (!getSource ().endsWith ("/") && link.equals (getSource () + "/"))) ret = "index.html"; // handle the root page specially else if (link.startsWith (getSource ()) && (link.length () > getSource ().length ())) ret = link.substring (getSource ().length () + 1); else ret = link; // give up // make it relative to the current page by prepending "../" for // each '/' in the current local path if ((null != current) && link.startsWith (getSource ()) && (current.length () > getSource ().length ())) { current = current.substring (getSource ().length () + 1); i = 0; while (-1 != (j = current.indexOf ('/', i))) { ret = "../" + ret; i = j + 1; } } return (ret); } /** * Unescape a URL to form a file name. * Very crude. * @param raw The escaped URI. * @return The native URI. */ protected String decode (String raw) { int length; int start; int index; int value; StringBuffer ret; ret = new StringBuffer (raw.length ()); length = raw.length (); start = 0; while (-1 != (index = raw.indexOf ('%', start))) { // append the part up to the % sign ret.append (raw.substring (start, index)); // there must be two hex digits after the percent sign if (index + 2 < length) { try { value = Integer.parseInt (raw.substring (index + 1, index + 3), 16); ret.append ((char)value); start = index + 3; } catch (NumberFormatException nfe) { ret.append ('%'); start = index + 1; } } else { // this case is actually illegal in a URI, but... ret.append ('%'); start = index + 1; } } ret.append (raw.substring (start)); return (ret.toString ()); } /** * Copy a resource (image) locally. * Removes one element from the 'to be copied' list and saves the * resource it points to locally as a file. */ protected void copy () { String link; String raw; String name; File file; File dir; URL source; byte[] data; InputStream in; FileOutputStream out; int read; link = (String)mImages.remove (0); mCopied.add (link); if (getCaptureResources ()) { raw = makeLocalLink (link, ""); name = decode (raw); file = new File (getTarget (), name); System.out.println ("copying " + link + " to " + file.getAbsolutePath ()); // ensure directory exists dir = file.getParentFile (); if (!dir.exists ()) dir.mkdirs (); try { source = new URL (link); data = new byte [TRANSFER_SIZE]; try { in = source.openStream (); try { out = new FileOutputStream (file); try { while (-1 != (read = in.read (data, 0, data.length))) out.write (data, 0, read); } finally { out.close (); } } catch (FileNotFoundException fnfe) { fnfe.printStackTrace (); } finally { in.close (); } } catch (FileNotFoundException fnfe) { System.err.println ("broken link " + fnfe.getMessage () + " ignored"); } } catch (MalformedURLException murle) { murle.printStackTrace (); } catch (IOException ioe) { ioe.printStackTrace (); } } } /** * Process a single page. * @param filter The filter to apply to the collected nodes. * @exception ParserException If a parse error occurs. */ protected void process (NodeFilter filter) throws ParserException { String url; int bookmark; NodeList list; NodeList robots; MetaTag robot; String content; File file; File dir; PrintWriter out; // get the next URL and add it to the done pile url = (String)mPages.remove (0); System.out.println ("processing " + url); mFinished.add (url); try { bookmark = mPages.size (); // fetch the page and gather the list of nodes mParser.setURL (url); try { list = new NodeList (); for (NodeIterator e = mParser.elements (); e.hasMoreNodes (); ) list.add (e.nextNode ()); // URL conversion occurs in the tags } catch (EncodingChangeException ece) { // fix bug #998195 SiteCatpurer just crashed // try again with the encoding now set correctly // hopefully mPages, mImages, mCopied and mFinished won't be corrupted mParser.reset (); list = new NodeList (); for (NodeIterator e = mParser.elements (); e.hasMoreNodes (); ) list.add (e.nextNode ()); } // handle robots meta tag according to http://www.robotstxt.org/wc/meta-user.html // <meta name="robots" content="index,follow" /> // <meta name="robots" content="noindex,nofollow" /> robots = list.extractAllNodesThatMatch ( new AndFilter ( new NodeClassFilter (MetaTag.class), new HasAttributeFilter ("name", "robots")), true); if (0 != robots.size ()) { robot = (MetaTag)robots.elementAt (0); content = robot.getAttribute ("content").toLowerCase (); if ((-1 != content.indexOf ("none")) || (-1 != content.indexOf ("nofollow"))) // reset mPages for (int i = bookmark; i < mPages.size (); i++) mPages.remove (i); if ((-1 != content.indexOf ("none")) || (-1 != content.indexOf ("noindex"))) return; } if (null != filter) list.keepAllNodesThatMatch (filter, true); // save the page locally file = new File (getTarget (), makeLocalLink (url, "")); dir = file.getParentFile (); if (!dir.exists ()) dir.mkdirs (); else if (!dir.isDirectory ()) { dir = new File (dir.getParentFile (), dir.getName () + ".content"); if (!dir.exists ()) dir.mkdirs (); file = new File (dir, file.getName ()); } try { out = new PrintWriter (new FileOutputStream (file)); for (int i = 0; i < list.size (); i++) out.print (list.elementAt (i).toHtml ()); out.close (); } catch (FileNotFoundException fnfe) { fnfe.printStackTrace (); } } catch (ParserException pe) { String message; // this exception handling is suboptimal, // but it recognizes resources that aren't text/html message = pe.getMessage (); if ((null != message) && (message.endsWith ("does not contain text"))) { if (!mCopied.contains (url)) if (!mImages.contains (url)) mImages.add (url); mFinished.remove (url); } else throw pe; } } /** * Link tag that rewrites the HREF. * The HREF is changed to a local target if it matches the source. */ class LocalLinkTag extends LinkTag { public void doSemanticAction () throws ParserException { boolean html; String link; // get the link link = getLink (); // check if it needs to be captured if (isToBeCaptured (link)) { // add the link to a list to be processed if (mFinished.contains (link)) html = true; else if (mPages.contains (link)) html = true; else if (mCopied.contains (link)) html = false; else if (mImages.contains (link)) html = false; else { // this test is expensive, do it reluctantly html = isHtml (link); if (html) mPages.add (link); else mImages.add (link); } // alter the link if (html || (!html && getCaptureResources ())) link = makeLocalLink (link, mParser.getLexer ().getPage ().getUrl ()); setLink (link); } } } /** * Frame tag that rewrites the SRC URLs. * The SRC URLs are mapped to local targets if they match the source. */ class LocalFrameTag extends FrameTag { public void doSemanticAction () throws ParserException { boolean html; String link; // get the link link = getFrameLocation (); // check if it needs to be captured if (isToBeCaptured (link)) { // add the link to a list to be processed if (mFinished.contains (link)) html = true; else if (mPages.contains (link)) html = true; else if (mCopied.contains (link)) html = false; else if (mImages.contains (link)) html = false; else { // this test is expensive, do it reluctantly html = isHtml (link); if (html) mPages.add (link); else mImages.add (link); } // alter the link if (html || (!html && getCaptureResources ())) link = makeLocalLink (link, mParser.getLexer ().getPage ().getUrl ()); setFrameLocation (link); } } } /** * Image tag that rewrites the SRC URL. * If resources are being captured the SRC is mapped to a local target if * it matches the source, otherwise it is convered to a full URL to point * back to the original site. */ class LocalImageTag extends ImageTag { public void doSemanticAction () throws ParserException { String image; // get the image url image = getImageURL (); // check if it needs to be captured if (isToBeCaptured (image)) { // add the image to the list needing to be copied if (!mCopied.contains (image)) if (!mImages.contains (image)) mImages.add (image); if (getCaptureResources ()) image = makeLocalLink (image, mParser.getLexer ().getPage ().getUrl ()); // alter the link setImageURL (image); } } } /** * Base tag that doesn't show. * The toHtml() method is overridden to return an empty string, * effectively shutting off the base reference. */ class LocalBaseHrefTag extends BaseHrefTag { // we don't want to have a base pointing back at the source page public String toHtml () { return (""); } } /** * Perform the capture. */ public void capture () { mPages.clear (); mPages.add (getSource ()); while (0 != mPages.size ()) try { process (getFilter ()); while (0 != mImages.size ()) copy (); } catch (ParserException pe) { // this exception handling is suboptimal, // but it messages correctly about broken links Throwable throwable; throwable = pe.getThrowable (); if (null != throwable) { throwable = throwable.getCause (); if (throwable instanceof FileNotFoundException) System.err.println ("broken link " + ((FileNotFoundException)throwable).getMessage () + " ignored"); else pe.printStackTrace (); } else pe.printStackTrace (); } } /** * Mainline to capture a web site locally. * @param args The command line arguments. * There are three arguments the web site to capture, the local directory * to save it to, and a flag (true or false) to indicate whether resources * such as images and video are to be captured as well. * These are requested via dialog boxes if not supplied. * @exception MalformedURLException If the supplied URL is invalid. * @exception IOException If an error occurs reading the page or resources. */ public static void main (String[] args) throws MalformedURLException, IOException { SiteCapturer worker; String url; JFileChooser chooser; URL source; String path; File target; Boolean capture; int ret; worker = new SiteCapturer (); if (0 >= args.length) { url = (String)JOptionPane.showInputDialog ( null, "Enter the URL to capture:", "Web Site", JOptionPane.PLAIN_MESSAGE, null, null, "http://htmlparser.sourceforge.net/wiki"); if (null != url) worker.setSource (url); else System.exit (1); } else worker.setSource (args[0]); if (1 >= args.length) { url = worker.getSource (); source = new URL (url); path = new File (new File ("." + File.separator), source.getHost () + File.separator).getCanonicalPath (); target = new File (path); chooser = new JFileChooser (target); chooser.setDialogType (JFileChooser.SAVE_DIALOG); chooser.setFileSelectionMode (JFileChooser.DIRECTORIES_ONLY); chooser.setSelectedFile (target); // this doesn't frickin' work chooser.setMultiSelectionEnabled (false); chooser.setDialogTitle ("Target Directory"); ret = chooser.showSaveDialog (null); if (ret == JFileChooser.APPROVE_OPTION) worker.setTarget (chooser.getSelectedFile ().getAbsolutePath ()); else System.exit (1); } else worker.setTarget (args[1]); if (2 >= args.length) { capture = (Boolean)JOptionPane.showInputDialog ( null, "Should resources be captured:", "Capture Resources", JOptionPane.PLAIN_MESSAGE, null, new Object[] { Boolean.TRUE, Boolean.FALSE}, Boolean.TRUE); if (null != capture) worker.setCaptureResources (capture.booleanValue ()); else System.exit (1); } else worker.setCaptureResources ((Boolean.valueOf (args[2]).booleanValue ())); worker.capture (); System.exit (0); } }