/******************************************************************************* * CogTool Copyright Notice and Distribution Terms * CogTool 1.3, Copyright (c) 2005-2013 Carnegie Mellon University * This software is distributed under the terms of the FSF Lesser * Gnu Public License (see LGPL.txt). * * CogTool is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * (at your option) any later version. * * CogTool is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with CogTool; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * * CogTool makes use of several third-party components, with the * following notices: * * Eclipse SWT version 3.448 * Eclipse GEF Draw2D version 3.2.1 * * Unless otherwise indicated, all Content made available by the Eclipse * Foundation is provided to you under the terms and conditions of the Eclipse * Public License Version 1.0 ("EPL"). A copy of the EPL is provided with this * Content and is also available at http://www.eclipse.org/legal/epl-v10.html. * * CLISP version 2.38 * * Copyright (c) Sam Steingold, Bruno Haible 2001-2006 * This software is distributed under the terms of the FSF Gnu Public License. * See COPYRIGHT file in clisp installation folder for more information. * * ACT-R 6.0 * * Copyright (c) 1998-2007 Dan Bothell, Mike Byrne, Christian Lebiere & * John R Anderson. * This software is distributed under the terms of the FSF Lesser * Gnu Public License (see LGPL.txt). * * Apache Jakarta Commons-Lang 2.1 * * This product contains software developed by the Apache Software Foundation * (http://www.apache.org/) * * jopt-simple version 1.0 * * Copyright (c) 2004-2013 Paul R. Holser, Jr. * * Permission is hereby granted, free of charge, to any person obtaining * a copy of this software and associated documentation files (the * "Software"), to deal in the Software without restriction, including * without limitation the rights to use, copy, modify, merge, publish, * distribute, sublicense, and/or sell copies of the Software, and to * permit persons to whom the Software is furnished to do so, subject to * the following conditions: * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. * * Mozilla XULRunner 1.9.0.5 * * The contents of this file are subject to the Mozilla Public License * Version 1.1 (the "License"); you may not use this file except in * compliance with the License. You may obtain a copy of the License at * http://www.mozilla.org/MPL/. * Software distributed under the License is distributed on an "AS IS" * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the * License for the specific language governing rights and limitations * under the License. * * The J2SE(TM) Java Runtime Environment version 5.0 * * Copyright 2009 Sun Microsystems, Inc., 4150 * Network Circle, Santa Clara, California 95054, U.S.A. All * rights reserved. U.S. * See the LICENSE file in the jre folder for more information. ******************************************************************************/ package edu.cmu.cs.hcii.cogtool.controller; import java.io.BufferedReader; import java.io.File; import java.io.IOException; import java.io.InputStreamReader; import java.net.MalformedURLException; import java.net.URL; import java.util.ArrayList; import java.util.Collection; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; import edu.cmu.cs.hcii.cogtool.model.URLCrawlEntry; import edu.cmu.cs.hcii.cogtool.model.URLLabeledLink; /** * The basic algorithm for crawling a set of URLs. */ public class WebCrawler { /** * Throw this exception whenever parsing a URL * fails in some way. */ public static class URLParseError extends RuntimeException { public URLParseError(String url) { super("Malformed URL: " + url); } public URLParseError(String url, Throwable t) { super("Malformed URL: " + url, t); } } /** * Throw this exception whenever the crawl algorithm detects an error. */ public static class CrawlError extends RuntimeException { public CrawlError() { super("URL could not be fetched"); } public CrawlError(Throwable t) { super("URL could not be fetched", t); } } /** * Throw this exception whenever parsing fetched HTML for a URL * fails in some way. */ public static class HTMLParseError extends RuntimeException { public HTMLParseError() { super("HTML parse error"); } public HTMLParseError(Throwable t) { super("HTML parse error", t); } } /** * Whenever no maximum number of URLs to visit is specified, * the following maximum count is used. */ public static final int DEFAULT_MAX_TO_CRAWL = 500; /** * The representation of a page after being visited and parsed. * Keeps track of the page's URL, the protocol/host/port prefix, * and the nested child links. * Subclasses (or friends of subclasses) of WebCrawler may subclass this * to add information (e.g., the page's image). */ public static class PageInfo { public String url; // Child links public List<URLLabeledLink> links = new ArrayList<URLLabeledLink>(); public PageInfo(String pageURL) { url = pageURL; } } // Maps URL to PageInfo protected Map<String, PageInfo> crawledURLs = new LinkedHashMap<String, PageInfo>(); protected LinkedList<URLCrawlEntry> urlsToCrawl = new LinkedList<URLCrawlEntry>(); /** * Return the current queue of URLs to crawl. */ public List<URLCrawlEntry> getURLsToCrawl() { return urlsToCrawl; } public void addURLsToCrawl(List<URLCrawlEntry> urls) { urlsToCrawl.addAll(urls); } /** * Crawl the URL specifications contained by the given list -- the member * objects should be instances of URLCrawlEntry or a subclass. * The number of visits will be limited to DEFAULT_MAX_TO_CRAWL, * using an infinite default depth. Visits are performed breadth-first. * * Fetch resulting page descriptions afterward via getCrawledURLs(). * Each call to crawlWeb will add new descriptions to the collection. * * @param crawlEntries the list of URLCrawlEntry instances */ public void crawlWeb(List<URLCrawlEntry> crawlEntries) { // Stop after a default number of visits crawlWeb(crawlEntries, DEFAULT_MAX_TO_CRAWL); } /** * Determine whether the specified URL should be visited. */ protected boolean crawlNeeded(URLCrawlEntry entry) { // Subclasses may override; if so, // check super.crawlNeeded(entry) first. // No need to progress if URL has been crawled return ! crawledURLs.containsKey(entry.getURL()); } /** * Determine whether the next URL should be visited. */ protected boolean crawlMayContinue() { // Subclasses may override, especially if being performed // as a background thread and cancel is a possibility. return true; } /** * Utility constant for "documentation" purposes */ protected static final boolean IGNORE_CASE = true; /** * Allowed absolute prefixes */ protected static final String[] allowedProtocols = new String[] { "http:", "https:", "file:" }; /** * Allowed file extensions; probably should get this * list from a resource! TODO: */ protected static final String[] allowedExtensions = new String[] { ".htm", ".html", ".xhtml", ".shtml", ".php", ".jsp", ".asp", ".aspx", ".cfm", ".pl", ".py", ".rb" }; protected boolean isAllowedExtension(String url) { String extension = ""; try { String path = new URL(url).getPath(); int extPos = path.lastIndexOf('.'); if (extPos != -1) { extension = path.substring(extPos); } // otherwise, no extension! } catch (MalformedURLException e) { // Hmm; postpone dealing with this for now // TODO: Possibly, return false to eliminate from consideration? return true; } if (extension.length() == 0) { return true; } for (String allowedExtension: allowedExtensions) { if (allowedExtension.equalsIgnoreCase(extension)) { return true; } } return false; } /** * Determine whether the given child link should be added to the * crawl queue. Subclasses may override to allow for pruning. * Ensures that the URL's protocol is appropriate for crawling. * Also check file extensions that might represent HTML. * Possibly should also check toDepth? */ protected boolean shouldCrawlLink(URLCrawlEntry newLink) { // Subclasses may override; if so, it should check // super.shouldCrawlLink first. if (newLink.getToDepth() >= 0) { String url = newLink.getURL(); /* if(url.substring(0,5).equalsIgnoreCase("file:")) { //System.out.println("file " + url); try { FileReader file= new FileReader(url); } catch( FileNotFoundException ex) { return false; } }*/ for (String allowedProtocol : allowedProtocols) { if (url.regionMatches(IGNORE_CASE, 0, allowedProtocol, 0, allowedProtocol.length())) { if(newLink.getDomain().equals("Unrestricted")|| newLink.getURL().startsWith(newLink.getDomain())) { return isAllowedExtension(url); } } } } return false; } /** * Utility to parse the given string URL for its protocol/host/port prefix, * which may then be used to make a relative URL absolute. */ public static String getURLPrefix(String parentURL) { try { return getURLPrefix(new URL(parentURL)); } catch (IOException ex) { throw new URLParseError(parentURL, ex); } } /** * Utility to parse the given URL for its protocol/host/port prefix, * which may then be used to make a relative URL absolute. */ public static String getURLPrefix(URL parentURL) { String protocol = parentURL.getProtocol(); if (protocol != null) { String host = parentURL.getHost(); String path = parentURL.getPath(); if ((host != null) && ! host.equals("")) { int port = parentURL.getPort(); String portStr = (port == -1) ? "" : (":" + Integer.toString(port)); int dirEnd = path.lastIndexOf('/'); String dirPath = (dirEnd == -1) ? "" : path.substring(0, dirEnd); return protocol + "://" + host + portStr + dirPath; } if (protocol.toLowerCase().equals("file")) { File asFile = new File(path); return protocol + "://" + asFile.getParent(); } } return ""; } /** * Visit and parse the page associated with the given URL entry. * Return the page information. * * This provides a default implementation. * * CURRENTLY NOT COMPLETELY IMPLEMENTED. */ protected PageInfo fetchPage(URLCrawlEntry entry) { // Subclasses may override and provide a different implementation. BufferedReader urlReader = null; try { URL url = new URL(entry.getURL()); urlReader = new BufferedReader(new InputStreamReader(url.openStream())); // ... } catch (IOException ex) { throw new URLParseError(entry.getURL(), ex); } finally { if (urlReader != null) { try { urlReader.close(); } catch (IOException ex) { throw new URLParseError(entry.getURL(), ex); } } } return null; //... } /** * Crawl the URL specifications contained by the given list -- the member * objects should be instances of URLCrawlEntry or a subclass. * The number of visits will be limited to maxURLs, * using an infinite default depth. Visits are performed breadth-first. * * Fetch resulting page descriptions afterward via getCrawledURLs(). * Each call to crawlWeb will add new descriptions to the collection. * * @param crawlEntries the list of URLCrawlEntry instances * @param maxURLs the maximum number of valid pages to visit */ public void crawlWeb(List<URLCrawlEntry> crawlEntries, int maxURLs) { crawlWeb(crawlEntries, URLCrawlEntry.INFINITE_DEPTH, maxURLs); } /** * Crawl the URL specifications contained by the given list -- the member * objects should be instances of URLCrawlEntry or a subclass. * The number of visits will be limited to maxURLs, * using the given default depth. Visits are performed breadth-first. * * Fetch resulting page descriptions afterward via getCrawledURLs(). * Each call to crawlWeb will add new descriptions to the collection. * * @param crawlEntries the list of URLCrawlEntry instances * @param defaultDepth the default depth for URLs without specified depths * @param maxURLs the maximum number of valid pages to visit */ public void crawlWeb(List<URLCrawlEntry> crawlEntries, int defaultDepth, int maxURLs) { int numURLsCrawled = 0; // FIFO tracking of URLCrawlEntry's yet to crawl for(URLCrawlEntry entry: crawlEntries) { if(shouldCrawlLink(entry)) { urlsToCrawl.add(entry); } } // Continue fetching pages as long as there are pages in the queue // AND the number of pages fetched is below the maximum requested // AND the subclass thinks it's ok to continue (for example, // ImportWebCrawler's override of crawlMayContinue() checks // if the cancel button has been pushed) while (! urlsToCrawl.isEmpty() && (numURLsCrawled < maxURLs) && crawlMayContinue()) { // important to pick it off the front of the list (truly implement a fifo), // so we do a breadth first walk URLCrawlEntry nextEntry = urlsToCrawl.removeFirst(); // Strip #... fragment from the URL // only for root urls, anything lower down will have already been stripped nextEntry.stripFragment(); if (nextEntry.isEmpty()) { continue; // string is now empty! } // This part only helps those URLs initially in the list; // see below for part that makes relative links absolute. try { nextEntry.ensureAbsolute(); } catch (IOException ex) { throw new URLParseError(nextEntry.getURL(), ex); } // Check that we still need to crawl this entry; default // implementation checks that the entry hasn't already been seen. if (crawlNeeded(nextEntry)) { PageInfo urlPage = fetchPage(nextEntry); // If the page is acceptable, record and count it. if (urlPage != null) { numURLsCrawled++; // Update the count fetched this time // Record page's absolute URL; used by crawlNeeded() // to decide that this URL no longer needs to be fetched. crawledURLs.put(nextEntry.getURL(), urlPage); // If the depth for this page allows more crawling, // add its child links to the queue. int toDepth = nextEntry.getToDepth(); if (toDepth == URLCrawlEntry.USE_DEFAULT_DEPTH) { // can only happen at top level of the tree being walked toDepth = defaultDepth; } if (toDepth > 0) { Iterator<URLLabeledLink> newLinks = urlPage.links.iterator(); URL contextURL = null; // If needed, the URL of the parent page while (newLinks.hasNext()) { URLLabeledLink newLink = newLinks.next(); newLink.setDomain(nextEntry.getDomain()); // Again, the #... fragment is useless to us newLink.stripFragment(); // Ensure the transitive link is "absolute" // for protocol scheme check inside shouldCrawlLink if (! newLink.isAbsolute()) { if (contextURL == null) { try { // Get the URL of the current page // to use as the context for all // relative links that it contains contextURL = new URL(urlPage.url); } catch (IOException ex) { throw new URLParseError(urlPage.url, ex); } } // This will deal with "../" and other relative // path issues try { URL absoluteURL = new URL(contextURL, newLink.getURL()); newLink.setURL(absoluteURL.toString()); } catch (IOException ex) { throw new URLParseError(newLink.getURL(), ex); } } newLink.setToDepth(toDepth - 1); // Allow subclass to prune. If the child link // should be crawled, if (shouldCrawlLink(newLink)) { urlsToCrawl.add(newLink); } } } } } } } /** * Return the current collection of page descriptions of URLs visited. */ public Collection<PageInfo> getCrawledURLs() { return crawledURLs.values(); } }