/*******************************************************************************
* CogTool Copyright Notice and Distribution Terms
* CogTool 1.3, Copyright (c) 2005-2013 Carnegie Mellon University
* This software is distributed under the terms of the FSF Lesser
* Gnu Public License (see LGPL.txt).
*
* CogTool is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation; either version 2.1 of the License, or
* (at your option) any later version.
*
* CogTool is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with CogTool; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*
* CogTool makes use of several third-party components, with the
* following notices:
*
* Eclipse SWT version 3.448
* Eclipse GEF Draw2D version 3.2.1
*
* Unless otherwise indicated, all Content made available by the Eclipse
* Foundation is provided to you under the terms and conditions of the Eclipse
* Public License Version 1.0 ("EPL"). A copy of the EPL is provided with this
* Content and is also available at http://www.eclipse.org/legal/epl-v10.html.
*
* CLISP version 2.38
*
* Copyright (c) Sam Steingold, Bruno Haible 2001-2006
* This software is distributed under the terms of the FSF Gnu Public License.
* See COPYRIGHT file in clisp installation folder for more information.
*
* ACT-R 6.0
*
* Copyright (c) 1998-2007 Dan Bothell, Mike Byrne, Christian Lebiere &
* John R Anderson.
* This software is distributed under the terms of the FSF Lesser
* Gnu Public License (see LGPL.txt).
*
* Apache Jakarta Commons-Lang 2.1
*
* This product contains software developed by the Apache Software Foundation
* (http://www.apache.org/)
*
* jopt-simple version 1.0
*
* Copyright (c) 2004-2013 Paul R. Holser, Jr.
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
* LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
* OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
* WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* Mozilla XULRunner 1.9.0.5
*
* The contents of this file are subject to the Mozilla Public License
* Version 1.1 (the "License"); you may not use this file except in
* compliance with the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/.
* Software distributed under the License is distributed on an "AS IS"
* basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
* License for the specific language governing rights and limitations
* under the License.
*
* The J2SE(TM) Java Runtime Environment version 5.0
*
* Copyright 2009 Sun Microsystems, Inc., 4150
* Network Circle, Santa Clara, California 95054, U.S.A. All
* rights reserved. U.S.
* See the LICENSE file in the jre folder for more information.
******************************************************************************/
package edu.cmu.cs.hcii.cogtool.controller;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import edu.cmu.cs.hcii.cogtool.model.URLCrawlEntry;
import edu.cmu.cs.hcii.cogtool.model.URLLabeledLink;
/**
* The basic algorithm for crawling a set of URLs.
*/
public class WebCrawler
{
/**
* Throw this exception whenever parsing a URL
* fails in some way.
*/
public static class URLParseError extends RuntimeException
{
public URLParseError(String url)
{
super("Malformed URL: " + url);
}
public URLParseError(String url, Throwable t)
{
super("Malformed URL: " + url, t);
}
}
/**
* Throw this exception whenever the crawl algorithm detects an error.
*/
public static class CrawlError extends RuntimeException
{
public CrawlError()
{
super("URL could not be fetched");
}
public CrawlError(Throwable t)
{
super("URL could not be fetched", t);
}
}
/**
* Throw this exception whenever parsing fetched HTML for a URL
* fails in some way.
*/
public static class HTMLParseError extends RuntimeException
{
public HTMLParseError()
{
super("HTML parse error");
}
public HTMLParseError(Throwable t)
{
super("HTML parse error", t);
}
}
/**
* Whenever no maximum number of URLs to visit is specified,
* the following maximum count is used.
*/
public static final int DEFAULT_MAX_TO_CRAWL = 500;
/**
* The representation of a page after being visited and parsed.
* Keeps track of the page's URL, the protocol/host/port prefix,
* and the nested child links.
* Subclasses (or friends of subclasses) of WebCrawler may subclass this
* to add information (e.g., the page's image).
*/
public static class PageInfo
{
public String url;
// Child links
public List<URLLabeledLink> links = new ArrayList<URLLabeledLink>();
public PageInfo(String pageURL)
{
url = pageURL;
}
}
// Maps URL to PageInfo
protected Map<String, PageInfo> crawledURLs =
new LinkedHashMap<String, PageInfo>();
protected LinkedList<URLCrawlEntry> urlsToCrawl =
new LinkedList<URLCrawlEntry>();
/**
* Return the current queue of URLs to crawl.
*/
public List<URLCrawlEntry> getURLsToCrawl()
{
return urlsToCrawl;
}
public void addURLsToCrawl(List<URLCrawlEntry> urls)
{
urlsToCrawl.addAll(urls);
}
/**
* Crawl the URL specifications contained by the given list -- the member
* objects should be instances of URLCrawlEntry or a subclass.
* The number of visits will be limited to DEFAULT_MAX_TO_CRAWL,
* using an infinite default depth. Visits are performed breadth-first.
*
* Fetch resulting page descriptions afterward via getCrawledURLs().
* Each call to crawlWeb will add new descriptions to the collection.
*
* @param crawlEntries the list of URLCrawlEntry instances
*/
public void crawlWeb(List<URLCrawlEntry> crawlEntries)
{
// Stop after a default number of visits
crawlWeb(crawlEntries, DEFAULT_MAX_TO_CRAWL);
}
/**
* Determine whether the specified URL should be visited.
*/
protected boolean crawlNeeded(URLCrawlEntry entry)
{
// Subclasses may override; if so,
// check super.crawlNeeded(entry) first.
// No need to progress if URL has been crawled
return ! crawledURLs.containsKey(entry.getURL());
}
/**
* Determine whether the next URL should be visited.
*/
protected boolean crawlMayContinue()
{
// Subclasses may override, especially if being performed
// as a background thread and cancel is a possibility.
return true;
}
/**
* Utility constant for "documentation" purposes
*/
protected static final boolean IGNORE_CASE = true;
/**
* Allowed absolute prefixes
*/
protected static final String[] allowedProtocols =
new String[] { "http:", "https:", "file:" };
/**
* Allowed file extensions; probably should get this
* list from a resource! TODO:
*/
protected static final String[] allowedExtensions =
new String[] { ".htm", ".html", ".xhtml", ".shtml",
".php", ".jsp", ".asp", ".aspx",
".cfm", ".pl", ".py", ".rb" };
protected boolean isAllowedExtension(String url)
{
String extension = "";
try {
String path = new URL(url).getPath();
int extPos = path.lastIndexOf('.');
if (extPos != -1) {
extension = path.substring(extPos);
}
// otherwise, no extension!
}
catch (MalformedURLException e) {
// Hmm; postpone dealing with this for now
// TODO: Possibly, return false to eliminate from consideration?
return true;
}
if (extension.length() == 0) {
return true;
}
for (String allowedExtension: allowedExtensions) {
if (allowedExtension.equalsIgnoreCase(extension)) {
return true;
}
}
return false;
}
/**
* Determine whether the given child link should be added to the
* crawl queue. Subclasses may override to allow for pruning.
* Ensures that the URL's protocol is appropriate for crawling.
* Also check file extensions that might represent HTML.
* Possibly should also check toDepth?
*/
protected boolean shouldCrawlLink(URLCrawlEntry newLink)
{
// Subclasses may override; if so, it should check
// super.shouldCrawlLink first.
if (newLink.getToDepth() >= 0) {
String url = newLink.getURL();
/* if(url.substring(0,5).equalsIgnoreCase("file:"))
{
//System.out.println("file " + url);
try
{
FileReader file= new FileReader(url);
}
catch( FileNotFoundException ex)
{
return false;
}
}*/
for (String allowedProtocol : allowedProtocols) {
if (url.regionMatches(IGNORE_CASE, 0,
allowedProtocol, 0,
allowedProtocol.length()))
{
if(newLink.getDomain().equals("Unrestricted")|| newLink.getURL().startsWith(newLink.getDomain()))
{
return isAllowedExtension(url);
}
}
}
}
return false;
}
/**
* Utility to parse the given string URL for its protocol/host/port prefix,
* which may then be used to make a relative URL absolute.
*/
public static String getURLPrefix(String parentURL)
{
try {
return getURLPrefix(new URL(parentURL));
}
catch (IOException ex) {
throw new URLParseError(parentURL, ex);
}
}
/**
* Utility to parse the given URL for its protocol/host/port prefix,
* which may then be used to make a relative URL absolute.
*/
public static String getURLPrefix(URL parentURL)
{
String protocol = parentURL.getProtocol();
if (protocol != null) {
String host = parentURL.getHost();
String path = parentURL.getPath();
if ((host != null) && ! host.equals("")) {
int port = parentURL.getPort();
String portStr =
(port == -1) ? "" : (":" + Integer.toString(port));
int dirEnd = path.lastIndexOf('/');
String dirPath =
(dirEnd == -1) ? "" : path.substring(0, dirEnd);
return protocol + "://" + host + portStr + dirPath;
}
if (protocol.toLowerCase().equals("file")) {
File asFile = new File(path);
return protocol + "://" + asFile.getParent();
}
}
return "";
}
/**
* Visit and parse the page associated with the given URL entry.
* Return the page information.
*
* This provides a default implementation.
*
* CURRENTLY NOT COMPLETELY IMPLEMENTED.
*/
protected PageInfo fetchPage(URLCrawlEntry entry)
{
// Subclasses may override and provide a different implementation.
BufferedReader urlReader = null;
try {
URL url = new URL(entry.getURL());
urlReader =
new BufferedReader(new InputStreamReader(url.openStream()));
// ...
}
catch (IOException ex) {
throw new URLParseError(entry.getURL(), ex);
}
finally {
if (urlReader != null) {
try {
urlReader.close();
}
catch (IOException ex) {
throw new URLParseError(entry.getURL(), ex);
}
}
}
return null; //...
}
/**
* Crawl the URL specifications contained by the given list -- the member
* objects should be instances of URLCrawlEntry or a subclass.
* The number of visits will be limited to maxURLs,
* using an infinite default depth. Visits are performed breadth-first.
*
* Fetch resulting page descriptions afterward via getCrawledURLs().
* Each call to crawlWeb will add new descriptions to the collection.
*
* @param crawlEntries the list of URLCrawlEntry instances
* @param maxURLs the maximum number of valid pages to visit
*/
public void crawlWeb(List<URLCrawlEntry> crawlEntries, int maxURLs)
{
crawlWeb(crawlEntries, URLCrawlEntry.INFINITE_DEPTH, maxURLs);
}
/**
* Crawl the URL specifications contained by the given list -- the member
* objects should be instances of URLCrawlEntry or a subclass.
* The number of visits will be limited to maxURLs,
* using the given default depth. Visits are performed breadth-first.
*
* Fetch resulting page descriptions afterward via getCrawledURLs().
* Each call to crawlWeb will add new descriptions to the collection.
*
* @param crawlEntries the list of URLCrawlEntry instances
* @param defaultDepth the default depth for URLs without specified depths
* @param maxURLs the maximum number of valid pages to visit
*/
public void crawlWeb(List<URLCrawlEntry> crawlEntries,
int defaultDepth,
int maxURLs)
{
int numURLsCrawled = 0;
// FIFO tracking of URLCrawlEntry's yet to crawl
for(URLCrawlEntry entry: crawlEntries)
{
if(shouldCrawlLink(entry))
{
urlsToCrawl.add(entry);
}
}
// Continue fetching pages as long as there are pages in the queue
// AND the number of pages fetched is below the maximum requested
// AND the subclass thinks it's ok to continue (for example,
// ImportWebCrawler's override of crawlMayContinue() checks
// if the cancel button has been pushed)
while (! urlsToCrawl.isEmpty() &&
(numURLsCrawled < maxURLs) &&
crawlMayContinue())
{
// important to pick it off the front of the list (truly implement a fifo),
// so we do a breadth first walk
URLCrawlEntry nextEntry =
urlsToCrawl.removeFirst();
// Strip #... fragment from the URL
// only for root urls, anything lower down will have already been stripped
nextEntry.stripFragment();
if (nextEntry.isEmpty()) {
continue; // string is now empty!
}
// This part only helps those URLs initially in the list;
// see below for part that makes relative links absolute.
try {
nextEntry.ensureAbsolute();
}
catch (IOException ex) {
throw new URLParseError(nextEntry.getURL(), ex);
}
// Check that we still need to crawl this entry; default
// implementation checks that the entry hasn't already been seen.
if (crawlNeeded(nextEntry)) {
PageInfo urlPage = fetchPage(nextEntry);
// If the page is acceptable, record and count it.
if (urlPage != null) {
numURLsCrawled++; // Update the count fetched this time
// Record page's absolute URL; used by crawlNeeded()
// to decide that this URL no longer needs to be fetched.
crawledURLs.put(nextEntry.getURL(), urlPage);
// If the depth for this page allows more crawling,
// add its child links to the queue.
int toDepth = nextEntry.getToDepth();
if (toDepth == URLCrawlEntry.USE_DEFAULT_DEPTH) {
// can only happen at top level of the tree being walked
toDepth = defaultDepth;
}
if (toDepth > 0) {
Iterator<URLLabeledLink> newLinks =
urlPage.links.iterator();
URL contextURL = null;
// If needed, the URL of the parent page
while (newLinks.hasNext()) {
URLLabeledLink newLink = newLinks.next();
newLink.setDomain(nextEntry.getDomain());
// Again, the #... fragment is useless to us
newLink.stripFragment();
// Ensure the transitive link is "absolute"
// for protocol scheme check inside shouldCrawlLink
if (! newLink.isAbsolute()) {
if (contextURL == null) {
try {
// Get the URL of the current page
// to use as the context for all
// relative links that it contains
contextURL = new URL(urlPage.url);
}
catch (IOException ex) {
throw new URLParseError(urlPage.url,
ex);
}
}
// This will deal with "../" and other relative
// path issues
try {
URL absoluteURL =
new URL(contextURL, newLink.getURL());
newLink.setURL(absoluteURL.toString());
}
catch (IOException ex) {
throw new URLParseError(newLink.getURL(),
ex);
}
}
newLink.setToDepth(toDepth - 1);
// Allow subclass to prune. If the child link
// should be crawled,
if (shouldCrawlLink(newLink)) {
urlsToCrawl.add(newLink);
}
}
}
}
}
}
}
/**
* Return the current collection of page descriptions of URLs visited.
*/
public Collection<PageInfo> getCrawledURLs()
{
return crawledURLs.values();
}
}