/*******************************************************************************
* CogTool Copyright Notice and Distribution Terms
* CogTool 1.3, Copyright (c) 2005-2013 Carnegie Mellon University
* This software is distributed under the terms of the FSF Lesser
* Gnu Public License (see LGPL.txt).
*
* CogTool is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation; either version 2.1 of the License, or
* (at your option) any later version.
*
* CogTool is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with CogTool; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*
* CogTool makes use of several third-party components, with the
* following notices:
*
* Eclipse SWT version 3.448
* Eclipse GEF Draw2D version 3.2.1
*
* Unless otherwise indicated, all Content made available by the Eclipse
* Foundation is provided to you under the terms and conditions of the Eclipse
* Public License Version 1.0 ("EPL"). A copy of the EPL is provided with this
* Content and is also available at http://www.eclipse.org/legal/epl-v10.html.
*
* CLISP version 2.38
*
* Copyright (c) Sam Steingold, Bruno Haible 2001-2006
* This software is distributed under the terms of the FSF Gnu Public License.
* See COPYRIGHT file in clisp installation folder for more information.
*
* ACT-R 6.0
*
* Copyright (c) 1998-2007 Dan Bothell, Mike Byrne, Christian Lebiere &
* John R Anderson.
* This software is distributed under the terms of the FSF Lesser
* Gnu Public License (see LGPL.txt).
*
* Apache Jakarta Commons-Lang 2.1
*
* This product contains software developed by the Apache Software Foundation
* (http://www.apache.org/)
*
* jopt-simple version 1.0
*
* Copyright (c) 2004-2013 Paul R. Holser, Jr.
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
* LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
* OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
* WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* Mozilla XULRunner 1.9.0.5
*
* The contents of this file are subject to the Mozilla Public License
* Version 1.1 (the "License"); you may not use this file except in
* compliance with the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/.
* Software distributed under the License is distributed on an "AS IS"
* basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
* License for the specific language governing rights and limitations
* under the License.
*
* The J2SE(TM) Java Runtime Environment version 5.0
*
* Copyright 2009 Sun Microsystems, Inc., 4150
* Network Circle, Santa Clara, California 95054, U.S.A. All
* rights reserved. U.S.
* See the LICENSE file in the jre folder for more information.
******************************************************************************/
package edu.cmu.cs.hcii.cogtool.controller;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import edu.cmu.cs.hcii.cogtool.model.URLCrawlEntry;
import edu.cmu.cs.hcii.cogtool.util.AggregateException;
import edu.cmu.cs.hcii.cogtool.util.Cancelable;
import edu.cmu.cs.hcii.cogtool.util.ProgressCallback;
/**
* Web crawler that uses the SWT Mozilla browser to visit each page,
* allowing for a DOM walk to parse child links and, if requested,
* acquiring the image of each page. It is expected that this crawler
* will do its work (the crawlWeb invocation) in a child thread.
*/
public class ImportWebCrawler extends WebCrawler
{
/**
* This class represents an algorithm that fetches a single
* page with its child links, determining the page's background image
* and location/extent and, for each link, its label and
* the location/extent.
*/
public interface IImportURL
{
/**
* When importing a page, we may also glean its image.
*/
public static class ImportPageInfo extends WebCrawler.PageInfo
{
public byte[] background = null;
public int bkgImageX = 0;
public int bkgImageY = 0;
public int bkgImageWidth = 0;
public int bkgImageHeight = 0;
public ImportPageInfo(String pageURL)
{
super(pageURL);
}
}
/**
* Fetch the page
*/
public ImportPageInfo fetchPage(URLCrawlEntry entry);
/**
* Return any exceptions thrown by the last call to fetchPage
*/
public AggregateException getThrownExceptions();
/**
* Recover any system resources
*/
public void dispose();
}
protected Cancelable cancelState;
protected ProgressCallback progressState;
// Currently, we'll serialize URL fetches, thus only one instance.
protected IImportURL importURL;
protected Set<String> pruneURLs = null;
/**
* Initialize the web crawler. Must be invoked in the main thread.
*
* @param importAlgorithm the algorithm to use for importing each page
* @param cancelable may be null; if not, allows user to cancel the process
* @param progressCB may be null; if not, used to indicate which URL
* is currently being visited
*/
public ImportWebCrawler(IImportURL importAlgorithm,
Cancelable cancelable,
ProgressCallback progressCB,
Set<String> pruneURLSet)
{
cancelState = cancelable;
progressState = progressCB;
importURL = importAlgorithm;
pruneURLs = pruneURLSet;
}
/**
* In this version of adding URLs to the initial crawl queue,
* if a set of URLs is given that have been fetched in a previous crawl,
* then only allow URLs that are not in the to-be-pruned list.
*/
// TODO this appears never to be called, but I'm too chicken
// to just excise it
@Override
public void addURLsToCrawl(List<URLCrawlEntry> urls)
{
if (pruneURLs != null) {
Iterator<URLCrawlEntry> checkURLs = urls.iterator();
while (checkURLs.hasNext()) {
URLCrawlEntry checkEntry = checkURLs.next();
//System.out.print("TO CRAWL? " + checkEntry.getURL());
if (! pruneURLs.contains(checkEntry.getURL())) {
urlsToCrawl.add(checkEntry);
//System.out.println(" YES!");
} // else System.out.println(" NO!!!!");
}
}
else {
// No pruning; add the given URLs by the default behavior,
// which just adds all of the given URLs.
super.addURLsToCrawl(urls);
}
}
/**
* Override; uses the ICancelable instance to determine if the user
* has canceled the process.
*/
@Override
protected boolean crawlMayContinue()
{
if (cancelState != null) {
return ! cancelState.isCanceled();
}
return super.crawlMayContinue();
}
/**
* Reports on progress by indicating which URL is currently being
* visited.
*
* First checks if the given URL entry has been seen in this pass.
* If not, then if there is a progress "bar", it updates the notification.
*/
@Override
protected boolean crawlNeeded(URLCrawlEntry entry)
{
if (super.crawlNeeded(entry)) {
if (progressState != null) {
progressState.updateProgress(0.0, entry.getURL());
}
//System.out.println(" YES!");
return true;
}
//System.out.println(" NO!!!!");
return false;
}
/**
* If there is a list of URLs to prune (i.e., URLs fetched by a previous
* crawl), then ensure that the given link is not in the to-be-pruned set.
*/
@Override
protected boolean shouldCrawlLink(URLCrawlEntry newLink)
{
if (super.shouldCrawlLink(newLink)) {
return (pruneURLs == null) ||
! pruneURLs.contains(newLink.getURL());
}
return false;
}
/**
* Delegates (and serializes) page fetches to ImportWebURL
*/
@Override
protected PageInfo fetchPage(URLCrawlEntry entry)
{
return importURL.fetchPage(entry);
}
public AggregateException getThrownExceptions()
{
return importURL.getThrownExceptions();
}
/**
* Must be recovered when done in order to recover
* Browser and Shell resources.
*/
public void dispose()
{
importURL.dispose();
}
}